In [1]:
import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext import vocab

import numpy as np
import os
import random

In [2]:
#define the SEED for reproduciability
SEED = 101
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [3]:
#create thr field objects
TEXT = data.Field(lower = True)
LABEL = data.LabelField(dtype = torch.float32)

In [None]:
#once our field objects are created we can now load our training and testing datasets
path = "F:/datasets/PRACTICE FOLDER/PYTORCH PRACTICE/TEXT DATA/SENTIMENT ANALYSIS/.data/"
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root = path)

In [5]:
#let's check the length of our train and test dataset
print(f"THE LENGTH OF OUR TRAINING DATA IS : {len(train_data)}")
print(f"THE LENGTH OF OUR TESTING DATA IS : {len(test_data)}")

THE LENGTH OF OUR TRAINING DATA IS : 25000
THE LENGTH OF OUR TESTING DATA IS : 25000


In [6]:
#we can also check few of the text examples
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........', 'high.', 'a', 'classic', 'line:', 'inspector:', "i'm", 'here', 'to', 'sack', 'one', 'of', '

In [7]:
#once we have our training and testing set, we would want to have a validation set as well
#so we will be deviding the train set further into training and validation
train_data, valid_data = train_data.split(split_ratio = 0.7, random_state = random.seed(SEED))

In [8]:
#let's check the size ouf our train, valid and test set
print(f"THE SIZE OF THE TRAINING SET IS : {len(train_data)}")
print(f"THE SIZE OF THE VALIDATION SET IS : {len(valid_data)}")
print(f"THE SOZE OF THE TESTING SET IS : {len(test_data)}")

THE SIZE OF THE TRAINING SET IS : 17500
THE SIZE OF THE VALIDATION SET IS : 7500
THE SOZE OF THE TESTING SET IS : 25000


In [9]:
#let's create the vocabulary
embedding_path = "F:/datasets/GLOVE PRETRAINED MODEL/glove.6B.100d.txt"
TEXT.build_vocab(train_data, max_size = 25000)
LABEL.build_vocab(test_data)

In [10]:
#load the pre-trained word embeddings
vectors = vocab.Vectors(embedding_path)

In [11]:
TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

In [12]:
#we can check the length of the vocabulary that we have
print(f"THE LENGTH OF THE TEXT VOCABULARY IS : {len(TEXT.vocab)}")
print(f"THE LENGTH OF THE LABEL VOCABULARY IS : {len(LABEL.vocab)}")

THE LENGTH OF THE TEXT VOCABULARY IS : 25002
THE LENGTH OF THE LABEL VOCABULARY IS : 2


You can see we got the length of the TEXT vocabulary to be 25002. That extra two is because of the tokens for the out of vocabulary word called $<unk>$ and the other one is the token for the padding $<pad>$

In [13]:
#the result displays the frequency occurence of each word in the vocabulary
import operator
sorted(TEXT.vocab.freqs.items(), key = operator.itemgetter(1))[::-1]

[('the', 224887),
 ('a', 111557),
 ('and', 110593),
 ('of', 100616),
 ('to', 93640),
 ('is', 72251),
 ('in', 63402),
 ('i', 49253),
 ('this', 48412),
 ('that', 46215),
 ('it', 45621),
 ('/><br', 35559),
 ('was', 32892),
 ('as', 31290),
 ('for', 29951),
 ('with', 29874),
 ('but', 27735),
 ('on', 21968),
 ('movie', 21391),
 ('his', 20331),
 ('are', 19934),
 ('not', 19931),
 ('film', 19337),
 ('you', 19287),
 ('have', 19128),
 ('he', 18308),
 ('be', 17863),
 ('at', 15796),
 ('one', 15652),
 ('by', 15214),
 ('an', 14738),
 ('they', 14545),
 ('from', 13882),
 ('all', 13712),
 ('who', 13591),
 ('like', 13099),
 ('so', 12547),
 ('just', 12044),
 ('her', 11652),
 ('has', 11640),
 ('or', 11598),
 ('about', 11569),
 ("it's", 11033),
 ('if', 10645),
 ('some', 10620),
 ('out', 9979),
 ('what', 9828),
 ('when', 9607),
 ('very', 9546),
 ('more', 9076),
 ('there', 9053),
 ('she', 8635),
 ('good', 8396),
 ('would', 8379),
 ('even', 8349),
 ('my', 8106),
 ('only', 8025),
 ('their', 7898),
 ('no', 7736)

In [14]:
#results will display the 300 dimensional vectors for each word in the vocabulary
TEXT.vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.7464,  0.3083, -0.4010,  ...,  0.7917,  0.3290,  0.0278],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4910,  0.0436,  0.1195,  ...,  0.3607, -0.7292,  0.1557]])

In [15]:
#the results gives the dictionery mapping each word in the vocabulary to it's corresponding integer value
TEXT.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index()>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             'a': 3,
             'and': 4,
             'of': 5,
             'to': 6,
             'is': 7,
             'in': 8,
             'i': 9,
             'this': 10,
             'that': 11,
             'it': 12,
             '/><br': 13,
             'was': 14,
             'as': 15,
             'for': 16,
             'with': 17,
             'but': 18,
             'on': 19,
             'movie': 20,
             'his': 21,
             'are': 22,
             'not': 23,
             'film': 24,
             'you': 25,
             'have': 26,
             'he': 27,
             'be': 28,
             'at': 29,
             'one': 30,
             'by': 31,
             'an': 32,
             'they': 33,
             'from': 34,
             'all': 35,
             'who': 36,
             'like': 37,
             'so': 38,
             '

In [16]:
#check for the GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


#### GENERATE BATCHES OF VECTORS

In [34]:
#the last step is to create the iterator object for the training, validation and the testing set
#the iterator will return the batches of examples during training of the model
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                         batch_size = BATCH_SIZE,
                                                                          device = device)

In [35]:
#let's see the sample example
batch = next(iter(train_iterator))
batch.text

tensor([[   10,    10,  2321,  ...,    10,   317,     7],
        [   91,    24,     7,  ...,   130,  7322,    52],
        [   28,     7, 12452,  ...,   148,    22,    93],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]], device='cuda:0')

In [36]:
batch.label

tensor([1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1.,
        1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1.,
        0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
        0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
        0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0.,
        0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
        0., 0.], device='cuda:0')

### BUILD THE MODEL

### Implementation Details

Another addition to this model is that we are not going to learn the embedding for the $$<pad>$$ token. This is because we want to explitictly tell our model that padding tokens are irrelevant to determining the sentiment of a sentence. This means the embedding for the pad token will remain at what it is initialized to (we initialize it to all zeros later). We do this by passing the index of our pad token as the padding_idx argument to the nn.Embedding layer.

To use an LSTM instead of the standard RNN, we use nn.LSTM instead of nn.RNN. Also, note that the LSTM returns the output and a tuple of the final hidden state and the final cell state, whereas the standard RNN only returned the output and final hidden state.

As the final hidden state of our LSTM has both a forward and a backward component, which will be concatenated together, the size of the input to the nn.Linear layer is twice that of the hidden dimension size.

Implementing bidirectionality and adding additional layers are done by passing values for the num_layers and bidirectional arguments for the RNN/LSTM.

Dropout is implemented by initializing an nn.Dropout layer (the argument is the probability of dropping out each neuron) and using it within the forward method after each layer we want to apply dropout to. Note: never use dropout on the input or output layers (text or fc in this case), you only ever want to use dropout on intermediate layers. The LSTM has a dropout argument which adds dropout on the connections between hidden states in one layer to hidden states in the next layer.

The final hidden state, hidden, has a shape of [num layers * num directions, batch size, hid dim]. These are ordered: [forward_layer_0, backward_layer_0, forward_layer_1, backward_layer 1, ..., forward_layer_n, backward_layer n]. As we want the final (top) layer forward and backward hidden states, we get the top two hidden layers from the first dimension, hidden[-2,:,:] and hidden[-1,:,:], and concatenate them together before passing them to the linear layer (after applying dropout).

In [37]:
#now we have our vocabulary build and our iterators created we can now start building the model
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [38]:
class UpdatedRNN(nn.Module):
    def __init__(self, input_dim, output_dim ,hidden_dim, embedding_dim, n_layers, bidirectional, dropout, pad_idx):
        super(UpdatedRNN, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional = bidirectional, dropout = dropout, num_layers = n_layers)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(p = dropout)
    
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.lstm(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        
        #hidden = [batch size, hid dim * num directions]
        
        out = self.fc(hidden.squeeze(0))
        
        return out

In [39]:
#define the parameters
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = 1
EMBEDDING_DIM = 100 
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = UpdatedRNN(input_dim = INPUT_DIM, 
                   output_dim = OUTPUT_DIM, 
                   hidden_dim = HIDDEN_DIM, 
                   embedding_dim = EMBEDDING_DIM, 
                   dropout = DROPOUT,
                   pad_idx = PAD_IDX,
                   bidirectional = BIDIRECTIONAL,
                   n_layers = N_LAYERS)

# #transfer the model to gpu
model = model.to(device)
print(model)

UpdatedRNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)




We'll print out the number of parameters in our model.

Notice how we have almost twice as many parameters as before!


In [40]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters




The final addition is copying the pre-trained word embeddings we loaded earlier into the embedding layer of our model.

We retrieve the embeddings from the field's vocab, and check they're the correct size, [vocab size, embedding dim]


In [41]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


We then replace the initial weights of the embedding layer with the pre-trained embeddings.

In [42]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.7464,  0.3083, -0.4010,  ...,  0.7917,  0.3290,  0.0278],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4910,  0.0436,  0.1195,  ...,  0.3607, -0.7292,  0.1557]],
       device='cuda:0')



As our $<unk>$ and $<pad>$ token aren't in the pre-trained vocabulary they have been initialized using unk_init (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.

We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.


In [43]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.7464,  0.3083, -0.4010,  ...,  0.7917,  0.3290,  0.0278],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4910,  0.0436,  0.1195,  ...,  0.3607, -0.7292,  0.1557]],
       device='cuda:0')


We can now see the first two rows of the embedding weights matrix have been set to zeros. As we passed the index of the pad token to the padding_idx of the embedding layer it will remain zeros throughout training, however the <unk> token embedding will be learned.

### Train the Model

In [44]:
#define the optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [45]:
#define the loss function
criterion = nn.BCEWithLogitsLoss()

In [46]:
#define the function to calculate the accuracy
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    
    return acc

In [47]:
#define the training function
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [48]:
#define the function to evaluate the model
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            
            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

And also create a nice function to tell us how long our epochs are taking.

In [49]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [50]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'updated-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: CUDA out of memory. Tried to allocate 3.50 GiB (GPU 0; 6.00 GiB total capacity; 1.39 GiB already allocated; 2.94 GiB free; 274.26 MiB cached)