In [1]:
##OBJECTIVE 
##SENTIMENT CLASSIFICATION  FOR IMDB USING RNN
## REF :https://github.com/bentrevett/pytorch-sentiment-analysis

In [2]:

import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary

import random
import time
import sys

In [3]:
def get_GPU_CPU(seed_val = 1):
    print('The Seed is set to {}'.format(seed_val))
    if torch.cuda.is_available():
        print('Model will Run on CUDA.')
        torch.cuda.manual_seed(seed_val)
        !nvidia-smi
        device = 'cuda'
    else:
        torch.manual_seed(seed_val)
        print ('Running in CPU')
        device = 'cpu'
    cuda = torch.cuda.is_available()
    return cuda,seed_val,device

In [4]:
cuda,SEED,device = get_GPU_CPU(seed_val=1234)

The Seed is set to 1234
Model will Run on CUDA.
Thu May 27 13:24:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    37W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------

In [5]:
# make your experiment reproducible, similar to set random seed to all options where there needs a random seed
torch.backends.cudnn.deterministic = True

In [6]:
#here spacy is a tokenizer 
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)



In [7]:
#WHY FLOAT  FOR LABEL:
# when initializing the LABEL field, we set dtype=torch.float. This is because TorchText sets tensors to be LongTensors by default,
# however our criterion expects both inputs to be FloatTensors. Setting the dtype to be torch.float, did this for us.
# The alternative method of doing this would be to do the conversion inside the train function by passing batch.label.float() 
#instad of batch.label to the criterion.

In [None]:
#DOWNLOAD DATA  AND SPLIT DATA INTO TRAIN AND TEST
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
#CHECK LENGTH OF TRAIN AND TEST DATA 
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [None]:
#SEE A SAMPLE DATA 
print(vars(train_data.examples[0]))
#This will return a dictionary with key as "text " and "labels"

In [None]:
#DIVIDE TRAIN  DATA INTO TRAIN AND VAILID IN 80:20 RATIO
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [None]:
# CHECK LENGTH OF TRAIN TEST AND VALID
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [None]:
#Construct the Vocab object for this field from one or more datasets.
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)


#This is basically assigning a unique number to each word . one document may contain 10 lakh element but we should not 
#take 10 lakhs unique index instead take 25000 most repeated word and use them for assigning index to words .
#so What happens if a word encounted which is not in most frequent 25000 in tht case that word is assigned with <unk> i,e unknown 

In [None]:
#Why do we only build the vocabulary on the training set? 
#When testing any machine learning system you do not want to look at the test set in any way. We do not include the validation set as we want it to reflect the test set as much as possible.



In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

In [None]:
#Why is the vocab size 25002 and not 25000?
#One of the addition tokens is the <unk> token and the other is a <pad> token.

#When we feed sentences into our model, we feed a batch of them at a time, i.e. more than one at a time, and all sentences in the batch need to be the same size.
# Thus, to ensure each sentence in the batch is the same size, any shorter than the longest within the batch are padded.

#sentenc1 : I live in bangalore and this is good
#sentence2: I am   a  boy       <pad><pad><pad><pad>


In [None]:
#MOST COMMON WORDS 
print(TEXT.vocab.freqs.most_common(20))

In [None]:


BATCH_SIZE = 64
#Create Iterator objects for multiple splits of a dataset.
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#POINT 01:creating the iterators. We iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.

#POINT 02 :BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

#POINT 03 :We also want to place the tensors returned by the iterator on the GPU (if you're using one). PyTorch handles this using torch.device, we then pass this device to the iterator.



In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [None]:
# In RNN we have 3 layers 
#IN RNN
#----------
#EMbedding layer :The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector. It is a FC layer
#RNN Layer :The RNN layer is our RNN which takes in our dense vector and the previous hidden state ht-1, which it uses to calculate the next hidden state, ht
#OUTPUT Layer :Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, ht, transforming it to the correct output dimension.

#IN Forwrd layer :
#---------------
#Each batch, text, is a tensor of size [sentence length, batch size]. That is a batch of sentences, each having each word converted into a one-hot vector.
#i.e batch 1 :64 sentences , each sentence 772 length  and each word is onehot encoded (see below ). It is observed that each batch has different sentence length but with in the batch 
#same sentence length required 

#The input batch is then passed through the embedding layer to get embedded, which gives us a dense vector representation of our sentences.
# embedded is a tensor of size [sentence length, batch size, embedding dim].

#embedded is then fed into the RNN. In some frameworks you must feed the initial hidden state, ho, into the RNN, however in PyTorch,
# if no initial hidden state is passed as an argument it defaults to a tensor of all zeros.

#OUTPUT OF RNN:
#The RNN returns 2 tensors, output of size [sentence length, batch size, hidden dim] and hidden of size [1, batch size, hidden dim].
#output is the concatenation of the hidden state from every time step, whereas hidden is simply the final hidden state

#AASERT:
#We verify this using the assert statement. Note the squeeze method, which is used to remove a dimension of size 1.#

#Finally, we feed the last hidden state, hidden, through the linear layer, fc, to produce a prediction


In [None]:
print('Train')
for batch in train_iterator:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    print(f'Text matrix size: {batch.text.size()}')
    break
    
print('\nValid:')
for batch in valid_iterator:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    print(f'Text matrix size: {batch.text.size()}')
    break
    
print('\nTest:')
for batch in test_iterator:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    print(f'Text matrix size: {batch.text.size()}')
    break

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [None]:
#MEANING :
#The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size.: HERE:len(TEXT.vocab)

#The embedding dimension is the size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.: HERE :100

#The hidden dimension is the size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, 
#the size of the dense vectors and the complexity of the task. : HERE :256

#The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional,
# i.e. a single scalar real number.: HERE :1

In [None]:
# GET COUNT OF TRAINABLE PARAMETERS

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


In [None]:
# DEFINE OPTIMIZER

In [None]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [None]:
#DEFINE LOSS
#BCEWithLogitsLoss()
#The loss function here is binary cross entropy with logits
# We get output as real number . apply sigmoid to make it 0 to1 . We then use this this bound scalar to calculate the loss using binary cross entropy.
criterion = nn.BCEWithLogitsLoss()

In [None]:
# PLACE MODEL AND LOSS TO GPU 
model = model.to(device)
criterion = criterion.to(device)

In [None]:
##BINARY ACCURACY FUNCTION  PER BATCH 
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
#TRAINING

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
#MODEL.TRAIN:
#model.train() is used to put the model in "training mode", which turns on dropout and batch normalization. 
#Although we aren't using them in this model, it's good practice to include it.

#OPTIMIZER.ZEROGRAD :
#For each batch, we first zero the gradients. Each parameter in a model has a grad attribute which stores the gradient calculated by the criterion.
#PyTorch does not automatically remove (or "zero") the gradients calculated from the last gradient calculation, so they must be manually zeroed.

#CALL TO FORWARD:
#We then feed the batch of sentences, batch.text, into the model. Note, you do not need to do model.forward(batch.text),
#simply calling the model works. The squeeze is needed as the predictions are initially size [batch size, 1],
# and we need to remove the dimension of size 1 as PyTorch expects the predictions input to our criterion function to be of size [batch size].

#CALCULATE LOSS AND ACCURACY :
#The loss and accuracy are then calculated using our predictions and the labels, batch.label, with the loss being averaged over all examples in the batch.

#CALCULATE AND UPDATE GRADIENT :
#We calculate the gradient of each parameter with loss.backward(), and then update the parameters using the gradients and optimizer algorithm with optimizer.step().

#ACCUMULATE LOSS AND ACCURACY FOR ENTIRE SINGLE EPOCH 
#The loss and accuracy is accumulated across the epoch, the .item() method is used to extract a scalar from a tensor which only contains a single value.

#RETURN AVARAGE ACCURACY AND LOSS
#Finally, we return the loss and accuracy, averaged across the epoch. The len of an iterator is the number of batches in the iterator.

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
#evaluate is similar to train, with a few modifications as you don't want to update the parameters when evaluating.
#MODEL.EVAL()
#model.eval() puts the model in "evaluation mode", this turns off dropout and batch normalization. Again, we are not using them in this model, but it is good practice to include them.
#TORCH.NO_GRAD()
#No gradients are calculated on PyTorch operations inside the with no_grad() block.
#This causes less memory to be used and speeds up computation.
#REMOVE optimizer.zero_grad(), loss.backward() and optimizer.step()
#The rest of the function is the same as train, with the removal of optimizer.zero_grad(), loss.backward() and optimizer.step(), as we do not update the model's parameters when evaluating.

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
#We then train the model through multiple epochs, an epoch being a complete pass through all examples in the training and validation sets.

In [None]:
N_EPOCHS = 30

best_valid_loss = float('inf')

#ACCUMULATE TRAIN TEST LOSS ACC
train_loss_epoch=[]
valid_loss_epoch=[]
train_acc_epoch=[]
valid_acc_epoch=[]



for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    train_loss_epoch.append(round(train_loss,3))
    valid_loss_epoch.append(round(valid_loss,3))
    train_acc_epoch.append(round((train_acc*100),2))
    valid_acc_epoch.append(round((valid_acc*100),2))
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
# PLOT A GRAPH BETWEEN TRAIN TEST LOSS ACC FOR N EPOCHS

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.style.use("dark_background")

fig, axs = plt.subplots(2,2,figsize=(15,10))
axs[0, 0].plot(train_loss_epoch)
axs[0, 0].set_title("Training Loss")
axs[0, 1].plot(valid_loss_epoch)
axs[0, 1].set_title("Valid Loss")
axs[1, 0].plot(train_acc_epoch)
axs[1, 0].set_title("Training Accuracy ")
axs[1, 1].plot(valid_acc_epoch)
axs[1, 1].set_title("Validation Accuracy")

plt.show()

In [None]:
# CALCULATE TEST SET LOSS AND ACCURACY 

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')