In [None]:
!pip install -U torch==1.10.0 torchtext==0.11.0

# Reload environment
exit()

Collecting torch==1.8.0
  Downloading torch-1.8.0-cp37-cp37m-manylinux1_x86_64.whl (735.5 MB)
[K     |████████████████████████████████| 735.5 MB 14 kB/s 
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 48.6 MB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.8.0 which is incompatible.
torchaudio 0.11.0

In [None]:
import random
import re
import pandas as pd
import spacy
import torch
import torch.optim as optim
import torch.nn as nn
from torchtext.legacy import data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
g_path = "/content/drive/My Drive/pytorch/"

Mounted at /content/drive


In [None]:
data_fl = 'data/IMDB_review_sentiment_small.csv'

In [None]:
#reproducing the same result
SEED = 2021
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f7fcac6c2f0>

In [None]:
spacy_en = spacy.load('en')
def clean_data(texts):
    cleaned_text = []
    for text in texts:
        # remove break
        text = text.replace('br', '')
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        # strip the text
        text = text.strip()
        # lower the text
        text = text.lower()

        if text != '':
          cleaned_text.append(text)
    return cleaned_text

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(preprocessing=clean_data,tokenize=tokenizer,batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
fields = [('text',TEXT),('label', LABEL)]

In [None]:
#loading the entire data
def load_data():
  imdb_data = data.TabularDataset(path = g_path+data_fl,format = 'csv', fields = fields, skip_header = True)
  return imdb_data

imdb_data = load_data() 
print(vars(imdb_data.examples[0]))
print(imdb_data.examples[0].text, imdb_data.examples[0].label)

{'text': ['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'oz', 'episode', 'you', 'll', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'utality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'word', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'the',

In [None]:
#splitting the data into training and validation dataset
def split_data(imdb_data):
  train_data, valid_data = imdb_data.split(split_ratio=0.7, random_state = random.seed(SEED))
  return train_data, valid_data

train_data, valid_data = split_data(imdb_data)

In [None]:
#generate vocabulary
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))
#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:16<00:00, 24229.43it/s]


Size of TEXT vocabulary: 466
Size of LABEL vocabulary: 2


In [None]:
#preparing batches for training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 5

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
class Classifier(nn.Module):

  def __init__(self, vocab_size, **kwargs):
    #Constructor
    super(Classifier, self).__init__(**kwargs)

    # variables
    self.embedding_dim = 100
    self.hidden_dim = 32
    self.num_layers = 1
    self.bidirectional = True
    self.batch_first = True
    self.output_dim = 1

    #embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                  embedding_dim=self.embedding_dim)

    #lstm layer
    self.lstm = nn.LSTM(input_size=self.embedding_dim,
                        hidden_size=self.hidden_dim, 
                        num_layers=self.num_layers, 
                        bidirectional=self.bidirectional,
                        batch_first=self.batch_first)

    #dense layer / linear layer
    self.fc = nn.Linear(self.hidden_dim * 2, self.output_dim)

    #activation function
    self.act = nn.Sigmoid()

  def forward(self, txt, txt_len):
    '''
    # txt [batch_size, seq_len] 
    ~ seq_len is max sequence length among all the rows in batch
    ~ it means the rows length with less than seq_len will be padded 
    ~ but the padding will be batchwise
    # txt_len [batch_size]
    ~ contains sequence length for each row in batch
    '''
    
    '''
    Step 1: pass through the embedding layer to convert text into vectors
    '''
    # embed_txt ~ [batch_size, seq_len, embedding_dim] 
    embed_txt = self.embedding(txt)

    '''
    Step 2: passing the embeddings through LSTM layer
    '''

    '''
    Step 2.1: first packing the embeddings to tackle variable length input
    For pytorch to know how to pack and unpack properly, 
    we feed in the length of the original sentence (before padding).
    by default enforce_sorted=True, 
    which requires input sorted by decreasing length, just make sure the target y are also sorted accordingly. 
    '''
    # packed the embedding (only the vocab words without padding)
    embed_txt_packed_pad = nn.utils.rnn.pack_padded_sequence(embed_txt, txt_len, batch_first=True)

    '''
    Step 2.2: passing the packed input to LSTM layer
    '''
    # lstm_out ~ [batch_size, seq_len, (2 * hidden_dim)] 
    lstm_out, (h_n, c_n) = self.lstm(embed_txt_packed_pad)

    '''
    Step 2.3: retrieving back the lstm output with zero padding
    '''
    # packed the embedding (with padding)
    embed_txt_pad_packed, lengths = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

    '''
    Step 3: sum all the hidden states
    '''
    # lstm_out ~ [include dimention, remove dimention, include dimention] 
    # concat_out ~ [batch_size, (2 * hidden_dim)] #concatenate hidden states
    # concat_out = embed_txt_pad_packed[ : , -1, : ]  #concatenate hidden states
    sum_ip = embed_txt_pad_packed.sum(dim=1)  #summing up hidden states
    # avg_ip = embed_txt_pad_packed.mean(dim=1)  #averaging the hidden states

    '''
    Step 4: feeding the weighted value to a linear layer
    '''
    # fc_out ~ [batch_size, output_dim]
    fc_out = self.fc(sum_ip)

    '''
    Step 5: feeding the linear output to activation function
    '''
    # out ~ [batch_size, output_dim]
    out = self.act(fc_out)

    return out

In [None]:
#define hyperparameters
vocab_size = len(TEXT.vocab)

#instantiate the model
train_model = Classifier(vocab_size)
train_model = train_model.to(device)

In [None]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def valid_model(valid_iterator, train_model, criterion):
  epoch_loss = 0
  epoch_acc = 0

  with torch.no_grad():
    for valid_batch in valid_iterator:
      
      #retrieve text and no. of words
      text, text_lengths = valid_batch.text
          
      #get prediction
      predictions = train_model(text, text_lengths)
      preds = predictions.squeeze(-1) #convert to 1D tensor

      #compute the loss
      loss = criterion(preds, valid_batch.label)

      #compute the binary accuracy
      acc = binary_accuracy(preds, valid_batch.label)

      # compute loss and accuracy
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  valid_epoc_loss = epoch_loss / len(valid_iterator)
  valid_epoch_acc = epoch_acc / len(valid_iterator)

  return valid_epoc_loss, valid_epoch_acc

In [None]:
#training the model

#define the optimizer
optimizer = optim.Adam(train_model.parameters())

#define the loss
criterion = nn.BCELoss()
criterion = criterion.to(device)

#set the model in training phase
train_model.train()

N_EPOCHS = 6
VALIDATION_EPOCH = 2

for epoch in range(N_EPOCHS+1):

  #initialize every epoch 
  epoch_loss = 0
  epoch_acc = 0

  for batch in train_iterator:
    #resets the gradients after every batch
    optimizer.zero_grad() 

    #retrieve text and no. of words
    text, text_lengths = batch.text

    #get prediction
    predictions = train_model(text, text_lengths)
    preds = predictions.squeeze(-1) #convert to 1D tensor

    #compute the loss
    loss = criterion(preds, batch.label)

    #compute the binary accuracy
    acc = binary_accuracy(preds, batch.label)   

    #backpropage the loss and compute the gradients
    loss.backward()

    #update the weights
    optimizer.step() 

    # compute loss and accuracy
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  if epoch%VALIDATION_EPOCH == 0:
    train_model.eval() # set the model in eval phase
    valid_epoc_loss, valid_epoch_acc = valid_model(valid_iterator, train_model, criterion)
    train_model.train() # return back to training phase

    print("epoch:- ",epoch)
    print("training===> ","loss:- ", epoch_loss / len(train_iterator), "  accuracy:- ", epoch_acc / len(train_iterator))
    print("validation===> ","loss:- ", valid_epoc_loss, "  accuracy:- ", valid_epoch_acc)

  if epoch == N_EPOCHS-1:
    torch.save(train_model.state_dict(), g_path+"model/classification_model.pt")

epoch:-  0
training===>  loss:-  24.26220291001456   accuracy:-  0.4571428724697658
validation===>  loss:-  2.331149458885193   accuracy:-  0.5333333512147268
epoch:-  2
training===>  loss:-  2.5758355855941772   accuracy:-  0.5142857219491687
validation===>  loss:-  3.7094129721323648   accuracy:-  0.46666667858759564
epoch:-  4
training===>  loss:-  1.0857821806733097   accuracy:-  0.6857142938034875
validation===>  loss:-  10.070106188456217   accuracy:-  0.3333333383003871
epoch:-  6
training===>  loss:-  0.5492339253365311   accuracy:-  0.8000000076634544
validation===>  loss:-  2.233311414718628   accuracy:-  0.46666667858759564


In [None]:
###  Inference  ###

#define hyperparameters
vocab_size = len(TEXT.vocab)

#instantiate the model
test_model = Classifier(vocab_size)
test_model = test_model.to(device)

#loading the model
model_path = g_path+"model/classification_model.pt"
test_model.load_state_dict(torch.load(model_path))

test_model.eval() # set the model in eval phase


test_sentence = "Are there any sports that you don't like?"
test_sentence = "I love the movie"
test_sentence = "I dislike the movie"
test_sentence = "I don't like the movie"

test_data = " ".join(clean_data(test_sentence.split(" "))) # clean the data
tokenized_test_data = tokenizer(test_data)  #tokenize the sentence

indexed_test_data = [TEXT.vocab.stoi[t] for t in tokenized_test_data]  #convert to integer sequence
txt_tensor = torch.LongTensor(indexed_test_data).to(device) #convert to tensor
txt_tensor_ip = txt_tensor.unsqueeze(1).T #reshape in form of batch,no. of words

length = [len(indexed_test_data)]  #compute no. of words
length_tensor_ip = torch.LongTensor(length) #convert to tensor 

prediction = test_model(txt_tensor_ip, length_tensor_ip) #prediction

print(prediction.item())

0.5662572383880615


**Example>>>>**

---

In [None]:
#[batch_size, (seq_len ~ variable), embed_dim]
x = [
        [
         [0.16, 0.57, 0.12, 0.84],
         [0.64, 0.28, 0.42, 0.86]
        ],

        [
         [0.20, 0.91, 0.26, 0.16],
         [0.75, 0.32, 0.25, 0.75],
         [0.15, 0.16, 0.70, 0.48]
        ],

        [
         [0.91, 0.10, 0.74, 0.22],
         [0.25, 0.42, 0.29, 0.26],
         [0.51, 0.70, 0.12, 0.26]
        ],

        [
         [0.17, 0.91, 0.77, 0.88]
        ],

        [
         [0.35, 0.90, 0.18, 0.46],
         [0.44, 0.33, 0.16, 0.43],
         [0.10, 0.97, 0.10, 0.70]
        ]
      ]
x_len = torch.Tensor([2, 3, 3, 1,  3])

In [None]:
'''
first define the input to the attention
#[batch_size, seq_len, embed_dim]
batch_size : the number of input sentences at a time
seq_len : the number (max) of words among the input sentences
embed_dim : the vector dimension for each word (depends on RNN/LSTM hidden_dim)
'''

batch_size = 5
seq_len = 3
embed_dim = 4

#[batch_size, seq_len, embed_dim]
x_padded = torch.Tensor([
        [[0.16, 0.57, 0.12, 0.84],
         [0.64, 0.28, 0.42, 0.86],
         [0.00, 0.00, 0.00, 0.00]],

        [[0.20, 0.91, 0.26, 0.16],
         [0.75, 0.32, 0.25, 0.75],
         [0.15, 0.16, 0.70, 0.48]],

        [[0.91, 0.10, 0.74, 0.22],
         [0.25, 0.42, 0.29, 0.26],
         [0.51, 0.70, 0.12, 0.26]],

        [[0.17, 0.91, 0.77, 0.88],
         [0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00]],

        [[0.35, 0.90, 0.18, 0.46],
         [0.44, 0.33, 0.16, 0.43],
         [0.10, 0.97, 0.10, 0.70]]
      ])

In [None]:
#[batch_size, embed_dim] 
weighted_sum_x = torch.sum(x_padded, dim=1)
print("weighted_sum_x shape:- \n", weighted_sum_x.shape)
print("weighted_sum_x:- \n", weighted_sum_x)

weighted_sum_x shape:- 
 torch.Size([5, 4])
weighted_sum_x:- 
 tensor([[0.8000, 0.8500, 0.5400, 1.7000],
        [1.1000, 1.3900, 1.2100, 1.3900],
        [1.6700, 1.2200, 1.1500, 0.7400],
        [0.1700, 0.9100, 0.7700, 0.8800],
        [0.8900, 2.2000, 0.4400, 1.5900]])
