--> Recurrent Neural Networks (RNNs) have been used successfully for many tasks involving sequential data such as machine translation, sentiment analysis, image captioning, time-series prediction etc. <br>
--> Improved RNN models such as Long Short-Term Memory networks (LSTMs) enable training on long sequences overcoming problems like vanishing gradients. <br>
--> To improve further, Attention is a mechanism combined in the RNN allowing it to focus on certain parts of the input sequence when predicting a certain part of the output sequence, enabling easier learning and of higher quality. Combination of attention mechanisms enabled improved performance in many tasks making it an integral part of modern RNN networks.

In [32]:
import random
import re
import spacy
import torch
import torch.optim as optim
import torch.nn as nn
from torchtext.legacy import data

In [33]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
g_path = "/content/drive/My Drive/pytorch/"

Mounted at /content/drive


In [34]:
data_fl = 'data/quora_question_pair_small.csv'

In [35]:
#reproducing the same result
SEED = 2021
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f4c8ccbbeb0>

In [36]:
spacy_en = spacy.load('en')
def clean_data(texts):
    cleaned_text = []
    for text in texts:
        # remove break
        text = text.replace('br', '')
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        # strip the text
        text = text.strip()
        # lower the text
        text = text.lower()

        if text != '':
          cleaned_text.append(text)
    return cleaned_text

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(preprocessing=clean_data,tokenize=tokenizer,batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
fields = [(None, None), (None, None), (None, None), ('q1_txt',TEXT), ('q2_txt',TEXT),('label', LABEL)]

In [37]:
#loading the entire data
def load_data():
  quora_data = data.TabularDataset(path = g_path+data_fl,format = 'csv', fields = fields, skip_header = True)
  return quora_data

quora_data = load_data() 
print(vars(quora_data.examples[1]))
print(quora_data.examples[1].q1_txt)
print(quora_data.examples[1].q2_txt)
print(quora_data.examples[1].label)

{'q1_txt': ['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'i', 'noor', 'diamond'], 'q2_txt': ['what', 'would', 'happen', 'if', 'the', 'indian', 'government', 'stole', 'the', 'kohinoor', 'koh', 'i', 'noor', 'diamond', 'back'], 'label': '0'}
['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'i', 'noor', 'diamond']
['what', 'would', 'happen', 'if', 'the', 'indian', 'government', 'stole', 'the', 'kohinoor', 'koh', 'i', 'noor', 'diamond', 'back']
0


In [38]:
#deleting blank data
blank_idx = []
for i in range(len(quora_data.examples)):
  if(len(quora_data.examples[i].q1_txt) == 0 or len(quora_data.examples[i].q2_txt) == 0):
    print(i)
    blank_idx.append(i)

for i in range(len(blank_idx)):
  del quora_data.examples[blank_idx[i]]

In [39]:
#splitting the data into training and validation dataset
def split_data(quora_data):
  train_data, valid_data = quora_data.split(split_ratio=0.7, random_state = random.seed(SEED))
  return train_data, valid_data

train_data, valid_data = split_data(quora_data)

In [40]:
#generate vocabulary
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))
#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

Size of TEXT vocabulary: 812
Size of LABEL vocabulary: 2


In [42]:
#preparing batches for training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.q1_txt),
    sort_within_batch=True,
    device = device)

In [47]:
#define model architecture
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)


class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, 
                 embedding_dim, 
                 hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions, hid dim]
        #cell = [batch size, num layers * num directions, hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [48]:
#define hyperparameters
vocab_size = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 32
output_dim = 1
num_layers = 2
bidirectional = True
dropout = 0.2

#instantiate the model
train_model = classifier(vocab_size, embedding_dim, 
                   hidden_dim,output_dim, 
                   num_layers, bidirectional = True, dropout = dropout)
train_model = train_model.to(device)

In [44]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [53]:
#training the model

#define the optimizer
optimizer = optim.Adam(train_model.parameters())

#define the loss
criterion = nn.BCELoss()
criterion = criterion.to(device)

#set the model in training phase
train_model.train()

N_EPOCHS = 3

for epoch in range(N_EPOCHS):

  #initialize every epoch 
  epoch_loss = 0
  epoch_acc = 0

  for batch in train_iterator:
    #resets the gradients after every batch
    optimizer.zero_grad() 

    #retrieve text and no. of words
    q1_txt, q1_text_lengths = batch.q1_txt
    q2_txt, q2_text_lengths = batch.q2_txt

    #get prediction
    predictions = train_model(q1_txt, q1_text_lengths)
    preds = predictions.squeeze() #convert to 1D tensor

    #compute the loss
    loss = criterion(preds, batch.label)

    #compute the binary accuracy
    acc = binary_accuracy(predictions, batch.label)   

    #backpropage the loss and compute the gradients
    loss.backward()

    #update the weights
    optimizer.step() 

    #compute loss and accuracy
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  print("loss:- ", epoch_loss / len(train_iterator))
  print("accuracy:- ", epoch_acc / len(train_iterator))

  if epoch == N_EPOCHS-1:
    torch.save(train_model.state_dict(), g_path+"model/classification_model.pt")

loss:-  0.668851608579809
accuracy:-  39.65625
loss:-  0.6317416700449857
accuracy:-  41.18181818181818
loss:-  0.6169563369317488
accuracy:-  41.18181818181818


In [None]:
#define hyperparameters
vocab_size = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 32
output_dim = 1
num_layers = 2
bidirectional = True
dropout = 0.2

#instantiate the model
test_model = classifier(vocab_size, embedding_dim, 
                   hidden_dim,output_dim, 
                   num_layers, bidirectional = True, dropout = dropout)
test_model = test_model.to(device)

#loading the model
model_path = g_path+"model/classification_model.pt"
test_model.load_state_dict(torch.load(model_path))

test_model.eval()

def predict(test_model, sentence):
  tokenized = [tok.text for tok in spacy_en.tokenizer(clean_data(sentence))]  #tokenize the sentence
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]                           #convert to integer sequence
  txt_tensor = torch.LongTensor(indexed).to(device)                           #convert to tensor
  txt_tensor_ip = txt_tensor.unsqueeze(1).T                                   #reshape in form of batch,no. of words

  length = [len(indexed)]                                                     #compute no. of words
  length_tensor_ip = torch.LongTensor(length)                                 #convert to tensor

  prediction = test_model(txt_tensor_ip, length_tensor_ip)                    #prediction

  print(prediction.item())


predict(test_model, "Are there any sports that you don't like?")

**--- Resources ---**

1) NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND TRANSLATE (Dzmitry Bahdanau, KyungHyun Cho and Yoshua Bengio)
https://arxiv.org/pdf/1409.0473.pdf <br>
2) A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING
https://arxiv.org/pdf/1703.03130.pdf <br>
3) https://tomekkorbak.com/2020/06/26/implementing-attention-in-pytorch/