**Downloading the Dependencies**


In [1]:

!pip install -q -U trax                         
import pandas as pd
import numpy as np
import os
import nltk
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import random
from collections import defaultdict
from functools import partial

random.seed(111)

[K     |████████████████████████████████| 629 kB 5.3 MB/s 
[K     |████████████████████████████████| 4.4 MB 42.4 MB/s 
[K     |████████████████████████████████| 153 kB 40.3 MB/s 
[K     |████████████████████████████████| 2.9 MB 39.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 42.0 MB/s 
[K     |████████████████████████████████| 286 kB 25.9 MB/s 
[K     |████████████████████████████████| 366 kB 25.5 MB/s 
[K     |████████████████████████████████| 4.0 MB 38.5 MB/s 
[K     |████████████████████████████████| 90 kB 7.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 64.7 MB/s 
[K     |████████████████████████████████| 56 kB 2.8 MB/s 
[K     |████████████████████████████████| 596 kB 47.8 MB/s 
[K     |████████████████████████████████| 895 kB 65.7 MB/s 
[?25h

**Dataset**



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
data = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/Questions/Questions.csv")
print(f"No. of Question duplicate pairs: {len(data)}")
data.head(5)                                                

No. of Question duplicate pairs: 404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


**Processing the Data**
* I'll divide the data into two groups: training and testing. The Test Set will be used to assess the Model afterwards. To train the Model, I'll only use Question Pairs that are duplicates. I'll make two batches of Siamese Networks to feed into the Neural Networks. The original pairs of Questions are used in the Test set, as well as the Status indicating whether or not the Questions are duplicates. 

In [4]:

N_train = 300000                                               
N_test = 10240                                                 
data_train = data[:N_train]                                                   
data_test = data[N_train:N_train+N_test]                                       
del(data)                                                                      

print(f"Training Set: {len(data_train)} and Test Set: {len(data_test)}")

train_idx = (data_train["is_duplicate"] == 1).to_numpy()
train_idx = [i for i,x in enumerate(train_idx) if x]
print(f"Number of Duplicate Questions: {len(train_idx)}")
print(f"Indexes of first Duplicate Questions: {train_idx[:10]}")

Training Set: 300000 and Test Set: 10240
Number of Duplicate Questions: 111486
Indexes of first Duplicate Questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [5]:
print(data_train["question1"][20])                                 
print(data_train["question2"][20])                                 
print("Index 20 is duplicate:", data_train["is_duplicate"][20])

Why do rockets look white?
Why are rockets and boosters painted white?
Index 20 is duplicate: 1


**Preparing the Data**

In [6]:

Q1_train_words = np.array(data_train["question1"][train_idx])
Q2_train_words = np.array(data_train["question2"][train_idx])

Q1_test_words = np.array(data_test["question1"])
Q2_test_words = np.array(data_test["question2"])
y_test = np.array(data_test["is_duplicate"])

print("TRAINING QUESTIONS:\n")
print("Question 1:", Q1_train_words[7])
print("Question 2:", Q2_train_words[7], "\n")

print("TESTING QUESTIONS:\n")
print("Question 1:", Q1_test_words[7])
print("Question 2:", Q2_test_words[7], "\n")
print("Inspecting Testing pairs is duplicate:", y_test[0])

TRAINING QUESTIONS:

Question 1: Why are so many Quora users posting questions that are readily answered on Google?
Question 2: Why do people ask Quora questions which can be answered easily by Google? 

TESTING QUESTIONS:

Question 1: Which is the best digital photo frame?
Question 2: What are the best 12-inch digital photo frames? 

Inspecting Testing pairs is duplicate: 0


**Preparing the Data**
* I'll use an Index, which will be a list of integers, to encode each word of the chosen pairings. To begin, I'll use NLTK to tokenize each word, then use Python's Default Dictionary to assign the value 0 to all Out of Vocabulary Words.

In [7]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:

Q1_train = np.empty_like(Q1_train_words)                                
Q2_train = np.empty_like(Q2_train_words)                                
Q1_test = np.empty_like(Q1_test_words)                                  
Q2_test = np.empty_like(Q2_test_words)                                  


vocab = defaultdict(lambda: 0)
vocab["<PAD>"] = 1
for idx in range(len(Q1_train_words)):
  Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx])               
  Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx])              
  q = Q1_train[idx] + Q2_train[idx]
  for word in q:
    if word not in vocab:
      vocab[word] = len(vocab) + 1
print("The length of the Vocabulary is:", len(vocab))


for idx in range(len(Q1_test_words)):
  Q1_test[idx] = nltk.word_tokenize(Q1_test_words[idx])                 
  Q2_test[idx] = nltk.word_tokenize(Q2_test_words[idx])                 

print("Training Set is reduced to:", len(Q1_train))
print("Test Set is:", len(Q1_test))

The length of the Vocabulary is: 36342
Training Set is reduced to: 111486
Test Set is: 10240


**Preparing the Data**
* I will convert each Questions Pairs to Tensors or array of Numbers using the Vocabulary. Then I will split the Training set into Training and Validation Dataset so that I can use it to train and evaluate the Neural Networks: Siamese Networks.

In [9]:

for i in range(len(Q1_train)):
  Q1_train[i] = [vocab[word] for word in Q1_train[i]]
  Q2_train[i] = [vocab[word] for word in Q2_train[i]]


for i in range(len(Q1_test)):
  Q1_test[i] = [vocab[word] for word in Q1_test[i]]
  Q2_test[i] = [vocab[word] for word in Q2_test[i]]

print("Question in the Training Set:")                         
print(Q1_train_words[7], "\n")
print("Encoded Version:")
print(Q1_train[7], "\n")
print("Question in the Test Set:")                               
print(Q1_test_words[7], "\n")
print("Encoded Version:")
print(Q1_test[7], "\n")

split = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:split], Q2_train[:split]                        
val_Q1, val_Q2 = Q1_train[split:], Q2_train[split:]                            
print(f"Total numbers of questions pairs: {len(Q1_train)}")              
print(f"The length of Training set: {len(train_Q1)}")                          
print(f"The length of Validation set: {len(val_Q1)}")                          

Question in the Training Set:
Why are so many Quora users posting questions that are readily answered on Google? 

Encoded Version:
[86, 87, 88, 89, 90, 91, 92, 93, 17, 87, 94, 95, 72, 96, 21] 

Question in the Test Set:
Which is the best digital photo frame? 

Encoded Version:
[283, 156, 78, 216, 1442, 1223, 4114, 21] 

Total numbers of questions pairs: 111486
The length of Training set: 89188
The length of Validation set: 22298


**Data Generator**
* Using batches for training the dataset is more efficient in most Natural Language Processing and AI applications. Now I'll create the Data Generator, which will take in Question pairs and return batches of Tuples. The Tuples are made up of two arrays, each with batch size Questions pairs. The next batch will be returned by the command next(data generator). The Data Generator will deliver the data in a format that can be easily entered into the Model for Feed Forward computation. It will return a pair of Question arrays.

In [10]:

def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):
  input1, input2 = [], []
  idx = 0
  len_q = len(Q1)
  question_index = [*range(len_q)]
  if shuffle:
    random.shuffle(question_index)
  
  while True:
    if idx >= len_q:
      idx = 0
      if shuffle:
        random.shuffle(question_index)
    
    q1 = Q1[question_index[idx]]
    q2 = Q2[question_index[idx]]
    idx += 1
    #@ Adding the Data:
    input1.append(q1)
    input2.append(q2)
    if len(input1) == batch_size:
      max_len = max(max([len(q) for q in input1]),
                    max([len(q) for q in input2]))
      max_len = 2**int(np.ceil(np.log2(max_len)))
      b1, b2 = [], []
      for q1, q2 in zip(input1, input2):
        q1 = q1 + [pad] * (max_len - len(q1))                        
        q2 = q2 + [pad] * (max_len - len(q2))                         
        b1.append(q1)
        b2.append(q2)
      yield np.array(b1), np.array(b2)
      input1, input2 = [], []                                       


res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size=2))
print(f"First Questions:\n{res1}")
print(f"\nSecond Questions:\n{res2}")

First Questions:
[[   30    87    78  1725    17  2274   286 11452 11453    21     1     1
      1     1     1     1]
 [  929     4 11746   129 16520    21     1     1     1     1     1     1
      1     1     1     1]]

Second Questions:
[[   32    89  1725   819   286 11452 11453    21     1     1     1     1
      1     1     1     1]
 [  929     4 11746   127 16520    21     1     1     1     1     1     1
      1     1     1     1]]


**Siamese Neural Network**
* A Siamese Neural Network is a Neural Network which uses the same weight while working in tandem on two different Input vectors to compute comparable output Vectors. Here, I will get the Embedding, run it through LSTM or Long Short Term Memory Network, Noramlize the two Vectors and Finally, I will use Triplet Loss to get the corresponding Cosine Similarity for each pair of Questions. 

In [11]:

def Siamese(vocab_size=len(vocab), d_model=128, mode="train"):
  
  def normalize(x):
    return x / fastnp.sqrt(fastnp.sum(x*x, axis=-1, keepdims=True))
  
  processor = tl.Serial(                                                 
      tl.Embedding(vocab_size=vocab_size, d_feature=d_model),            
      tl.LSTM(n_units=d_model),                                           
      tl.Mean(axis=1),                                                   
                                      
      tl.Fn("Normalize", lambda x: normalize(x))                        
  )
  
  model = tl.Parallel(processor, processor)
  return model

model = Siamese()
print(model)                                                             

Parallel_in2_out2[
  Serial[
    Embedding_41789_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_41789_128
    LSTM_128
    Mean
    Normalize
  ]
]


**Triplet Loss**
* The Triplet Loss makes use of a Baseline or Anchor Input which is compared to the Positive or Truthy Input and a Negatve or Falsy Input. The distance from the Anchor Input to the Positive Input is minimized and the distance from the Anchor Input to the Negative Input is maximized. The Triplet Loss is composed of two terms where one term utilizes the mean of all the non duplicates and the second term utilizes the Closest Negative. 

In [12]:

def TripletLossFn(v1, v2, margin=0.25):
  """ Custom Loss Function. """
  scores = fastnp.dot(v1, v2.T)                                                       
  batch_size = len(scores)                                                           
  positive = fastnp.diagonal(scores)                                                 
  negative_without_positive = scores - 2.0 * fastnp.eye(batch_size)
  closest_negative = negative_without_positive.max(axis=1)                            
  mean_negative = fastnp.sum(negative_zero_on_duplicate, axis=1)/(batch_size - 1)
  triplet_loss1 = fastnp.maximum(0, margin - positive + closest_negative)
  triplet_loss2 = fastnp.maximum(0, margin - positive + mean_negative)
  triplet_loss = fastnp.mean(triplet_loss1 + triplet_loss2)
  return triplet_loss


def TripletLoss(margin=0.25):
  triplet_loss_fn = partial(TripletLossFn, margin=margin)
  return tl.Fn("TripletLoss", triplet_loss_fn)

**Training the Model**
* Now, I will train the Model. I will define the Cost Function and the Optimizer as ususal. I will use Training Iterator to go through all the Data for each Epochs while training the Model.

In [15]:
#@ Preparing the Data:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab["<PAD>"])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab["<PAD>"])

#@ Training the Model:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)
def train_model(Siamese, TripletLoss, lr_schedule, train_generator=train_generator,
                val_generator=val_generator, output_dir="model/"):

  output_dir = os.path.expanduser(output_dir)
  
  #@ Training:
  train_task = training.TrainTask(
      labeled_data = train_generator,                                               
      loss_layer = TripletLoss(),                                                       
      optimizer = trax.optimizers.Adam(0.001),                                          
      lr_schedule = lr_schedule                                                        
  )
  #@ Evaluating:
  eval_task = training.EvalTask(
      labeled_data = val_generator,                                                     
      metrics = [TripletLoss()],                                                        
      n_eval_batches = 3
  )
  #@ Training the Model:
  training_loop = training.Loop(                                                       
      Siamese(),                                                                        
      train_task, eval_tasks = eval_task,
      output_dir = output_dir
  )
  return training_loop

#@ Training the Model:
training_loop = train_model(Siamese, TripletLoss, lr_schedule)
training_loop.run(1000)                                                                

  "jax.host_id has been renamed to jax.process_index. This alias "
  "jax.host_count has been renamed to jax.process_count. This alias "



Step   1100: Ran 100 train steps in 88.72 secs
Step   1100: train TripletLoss |  0.07060241
Step   1100: eval  TripletLoss |  0.08669567

Step   1200: Ran 100 train steps in 81.57 secs
Step   1200: train TripletLoss |  0.06752674
Step   1200: eval  TripletLoss |  0.09445456

Step   1300: Ran 100 train steps in 73.97 secs
Step   1300: train TripletLoss |  0.06285258
Step   1300: eval  TripletLoss |  0.08742717

Step   1400: Ran 100 train steps in 77.71 secs
Step   1400: train TripletLoss |  0.05550867
Step   1400: eval  TripletLoss |  0.07100180

Step   1500: Ran 100 train steps in 75.23 secs
Step   1500: train TripletLoss |  0.05138228
Step   1500: eval  TripletLoss |  0.08083352

Step   1600: Ran 100 train steps in 75.73 secs
Step   1600: train TripletLoss |  0.05150433
Step   1600: eval  TripletLoss |  0.06651612

Step   1700: Ran 100 train steps in 72.20 secs
Step   1700: train TripletLoss |  0.04927659
Step   1700: eval  TripletLoss |  0.06092033

Step   1800: Ran 100 train steps 

**Model Evaluation**
* I will utilize the Test Set which was configured earlier to determine the accuracy of the Model. Actually the Training Set only had Positive examples whereas the Test Set and y test is setup as pairs of Questions and some of which are duplicates and some are not. I will compute the Cosine Similarity of each pair, threshold it and compare the result to y test. The results are accumulated to produce the Accuracy. 

In [16]:
#@ Loading the Saved Model:
model = Siamese()
model.init_from_file("/content/model/model.pkl.gz")

 
def classify(test_Q1, test_Q2, y, threshold, model, vocab, data_generator=data_generator, batch_size=64):
  """ Function to test the Accuracy of the Model. """
  accuracy = 0                                                                               
  for i in range(0, len(test_Q1), batch_size):
    q1, q2 = next(data_generator(test_Q1[i:i+batch_size], test_Q2[i:i+batch_size],
                                 batch_size, vocab["<PAD>"], shuffle=False))
    y_test = y[i:i+batch_size]                                                              
    v1, v2 = model((q1, q2))                                                                 
    for j in range(batch_size):
      d = np.dot(v1[j], v2[j].T)                                                            
      res = d > threshold
      accuracy += (y_test[j] == res)
  accuracy = accuracy / len(test_Q1)
  return accuracy


accuracy = classify(Q1_test, Q2_test, y_test, 0.7, model, vocab, batch_size=512)             
print("Accuracy :", accuracy) 

Accuracy : 0.7462890625


**Model Evaluation**
* Now, I will test the Model using my own Questions. I will build a reverse Vocabulary that allows the map encoded Questions back to words. 

In [17]:

def predict(question1, question2, threshold, model, vocab, data_generator=data_generator, verbose=False):
  """ Function for predicting if two Questions are Duplicates. """
  q1 = nltk.word_tokenize(question1)                                
  q2 = nltk.word_tokenize(question2)                                
  Q1, Q2 = [], []
  for word in q1:
    Q1 += [vocab[word]]                                            
  for word in q2:
    Q2 += [vocab[word]]                                            
  Q1, Q2 = next(data_generator([Q1], [Q2], 1, vocab["<PAD>"]))
  v1, v2 = model((Q1, Q2))                                          
  d = fastnp.dot(v1[0], v2[0].T)
  res = d > threshold
  if (verbose):
    print("Q1 = ", Q1, "\nQ2 = ", Q2)
    print("d = ", d)
    print("res = ", res)
  return res 

In [18]:
#@ Examples of Questions:
question1 = "What is your name?"
question2 = "What are you known as?"
#@ Predicting the Duplicated Questions:
example1 = predict(question1, question2, 0.7, model, vocab, verbose=True)
print("Example1:", example1, "\n")

#@ Example of Questions:
question1 = "Where are you taking us?"
question2 = "Where are we going?"
#@ Predicting the Duplicated Questions:
example2 = predict(question1, question2, 0.7, model, vocab, verbose=True)
print("Example2:", example2)

Q1 =  [[  30  156   56 1377   21    1    1    1]] 
Q2 =  [[  30   87   53 2715  251   21    1    1]]
d =  0.75979096
res =  True
Example1: True 

Q1 =  [[676  87  53 906 603  21   1   1]] 
Q2 =  [[ 676   87  138 1479   21    1    1    1]]
d =  0.8487048
res =  True
Example2: True
