In [None]:
import urllib
urllib.request.urlretrieve("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", "quora_duplicate_questions.tsv")

In [None]:
import pandas as pd
data = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')
list(data.columns)

In [None]:
#Remove samples with nan
import numpy as np
dataq1 = data['question1']
dataq2 = data['question2']

q1_nans = np.where(dataq1.isnull())[0]
q2_nans = np.where(dataq2.isnull())[0]
nan_indeces = np.concatenate([q1_nans,q2_nans])
print("Print NAN indices:",nan_indeces)

did = data['id']
data = data.drop(nan_indeces)
data = data[['question1', 'question2','is_duplicate']]
data.head

In [None]:
from sklearn.model_selection import train_test_split
inputs, test_set = train_test_split(data, test_size=0.2)
train_set, val_set = train_test_split(inputs, test_size=0.2)
print("Train shape:", train_set.shape)
print("Test shape:",test_set.shape)
print("Val shape:",val_set.shape)
#Modifiying the sizes so shape is divisible by the batchsize
train_set = train_set.append(train_set[:9])
print("Modified Train shape:", train_set.shape)
test_set = test_set.append(test_set[:6])
print("Modified Test shape:",test_set.shape)
val_set = val_set.append(val_set[:18])
print("Modified Val shape:",val_set.shape)

In [None]:
train_set.to_csv("train.csv",index = False)
val_set.to_csv("val.csv",index = False)
test_set.to_csv("test.csv",index = False)

In [None]:
trn_q1_set = train_set['question1'].values
trn_q2_set = train_set['question2'].values
trn_qcombined_set = np.concatenate((trn_q1_set, trn_q2_set), axis=0)
print("Combined question set shape:",trn_qcombined_set.shape)
print("Sample question from the set:",trn_qcombined_set[19])

In [None]:
import spacy
spacy_en = spacy.load('en')

def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text)]

trn_qcombined_len = len(trn_qcombined_set)
trn_word_list = set()
for i in range(trn_qcombined_len):
    for words in tokenizer(trn_qcombined_set[i]):
        trn_word_list.add(words)

In [None]:
print("Unique Word Count:", len(trn_word_list))
MAX_VOCAB_SIZE = int(len(trn_word_list))
print("Max Vocab Size:", MAX_VOCAB_SIZE)

In [None]:
from torchtext import data
from torchtext import datasets

TEXT = data.Field(sequential=True, 
                       tokenize='spacy',  
                       use_vocab=True,
                       lower=True)
                 
LABELS = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

data_fields = [
    ('question1', TEXT),
    ('question2', TEXT), 
    ('is_duplicate', LABELS) 
]

In [None]:
train, val, test = data.TabularDataset.splits(path='.', 
                                            format='csv', 
                                            train='train.csv', 
                                            validation='val.csv',
                                            test='test.csv',
                                            fields=data_fields, 
                                            skip_header=True)

In [None]:
print("Length of the training set:",len(train))
ex = train[0]
print("Q1 field of the first sample:\n",ex.question1)
print("Q2 field of the first sample:\n",ex.question2)
print("Label field of the first sample:",ex.is_duplicate)

In [None]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.300d")
print("Vocabulary size: {}".format(len(TEXT.vocab)))

In [None]:
print("Word Vector of the:")
print(TEXT.vocab.vectors[TEXT.vocab.stoi['the']])

In [None]:
print(TEXT.vocab.itos)

In [None]:
import torch
import torch.nn as nn
BATCH_SIZE = 32
VOCAB_SIZE = len(TEXT.vocab)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, val_iter, test_iter = data.BucketIterator.splits(datasets=(train, val, test),  
                                                             batch_size=BATCH_SIZE, 
                                                             device= device,
                                                             sort = False,
                                                             repeat=False)


In [None]:
from torchtext.vocab import Vectors
print("Per batch length of train,val and test set:")
print(len(train_iter), len(val_iter),len(test_iter))
print(TEXT.vocab.vectors[TEXT.vocab.stoi['the']])

In [None]:
#Glove Based Embeddings 
class Model1(nn.Module):
    def __init__(self, input_dim, n_hidden1, n_hidden2, n_hidden3, n_out):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.n_out = n_out
        
        self.l1 =nn.Linear(self.input_dim, self.n_hidden1)
        self.l2 = nn.Linear(self.n_hidden1, self.n_hidden2)
        self.l3 = nn.Linear(self.n_hidden2, self.n_hidden3)
        self.out = nn.Linear(self.n_hidden3, n_out)
        
        self.l1.weight.data.uniform_(-1, 1)
        self.l2.weight.data.uniform_(-1, 1)
        self.l3.weight.data.uniform_(-1, 1)
        self.out.weight.data.uniform_(-1, 1)
        
    def forward(self, q1, q2):
        q1out = self.qemb(q1)
        q2out = self.qemb(q2)
        q1output = q1out.mean(0)
        q2output = q2out.mean(0)
        output = torch.cat([q1output,q2output],dim=1)
        output = torch.tanh(self.l1(output))
        output = torch.tanh(self.l2(output))
        output = torch.tanh(self.l3(output))
        return self.out(output)

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc

In [None]:
#Training Module for all the models
class Training_module():
    def __init__(self, model,useRNN):
        self.useRNN = useRNN
        self.model = model
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.optimizer = optim.Adam(self.model.parameters())
    
    def train_epoch(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        
        if (self.useRNN == True):
            hidden = self.model.init_hidden(BATCH_SIZE, requires_grad = True)
            
        for batch in iterator:
            Q1 = batch.question1
            Q2 = batch.question2
            y = batch.is_duplicate
            
            y = y.float()
            
            self.optimizer.zero_grad()
            preds = self.model(Q1,Q2).squeeze(1)
            loss = self.loss_fn(preds, y)
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()      
            acc = binary_accuracy(preds, y)
            epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        dev_accs = [0.]
        for epoch in range(NUM_EPOCH):
            train_acc = self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print("epoch: {}".format(epoch),
                  "train acc: {}".format(train_acc[1]),
                  "train loss: {}".format(train_acc[0]),
                  "dev acc: {}".format(dev_acc[1]), 
                  "dev loss:{}".format(dev_acc[0]))
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
    
        with torch.no_grad():
            
            if (self.useRNN == True):
                hidden = self.model.init_hidden(BATCH_SIZE, requires_grad=False)
            
            for batch in iterator:
                Q1 = batch.question1
                Q2 = batch.question2
                y = batch.is_duplicate
                
                y = y.float()
                
                predictions = self.model(Q1,Q2).squeeze(1)
                loss = self.loss_fn(predictions, y)
                acc = binary_accuracy(predictions, y)
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import torch.optim as optim
import copy
import torch.nn.functional as F
nin = 600
nhid1 = 200
nhid2 = 200
nhid3 = 200
nout = 1
NUM_EPOCH = 30
useRNN = False
model_1 = Model1(nin,nhid1,nhid2,nhid3,nout)
model_1 = model_1.to(device)
tm_1 = Training_module(model_1,useRNN)
mymodel_1 = tm_1.train_model(train_iter,val_iter)

In [None]:
tst_loss_1, tst_acc_1 = tm_1.evaluate(test_iter)
print("Tst Acc:", tst_acc_1)

In [None]:
#simple LSTM 
class Model2(nn.Module):
    def __init__(self, input_dim, sentemb_dim,n_hidden1, n_hidden2, n_hidden3, n_out):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.sentemb_dim = sentemb_dim
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.n_out = n_out
        
        self.rnnq1 = nn.LSTM(self.input_dim, self.sentemb_dim)#, 1)
        self.rnnq2 = nn.LSTM(self.input_dim, self.sentemb_dim)#, 1)
        
        self.l1 =nn.Linear(self.sentemb_dim * 2, self.n_hidden1)
        self.l2 = nn.Linear(self.n_hidden1, self.n_hidden2)
        self.l3 = nn.Linear(self.n_hidden2, self.n_hidden3)
        self.out = nn.Linear(self.n_hidden3, n_out)
        
        self.l1.weight.data.uniform_(-1, 1)
        self.l2.weight.data.uniform_(-1, 1)
        self.l3.weight.data.uniform_(-1, 1)
        self.out.weight.data.uniform_(-1, 1)
        
    def forward(self, q1, q2):
        q1out = self.qemb(q1)
        q2out = self.qemb(q2)
        
        q1output, q1hidden_out = self.rnnq1(q1out)
        q2output, q2hidden_out = self.rnnq2(q2out)
        
        q1a,q1b = q1hidden_out
        q2a,q2b = q2hidden_out
        
        q1out = q1a.squeeze()
        q2out = q2a.squeeze()
        
        output = torch.cat([q1out,q2out],dim=1)
        
        output = torch.tanh(self.l1(output))
        output = torch.tanh(self.l2(output))
        output = torch.tanh(self.l3(output))
        return self.out(output)
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.sentemb_dim), requires_grad=requires_grad),
                    weight.new_zeros((1, bsz, self.sentemb_dim), requires_grad=requires_grad))
        

In [None]:
import torch.optim as optim
import copy
import torch.nn.functional as F
nin = 300
sentemb_dim = 200
nhid1 = 200
nhid2 = 200
nhid3 = 200
nout = 1
NUM_EPOCH = 20
useRNN = True
model_2 = Model2(nin,sentemb_dim,nhid1,nhid2,nhid3,nout)
model_2 = model_2.to(device)
tm_2 = Training_module(model_2,useRNN)
mymodel_2 = tm_2.train_model(train_iter,val_iter)

In [None]:
tst_loss_2, tst_acc_2 = tm_2.evaluate(test_iter)
print("Tst Acc:", tst_acc_2)

In [None]:
#Siamese LSTM approach
class Model3(nn.Module):
    def __init__(self, input_dim, sentemb_dim,n_hidden1, n_hidden2, n_hidden3, n_out):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.sentemb_dim = sentemb_dim
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.n_out = n_out
        
        self.rnnq1 = nn.LSTM(self.input_dim, self.sentemb_dim)
        self.rnnq2 = nn.LSTM(self.input_dim, self.sentemb_dim)
        
        self.l1 =nn.Linear(self.sentemb_dim * 4, self.n_hidden1)
        self.dropout = nn.Dropout(0.1)
        self.l2 = nn.Linear(self.n_hidden1, self.n_hidden2)
        self.l3 = nn.Linear(self.n_hidden2, self.n_hidden3)
        self.out = nn.Linear(self.n_hidden3, n_out)
        
        self.l1.weight.data.uniform_(-1, 1)
        self.l2.weight.data.uniform_(-1, 1)
        self.l3.weight.data.uniform_(-1, 1)
        self.out.weight.data.uniform_(-1, 1)
        
    def forward(self, q1, q2):
        q1out = self.qemb(q1)
        q2out = self.qemb(q2)
        
        q1output, q1hidden_out = self.rnnq1(q1out)
        q2output, q2hidden_out = self.rnnq2(q2out)
        
        q1a,q1b = q1hidden_out
        q2a,q2b = q2hidden_out
        q1out = q1a.squeeze()
        q2out = q2a.squeeze()
        
        #computing the distance information
        qminus = torch.sub(q1out,q2out)
        qelemwiseproduct = torch.mul(q1out,q2out)
        
        output = torch.cat([q1out,q2out,qminus,qelemwiseproduct],dim=1)
        
        output = torch.tanh(self.l1(output))
        output = self.dropout(output)
        output = torch.tanh(self.l2(output))
        output = self.dropout(output)
        return self.out(output)
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.sentemb_dim), requires_grad=requires_grad),
                    weight.new_zeros((1, bsz, self.sentemb_dim), requires_grad=requires_grad))
        

In [None]:
import torch.optim as optim
import copy
import torch.nn.functional as F
nin = 300
sentemb_dim = 200
nhid1 = 200
nhid2 = 200
nhid3 = 200
nout = 1
NUM_EPOCH = 20
useRNN = True
model_3 = Model3(nin,sentemb_dim,nhid1,nhid2,nhid3,nout)
model_3 = model_3.to(device)
tm_3 = Training_module(model_3,useRNN)
mymodel_3 = tm_3.train_model(train_iter,val_iter)

In [None]:
tst_loss_3, tst_acc_3 = tm_3.evaluate(test_iter)
print("Tst Acc:", tst_acc_3)

Attention with LSTM

In [None]:
#Encoding of Question 1
class Q1Module(nn.Module):
    def __init__(self, input_dim, sentemb_dim):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.sentemb_dim = sentemb_dim
        
        self.rnnq1 = nn.LSTM(self.input_dim, self.sentemb_dim)
        
    def forward(self, q1):
        q1out = self.qemb(q1)
        q1output, q1hidden_out = self.rnnq1(q1out) 
        q1_h,q1_c = q1hidden_out
        return q1output, q1hidden_out

In [None]:
#Encoding of Question 2
class Q2Module(nn.Module):
    def __init__(self, input_dim, out_dim):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.out_dim = out_dim
        
        self.rnnq2 = nn.LSTM(self.input_dim, self.out_dim)
        
    def forward(self, q2, hidden):
        q2out = self.qemb(q2)
        q2output, q2hidden_out = self.rnnq2(q2out, hidden)
        q2_h,q2_c = q2hidden_out
        return q2_h

In [None]:
#LSTM with attention
#Notation is based on Rocktaschel et al
#https://arxiv.org/pdf/1509.06664.pdf
class Model4(nn.Module):
    def __init__(self, Q1Module, Q2Module, device):
        super().__init__()
        
        self.Q1Module = Q1Module
        self.Q2Module = Q2Module
        self.device = device
        k = self.Q1Module.sentemb_dim * BATCH_SIZE #
        
        # Attention Parameters
        self.W_y = nn.Parameter(torch.randn(k, k))
        self.register_parameter('W_y', self.W_y)
        
        self.W_h = nn.Parameter(torch.randn(k, k)) 
        self.register_parameter('W_h', self.W_h)
        
        self.W_alpha = nn.Parameter(torch.randn(k, 1))
        self.register_parameter('W_alpha', self.W_alpha)
        
        self.W_p = nn.Parameter(torch.randn(k, k))
        self.register_parameter('W_p', self.W_p)
        
        self.W_x = nn.Parameter(torch.randn(k, k))  
        self.register_parameter('W_x', self.W_x)
        
        self.l1 =nn.Linear(self.Q1Module.sentemb_dim, 1)
        
    def forward(self, q1, q2):
        
        q1output, q1hidden_out = self.Q1Module(q1)
        q2_hidden_out = self.Q2Module(q2,q1hidden_out)
        
        #convert to 2d
        Y = torch.t(q1output.view(q1output.shape[0],-1))

        h_n = q2_hidden_out.view(-1,1)
        
        #L is a output dimensional vector of ones
        L = torch.ones(q1output.shape[0]).to(self.device)

        product_h_n = torch.mm(self.W_h,h_n)
        product_h_n = product_h_n.view(-1)
        
        M = torch.tanh(torch.add(torch.mm(self.W_y,Y),torch.ger(product_h_n,L)))
  
        alpha = F.softmax(torch.mm(torch.t(self.W_alpha),M),dim=1)#rigth dim is 1
     
        r = torch.mm(Y,torch.t(alpha))
      
        h_star = torch.tanh(torch.add(torch.mm(self.W_p,r),torch.mm(self.W_x,h_n)))
        
        h_star = h_star.view(BATCH_SIZE,-1)
        
        outputs = self.l1(h_star)
 
        return outputs

In [None]:
import torch.optim as optim
import copy
import torch.nn.functional as F
INPUT_DIM = 300
EMB_DIM = 200
NUM_EPOCH = 30
useRNN = False
q1mod = Q1Module(INPUT_DIM, EMB_DIM)
q2mod = Q2Module(INPUT_DIM, EMB_DIM)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_4 = Model4(q1mod, q2mod, device).to(device)

In [None]:
#initilaizing weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model_4.apply(init_weights)

In [None]:
tm_4 = Training_module(model_4,useRNN)
mymodel_4 = tm_4.train_model(train_iter,val_iter)

In [None]:
tst_loss_4, tst_acc_4 = tm_4.evaluate(test_iter)
print("Tst Acc:", tst_acc_4)

In [None]:
Word to word attention

In [None]:
#Encoding of Question 1
class Q1ModuleModel5(nn.Module):
    def __init__(self, input_dim, sentemb_dim):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.sentemb_dim = sentemb_dim
        
        self.rnnq1 = nn.LSTM(self.input_dim, self.sentemb_dim)
        
    def forward(self, q1):
        q1out = self.qemb(q1)
        q1output, q1hidden_out = self.rnnq1(q1out) 
        q1_h,q1_c = q1hidden_out
        return q1output, q1hidden_out

In [None]:
#Encoding of Question 2
class Q2ModuleModel5(nn.Module):
    def __init__(self, input_dim, out_dim):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.out_dim = out_dim
        
        self.rnnq2 = nn.LSTM(self.input_dim, self.out_dim)
        
    def forward(self, q2, hidden):
        q2out = self.qemb(q2)
        q2output, q2hidden_out = self.rnnq2(q2out, hidden)
        return q2output

In [None]:
#This model is called model 5 version 1 in the paper
#LSTM with word by word attention
#Notation is based on Rocktaschel et al
#https://arxiv.org/pdf/1509.06664.pdf
class Model5(nn.Module):
    def __init__(self, Q1Module, Q2Module, device):
        super().__init__()
        
        self.Q1Module = Q1Module
        self.Q2Module = Q2Module
        self.device = device
        k = self.Q1Module.sentemb_dim * BATCH_SIZE
        
        # Attention Parameters
        self.W_y = nn.Parameter(torch.randn(k, k))
        self.register_parameter('W_y', self.W_y)
        
        self.W_h = nn.Parameter(torch.randn(k, k)) 
        self.register_parameter('W_h', self.W_h)
        
        self.W_r = nn.Parameter(torch.randn(k, k)) 
        self.register_parameter('W_r', self.W_r)
        
        self.W_alpha = nn.Parameter(torch.randn(k, 1))
        self.register_parameter('W_alpha', self.W_alpha)
        
        self.W_p = nn.Parameter(torch.randn(k, k))
        self.register_parameter('W_p', self.W_p)
        
        self.W_t = nn.Parameter(torch.randn(k, k))
        self.register_parameter('W_t', self.W_t)
        
        self.W_x = nn.Parameter(torch.randn(k, k))  
        self.register_parameter('W_x', self.W_x)
        
        self.l1 =nn.Linear(self.Q1Module.sentemb_dim, 1)
        
    def forward(self, q1, q2):
        
        q1output, q1hidden_out = self.Q1Module(q1)
        q2output = self.Q2Module(q2,q1hidden_out)
        
        #convert to 2d
        Y = torch.t(q1output.view(q1output.shape[0],-1))

        #L is a output dimensional vector of ones
        L = torch.ones(q1output.shape[0]).to(self.device)
        
        r = torch.cuda.FloatTensor(self.Q1Module.sentemb_dim * BATCH_SIZE, 1).normal_()
        
        #loop through each word within a sentence
        for i in range(q2output.shape[0]):
            h_n = q2output[i,:,:]
            h_n = h_n.view(-1,1)

            product_h_n = torch.add(torch.mm(self.W_h,h_n),torch.mm(self.W_r,r))
            product_h_n = product_h_n.view(-1)

            M = torch.tanh(torch.add(torch.mm(self.W_y,Y),torch.ger(product_h_n,L)))

            alpha = F.softmax(torch.mm(torch.t(self.W_alpha),M),dim=1)

            r = torch.add(torch.mm(Y,torch.t(alpha)),torch.tanh(torch.mm(self.W_t,r)))

        h_star = torch.tanh(torch.add(torch.mm(self.W_p,r),torch.mm(self.W_x,h_n)))
        
        h_star = h_star.view(BATCH_SIZE,-1)
        
        outputs = self.l1(h_star)
 
        return outputs

In [None]:
import torch.optim as optim
import copy
import torch.nn.functional as F
INPUT_DIM = 300
EMB_DIM = 200
NUM_EPOCH = 5
useRNN = False
q1mod5 = Q1ModuleModel5(INPUT_DIM, EMB_DIM)
q2mod5 = Q2ModuleModel5(INPUT_DIM, EMB_DIM)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_5 = Model5(q1mod5, q2mod5, device).to(device)

In [None]:
#initilaizing weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model_5.apply(init_weights)

In [None]:
tm_5 = Training_module(model_5,useRNN)
mymodel_5 = tm_5.train_model(train_iter,val_iter)

In [None]:
tst_loss_5, tst_acc_5 = tm_5.evaluate(test_iter)
print("Tst Acc:", tst_acc5)