In [1]:
import urllib
urllib.request.urlretrieve("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", "quora_duplicate_questions.tsv")

('quora_duplicate_questions.tsv', <http.client.HTTPMessage at 0x7f47f057d438>)

In [2]:
import pandas as pd
data = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')
list(data.columns)

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']

In [3]:
#Remove samples with nan
import numpy as np
dataq1 = data['question1']
dataq2 = data['question2']

q1_nans = np.where(dataq1.isnull())[0]
q2_nans = np.where(dataq2.isnull())[0]
nan_indeces = np.concatenate([q1_nans,q2_nans])
print("Print NAN indices:",nan_indeces)

did = data['id']
data = data.drop(nan_indeces)
data = data[['question1', 'question2','is_duplicate']]
data.head

Print NAN indices: [363362 105780 201841]


<bound method NDFrame.head of                                                 question1  \
0       What is the step by step guide to invest in sh...   
1       What is the story of Kohinoor (Koh-i-Noor) Dia...   
2       How can I increase the speed of my internet co...   
3       Why am I mentally very lonely? How can I solve...   
4       Which one dissolve in water quikly sugar, salt...   
5       Astrology: I am a Capricorn Sun Cap moon and c...   
6                                     Should I buy tiago?   
7                          How can I be a good geologist?   
8                         When do you use シ instead of し?   
9       Motorola (company): Can I hack my Charter Moto...   
10      Method to find separation of slits using fresn...   
11            How do I read and find my YouTube comments?   
12                   What can make Physics easy to learn?   
13            What was your first sexual experience like?   
14      What are the laws to change your status from a.

In [4]:
from sklearn.model_selection import train_test_split
inputs, test_set = train_test_split(data, test_size=0.2)
train_set, val_set = train_test_split(inputs, test_size=0.2)
print("Train shape:", train_set.shape)
print("Test shape:",test_set.shape)
print("Val shape:",val_set.shape)
train_set.to_csv("train.csv",index = False)
val_set.to_csv("val.csv",index = False)
test_set.to_csv("test.csv",index = False)

Train shape: (258743, 3)
Test shape: (80858, 3)
Val shape: (64686, 3)


In [5]:
trn_q1_set = train_set['question1'].values
trn_q2_set = train_set['question2'].values
trn_qcombined_set = np.concatenate((trn_q1_set, trn_q2_set), axis=0)
print("Combined question set shape:",trn_qcombined_set.shape)
print("Sample question from the set:",trn_qcombined_set[19])

Combined question set shape: (517486,)
Sample question from the set: What is a good replacement for red wine in cooking?


In [6]:
import spacy
spacy_en = spacy.load('en')

def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text)]

trn_qcombined_len = len(trn_qcombined_set)
trn_word_list = set()
for i in range(trn_qcombined_len):
    for words in tokenizer(trn_qcombined_set[i]):
        trn_word_list.add(words)

In [7]:
print("Unique Word Count:", len(trn_word_list))
MAX_VOCAB_SIZE = int(len(trn_word_list))
print("Max Vocab Size:", MAX_VOCAB_SIZE)

Unique Word Count: 97071
Max Vocab Size: 97071


In [8]:
from torchtext import data
from torchtext import datasets

TEXT = data.Field(sequential=True, 
                       tokenize='spacy',  
                       use_vocab=True,
                       lower=True)
                 
LABELS = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

data_fields = [
    ('question1', TEXT),
    ('question2', TEXT), 
    ('is_duplicate', LABELS) 
]

In [9]:
train, val, test = data.TabularDataset.splits(path='.', 
                                            format='csv', 
                                            train='train.csv', 
                                            validation='val.csv',
                                            test='test.csv',
                                            fields=data_fields, 
                                            skip_header=True)

In [10]:
print("Length of the training set:",len(train))
ex = train[0]
print("Q1 field of the first sample:\n",ex.question1)
print("Q2 field of the first sample:\n",ex.question2)
print("Label field of the first sample:",ex.is_duplicate)

Length of the training set: 258743
Q1 field of the first sample:
 ['how', 'do', 'i', 'delete', 'whatsapp', 'chats', 'permanently', '?']
Q2 field of the first sample:
 ['how', 'can', 'i', 'delete', 'permanently', 'my', 'whatsapp', 'chat', 'and', 'conversation', '?']
Label field of the first sample: 0


In [11]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.300d")
print("Vocabulary size: {}".format(len(TEXT.vocab)))

Vocabulary size: 78701


In [12]:
print("Word Vector of the:")
print(TEXT.vocab.vectors[TEXT.vocab.stoi['the']])

Word Vector of the:
tensor([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
         2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
        -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
         2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
        -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
        -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
         9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
        -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
         1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
         2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
         1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
         2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
        -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.6425e-02,
        -4.4640e-01,  1.7197e-0

In [13]:
print(TEXT.vocab.itos)



In [14]:
import torch
import torch.nn as nn
BATCH_SIZE = 32
VOCAB_SIZE = len(TEXT.vocab)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, val_iter, test_iter = data.BucketIterator.splits(datasets=(train, val, test),  
                                                             batch_size=BATCH_SIZE, 
                                                             device= device,
                                                             sort = False,
                                                             repeat=False)
#BucketIterator pads the batch according the maximum length sample---- double check this

In [15]:
from torchtext.vocab import Vectors
print("Per batch length of train,val and test set:")
print(len(train_iter), len(val_iter),len(test_iter))
print(TEXT.vocab.vectors[TEXT.vocab.stoi['the']])

Per batch length of train,val and test set:
8086 2022 2527
tensor([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
         2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
        -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
         2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
        -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
        -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
         9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
        -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
         1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
         2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
         1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
         2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
        -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.64

In [16]:
class Model1(nn.Module):
    def __init__(self, input_dim, n_hidden1, n_hidden2, n_hidden3, n_out):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.n_out = n_out
        
        self.l1 =nn.Linear(self.input_dim, self.n_hidden1)
        self.l2 = nn.Linear(self.n_hidden1, self.n_hidden2)
        self.l3 = nn.Linear(self.n_hidden2, self.n_hidden3)
        self.out = nn.Linear(self.n_hidden3, n_out)
        
        self.l1.weight.data.uniform_(-1, 1)
        self.l2.weight.data.uniform_(-1, 1)
        self.l3.weight.data.uniform_(-1, 1)
        self.out.weight.data.uniform_(-1, 1)
        
    def forward(self, q1, q2):
        q1out = self.qemb(q1)
        q2out = self.qemb(q2)
        q1output = q1out.mean(0)
        q2output = q2out.mean(0)
        output = torch.cat([q1output,q2output],dim=1)
        output = torch.tanh(self.l1(output))
        output = torch.tanh(self.l2(output))
        output = torch.tanh(self.l3(output))
        return self.out(output)

In [21]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc

In [22]:
class Training_module():
    def __init__(self, model,useRNN):
        self.useRNN = useRNN
        self.model = model
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.optimizer = optim.Adam(self.model.parameters())
    
    def train_epoch(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
        
        if (self.useRNN == True):
            hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
            
        for batch in iterator:
            Q1 = batch.question1
            Q2 = batch.question2
            y = batch.is_duplicate
            
            y = y.float()
            
            self.optimizer.zero_grad()
            preds = self.model(Q1,Q2).squeeze(1)
            loss = self.loss_fn(preds, y)
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()      
            acc = binary_accuracy(preds, y)
            epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        dev_accs = [0.]
        for epoch in range(NUM_EPOCH):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print("dev acc: {}".format(dev_acc[1]), "dev loss:{}".format(dev_acc[0]))
            if dev_acc[1] > max(dev_accs):
                best_model = copy.deepcopy(self)
            dev_accs.append(dev_acc[1])
        return best_model.model
                
    def evaluate(self, iterator):
        epoch_loss = 0
        epoch_acc = 0
    
        with torch.no_grad():
            
            if (self.useRNN == True):
                hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
            
            for batch in iterator:
                Q1 = batch.question1
                Q2 = batch.question2
                y = batch.is_duplicate
                
                y = y.float()
                
                predictions = self.model(Q1,Q2).squeeze(1)
                loss = self.loss_fn(predictions, y)
                acc = binary_accuracy(predictions, y)
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
import torch.optim as optim
import copy
import torch.nn.functional as F
nin = 600
nhid1 = 200
nhid2 = 200
nhid3 = 200
nout = 1
NUM_EPOCH = 10
useRNN = False
model = Model1(nin,nhid1,nhid2,nhid3,nout)
model = model.to(device)
tm = Training_module(model,useRNN)
mymodel = tm.train_model(train_iter,val_iter)

dev acc: 0.7063330154272734 dev loss:0.5683282782074016
dev acc: 0.7148994983834399 dev loss:0.5474766705233077
dev acc: 0.7231944150316255 dev loss:0.5336545515661777
dev acc: 0.7333439310568612 dev loss:0.5190180379871563
dev acc: 0.7313193267047347 dev loss:0.5131674942170831
dev acc: 0.7438069627086911 dev loss:0.5059354817080215
dev acc: 0.746496131848157 dev loss:0.49524871534810694
dev acc: 0.7501170164163931 dev loss:0.486488876781407
dev acc: 0.7494634908986021 dev loss:0.48835972278691187
dev acc: 0.7526626748748633 dev loss:0.4818936772312892


In [24]:
tst_loss, tst_acc = tm.evaluate(test_iter)
print("Tst Acc:", tst_acc)

Tst Acc: 0.7556714407530613


In [69]:
class Model2(nn.Module):
    def __init__(self, input_dim, sentemb_dim,n_hidden1, n_hidden2, n_hidden3, n_out):
        super().__init__()
        self.input_dim = input_dim
        self.qemb = nn.Embedding(VOCAB_SIZE,300)
        self.qemb.weight = torch.nn.Parameter(TEXT.vocab.vectors, requires_grad = False)
        
        self.sentemb_dim = sentemb_dim
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.n_out = n_out
        
        self.rnnq1 = nn.LSTM(self.input_dim, self.sentemb_dim)#, 1)
        self.rnnq2 = nn.LSTM(self.input_dim, self.sentemb_dim)#, 1)
        
        self.l1 =nn.Linear(self.sentemb_dim * 2, self.n_hidden1)
        self.l2 = nn.Linear(self.n_hidden1, self.n_hidden2)
        self.l3 = nn.Linear(self.n_hidden2, self.n_hidden3)
        self.out = nn.Linear(self.n_hidden3, n_out)
        
        self.l1.weight.data.uniform_(-1, 1)
        self.l2.weight.data.uniform_(-1, 1)
        self.l3.weight.data.uniform_(-1, 1)
        self.out.weight.data.uniform_(-1, 1)
        
    def forward(self, q1, q2):
        q1out = self.qemb(q1)
        q2out = self.qemb(q2)
        #print(q1out.shape)
        #print(q2out.shape)
        
        q1output, q1hidden_out = self.rnnq1(q1out)
        q2output, q2hidden_out = self.rnnq2(q2out)
        
        #print(q1output.shape)
        #print(q2output.shape)
        q1a,q1b = q1hidden_out
        q2a,q2b = q2hidden_out
        #print("-------")
        #print(q1a.shape)
        #print(q2a.shape)
        #print(q1b.shape)
        #print(q2b.shape)
        #print("-------")
        q1output = q1a.squeeze()
        #q1output.view(q1output.size(0)*q1output.size(1),q1output.size(2))
        q2output = q2a.squeeze()
        #q2output.view(q2output.size(0)*q2output.size(1),q2output.size(2))
        #print(q1output.shape)
        #print(q2output.shape)
        
        output = torch.cat([q1output,q2output],dim=1)
        #print(output.shape)
        
        output = torch.tanh(self.l1(output))
        #print(output.shape)
        output = torch.tanh(self.l2(output))
        #print(output.shape)
        output = torch.tanh(self.l3(output))
        #print(output.shape)
        return self.out(output)
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.sentemb_dim), requires_grad=requires_grad),
                    weight.new_zeros((1, bsz, self.sentemb_dim), requires_grad=requires_grad))
        

In [70]:
nin = 300
sentemb_dim = 200
nhid1 = 200
nhid2 = 200
nhid3 = 200
nout = 1
NUM_EPOCH = 10
useRNN = True
model = Model2(nin,sentemb_dim,nhid1,nhid2,nhid3,nout)
model = model.to(device)
tm = Training_module(model,useRNN)
mymodel = tm.train_model(train_iter,val_iter)

dev acc: 0.6336680443775052 dev loss:0.6422898337672419
dev acc: 0.6493747350782834 dev loss:0.6319362703051222
dev acc: 0.7157362759643917 dev loss:0.575108023133759
dev acc: 0.7265636039534851 dev loss:0.5673176585507794
dev acc: 0.7225762152294495 dev loss:0.5510003158912933
dev acc: 0.7145550727962151 dev loss:0.5469622145656544
dev acc: 0.7368455913606902 dev loss:0.5326912405759715
dev acc: 0.7212029285176218 dev loss:0.5641023771640219
dev acc: 0.7248392680514342 dev loss:0.5432439318809265
dev acc: 0.7419413240326147 dev loss:0.5294932090151322
