In [1]:
import urllib
urllib.request.urlretrieve("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", "quora_duplicate_questions.tsv")

('quora_duplicate_questions.tsv', <http.client.HTTPMessage at 0x7fca4c76b438>)

In [2]:
import pandas as pd
data = pd.read_csv('quora_duplicate_questions.tsv', sep='\t')
list(data.columns)

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']

In [3]:
#Remove samples with nan
import numpy as np
dataq1 = data['question1']
dataq2 = data['question2']

q1_nans = np.where(dataq1.isna())[0]
q2_nans = np.where(dataq2.isna())[0]
nan_indeces = np.concatenate([q1_nans,q2_nans])
print("Print NAN indices:",nan_indeces)

did = data['id']
data = data.drop(nan_indeces)
#data = data[['question1', 'question2','is_duplicate']]
data.head

Print NAN indices: [363362 105780 201841]


<bound method NDFrame.head of             id    qid1    qid2  \
0            0       1       2   
1            1       3       4   
2            2       5       6   
3            3       7       8   
4            4       9      10   
5            5      11      12   
6            6      13      14   
7            7      15      16   
8            8      17      18   
9            9      19      20   
10          10      21      22   
11          11      23      24   
12          12      25      26   
13          13      27      28   
14          14      29      30   
15          15      31      32   
16          16      33      34   
17          17      35      36   
18          18      37      38   
19          19      39      40   
20          20      41      42   
21          21      43      44   
22          22      45      46   
23          23      47      48   
24          24      49      50   
25          25      51      52   
26          26      53      54   
27          27    

In [4]:
from sklearn.model_selection import train_test_split
inputs, test_set = train_test_split(data, test_size=0.2)
train_set, val_set = train_test_split(inputs, test_size=0.2)
print("Train shape:", train_set.shape)
print("Test shape:",test_set.shape)
print("Val shape:",val_set.shape)
train_set.to_csv("train.csv")
val_set.to_csv("val.csv")
test_set.to_csv("test.csv")

Train shape: (258743, 6)
Test shape: (80858, 6)
Val shape: (64686, 6)


In [5]:
trn_q1_set = train_set['question1'].values
trn_q2_set = train_set['question2'].values
trn_qcombined_set = np.concatenate((trn_q1_set, trn_q2_set), axis=0)
print("Combined question set shape:",trn_qcombined_set.shape)
print("Sample question from the set:",trn_qcombined_set[19])

Combined question set shape: (517486,)
Sample question from the set: Are courses on Shaw Academy good?


In [6]:
import spacy
spacy_en = spacy.load('en')

def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text)]

trn_qcombined_len = len(trn_qcombined_set)
trn_word_list = set()
for i in range(trn_qcombined_len):
    for words in tokenizer(trn_qcombined_set[i]):
        trn_word_list.add(words)

In [7]:
print("Unique Word Count:", len(trn_word_list))
MAX_VOCAB_SIZE = int(len(trn_word_list) / 2)
print("Max Vocab Size:", MAX_VOCAB_SIZE)

Unique Word Count: 97274
Max Vocab Size: 48637


In [8]:
from torchtext import data
from torchtext import datasets

TEXT_Q1 = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=True,
                       lower=True)

TEXT_Q2 = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=True,
                       lower=True)

LABELS = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

#INDEX is there because of the pandas(i.e index is not a column label
#in the original dataset)
data_fields = [
    ('index', None),
    ('id', None),
    ('qid1', None),
    ('qid2', None),
    ('question1', TEXT_Q1),
    ('question2', TEXT_Q2), 
    ('is_duplicate', LABELS) 
]

In [9]:
train, val, test = data.TabularDataset.splits(path='.', 
                                            format='csv', 
                                            train='train.csv', 
                                            validation='val.csv',
                                            test='test.csv',
                                            fields=data_fields, 
                                            skip_header=True)

In [10]:
print("Length of the training set:",len(train))
ex = train[0]
print("Q1 field of the first sample:\n",ex.question1)
print("Q2 field of the first sample:\n",ex.question2)
print("Label field of the first sample:",ex.is_duplicate)

Length of the training set: 258743
Q1 field of the first sample:
 ['what', 'are', 'the', 'overall', 'best', 'web', 'hosting', 'services', '?']
Q2 field of the first sample:
 ['what', 'is', 'the', 'best', 'web', 'hosting', 'service', '?']
Label field of the first sample: 1


In [11]:
TEXT_Q1.build_vocab(train, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.300d")
TEXT_Q2.build_vocab(train, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.300d")
print("Q1 vocabulary size: {}".format(len(TEXT_Q1.vocab)))
print("Q2 vocabulary size: {}".format(len(TEXT_Q2.vocab)))

Q1 vocabulary size: 48639
Q2 vocabulary size: 48639


In [12]:
print("TEXT_Q1 Vocab Shape:",TEXT_Q1.vocab.vectors.shape)
print("TEXT_Q2 Vocab Shape:",TEXT_Q2.vocab.vectors.shape)

TEXT_Q1 Vocab Shape: torch.Size([48639, 300])
TEXT_Q2 Vocab Shape: torch.Size([48639, 300])


In [13]:
print("Word Vector of the:")
print(TEXT_Q1.vocab.vectors[TEXT_Q1.vocab.stoi['the']])

Word Vector of the:
tensor([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
         2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
        -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
         2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
        -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
        -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
         9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
        -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
         1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
         2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
         1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
         2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
        -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.6425e-02,
        -4.4640e-01,  1.7197e-0

In [14]:
print(TEXT_Q1.vocab.itos)



In [15]:
import torch
import torch.nn as nn
BATCH_SIZE = 32
VOCAB_SIZE = len(TEXT_Q1.vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, val_iter, test_iter = data.BucketIterator.splits(datasets=(train, val, test),  
                                                             batch_size=BATCH_SIZE, 
                                                             device= device,
                                                             repeat=False)
#BucketIterator pads the batch according the maximum length sample---- double check this

In [16]:
from torchtext.vocab import Vectors
print("Per batch length of train,val and test set:")
print(len(train_iter), len(val_iter),len(test_iter ))

batch = next(iter(train))
print("Batch Check:")
print("Q1:\n",batch.question1)
print("Q2:\n",batch.question2)
print("Label:",batch.is_duplicate)

Per batch length of train,val and test set:
8086 2022 2527
Batch Check:
Q1:
 ['what', 'are', 'the', 'overall', 'best', 'web', 'hosting', 'services', '?']
Q2:
 ['what', 'is', 'the', 'best', 'web', 'hosting', 'service', '?']
Label: 1


In [17]:
Embedding_Dim = 100
n_hidden = 64
n_out = 2

In [18]:
'''
train_iter, val_iter, test_iter = data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device= device, bptt_len=32, repeat=False)
'''

'\ntrain_iter, val_iter, test_iter = data.BPTTIterator.splits(\n    (train, val, test), batch_size=BATCH_SIZE, device= device, bptt_len=32, repeat=False)\n'

In [19]:
'''
from torchtext import data
from torchtext import datasets
TEXT = data.Field(lower=True)
LABELS = data.Field()
#train, val, test = datasets.LanguageModelingDataset.splits(path=".", 
#    train="train.csv", validation="val.csv", test="test.csv", text_field=TEXT)
train, val, test = data.TabularDataset.splits(
     path='.', train='train.csv',
     validation='val.csv', test='test.csv', format='csv',
     fields=[('text', TEXT), ('labels', LABELS)])

print("vocabulary size: {}".format(len(TEXT.vocab)))
'''

'\nfrom torchtext import data\nfrom torchtext import datasets\nTEXT = data.Field(lower=True)\nLABELS = data.Field()\n#train, val, test = datasets.LanguageModelingDataset.splits(path=".", \n#    train="train.csv", validation="val.csv", test="test.csv", text_field=TEXT)\ntrain, val, test = data.TabularDataset.splits(\n     path=\'.\', train=\'train.csv\',\n     validation=\'val.csv\', test=\'test.csv\', format=\'csv\',\n     fields=[(\'text\', TEXT), (\'labels\', LABELS)])\n\nprint("vocabulary size: {}".format(len(TEXT.vocab)))\n'