In [3]:
import pandas as pd
import nltk
import re
import utilities as u
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import collections
from gensim.models import KeyedVectors
EMBEDDING_FILE = '../GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

from keras.preprocessing.text import Tokenizer

In [4]:
# Read training and test data from .csv files
train_df = pd.read_csv("../train.csv")
test_df = pd.read_csv("../test.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Display top 3 rows of train data
train_df.head(3)
# assert the shape of train dataframe
assert train_df.shape == (404290, 6)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
# Display top 3 rows of test data and assert test data shape
test_df.head(3)
assert test_df.shape == (3563475, 3)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


# Identify Feature and Target columns

In [7]:
# Extract feature (X) and target (y) columns
feature_cols = list(train_df.columns[:-1])
target_col = train_df.columns[-1]
print ("Feature Columns {}".format(feature_cols))
print ("Target Columns {}".format(target_col))
X_all = train_df[feature_cols]
y_all = pd.DataFrame(data=train_df[target_col], columns=[target_col])

Feature Columns ['id', 'qid1', 'qid2', 'question1', 'question2']
Target Columns is_duplicate


In [8]:
# To check if the data is balanced or not
collections.Counter(y_all['is_duplicate'])

Counter({0: 255027, 1: 149263})

In [9]:
# Check the 5 rows of train output
y_all.head(5)

Unnamed: 0,is_duplicate
0,0
1,0
2,0
3,0
4,0
5,1
6,0


In [10]:
X_all.head(3)
assert X_all.shape == (train_df.shape[0],train_df.shape[0]-1)

Unnamed: 0,id,qid1,qid2,question1,question2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...


# Split data in training and validation sets

# check whether data is balanced or not

In [11]:
# Split data into 30% and have a constant random state so that the results are consistent
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
print ("Total data set: {} samples".format(X_all.shape[0]))
print ("Training set: {} samples".format(X_train.shape[0]))
print ("Valid set: {} samples".format(X_valid.shape[0]))

Total data set: 404290 samples
Training set: 283003 samples
Valid set: 121287 samples


In [None]:
assert (X_train.shape[0] + X_valid.shape[0]) == X_all.shape[0]
assert (y_train.shape[0] + y_valid.shape[0]) == y_all.shape[0]

# check whether data is balanced or not

In [12]:
# TO check how many question pairs in training set are duplicate or not
collections.Counter(y_train['is_duplicate'])

Counter({0: 178677, 1: 104326})

In [13]:
collections.Counter(y_valid['is_duplicate'])

Counter({0: 76350, 1: 44937})

In [14]:
X_valid.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2
8067,8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?
368101,368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...
70497,70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?
226567,226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...
73186,73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?


In [15]:
y_valid.head(5)

Unnamed: 0,is_duplicate
8067,0
368101,0
70497,0
226567,1
73186,1


# Data Preprocessing

In [16]:
def create_vocabulary(words_list,df,column_name):
    for sentence in df[column_name]:
        for word in sentence:
            words_list.append(word)
    return words_list

In [17]:
def preprocess_baseline_text(text):
    text = str(text)
    text = text.lower()
    text = text.split()
    
    return text

In [18]:
preprocess_baseline_text('How do I play Pokémon GO in Korea?')

['how', 'do', 'i', 'play', 'pokémon', 'go', 'in', 'korea?']

In [19]:
X_all_1 = pd.DataFrame()
test_df_1 = pd.DataFrame()

In [20]:
X_all_1['question1'] = X_all['question1'].apply(lambda x:preprocess_baseline_text(x))
X_all_1['question2'] = X_all['question2'].apply(lambda x:preprocess_baseline_text(x))
test_df_1['question1'] = test_df['question1'].apply(lambda x:preprocess_baseline_text(x))
test_df_1['question2'] = test_df['question2'].apply(lambda x:preprocess_baseline_text(x))

In [21]:
X_all_1.head(5)

Unnamed: 0,question1,question2
0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv..."
1,"[what, is, the, story, of, kohinoor, (koh-i-no...","[what, would, happen, if, the, indian, governm..."
2,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,..."
3,"[why, am, i, mentally, very, lonely?, how, can...","[find, the, remainder, when, [math]23^{24}[/ma..."
4,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water?]"


In [22]:
test_df_1.head(5)

Unnamed: 0,question1,question2
0,"[how, does, the, surface, pro, himself, 4, com...","[why, did, microsoft, choose, core, m3, and, n..."
1,"[should, i, have, a, hair, transplant, at, age...","[how, much, cost, does, hair, transplant, requ..."
2,"[what, but, is, the, best, way, to, send, mone...","[what, you, send, money, to, china?]"
3,"[which, food, not, emulsifiers?]","[what, foods, fibre?]"
4,"[how, ""aberystwyth"", start, reading?]","[how, their, can, i, start, reading?]"


In [23]:
words_list = create_vocabulary([],X_all_1,'question1')
print ("Lenght of words in X_all question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_all_1,'question2')
print ("Lenght of words after adding X_all question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,test_df_1,'question1')
print ("Lenght of words after adding test_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,test_df_1,'question2')
print ("Lenght of words after adding test_df question 2 {}".format(len(words_list)))



Lenght of words in X_all question 1 4423826
Lenght of words after adding X_all question 2 8944593
Lenght of words after adding test_df question 1 48146246
Lenght of words after adding test_df question 2 87470531


In [24]:
print (len(set(words_list)))
print (words_list[0:10])
words_freq = collections.Counter(words_list)
words_freq.most_common(10)
words_freq_10000 = words_freq.most_common(10000)


word_in_word2vec = []
word_notin_word2vec = []

for word in words_freq.most_common(10000):
    if word[0] in model.vocab:
        word_in_word2vec.append(word[0])
    else:
        word_notin_word2vec.append(word[0])
        
print (len(word_in_word2vec))
print (len(word_notin_word2vec))
     
        

327522
['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in']
6614
3386


In [25]:
print (words_freq_10000[0:100])

[('the', 3398855), ('what', 2980203), ('is', 2573361), ('how', 2090590), ('i', 2060339), ('a', 2028806), ('in', 1944305), ('to', 1898636), ('of', 1467452), ('do', 1374660), ('are', 1294421), ('and', 1223643), ('can', 1072247), ('for', 1012448), ('why', 707330), ('you', 699351), ('my', 666512), ('best', 632025), ('it', 581168), ('on', 547035), ('does', 521099), ('which', 466737), ('or', 461708), ('if', 408308), ('get', 386807), ('with', 384513), ('be', 375348), ('should', 372922), ('an', 362490), ('have', 361547), ('that', 357477), ('some', 351606), ('from', 325810), ('your', 272379), ('when', 258746), ('will', 250735), ('who', 247316), ('at', 246414), ('good', 233478), ('like', 223882), ('there', 214642), ('people', 212807), ('as', 210562), ('would', 208053), ('between', 196776), ('where', 190411), ('one', 181467), ('about', 178274), ('most', 171376), ('way', 167888), ('make', 167313), ('any', 165852), ('not', 161751), ('we', 159834), ('by', 155638), ('india?', 155451), ('after', 14893

In [26]:
print (word_notin_word2vec[0:100])  

['a', 'to', 'of', 'and', 'india?', 'quora', 'quora?', 'why?', 'life?', 'it?', "i'm", 'do?', 'time?', 'me?', 'english?', '2016?', 'mean?', 'online?', 'work?', '?', '10', 'instagram', 'you?', '2016', 'world?', '500', 'them?', 'engineering?', '1000', 'weight?', 'money?', 'account?', 'like?', 'whatsapp', 'not?', 'exam?', 'for?', 'language?', 'possible?', 'this?', 'sentence?', 'people?', 'day?', 'phone?', '-', 'number?', 'us?', 'sydney?', 'skills?', 'now?', 'country?', 'sex?', 'so,', 'instagram?', 'system?', 'facebook?', 'in?', 'free?', 'university?', 'job?', 'business?', 'movies?', 'bangalore?', 'year?', 'from?', 'person?', 'love?', 'about?', '2017?', 'college?', 'company?', 'water?', 'u.s.', 'programming?', 'companies?', 'exist?', 'app?', 'china?', 'fat?', 'card?', 'website?', 'years?', 'youtube?', 'how?', 'delhi?', 'one?', 'have?', 'science?', '/', 'girl?', "someone's", 'usa?', '12', 'be?', 'school?', "i've", 'jio', 'interview?', 'snapchat', 'month?']


In [27]:
print (word_in_word2vec[0:100])

['the', 'what', 'is', 'how', 'i', 'in', 'do', 'are', 'can', 'for', 'why', 'you', 'my', 'best', 'it', 'on', 'does', 'which', 'or', 'if', 'get', 'with', 'be', 'should', 'an', 'have', 'that', 'some', 'from', 'your', 'when', 'will', 'who', 'at', 'good', 'like', 'there', 'people', 'as', 'would', 'between', 'where', 'one', 'about', 'most', 'way', 'make', 'any', 'not', 'we', 'by', 'after', 'did', 'was', 'am', 'difference', 'they', 'has', 'much', "what's", 'so', 'learn', 'use', 'know', 'this', 'me', 'their', 'many', 'than', 'find', 'time', 'india', 'more', 'but', 'money', 'someone', 'all', 'without', 'indian', 'other', 'new', 'want', 'become', 'think', 'better', 'first', 'start', 'ever', 'into', 'take', 'out', "don't", 'feel', 'he', 'used', 'could', 'life', 'improve', 'possible', 'english']


# Further Preprocessing Text

In [28]:
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    
    text = re.sub(r"[^A-Za-z0-9^?,!.\/'+-=]", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    
    text = text.split()
    
    return text

In [29]:
X_train.reset_index(drop = True, inplace = True)
X_valid.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_valid.reset_index(drop = True, inplace = True)

X_train_df = pd.DataFrame()
X_valid_df = pd.DataFrame()
X_test_df = pd.DataFrame()

In [30]:
X_train.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2
0,20128,37998,37999,"How is the working environment at SBI Life, Mu...",How stressful is work of SBI clerk?
1,296237,418414,31812,How can a US citizen work in Canada?,Will a US graduate degree help a non-US citize...
2,107095,176273,176274,What are the benefits of washing your hands wi...,Why is it important to wash your hands with soap?


In [31]:
test_df.head(3)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?


In [32]:
preprocess_text('What is= the step by step guide to invest in?')

['what', 'is', '=', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in']

In [33]:
X_train_df['question1'] = X_train['question1'].apply(lambda x:preprocess_text(x))
X_train_df['question2'] = X_train['question2'].apply(lambda x:preprocess_text(x))
X_valid_df['question1'] = X_valid['question1'].apply(lambda x:preprocess_text(x))
X_valid_df['question2'] = X_valid['question2'].apply(lambda x:preprocess_text(x))
X_test_df['question1'] = test_df['question1'].apply(lambda x:preprocess_text(x))
X_test_df['question2'] = test_df['question2'].apply(lambda x:preprocess_text(x))

In [34]:
X_train_df.head(3)

Unnamed: 0,question1,question2
0,"[how, is, the, working, environment, at, sbi, ...","[how, stressful, is, work, of, sbi, clerk]"
1,"[how, can, a, us, citizen, work, in, canada]","[will, a, us, graduate, degree, help, a, non, ..."
2,"[what, are, the, benefits, of, washing, your, ...","[why, is, it, important, to, wash, your, hands..."


In [35]:
X_train_df['question1'].shape

(283003,)

In [36]:
words_list = create_vocabulary([],X_train_df,'question1')
print ("Lenght of words in X_train_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_train_df,'question2')
print ("Lenght of words after adding X_train_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question1')
print ("Lenght of words after adding X_valid_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question2')
print ("Lenght of words after adding X_valid_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question1')
print ("Lenght of words after adding X_test_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question2')
print ("Lenght of words after adding X_test_df question 2 {}".format(len(words_list)))

Lenght of words in X_train_df question 1 3143028
Lenght of words after adding X_train_df question 2 6358368
Lenght of words after adding X_valid_df question 1 7705257
Lenght of words after adding X_valid_df question 2 9080346
Lenght of words after adding X_test_df question 1 48938305
Lenght of words after adding X_test_df question 2 88928769


In [37]:
print (len(set(words_list)))
print (words_list[0:10])
words_freq = collections.Counter(words_list)
words_freq.most_common(10)
words_freq_10000 = words_freq.most_common(10000)


word_in_word2vec = []
word_notin_word2vec = []

for word in words_freq.most_common(10000):
    if word[0] in model.vocab:
        word_in_word2vec.append(word[0])
    else:
        word_notin_word2vec.append(word[0])
        
print (len(word_in_word2vec))
print (len(word_notin_word2vec))

131743
['how', 'is', 'the', 'working', 'environment', 'at', 'sbi', 'life', 'mumbai', 'how']
9206
794


In [38]:
print (word_notin_word2vec[0:100]) 

['a', 'to', 'of', 'and', '-', 'quora', '2016', '10', 'instagram', '500', '1000', 'whatsapp', '2017', '2015', 'snapchat', '20', ':', '12', '100', '000', '15', '30', '50', 'jio', '12th', 'sbi', '16', '11', 'brexit', '!', '18', 'upsc', 'ece', '13', 'tcs', 'narendra', 'better:', '2014', '25', '17', '14', '70', 'mbbs', 'manipal', '2000', 'gmat', '40', 'iim', '24', 'btech', 'cgpa', '200', 'iiit', 'cgl', '10th', 'obc', 'redmi', 'favourite', '90', '60', 'iits', '21', 'pilani', 'aiims', 'centre', 'mightn', '80', 'flipkart', 'mustn', 'xiaomi', '19', 'travelling', 'ielts', '22', '300', 'india:', 'bba', 'colour', 'ibps', '23', 'ps4', '2013', 'mtech', 'accenture', 'x^2', 'paytm', '25000', 'elon', 'hadoop', 'kohli', 'srm', 'kejriwal', 'bitsat', 'spotify', '11th', 'grey', "'", '32', 'ncr', 'virat']


# Tokenizer

# Keras Implementation

In [39]:
words_to_index = {}
index_to_words = {}

i = 0
for word in set(words_list):
    words_to_index[word] = i
    i = i + 1
    
j=0
for word in set(words_list):
    index_to_words[j] = word
    j=j+1

In [40]:
import pickle
with open('words_to_index.pickle', 'wb') as handle:
    pickle.dump(words_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('index_to_words.pickle', 'wb') as handle:
    pickle.dump(index_to_words, handle, protocol=pickle.HIGHEST_PROTOCOL)    
    
    



In [41]:
print (words_to_index['the'])
print (len(words_to_index))
print (index_to_words[104295])
print (index_to_words[0])

14632
131743
diam
levothyroxine


In [42]:
def word_2_integer(wordslist):
    question2integer = []
    for word in wordslist:
        question2integer.append(words_to_index[word])

    return question2integer

word_2_integer(['what', 'is', 'the', 'story', 'of', 'kohinoor'])

[72487, 100724, 14632, 121183, 66543, 114399]

In [43]:
X_train_df['question1'] = X_train_df['question1'].apply(lambda x:word_2_integer(x))


In [44]:
X_train_df['question2'] = X_train_df['question2'].apply(lambda x:word_2_integer(x))
X_valid_df['question1'] = X_valid_df['question1'].apply(lambda x:word_2_integer(x))
X_valid_df['question2'] = X_valid_df['question2'].apply(lambda x:word_2_integer(x))
X_test_df['question1'] = X_test_df['question1'].apply(lambda x:word_2_integer(x))
X_test_df['question2'] = X_test_df['question2'].apply(lambda x:word_2_integer(x))

In [45]:
X_train_df['question1'].head(5)

0    [62398, 100724, 14632, 93, 65626, 8375, 49702,...
1    [62398, 48508, 78971, 87119, 54862, 72873, 410...
2    [72487, 62225, 14632, 11156, 66543, 5548, 7760...
3    [62398, 10287, 14632, 24307, 31939, 66543, 236...
4    [100724, 14632, 112889, 23163, 65800, 128187, ...
Name: question1, dtype: object

In [46]:
import numpy as np
embed_length = 300
embed_matrix = np.random.randn(len(words_to_index)+1,embed_length)
# To ignore padding
embed_matrix[0] = 0



In [47]:
print (embed_matrix.shape)

(131744, 300)


In [48]:
print (embed_matrix[1])

[ 6.28003944e-01  1.54579331e+00 -3.83079812e-01 -8.36466124e-02
 -7.39653684e-01  1.46393452e-01  1.52999916e+00 -1.76033992e+00
 -3.98542902e-01  3.17350711e-02  8.24731838e-01  3.70434018e-02
 -2.28599266e-01 -5.19194792e-01  1.80413798e+00 -3.04626298e-01
 -1.10937024e+00 -9.37830008e-01 -7.49936143e-01  1.11140699e+00
  7.10048587e-01  5.03501099e-01 -2.83290933e-01 -5.15819979e-01
 -1.00338804e+00  2.18563589e+00 -2.49315225e-01 -1.20536892e+00
 -4.44473838e-01  8.41733037e-01 -3.83638942e-03 -5.04689165e-01
  5.32180596e-01 -2.98095953e-01 -2.93073026e+00  4.40414144e-01
  1.47632395e+00  6.15557787e-01 -6.16197931e-01 -9.71731738e-01
  9.47916680e-01  9.58477165e-01  2.42288950e-01 -1.73315113e+00
 -1.49556719e+00 -3.28203444e-01  1.06648372e+00  1.15324732e+00
 -2.38861136e-01  8.73026918e-01  1.04939494e+00 -3.50163012e-01
  2.87915726e-02 -8.54624938e-01 -1.63895964e+00 -1.12389430e+00
 -7.30194390e-01 -5.16509667e-02  4.09188479e-01  9.26831636e-01
 -2.91892644e-01 -6.16139

In [49]:
#Updating embedding matrix 
count = 0
for word, index in words_to_index.items():
    if word in model.vocab:
        count = count + 1
        embed_matrix[index] = model.word_vec(word)

print (count)


58547


In [50]:
print (len(words_to_index))

131743


In [51]:
max_seq_length = max([
max(list(X_train_df['question1'].map(lambda x: len(x)))),
max(list(X_train_df['question2'].map(lambda x: len(x)))),
max(list(X_valid_df['question1'].map(lambda x: len(x)))),
max(list(X_valid_df['question2'].map(lambda x: len(x)))),
max(list(X_test_df['question1'].map(lambda x: len(x)))),
max(list(X_test_df['question2'].map(lambda x: len(x)))),
])

In [52]:
print (max_seq_length)

242


In [53]:
# Convert labels to their numpy representations
Y_train =  y_train.values
Y_valid =  y_valid.values


In [54]:
print (type(Y_train))
print (type(Y_valid))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [55]:
print (Y_train[0:10])

[[0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]]


In [56]:
from tensorflow.python.keras import utils

In [57]:
Y_train = utils.to_categorical(Y_train, num_classes=2)
Y_valid = utils.to_categorical(Y_valid, num_classes=2)

In [58]:
# assert X_train_dict['left'].shape == X_train_dict['right'].shape

In [59]:
# assert len(X_train_dict['left']) == len(Y_train)


In [60]:
#padding to max seq length

In [61]:
X_train_df['question1'][0]

[62398, 100724, 14632, 93, 65626, 8375, 49702, 93246, 102404]

In [62]:
def pad_left_zeros(question_list,max_seq_length):
    question_list = [0] * (max_seq_length - len(question_list)) + question_list
    return question_list

In [63]:
# len (pad_left_zeros([31586, 69984, 104295, 57112, 55384, 21628, 63296, 131479, 95639]))

In [64]:
X_train_df['question1'] = X_train_df['question1'].apply(lambda x: pad_left_zeros(x,max_seq_length))

In [65]:
X_train_df['question2'] = X_train_df['question2'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_valid_df['question1'] = X_valid_df['question1'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_valid_df['question2'] = X_valid_df['question2'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_test_df['question1'] = X_test_df['question1'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_test_df['question2'] = X_test_df['question2'].apply(lambda x: pad_left_zeros(x,max_seq_length))

In [66]:
# Split to dicts
X_train_dict = {'left': X_train_df['question1'], 'right': X_train_df['question2']}
X_valid_dict = {'left': X_valid_df['question1'], 'right': X_valid_df['question2']}
X_test_dict = {'left': X_test_df['question1'], 'right': X_test_df['question2']}

In [67]:
from keras.layers import Input, Embedding, LSTM

In [68]:
import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

In [69]:
data_dim = 300
timesteps = 242
nb_classes = 2

In [70]:
# expected input data shape: (batch_size, timesteps, data_dim)
#Input None,max_seq_length,1

left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

In [71]:
embedding_layer = Embedding(len(embed_matrix),
                            embed_length,
                            weights=[embed_matrix],
                            input_length=max_seq_length,
                            trainable=False)

In [72]:
#Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

In [73]:
encoded_left.shape

TensorShape([Dimension(None), Dimension(242), Dimension(300)])

In [74]:
# This layer can take as input a matrix
# and will return a vector of size 64
shared_lstm = LSTM(64)

In [75]:
import tensorflow as tf;
import keras
print(keras.__version__)
print(tf.__version__)
# python -c 'import keras; print(keras.__version__)'
# python3 -c 'import tensorflow as tf; print(tf.__version__)'  # for Python 3


2.1.2
1.4.1


In [76]:
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)


In [77]:
left_output.shape

TensorShape([Dimension(None), Dimension(64)])

In [78]:
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate([left_output,right_output], axis=-1)

In [79]:
# And add a logistic regression on top
predictions = Dense(2, activation='softmax')(merged_vector)

In [80]:
X_train_dict['left'] = np.concatenate(X_train_dict['left']).reshape(X_train_dict['left'].shape[0],max_seq_length)

In [81]:
X_train_dict['left'].shape

(283003, 242)

In [82]:
X_train_dict['right'] = np.concatenate(X_train_dict['right']).reshape(X_train_dict['right'].shape[0],max_seq_length)

In [83]:
X_train_dict['right'].shape

(283003, 242)

In [84]:
X_valid_dict['left'] = np.concatenate(X_valid_dict['left']).reshape(X_valid_dict['left'].shape[0],max_seq_length)

In [85]:
X_valid_dict['right'] = np.concatenate(X_valid_dict['right']).reshape(X_valid_dict['right'].shape[0],max_seq_length)

In [86]:
X_train_dict['right'][0]

array([     0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [176]:

model = Model(inputs=[left_input, right_input], outputs=predictions)

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

n_epoch = 1


model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))



Train on 283003 samples, validate on 121287 samples
Epoch 1/1


<keras.callbacks.History at 0x39663d438>

In [177]:

def preprocess_model_input(input_questions,preprocess_text,words_to_index):

    final_new_words_list = []

    # i = 0
    for words_list in input_questions:
        new_words_list = []
      
        for word in preprocess_text(words_list):

            if words_to_index.get(word):
                new_words_list.append(words_to_index.get(word))
             
        final_new_words_list.append(np.array(pad_left_zeros(new_words_list,max_seq_length)))
        
         
    return [final_new_words_list[0].reshape(1,max_seq_length), final_new_words_list[1].reshape(1,max_seq_length)]

In [178]:
def model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index):
    y_prob = model.predict(preprocess_model_input(input_questions,preprocess_text,words_to_index),batch_size=1, verbose=0, steps=None)

    print ("The probabilities predicted by model for the question pair \n {} for classes [0,1] are \n {}".format(input_questions,y_prob))
    y_classes = y_prob.argmax(axis=-1)
    print ("The class that model predicted for the question pair \n {} is {}".format(input_questions,y_classes))
    return y_classes

In [179]:
input_questions = ["What is the capital of India?", "What is the capital of India?"]
model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index)


The probabilities predicted by model for the question pair 
 ['What is the capital of India?', 'What is the capital of India?'] for classes [0,1] are 
 [[0.489049   0.51095104]]
The class that model predicted for the question pair 
 ['What is the capital of India?', 'What is the capital of India?'] is [1]


array([1])

In [180]:
input_questions = ["How can I be a good geologist?", "What should I do to be a great geologist?"]
model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index)

The probabilities predicted by model for the question pair 
 ['How can I be a good geologist?', 'What should I do to be a great geologist?'] for classes [0,1] are 
 [[0.2599712 0.7400288]]
The class that model predicted for the question pair 
 ['How can I be a good geologist?', 'What should I do to be a great geologist?'] is [1]


array([1])

In [181]:
input_questions = ["What is the best travel website in spain?", "What is the best travel website?"]
model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index)

The probabilities predicted by model for the question pair 
 ['What is the best travel website in spain?', 'What is the best travel website?'] for classes [0,1] are 
 [[0.17791672 0.8220833 ]]
The class that model predicted for the question pair 
 ['What is the best travel website in spain?', 'What is the best travel website?'] is [1]


array([1])

In [182]:

n_epoch = 4


model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))



Train on 283003 samples, validate on 121287 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x39658c518>

In [183]:

n_epoch = 6


model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

Train on 283003 samples, validate on 121287 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x39663b630>

In [None]:
n_epoch = 5


model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

Train on 283003 samples, validate on 121287 samples
Epoch 1/5
Epoch 2/5

In [None]:
input_questions = ["What is the capital of India?", "What is the capital of India?"]
model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index)

In [None]:
input_questions = ["How can I be a good geologist?", "What should I do to be a great geologist?"]
model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index)

In [None]:
input_questions = ["What is the best travel website in spain?", "What is the best travel website?"]
model_prediction(model,input_questions,preprocess_model_input,preprocess_text,words_to_index)

In [None]:
# n_epoch = 4
# training_start_time = time()

# model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
#          validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

# print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

In [None]:
# n_epoch = 20
# training_start_time = time()

# model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
#          validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

# print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

In [None]:
import pickle
 
# # save the tokenizer and model
# with open("keras_tokenizer.pickle", "wb") as f:
#    pickle.dump(tokenizer, f)
model.save("quora_keras_model_v1_tokenizer.hdf5")

In [None]:
with open('words_to_index.pickle', 'rb') as handle:
    words_to_index_1 = pickle.load(handle)

In [None]:
with open('index_to_words.pickle', 'rb') as handle:
    index_to_words_1 = pickle.load(handle)

In [None]:
print (len(words_to_index_1))

In [None]:
print (len(index_to_words))

In [None]:
print (index_to_words[0])

In [None]:
newtexts = ["How do I read and find my YouTube comments?", "How can I see all my Youtube comments?"]

In [None]:
print (newtexts)

In [None]:

final_new_words_list = []
print (new_words_list)
i = 0
for words_list in newtexts:
    new_words_list = []
    print (words_list)
    for word in preprocess_text(words_list):
        print (word)
        if words_to_index_1.get(word):
            
            new_words_list.append(words_to_index_1.get(word))
        else:
            print (word)
    final_new_words_list.append(np.array(pad_left_zeros(new_words_list,max_seq_length)))
    

In [None]:
print (final_new_words_list)

In [None]:
from keras.models import load_model

predict_model = load_model("quora_keras_model_v1.hdf5")

In [None]:
y_prob = predict_model.predict([final_new_words_list[0].reshape(1,max_seq_length), final_new_words_list[1].reshape(1,max_seq_length)],batch_size=1, verbose=1, steps=None)

In [None]:
y_classes = y_prob.argmax(axis=-1)

In [None]:
print (y_classes)

In [None]:
print (y_prob)

In [None]:
s1rnn = Sequential()
s1rnn.add(embedding_layer_1)
s1rnn.add(LSTM(128, input_shape=(100, 1)))
s1rnn.add(Dense(1))

s2rnn = Sequential()
s2rnn.add(embedding_layer_2)
s2rnn.add(LSTM(128, input_shape=(100, 1)))
s2rnn.add(Dense(1))

In [None]:
preprocess_baseline_text(X_train['question1'][0])

In [None]:
X_test_df.head(5)

In [None]:
X_train_df.head(5)

In [None]:
# re.sub(r"[^A-Za-z0-9,!.\/'+-=]", " ", 'why am i mentally very lonely? how can i solve')

In [None]:
# re.sub(r"\'s", " ", 'what\'s')

In [None]:
# words_list = []
# # train_subset_df = train_df['question1'][0:10]
# train_subset_df['question1'] = pd.DataFrame(data=train_df['question1'][0:10], columns=['question1'])
# train_subset_df['question2'] = pd.DataFrame(data=train_df['question2'][0:10], columns=['question2'])

In [None]:
# for sentence in train_subset_df['question1']:
#     for word in sentence:
#         words_list.append(word)
# print (len(set(words_list)))

## Feature based on how many words are common in question 1 and question 2

In [None]:
# u.termfrequency(['What is the step by step guide to invest in share market in india?'], ['What is the step by step guide to invest in share market?'])

In [None]:
def termfrequency(sentence1, sentence2):
   
    question_dict ={}
    sentence1_words = sentence1   
    sentence2_words = sentence2
    searchtermfreq = []
    i = 0
    
    for key in sentence1_words:
#         print (key)
        question_dict[key] = question_dict.get(key,0) + 1
    
    for key in set(sentence2_words):
        value =  question_dict.get(key,0)
        if value >= 1:
            value = 1
        searchtermfreq.append(value)
        
    
#     print (question_dict)
#     print (searchtermfreq)
#     print (sum(searchtermfreq))
    return sum(searchtermfreq)

termfrequency(['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'], ['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market?'])
    

In [None]:
X_train_df['common_term_freq'] = X_train_df.apply(lambda x: termfrequency(x['question1'],x['question2']), axis=1 )

In [None]:
X_train_df.head(10)

In [None]:
X_valid_df['common_term_freq'] = X_valid_df.apply(lambda x: termfrequency(x['question1'],x['question2']), axis=1 )

In [None]:
X_valid_df.head(10)

# Total words frequency

In [None]:
def total_words_freq(sentence):
    return len(sentence)

In [None]:
total_words_freq(['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'])

In [None]:
X_train_df['question1_words_freq'] = X_train_df['question1'].map(lambda x: total_words_freq(x))

In [None]:
X_train_df['question2_words_freq'] = X_train_df['question2'].map(lambda x: total_words_freq(x))

In [None]:
X_train_df.head(5)

In [None]:
X_valid_df['question1_words_freq'] = X_valid_df['question1'].map(lambda x: total_words_freq(x))

In [None]:
X_valid_df['question2_words_freq'] = X_valid_df['question2'].map(lambda x: total_words_freq(x))

In [None]:
X_valid_df.head(5)

In [None]:
X_train_model_input = X_train_df.drop(['question1','question2'],axis =1)

In [None]:
X_valid_model_input = X_valid_df.drop(['question1','question2'],axis =1)

In [None]:
X_train_model_input.head(3)

# Baseline Model

In [None]:

# Train a model# Train  
import time

def train_classifier(clf, X_train, y_train):
    print ("Training {}...".format(clf.__class__.__name__))
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print ("Done!\nTraining time (secs): {:.3f}".format(end - start))

In [None]:
from sklearn.linear_model import LogisticRegression

clf =  LogisticRegression()

In [None]:
train_classifier(clf, X_train_model_input, y_train.values.ravel())

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

def predict_labels(clf, X_train, y_train):
    print ("Predicting labels using {}...".format(clf.__class__.__name__))
    start = time.time()
    y_pred = clf.predict(X_train)
    end = time.time()
    print ("Done!\nPrediction time (secs): {:.3f}".format(end - start))
    return log_loss(y_train, y_pred, eps=1e-15), confusion_matrix(y_train, y_pred)

train_metrics = predict_labels(clf, X_train_model_input, y_train.values.ravel())

print 
print ("Log loss for training set: {}".format(train_metrics[0]))

print ("Confusion matrix for training set: {}".format(train_metrics[1]))

In [None]:
# Predict on test data
print ("Log loss for validation set: {}".format(predict_labels(clf, X_valid_model_input, y_valid.values.ravel())[0]))

In [None]:

print ("Confusion matrix for validation set: {}".format(predict_labels(clf, X_valid_model_input, y_valid.values.ravel())[1]))

# Text Preprocessing

In [None]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [None]:
words_list = create_vocabulary([],X_train_df,'question1')
print ("Lenght of words in X_train_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_train_df,'question2')
print ("Lenght of words after adding X_train_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question1')
print ("Lenght of words after adding X_valid_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question2')
print ("Lenght of words after adding X_valid_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question1')
print ("Lenght of words after adding X_test_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question2')
print ("Lenght of words after adding X_test_df question 2 {}".format(len(words_list)))



In [None]:

def preprocess_text(list_words):
     list_words_processed = []
     for text in list_words:
         text = re.sub(r"\?", '', text)
         text = re.sub(r"i'm", "i am ", text)
#          print (text)
         list_words_processed.append(str(text))
#          print (list_words_processed)
     return list_words_processed

In [None]:
preprocess_text(['India?'])

In [None]:
preprocess_text(['how',
 'do',
 'the',
 'holy',
 'scriptures',
 'of',
 'hinduism',
 'compare',
 'and',
 'contrast',
 'to',
 'those',
 'of',
 'taoism?'])

In [None]:
X_train_df['question1'][3]

In [None]:
preprocess_text(X_train_df['question1'][3])

In [None]:
X_train_p_df = pd.DataFrame()
X_valid_p_df = pd.DataFrame()
X_test_p_df = pd.DataFrame()

In [None]:
X_train_p_df['question1'] = X_train_df['question1'].apply(lambda x:preprocess_text(x))
X_train_p_df['question2'] = X_train_df['question2'].apply(lambda x:preprocess_text(x))
X_valid_p_df['question1'] = X_valid_df['question1'].apply(lambda x:preprocess_text(x))
X_valid_p_df['question2'] = X_valid_df['question2'].apply(lambda x:preprocess_text(x))
X_test_p_df['question1'] = X_test_df['question1'].apply(lambda x:preprocess_text(x))
X_test_p_df['question2'] = X_test_df['question2'].apply(lambda x:preprocess_text(x))

In [None]:
X_train_p_df['question1'][2]

In [None]:
proc_words_list = create_vocabulary([],X_train_p_df,'question1')
print ("Lenght of words in X_train_df question 1 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_train_p_df,'question2')
print ("Lenght of words after adding X_train_df question 2 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_valid_p_df,'question1')
print ("Lenght of words after adding X_valid_df question 1 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_valid_p_df,'question2')
print ("Lenght of words after adding X_valid_df question 2 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_test_p_df,'question1')
print ("Lenght of words after adding X_test_df question 1 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_test_p_df,'question2')
print ("Lenght of words after adding X_test_df question 2 {}".format(len(words_list)))



In [None]:
proc_words_freq = collections.Counter(proc_words_list)

In [None]:
proc_words_freq_10000 = proc_words_freq.most_common(10000)

In [None]:
proc_word_in_word2vec = []
proc_word_notin_word2vec = []

for word in proc_words_freq.most_common(10000):
    if word[0] in model.vocab:
        proc_word_in_word2vec.append(word[0])
    else:
        proc_word_notin_word2vec.append(word[0])

In [None]:
print (len(proc_word_in_word2vec))
print (len(proc_word_notin_word2vec))
print (proc_word_notin_word2vec[0:100])

In [None]:
print (proc_word_in_word2vec[0:100])

# Pipeline

# Keras

## Create Vocabulary

In [None]:
#