In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# import libraries
import pandas as pd
from utils import *
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy.linalg import norm
seed=10
from w2vec_model import *
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = pd.read_csv('train.csv')

In [7]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [9]:
# check for duplicated ids
# check for duplicated ids
duplicated_id = sum(data.duplicated('id'))
duplicated_qid1 = sum(data.duplicated('qid1'))
duplicated_qid2 = sum(data.duplicated('qid2'))
print("The number of duplicated 'id' is: {0}".format(duplicated_id))
print("The number of duplicated 'qid1' is {0}".format(duplicated_qid1))
print("The number of duplicated 'qid2' is {0}".format(duplicated_qid2))


The number of duplicated 'id' is: 0
The number of duplicated 'qid1' is 113636
The number of duplicated 'qid2' is 104926


In [10]:
# let's remove missing question entries
q1_nan = sum(data.question1.isnull())
q2_nan = sum(data.question2.isnull())
print('Question 1 has {0} missing values'.format(q1_nan))
print('Question 2 has {0} missing values'.format(q2_nan))

Question 1 has 0 missing values
Question 2 has 2 missing values


In [11]:
# question 2 missing entries
data[data.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [12]:
# remove those entries
data.drop(data[data.question2.isnull()].index, inplace=True)

In [13]:
# check
data[data.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 6 columns):
id              404288 non-null int64
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(4), object(2)
memory usage: 21.6+ MB


In [15]:
# Let's look at the repeated question ids in each columns.

# get the number question1 ids that appear more than once
q1 = data['qid1']
q1_dup = data[q1.isin(q1[q1.duplicated()])].shape[0]
print('The total number of qid1 that appears more than once is {0}'.format(q1_dup))

# get the number of question2 ids that appears more than once
q2 = data['qid2']
q2_dup = data[q2.isin(q2[q2.duplicated()])].shape[0]
print('The number of qid2 that appears more than once is {0}'.format(q2_dup))

The total number of qid1 that appears more than once is 167707
The number of qid2 that appears more than once is 150555


In [16]:
# Now let's look at the questions corresponding to those ids
data_q1_dup = data[q1.isin(q1[q1.duplicated()])].sort_values('qid1')
data_q2_dup = data[q2.isin(q2[q2.duplicated()])].sort_values('qid2')

In [17]:
# let's check
print('First 11 entries for qid1')
data_q1_dup.head(11)

First 11 entries for qid1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
359232,359232,3,488853,What is the story of Kohinoor (Koh-i-Noor) Dia...,Could India keep the Koh-I-Noor safe?,0
263614,263614,3,380197,What is the story of Kohinoor (Koh-i-Noor) Dia...,What are some interesting facts about Kohinoor...,0
184732,184732,3,282170,What is the story of Kohinoor (Koh-i-Noor) Dia...,Is it possible to melt down diamonds?,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
126071,126071,9,109465,"Which one dissolve in water quikly sugar, salt...","Which freezes faster, sugar water or salt wat...",0
351364,351364,9,480204,"Which one dissolve in water quikly sugar, salt...",Which are the companies in Dubai who imports w...,0
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
327315,327315,27,50277,What was your first sexual experience like?,What is your first sexual experience?,1
80628,80628,29,44255,What are the laws to change your status from a...,What are the laws to change your status from a...,0


In [18]:
# let's check
print('Last 10 entries for qid1')
data_q1_dup.tail(10)

Last 10 entries for qid1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
398892,398892,532136,116962,Which part of speech is the word hello?,"What part of speech is ""this""?",0
400113,400113,532136,533449,Which part of speech is the word hello?,What part of speech are words ending in -ing?,0
399490,399490,532767,532768,Why is poverty considered a social problem in ...,How can we short out the problem of poverty?,0
403926,403926,532767,537550,Why is poverty considered a social problem in ...,Why is poverty considered a social problem?,0
399595,399595,532883,532884,What does “天人” mean in English?,What does ''badam'' mean in English?,0
403750,403750,532883,537349,What does “天人” mean in English?,What does ''bientot'' mean in English?,0
400745,400745,534098,78019,What is a career path?,What is the career path of a recruiter?,0
403461,403461,534098,537043,What is a career path?,What is the career path for a doctor?,0
404120,404120,535331,254941,What should I say when someone is expressing c...,"What does it mean to be ""people smart""?",0
401886,401886,535331,535332,What should I say when someone is expressing c...,How can you tell when someone is faking confid...,0


In [19]:
# let's check
print('First 11 entries for qid2')
data_q2_dup.head(11)

First 11 entries for qid2


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
402369,402369,38436,18,"When do you use ""into"" instead of ""in to""?","When do you use ""&"" instead of ""and""?",0
65735,65735,114035,26,Is there a way to make learning physics easier?,How can you make physics easy to learn?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
27051,27051,50277,28,What is your first sexual experience?,What was your first sexual experience?,1
14,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
272517,272517,47048,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
401723,401723,44256,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
109466,109466,179664,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0


In [20]:
# let's check
print('Last 10 entries for qid2')
data_q2_dup.tail(10)

Last 10 entries for qid2


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
402164,402164,141495,530954,What is the difference between the following s...,What is the difference between the following s...,0
397806,397806,530953,530954,"What is the difference between ""at"" and ""on"" i...",What is the difference between the following s...,1
403573,403573,79240,532396,What is the cell membrane? What are the functi...,Within the cell membrane. what is the role of ...,0
399151,399151,193935,532396,What is the function of a cell membrane?,Within the cell membrane. what is the role of ...,0
400440,400440,219589,532565,How do astronauts have a shower?,How do astronauts bathe in space?,1
399312,399312,219590,532565,How do astronauts shower in space?,How do astronauts bathe in space?,1
400906,400906,534275,533534,"What is the origin of the saying ""knock on wood""?","What is the origin of ""knocking on wood""?",1
400191,400191,533533,533534,Why is knocking on wood a part of many cultures?,"What is the origin of ""knocking on wood""?",0
403058,403058,177065,534864,What are mind-boggling facts about rich people?,What are some mind boggling facts about billio...,1
401473,401473,177066,534864,What are some mind-boggling facts about rich p...,What are some mind boggling facts about billio...,1


In [21]:
# check if we have duplicated question ids pairs
d = data.duplicated(['qid1', 'qid2']) 
ddup = data[d]
ddup

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 6 columns):
id              404288 non-null int64
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(4), object(2)
memory usage: 41.6+ MB


## Text Analysis

In [23]:
# Remove all non-alphanumeric token in our corpus
data['question1'] = data['question1'].apply(clean_str)
data['question2'] = data['question2'].apply(clean_str)

In [24]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor ( koh i noor ) ...,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely ? how can i sol...,find the remainder when math 23 24 math i...,0
4,4,9,10,"which one dissolve in water quikly sugar , sa...",which fish would survive in salt water ?,0


In [25]:
for i in range(10):
    print("question 1: {0}".format(data['question1'][i]))
    print("question 2: {0}".format(data['question2'][i]))
    print("is duplicate: {0}".format(data['is_duplicate'][i]))
    print()

question 1: what is the step by step guide to invest in share market in india ?
question 2: what is the step by step guide to invest in share market ?
is duplicate: 0

question 1: what is the story of kohinoor  ( koh i noor )  diamond ?
question 2: what would happen if the indian government stole the kohinoor  ( koh i noor )  diamond back ?
is duplicate: 0

question 1: how can i increase the speed of my internet connection while using a vpn ?
question 2: how can internet speed be increased by hacking through dns ?
is duplicate: 0

question 1: why am i mentally very lonely ?  how can i solve it ?
question 2: find the remainder when  math 23  24   math  is divided by 24 , 23 ?
is duplicate: 0

question 1: which one dissolve in water quikly sugar ,  salt ,  methane and carbon di oxide ?
question 2: which fish would survive in salt water ?
is duplicate: 0

question 1: astrology  i am a capricorn sun cap moon and cap rising   what does that say about me ?
question 2: i'm a triple capricorn 

In [26]:
# remove punctuation in our corpus
data['question1'] = data['question1'].apply(lambda x: remove_punctuation(x.split()))
data['question2'] = data['question2'].apply(lambda x: remove_punctuation(x.split()))

In [27]:
# remove stopwords in our corpus
data['question1'] = data['question1'].apply(remove_stopwords)
data['question2'] = data['question2'].apply(remove_stopwords)

In [28]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0
1,1,3,4,"[story, kohinoor, koh, noor, diamond]","[would, happen, indian, government, stole, koh...",0
2,2,5,6,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0
3,3,7,8,"[mentally, lonely, solve]","[find, remainder, math, 23, 24, math, divided,...",0
4,4,9,10,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0


In [29]:
data.to_pickle('data/quora.pickle')

## Prepare data for convolutional neural network

In [30]:
data_cnn = data.copy()

In [31]:
data_cnn['q1_len'] = data_cnn['question1'].apply(len)
data_cnn['q2_len'] = data_cnn['question2'].apply(len)

In [32]:
d_1 = data_cnn['q1_len'] == 0
d_2 = data_cnn['q2_len'] == 0

ind_1 = data_cnn[d_1].index
ind_2 = data_cnn[d_2].index
ind = ind_1.append(ind_2)

In [33]:
data_cnn = data_cnn.drop(ind)
data_cnn.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
0,0,1,2,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0,7,6
1,1,3,4,"[story, kohinoor, koh, noor, diamond]","[would, happen, indian, government, stole, koh...",0,5,10
2,2,5,6,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0,6,5
3,3,7,8,"[mentally, lonely, solve]","[find, remainder, math, 23, 24, math, divided,...",0,3,9
4,4,9,10,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0,10,5


In [34]:
data_cnn.drop(['qid1', 'qid2','id','q1_len', 'q2_len'], axis=1, inplace=True)

In [35]:
data_cnn.head()

Unnamed: 0,question1,question2,is_duplicate
0,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0
1,"[story, kohinoor, koh, noor, diamond]","[would, happen, indian, government, stole, koh...",0
2,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0
3,"[mentally, lonely, solve]","[find, remainder, math, 23, 24, math, divided,...",0
4,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0


In [36]:
data_cnn['question1'] = data_cnn['question1'].apply(lambda x: x + ['']*(20 - len(x)) if len(x) < 20 else x[:20])
data_cnn['question2'] = data_cnn['question2'].apply(lambda x: x + ['']*(20 - len(x)) if len(x) < 20 else x[:20])

In [37]:
# get all the questions
questions = data_cnn['question1'].append(data_cnn['question2'], ignore_index=True).tolist()

In [38]:
# build word2vec model using only 100 features
num_features = 100
min_word_count = 1
num_workers = multiprocessing.cpu_count()
context_size = 5
downsampling = 1e-3
seed = 10
sg=1

In [39]:
model = Word2VecModel()
model.create_w2v_model(questions, num_features, min_word_count, context_size,
                         sg, downsampling, seed)

In [40]:
data_cnn['question1'] = data_cnn['question1'].apply(lambda x: get_w2v_vector(model._model, x, vector_size=100))
data_cnn['question2'] = data_cnn['question2'].apply(lambda x: get_w2v_vector(model._model, x, vector_size=100))

In [41]:
data_cnn.head()

Unnamed: 0,question1,question2,is_duplicate
0,"[[-0.0378785, -0.146129, 0.280466, -0.119127, ...","[[-0.0378785, -0.146129, 0.280466, -0.119127, ...",0
1,"[[-0.571352, 0.495741, -0.673742, -0.712127, 0...","[[0.332913, -0.283903, 0.451347, -0.089472, 0....",0
2,"[[0.275699, 0.271085, 0.055919, -0.223083, -0....","[[-0.489543, 0.215172, 0.306797, 0.614354, 0.1...",0
3,"[[-0.104706, -0.198482, 0.0512632, -0.0317959,...","[[-0.365623, 0.437891, 0.0427879, -0.144401, 0...",0
4,"[[0.330548, -0.125617, 0.167564, -0.124792, 0....","[[0.373634, -0.218972, 0.736113, -0.248313, -0...",0


In [42]:
data_cnn.to_pickle('data/quora_cnn.pickle')

## Prepare data for xgboost model

In [28]:
data_xgb = data.copy()

In [29]:
##prepare the data to build the LSA model
data1 = data_xgb.question1
data2 = data_xgb.question2
data_q1 = []
data_q2 = []
q1_q2 = []
for d in data1:
    d1 = ' '.join(d)
    data_q1.append(d1)
    
for d in data2:
    d2 = ' '.join(d)
    data_q2.append(d2)
# our document set consist of all the questions: questions1 and questions2    
q1_q2 = data_q1 + data_q2

In [30]:
svd_clf = make_pipeline(TfidfVectorizer(min_df=1),
                               TruncatedSVD(algorithm='arpack', n_components=100, random_state=seed))
svd_clf

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...vd', TruncatedSVD(algorithm='arpack', n_components=100, n_iter=5, random_state=10,
       tol=0.0))])

In [31]:
# fit the model
svd_clf.fit(q1_q2)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...vd', TruncatedSVD(algorithm='arpack', n_components=100, n_iter=5, random_state=10,
       tol=0.0))])

In [32]:
# apply trainsform to question1 and question 2 to get SVD vectors
questions1_transform = svd_clf.transform(data_q1)
questions2_transform = svd_clf.transform(data_q2)

In [33]:
# similarity score
sim = similarity(questions1_transform, questions2_transform)


In [34]:
data_xgb['similarity'] = sim

In [35]:
data_xgb.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,similarity
0,0,1,2,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0,0.279268
1,1,3,4,"[story, kohinoor, koh, noor, diamond]","[would, happen, indian, government, stole, koh...",0,0.001085
2,2,5,6,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0,0.476306
3,3,7,8,"[mentally, lonely, solve]","[find, remainder, math, 23, 24, math, divided,...",0,0.133187
4,4,9,10,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0,0.022395


In [36]:
# other feature engineering
### number of word in question 1
data_xgb['q1_len'] = data_xgb['question1'].apply(len)
### number of word in question 2
data_xgb['q2_len'] = data_xgb['question2'].apply(len)
### number of unique word in question 1
data_xgb['q1_voc_len'] = data_xgb['question1'].apply(lambda x: len(list(set(x))))
### number of unique word in question 2
data_xgb['q2_voc_len'] = data_xgb['question2'].apply(lambda x: len(list(set(x))))
### number of unique word in both question 1 and question 2
data_xgb['shared_token_len'] = data_xgb.apply(lambda row: len(list(set(row['question1']) & set(row['question2']))), axis=1 )

In [37]:
data_xgb.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,similarity,q1_len,q2_len,q1_voc_len,q2_voc_len,shared_token_len
0,0,1,2,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0,0.279268,7,6,6,5,5
1,1,3,4,"[story, kohinoor, koh, noor, diamond]","[would, happen, indian, government, stole, koh...",0,0.001085,5,10,5,10,4
2,2,5,6,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0,0.476306,6,5,6,5,2
3,3,7,8,"[mentally, lonely, solve]","[find, remainder, math, 23, 24, math, divided,...",0,0.133187,3,9,3,6,0
4,4,9,10,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0,0.022395,10,5,10,5,2


In [38]:
# remove nan
# remove NAN iN similarity
ind_sim = data_xgb[data_xgb.similarity.isnull()].index

In [39]:
len(ind_sim)

260

In [40]:
data_xgb.drop(ind_sim, inplace=True)

In [41]:
data_xgb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404028 entries, 0 to 404289
Data columns (total 12 columns):
id                  404028 non-null int64
qid1                404028 non-null int64
qid2                404028 non-null int64
question1           404028 non-null object
question2           404028 non-null object
is_duplicate        404028 non-null int64
similarity          404028 non-null float64
q1_len              404028 non-null int64
q2_len              404028 non-null int64
q1_voc_len          404028 non-null int64
q2_voc_len          404028 non-null int64
shared_token_len    404028 non-null int64
dtypes: float64(1), int64(9), object(2)
memory usage: 40.1+ MB


In [42]:
data_xgb.drop(['qid1', 'qid2', 'id', 'question1', 'question2'], axis=1, inplace=True)

In [43]:
data_xgb.head()

Unnamed: 0,is_duplicate,similarity,q1_len,q2_len,q1_voc_len,q2_voc_len,shared_token_len
0,0,0.279268,7,6,6,5,5
1,0,0.001085,5,10,5,10,4
2,0,0.476306,6,5,6,5,2
3,0,0.133187,3,9,3,6,0
4,0,0.022395,10,5,10,5,2


In [44]:
### save data for xgboost learning
data_xgb.to_pickle('data/quora_xgb.pickle')