In [95]:
from utils import save, load
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt

%matplotlib inline

In [17]:
train_df = load('train')
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66.0,57.0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51.0,88.0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73.0,59.0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50.0,65.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76.0,39.0


Let's make a stack of questions maintaining the `id` of the question pair.

In [60]:
question_df = train_df.loc[:, ['id', 'question1']]
question_df = question_df.append(train_df.loc[:,['id', 'question2']], sort=False)
question_df.loc[question_df['question1'].isna(), 'question1'] = question_df.loc[question_df['question1'].isna(), 'question2']
question_df = question_df.drop(columns='question2')
question_df = question_df.dropna()
question_df = question_df.sort_values('id')
question_df.head(6)

Unnamed: 0,id,question1
0,0,What is the step by step guide to invest in sh...
0,0,What is the step by step guide to invest in sh...
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...
1,1,What would happen if the Indian government sto...
2,2,How can I increase the speed of my internet co...
2,2,How can Internet speed be increased by hacking...


Let's now calcualte the tf-idf term matrix.

In [61]:
tf = TfidfVectorizer()

question_tf = tf.fit_transform(question_df['question1'])

In [62]:
# first 10 terms
tf.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '000000',
 '00000000',
 '0000000000',
 '0000001',
 '00000074',
 '0000021210',
 '00001']

In [63]:
# last 10 terms
tf.get_feature_names()[-10:]

['분위기', '불타오르네', '슬마', '심하잖아', '이정현', '친구해도', '쾌지나칭칭나네', '하지만', '한글', 'ﬁnd']

In [64]:
# total terms
len(tf.get_feature_names())

86150

Lots of words, but some cleanup will probably needed given the numbers, and what appears to be Korean characters.

Let's now transform the 86,150 into a 100 dimension matrix with NMF

In [65]:
nmf = NMF(n_components=20, verbose=True)

question_nmf = nmf.fit_transform(question_tf)

violation: 1.0
violation: 0.42240262588085903
violation: 0.2942718246718631
violation: 0.18655910085084124
violation: 0.121261883550437
violation: 0.08672423561452133
violation: 0.06656799204998462
violation: 0.05224402001840111
violation: 0.04329858747112655
violation: 0.038354657038831036
violation: 0.03459003451153603
violation: 0.03093199367926663
violation: 0.027424938643366523
violation: 0.02393247256721756
violation: 0.020735893458415356
violation: 0.017867243888563512
violation: 0.015303153865422267
violation: 0.013048250047164077
violation: 0.01114851742822003
violation: 0.009531273696985133
violation: 0.008148825471230513
violation: 0.006864251207376008
violation: 0.005807410586736534
violation: 0.004930069573083649
violation: 0.004114396356012972
violation: 0.0034272332686147468
violation: 0.0028983928271391762
violation: 0.002468711188727635
violation: 0.0021079392028253963
violation: 0.0018044447823109626
violation: 0.0015482621364678554
violation: 0.00133086320705158
viol

In [66]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [67]:
print_top_words(nmf, tf.get_feature_names(), 10)

Topic #0: of the what meaning most world all was life rid
Topic #1: do how get people become with think start from find
Topic #2: are some what ways good examples that about things books
Topic #3: you have if ever would think when thing that know
Topic #4: to way learn want ways get need from lose weight
Topic #5: it like be possible feel an at to with that
Topic #6: why people so not we do did don many and
Topic #7: in india the world there engineering where life job which
Topic #8: can how get where we find one learn be become
Topic #9: is what the way thing there or most better an
Topic #10: and between difference the 500 notes 1000 what differences rupee
Topic #11: my improve english account skills me password if gmail increase
Topic #12: for prepare good year 2017 exam an free book engineering
Topic #13: on quora questions question ask answer answers google asked delete
Topic #14: does mean how work what have feel much one long
Topic #15: money make online earn from ways how 1000 

We now have mapped the term matrix into a 20 topic space. Let's now calculate the `cosine_similarity` between each pair of questions. The goal is to determine if `cosine_similarity` will indicate whether or not the pair of questions have the same intent.

In [74]:
odd_idx = [i for i in range(question_nmf.shape[0]) if i % 2 == 1]
even_idx = [i for i in range(question_nmf.shape[0]) if i % 2 == 0]
len(odd_idx)

404267

In [85]:
# takes a long time...probably a better way to do this calculation
sim_list = []
for i in range(len(odd_idx)):
    q1 = question_nmf[odd_idx[i]].reshape(1,-1)
    q2 = question_nmf[even_idx[i]].reshape(1,-1)
    sim = cosine_similarity(q1, q2)
    sim_list.append(sim[0, 0])
sim_list[:10]

[0.90378566222836,
 0.3807641162406918,
 0.5020035439261036,
 0.17318937545084193,
 0.6538234746746348,
 0.9131094031253592,
 0.016503444580548966,
 0.15402678346824233,
 0.8575985480199524,
 0.03296844155586009]

In [87]:
train_df_cosine = pd.concat([train_df, pd.Series(sim_list)], axis=1)
train_df_cosine = train_df_cosine.rename(columns={0:'cosine_similarity'})
train_df_cosine.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,cosine_similarity
0,0.0,1.0,2.0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0,66.0,57.0,0.903786
1,1.0,3.0,4.0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0,51.0,88.0,0.380764
2,2.0,5.0,6.0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0,73.0,59.0,0.502004
3,3.0,7.0,8.0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0,50.0,65.0,0.173189
4,4.0,9.0,10.0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0,76.0,39.0,0.653823


Let's look at the first pair. The cosine similarity is very high, but has identified as not the same intent.

In [94]:
print(train_df_cosine.loc[:, 'question1'].head(1).values)
print(train_df_cosine.loc[:, 'question2'].head(1).values)

['What is the step by step guide to invest in share market in india?']
['What is the step by step guide to invest in share market?']


Ah! These two questions are exactly the same except for the last two words. Maybe this is an outlier. Let's plot the similarity distribution for each class.

In [101]:
train_df_cosine.tail() # I messed something up

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,cosine_similarity
404285,404285.0,433578.0,379845.0,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0.0,85.0,79.0,
404286,404286.0,18840.0,155606.0,Do you believe there is life after death?,Is it true that there is life after death?,1.0,41.0,42.0,
404287,404287.0,537928.0,537929.0,What is one coin?,What's this coin?,0.0,17.0,17.0,
404288,404288.0,537930.0,537931.0,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0.0,94.0,127.0,
404289,404289.0,537932.0,537933.0,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0.0,37.0,45.0,
