In [16]:
# data manipulation
from utils import save, load
import pandas as pd

# modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline

In [2]:
train_df = load('train')
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66.0,57.0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51.0,88.0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73.0,59.0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50.0,65.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76.0,39.0


Let's make a stack of questions maintaining the `id` of the question pair.

In [3]:
question_df = train_df.loc[:, ['id', 'question1']]
question_df = question_df.append(train_df.loc[:,['id', 'question2']], sort=False)
question_df.loc[question_df['question1'].isna(), 'question1'] = question_df.loc[question_df['question1'].isna(), 'question2']
question_df = question_df.drop(columns='question2')
question_df = question_df.sort_values('id')
question_df.head(6)

Unnamed: 0,id,question1
0,0,What is the step by step guide to invest in sh...
0,0,What is the step by step guide to invest in sh...
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...
1,1,What would happen if the Indian government sto...
2,2,How can I increase the speed of my internet co...
2,2,How can Internet speed be increased by hacking...


Let's now calcualte the tf-idf term matrix.

In [4]:
tf = TfidfVectorizer(stop_words='english', token_pattern='\\b[a-zA-Z0-9][a-zA-Z0-9]+\\b')

question_tf = tf.fit_transform(question_df['question1'])

In [5]:
# first 10 terms
tf.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '000000',
 '00000000',
 '0000000000',
 '0000001',
 '00000074',
 '0000021210',
 '00001']

In [6]:
# last 10 terms
tf.get_feature_names()[-10:]

['zyl',
 'zylber',
 'zynga',
 'zyropathy',
 'zyrtec',
 'zyzz',
 'zz',
 'zzz',
 'zzzquil',
 'zzzz']

In [7]:
# total terms
len(tf.get_feature_names())

84697

Lots of words, but some cleanup will probably needed given the numbers, and what appears to be Korean characters.

Let's now transform the 86,150 into a 20 dimension matrix with NMF

In [27]:
def calc_NMF_sim(n_components, col_name, tf_df, df):
    nmf = NMF(n_components=n_components)

    nmf_topics = nmf.fit_transform(tf_df)

    odd_idx = [i for i in range(nmf_topics.shape[0]) if i % 2 == 1]
    even_idx = [i for i in range(nmf_topics.shape[0]) if i % 2 == 0]

    # takes a long time...probably a better way to do this calculation
    sim_list = [1 - distance.cosine(nmf_topics[odd_idx[i]], nmf_topics[even_idx[i]]) 
                for i in range(len(odd_idx))]

#     sim_list = [cosine_similarity(
#                                     nmf_topics[odd_idx[i]].reshape(1,-1),
#                                     nmf_topics[even_idx[i]].reshape(1,-1)
#                 )[0,0]
#                 for i in range(len(odd_idx))]

#     sim_list = []
#     for i in range(len(odd_idx)):
#         q1 = nmf_topics[odd_idx[i]].reshape(1,-1)
#         q2 = nmf_topics[even_idx[i]].reshape(1,-1)
#         sim = cosine_similarity(q1, q2)
#         sim_list.append(sim[0, 0])

    df = pd.concat([df.sort_values('id').reset_index(), pd.Series(sim_list)], axis=1)
    df = df.rename(columns={0:col_name})
    
    return df

In [28]:
train_df_cosine = calc_NMF_sim(5, 'cos_sim_5', question_tf, train_df)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [13]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66.0,57.0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51.0,88.0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73.0,59.0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50.0,65.0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76.0,39.0


In [29]:
train_df_cosine[train_df_cosine['cos_sim_5'].isna()] ## have some null values when using distance

Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,cos_sim_5
115,115,115,231,232,Why do we cry when we are happy and when we ar...,Why do we cry?,0,52.0,14.0,
154,154,154,309,310,How forgetful are you?,How can I forget my wife?,0,22.0,25.0,
3103,3103,3103,6153,6154,What is formal theory?,What is formalism?,0,22.0,18.0,
4241,4242,4242,8388,8389,What is the importance of formatting a document?,What is SEL?,0,48.0,12.0,
4641,4642,4642,9169,9170,Where is Nangli Wazidpur?,What is Nangli Wazidpur?,0,25.0,24.0,
6881,6882,6882,13469,13470,What is a lobotomy?,Where can I get a lobotomy?,0,19.0,27.0,
7119,7120,7120,13921,13922,Is it proper to use a comma after saying thank...,What is here and not there?,0,51.0,27.0,
7409,7410,7410,14479,232,Why don't you cry?,Why do we cry?,0,18.0,14.0,
7819,7820,7820,15264,15265,Why and how is 0! =1?,Why is 0! equal to 1?,1,21.0,21.0,
8054,8055,8055,15715,15716,What is a boobsize?,Why hasn't my sunburn faded yet?,0,19.0,32.0,


In [24]:
train_df_cosine.head()

Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,cos_sim_5
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66.0,57.0,0.588704
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51.0,88.0,0.83445
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73.0,59.0,0.955411
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50.0,65.0,0.89252
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76.0,39.0,0.994637


In [None]:
nmf = NMF(n_components=50)

question_nmf = nmf.fit_transform(question_tf)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
print_top_words(nmf, tf.get_feature_names(), 10)

We now have mapped the term matrix into a 20 topic space. Let's now calculate the `cosine_similarity` between each pair of questions. The goal is to determine if `cosine_similarity` will indicate whether or not the pair of questions have the same intent.

In [None]:
odd_idx = [i for i in range(question_nmf.shape[0]) if i % 2 == 1]
even_idx = [i for i in range(question_nmf.shape[0]) if i % 2 == 0]
print(len(odd_idx))
train_df.shape[0]

In [None]:
# takes a long time...probably a better way to do this calculation
sim_list = []
for i in range(len(odd_idx)):
    q1 = question_nmf[odd_idx[i]].reshape(1,-1)
    q2 = question_nmf[even_idx[i]].reshape(1,-1)
    sim = cosine_similarity(q1, q2)
    sim_list.append(sim[0, 0])
sim_list[:10]

In [None]:
train_df_cosine = pd.concat([train_df.sort_values('id').reset_index(), pd.Series(sim_list)], axis=1)
train_df_cosine = train_df_cosine.rename(columns={0:'cosine_similarity'})
train_df_cosine.head()

Let's look at the first pair. The cosine similarity is very high, but has identified as not the same intent.

In [None]:
print(train_df_cosine.loc[:, 'question1'].head(1).values)
print(train_df_cosine.loc[:, 'question2'].head(1).values)

Ah! These two questions are exactly the same except for the last two words. Maybe this is an outlier. Let's plot the similarity distribution for each class.

In [None]:
sns.kdeplot(train_df_cosine.loc[train_df_cosine['is_duplicate'] == 0, 'cosine_similarity'], 
             shade=True, 
             label = 'No Intent')
sns.kdeplot(train_df_cosine.loc[train_df_cosine['is_duplicate'] == 1, 'cosine_similarity'], 
             shade=True,
             label = 'Intent')

plt.xlabel('cosine similarity')
plt.ylabel('density')
plt.title('KDE comparing pairs with intent and no intent')
;

The distribution of cosine similarity is somewhat different, but there is a lot of overlap. This will be difficult to classify the pairs. Let's take a look at an example where the pair has a cosine similarity of 0 and is marked as a duplicate.

In [None]:
print(train_df_cosine.loc[
    (train_df_cosine['cosine_similarity'] == 0) & (train_df_cosine['is_duplicate'] == 1),
    'question1'
    ].head(1).values)

In [None]:
print(train_df_cosine.loc[
    (train_df_cosine['cosine_similarity'] == 0) & (train_df_cosine['is_duplicate'] == 1),
    'question2'
    ].head(1).values)

In [None]:
dup_cos_zero_df = train_df_cosine[
    (train_df_cosine['cosine_similarity'] == 0) & (train_df_cosine['is_duplicate'] == 1)]
dup_cos_zero_df

This seems odd....

In [None]:
def stack_questions(df):
    comb_df = df.loc[:, ['id', 'question1']]
    comb_df = comb_df.append(df.loc[:,['id', 'question2']], sort=False)
    comb_df.loc[comb_df['question1'].isna(), 'question1'] = comb_df.loc[comb_df['question1'].isna(), 'question2']
    comb_df = comb_df.drop(columns='question2')
    comb_df = comb_df.sort_values('id')
    return comb_df

In [None]:
comb_cos_zero_df = stack_questions(dup_cos_zero_df)
comb_cos_zero_df.head()

In [None]:
tf_sub = TfidfVectorizer(stop_words='english')
tf_cos_zero = tf_sub.fit_transform(comb_cos_zero_df.loc[:, 'question1'])

nmf_sub = NMF(n_components=20)
nmf_cos_zero = nmf_sub.fit_transform(tf_cos_zero)

cosine_similarity(nmf_cos_zero[0].reshape(1, -1), nmf_cos_zero[1].reshape(1, -1))