## Import packages

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
import re, sys

## Read the data

In [4]:
# read the data
train = pd.read_csv('../data/train.csv')

In [5]:
# first look

In [6]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
# summary information about the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


## Todo : summary of data variable
###
###

In [9]:
# check for duplicated ids
duplicated_id = sum(train.duplicated('id'))
duplicated_qid1 = sum(train.duplicated('qid1'))
duplicated_qid2 = sum(train.duplicated('qid2'))
print(f"The number of duplicated 'id' is: {duplicated_id}")
print(f"The number of duplicated 'qid1' is {duplicated_qid1}")
print(f"The number of duplicated 'qid2' is {duplicated_qid2}")

The number of duplicated 'id' is: 0
The number of duplicated 'qid1' is 113636
The number of duplicated 'qid2' is 104926


In [10]:
# let's remove missing value
q1_nan = sum(train.question1.isnull())
q2_nan = sum(train.question2.isnull())
print(f'Question 1 has {q1_nan} missing values')
print(f'Question 2 has {q2_nan} missing values')

Question 1 has 0 missing values
Question 2 has 2 missing values


In [11]:
# question 2 missing entries
train[train.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [12]:
# question 1 missing entries
train[train.question1.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [13]:
# let's remove question 2 missing entries
train.drop(train.index[[105780,201841]], inplace=True)

# verify 
train[train.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [14]:
# Let's look at the repeated questions in each columns.

# get the number question1 ids that appear more than once
q1 = train['qid1']
q1_dup = train[q1.isin(q1[q1.duplicated()])].shape[0]
print(f'The total number of qid1 that appears more than once is {q1_dup}')

# get the number of question2 ids that appears more than once
q2 = train['qid2']
q2_dup = train[q1.isin(q1[q1.duplicated()])].shape[0]
print(f'The number of qid2 that appears more than once is {q2_dup}')

The total number of qid1 that appears more than once is 167707
The number of qid2 that appears more than once is 167707


In [15]:
# Now let's look at the questions corresponding to those ids
train_q1_dup = train[q1.isin(q1[q1.duplicated()])].sort_values('qid1')
train_q2_dup = train[q2.isin(q2[q2.duplicated()])].sort_values('qid2')

In [16]:
# let's check
print('First 5 entries for qid1')
train_q1_dup.head()

First 5 entries for qid1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
359232,359232,3,488853,What is the story of Kohinoor (Koh-i-Noor) Dia...,Could India keep the Koh-I-Noor safe?,0
263614,263614,3,380197,What is the story of Kohinoor (Koh-i-Noor) Dia...,What are some interesting facts about Kohinoor...,0
184732,184732,3,282170,What is the story of Kohinoor (Koh-i-Noor) Dia...,Is it possible to melt down diamonds?,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [17]:
print('Last 5 entries for qid1')
train_q1_dup.tail()

Last 5 entries for qid1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
403750,403750,532883,537349,What does “天人” mean in English?,What does ''bientot'' mean in English?,0
400745,400745,534098,78019,What is a career path?,What is the career path of a recruiter?,0
403461,403461,534098,537043,What is a career path?,What is the career path for a doctor?,0
404120,404120,535331,254941,What should I say when someone is expressing c...,"What does it mean to be ""people smart""?",0
401886,401886,535331,535332,What should I say when someone is expressing c...,How can you tell when someone is faking confid...,0


In [18]:
print('First 5 entries for qid2')
train_q2_dup.head()

First 5 entries for qid2


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
402369,402369,38436,18,"When do you use ""into"" instead of ""in to""?","When do you use ""&"" instead of ""and""?",0
65735,65735,114035,26,Is there a way to make learning physics easier?,How can you make physics easy to learn?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [19]:
print('Last 5 entries for qid2')
train_q2_dup.tail()

Last 5 entries for qid2


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
399312,399312,219590,532565,How do astronauts shower in space?,How do astronauts bathe in space?,1
400906,400906,534275,533534,"What is the origin of the saying ""knock on wood""?","What is the origin of ""knocking on wood""?",1
400191,400191,533533,533534,Why is knocking on wood a part of many cultures?,"What is the origin of ""knocking on wood""?",0
403058,403058,177065,534864,What are mind-boggling facts about rich people?,What are some mind boggling facts about billio...,1
401473,401473,177066,534864,What are some mind-boggling facts about rich p...,What are some mind boggling facts about billio...,1


## say something about your analysis
##
##
##
##
##


In [20]:
# duplicate pair
q1_ids = set(train_q2_dup['id'])
q2_ids = set(train_q1_dup['id'])


In [21]:
dup_pairs = list(q1_ids & q2_ids)

In [22]:
len(dup_pairs)

99870

In [23]:
dup = train.filter(items=dup_pairs,  axis=0).sort_values(['qid1', 'qid2'], ascending=[True, False])

In [24]:
dup.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
126071,126071,9,109465,"Which one dissolve in water quikly sugar, salt...","Which freezes faster, sugar water or salt wat...",0
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
80628,80628,29,44255,What are the laws to change your status from a...,What are the laws to change your status from a...,0
14,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
238955,238955,31,93145,What would a Trump presidency mean for current...,Will the Trump factor affect the admission of ...,1
215442,215442,31,88834,What would a Trump presidency mean for current...,How would a Trump presidency affect schools ac...,1
364513,364513,31,12544,What would a Trump presidency mean for current...,How will Trump's presidency affect prospective...,1
238721,238721,31,11435,What would a Trump presidency mean for current...,What will happen to international students in ...,1
160090,160090,31,6937,What would a Trump presidency mean for current...,How will Trump’s presidency affect internation...,1
218075,218075,31,1101,What would a Trump presidency mean for current...,How would Trump presidency affect Indian stude...,1


In [25]:
dup2 = train.filter(items=dup_pairs,  axis=0).sort_values(['qid2'])
dup2.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
402369,402369,38436,18,"When do you use ""into"" instead of ""in to""?","When do you use ""&"" instead of ""and""?",0
65735,65735,114035,26,Is there a way to make learning physics easier?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
185471,185471,261411,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
14,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0


In [26]:
# check if we have duplicated questions pairs

In [27]:
d = train.duplicated(['qid1', 'qid2']) 
ddup = train[d]
ddup

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


from above analysis we don't have any duplicated pairs of question.
### write some conclusion
# todo visualizations
#
#

## Text analysis

Now let's analyse text data

In [35]:
# analyse text data
question_1 = train['question1']
question_2 = train['question2']
is_dup = train['is_duplicate']

In [36]:
def clean_str(txt):
    txt = re.sub(r"[^A-Za-z0-9(),!?\'\`&%]", " ", txt)
    txt = re.sub(r"\'s", " \'s", txt)
    txt = re.sub(r"\'ve", " \'ve", txt)
    txt = re.sub(r"\'t", " n\'t", txt)
    txt = re.sub(r"\'re", " \'re", txt)
    txt = re.sub(r"\'d", " \'d", txt) 
    txt = re.sub(r"\'ll", " \'ll", txt) 
    txt = re.sub(r",", " , ", txt) 
    txt = re.sub(r"!", " ! ", txt) 
    txt = re.sub(r"\(", " ( ", txt) 
    txt = re.sub(r"\)", " ) ", txt) 
    txt = re.sub(r"\?", " ? ", txt) 
    txt = re.sub(r"\&", " & ", txt)
    txt = re.sub(r"\%", " percent ", txt)
    return txt.strip().lower()
    
    

In [37]:
question1_clean = question_1.apply(clean_str)
question2_clean = question_2.apply(clean_str)

In [38]:
for i in range(10):
    print(f"question 1: {question1_clean[i]}")
    print(f"question 2: {question2_clean[i]}")
    print(f"is duplicate: {is_dup[i]}")
    print()

question 1: what is the step by step guide to invest in share market in india ?
question 2: what is the step by step guide to invest in share market ?
is duplicate: 0

question 1: what is the story of kohinoor  ( koh i noor )  diamond ?
question 2: what would happen if the indian government stole the kohinoor  ( koh i noor )  diamond back ?
is duplicate: 0

question 1: how can i increase the speed of my internet connection while using a vpn ?
question 2: how can internet speed be increased by hacking through dns ?
is duplicate: 0

question 1: why am i mentally very lonely ?  how can i solve it ?
question 2: find the remainder when  math 23  24   math  is divided by 24 , 23 ?
is duplicate: 0

question 1: which one dissolve in water quikly sugar ,  salt ,  methane and carbon di oxide ?
question 2: which fish would survive in salt water ?
is duplicate: 0

question 1: astrology  i am a capricorn sun cap moon and cap rising   what does that say about me ?
question 2: i'm a triple capricorn 

In [39]:
# check size
q1_len = len(question1_clean)
q2_len = len(question2_clean)
print(f"question 1 length {q1_len}")
print(f"question 2 length {q2_len}")

question 1 length 404288
question 2 length 404288


In [40]:
question_1_len = question1_clean.apply(len)
question_2_len = question2_clean.apply(len)

In [44]:
# filter empty list
empty_q1_ind = question_1_len[question_1_len == 0] 
empty_q2_ind = question_2_len[question_2_len == 0]

In [43]:
empty_q1_ind

3306      0
190570    0
290090    0
301583    0
Name: question1, dtype: int64

In [45]:
empty_q2_ind

20072     0
144343    0
384293    0
Name: question2, dtype: int64

In [46]:
# print empty record
for i in empty_q1_ind:
    print(f"raw data question 1: {question_1[i]}")
    print(f"clean data question 1: {question1_clean[i]}")
    print(f"raw data question 2: {question_2[i]}")
    print(f"clean data question 2: {question2_clean[i]}")
    print(f"is duplicate 2: {is_dup[i]}")
    print()

raw data question 1: What is the step by step guide to invest in share market in india?
clean data question 1: what is the step by step guide to invest in share market in india ?
raw data question 2: What is the step by step guide to invest in share market?
clean data question 2: what is the step by step guide to invest in share market ?
is duplicate 2: 0

raw data question 1: What is the step by step guide to invest in share market in india?
clean data question 1: what is the step by step guide to invest in share market in india ?
raw data question 2: What is the step by step guide to invest in share market?
clean data question 2: what is the step by step guide to invest in share market ?
is duplicate 2: 0

raw data question 1: What is the step by step guide to invest in share market in india?
clean data question 1: what is the step by step guide to invest in share market in india ?
raw data question 2: What is the step by step guide to invest in share market?
clean data question 2: w

In [47]:
# question 2 
for i in empty_q2_ind:
    print(f"raw data question 1: {question_1[i]}")
    print(f"clean data question 1: {question1_clean[i]}")
    print(f"raw data question 2: {question_2[i]}")
    print(f"clean data question 2: {question2_clean[i]}")
    print(f"is duplicate 2: {is_dup[i]}")
    print()

raw data question 1: What is the step by step guide to invest in share market in india?
clean data question 1: what is the step by step guide to invest in share market in india ?
raw data question 2: What is the step by step guide to invest in share market?
clean data question 2: what is the step by step guide to invest in share market ?
is duplicate 2: 0

raw data question 1: What is the step by step guide to invest in share market in india?
clean data question 1: what is the step by step guide to invest in share market in india ?
raw data question 2: What is the step by step guide to invest in share market?
clean data question 2: what is the step by step guide to invest in share market ?
is duplicate 2: 0

raw data question 1: What is the step by step guide to invest in share market in india?
clean data question 1: what is the step by step guide to invest in share market in india ?
raw data question 2: What is the step by step guide to invest in share market?
clean data question 2: w

we will remove those entries

In [51]:
# vocabulary in question 1
q1_vocabulary = question1_clean.apply(lambda x: list(set(x.split())))
q1_vocabulary_len = q1_vocabulary.apply(len)

In [52]:
q1_vocabulary[:3]

0    [in, what, to, the, step, share, guide, india,...
1    [what, kohinoor, of, i, noor, the, koh, (, ?, ...
2    [how, i, of, can, vpn, increase, while, the, a...
Name: question1, dtype: object

In [53]:
q1_vocabulary_len[:3]

0    13
1    13
2    15
Name: question1, dtype: int64

In [54]:
# vocabulary in question 2
q2_vocabulary = question2_clean.apply(lambda x: list(set(x.split())))
q2_vocabulary_len = q2_vocabulary.apply(len)

In [56]:
q2_vocabulary[:3]

0    [in, what, to, the, step, share, guide, by, ma...
1    [would, what, kohinoor, i, noor, back, if, ind...
2    [increased, dns, how, can, be, internet, by, h...
Name: question2, dtype: object

In [57]:
q2_vocabulary_len[:5]

0    12
1    17
2    11
3    12
4     8
Name: question2, dtype: int64

In [61]:
voc_df = pd.DataFrame({'voc_1': q1_vocabulary, 'voc_2':q2_vocabulary})

In [62]:
voc_df.head()

Unnamed: 0,voc_1,voc_2
0,"[in, what, to, the, step, share, guide, india,...","[in, what, to, the, step, share, guide, by, ma..."
1,"[what, kohinoor, of, i, noor, the, koh, (, ?, ...","[would, what, kohinoor, i, noor, back, if, ind..."
2,"[how, i, of, can, vpn, increase, while, the, a...","[increased, dns, how, can, be, internet, by, h..."
3,"[how, why, i, can, mentally, it, lonely, very,...","[remainder, ,, find, 24, the, 23, math, divide..."
4,"[which, in, di, ,, oxide, water, methane, and,...","[would, which, in, water, fish, ?, survive, salt]"


In [66]:
shared_token = voc_df.apply(lambda row: list(set(row['voc_1']) & set(row['voc_2'])), axis=1 )

In [69]:
shared_token_len = shared_token.apply(len)

In [75]:
# check
for i in range(10):
    print(f"{q1_vocabulary[i]}")
    print(f"{q2_vocabulary[i]}")
    print(f"{shared_token[i]}")
    print(f"{shared_token_len[i]}")
    print()

['in', 'what', 'to', 'the', 'step', 'share', 'guide', 'india', 'by', 'market', 'invest', '?', 'is']
['in', 'what', 'to', 'the', 'step', 'share', 'guide', 'by', 'market', 'invest', '?', 'is']
['in', 'what', 'to', 'the', 'step', 'share', 'guide', 'by', 'market', 'invest', '?', 'is']
12

['what', 'kohinoor', 'of', 'i', 'noor', 'the', 'koh', '(', '?', 'diamond', 'story', ')', 'is']
['would', 'what', 'kohinoor', 'i', 'noor', 'back', 'if', 'indian', 'the', 'stole', 'koh', 'government', '(', 'diamond', 'happen', '?', ')']
['what', 'kohinoor', 'noor', 'i', 'the', 'koh', '(', 'diamond', '?', ')']
10

['how', 'i', 'of', 'can', 'vpn', 'increase', 'while', 'the', 'a', 'internet', 'using', 'my', 'connection', '?', 'speed']
['increased', 'dns', 'how', 'can', 'be', 'internet', 'by', 'hacking', '?', 'through', 'speed']
['how', 'can', 'internet', '?', 'speed']
5

['how', 'why', 'i', 'can', 'mentally', 'it', 'lonely', 'very', '?', 'am', 'solve']
['remainder', ',', 'find', '24', 'the', '23', 'math', 'div

In [89]:
import math
def average_word_len(arr):
    m = len(arr)
    if m > 0:
        l = [len(txt) for txt in arr]
        return round(float(sum(l))/m, 2) 
    else:
        return 0

In [90]:
q1_average_word_len = question1_clean.apply( lambda x: average_word_len(x.split()))
q2_average_word_len = question2_clean.apply( lambda x: average_word_len(x.split()))

In [92]:
q1_average_word_len[:3]

0    3.53
1    3.23
2    4.00
Name: question1, dtype: float64

In [93]:
q2_average_word_len[:3]

0    3.54
1    4.11
2    4.55
Name: question2, dtype: float64

In [94]:
shared_token_average_len = shared_token.apply(lambda x: average_word_len(x))

In [96]:
shared_token_average_len[:10]

0    3.50
1    3.30
2    4.00
3    1.00
4    3.40
5    3.55
6    1.00
7    2.80
8    3.14
9    4.20
dtype: float64

In [None]:
# create a new dataframe 

In [119]:
df = pd.DataFrame()

In [123]:
# question1 len
df['q1_len'] = question_1_len
# question 2 len
df['q2_len'] = question_2_len
# question 1 vocabulary len
df['q1_voc_len'] = q1_vocabulary_len
# question 2 vocabulary len
df['q2_voc_len'] = q2_vocabulary_len
# question 1 average word len
df['q1_average_word_len'] = q1_average_word_len
# question 2 average word len
df['q2_average_word_len'] = q2_average_word_len
# shared token len
df['shared_token_len'] = shared_token_len
#average word in shared token
df['shared_token_average_word_len'] = shared_token_average_len
# question 1
df['question_1'] = question1_clean.apply(lambda x: x.split())
# question 2
df['question_2'] =  question2_clean.apply(lambda x: x.split())

In [124]:
df['is_duplicate'] = is_dup

In [125]:
df.head()

Unnamed: 0,q1_len,q2_len,q1_voc_len,q2_voc_len,q1_average_word_len,q2_average_word_len,shared_token_len,shared_token_average_word_len,question_1,is_duplicate,question_2
0,67,58,13,12,3.53,3.54,12,3.5,"[what, is, the, step, by, step, guide, to, inv...",0,"[what, is, the, step, by, step, guide, to, inv..."
1,56,93,13,17,3.23,4.11,10,3.3,"[what, is, the, story, of, kohinoor, (, koh, i...",0,"[what, would, happen, if, the, indian, governm..."
2,74,60,15,11,4.0,4.55,5,4.0,"[how, can, i, increase, the, speed, of, my, in...",0,"[how, can, internet, speed, be, increased, by,..."
3,53,68,11,12,3.08,3.27,1,1.0,"[why, am, i, mentally, very, lonely, ?, how, c...",0,"[find, the, remainder, when, math, 23, 24, mat..."
4,81,40,15,8,4.0,4.12,5,3.4,"[which, one, dissolve, in, water, quikly, suga...",0,"[which, fish, would, survive, in, salt, water, ?]"


In [126]:
df.shape

(404288, 11)

In [127]:
train.shape

(404288, 6)

In [128]:
# save our data
df.to_pickle('quora.pickle')

In [129]:
## TODO
## some cleaning in my notebook later