## Import packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Read the data

In [2]:
# read the data
train = pd.read_csv('data/train.csv')

In [3]:
# first look

In [4]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
# summary information about the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


## Todo : summary of data variable
###
###

In [6]:
# check for duplicated ids
duplicated_id = sum(train.duplicated('id'))
duplicated_qid1 = sum(train.duplicated('qid1'))
duplicated_qid2 = sum(train.duplicated('qid2'))
print(f"The number of duplicated 'id' is: {duplicated_id}")
print(f"The number of duplicated 'qid1' is {duplicated_qid1}")
print("The number of duplicated 'qid2' is {duplicated_qid2}")

The number of duplicated 'id' is: 0
The number of duplicated 'qid1' is 113636
The number of duplicated 'qid2' is {duplicated_qid2}


In [7]:
# let's remove missing value
q1_nan = sum(train.question1.isnull())
q2_nan = sum(train.question2.isnull())
print(f'Question 1 has {q1_nan} missing values')
print(f'Question 2 has {q2_nan} missing values')

Question 1 has 0 missing values
Question 2 has 2 missing values


In [8]:
# question 2 missing entries
train[train.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [9]:
# question 1 missing entries
train[train.question1.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [10]:
# let's remove question 2 missing entries
train.drop(train.index[[105780,201841]], inplace=True)

# verify 
train[train.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [11]:
# Let's look at the repeated questions in each columns.

# get the number question1 ids that appear more than once
q1 = train['qid1']
q1_dup = train[q1.isin(q1[q1.duplicated()])].shape[0]
print(f'The total number of qid1 that appears more than once is {q1_dup}')

# get the number of question2 ids that appears more than once
q2 = train['qid2']
q2_dup = train[q1.isin(q1[q1.duplicated()])].shape[0]
print(f'The number of qid2 that appears more than once is {q2_dup}')

The total number of qid1 that appears more than once is 167707
The number of qid2 that appears more than once is 167707


In [12]:
# Now let's look at the questions corresponding to those ids
train_q1_dup = train[q1.isin(q1[q1.duplicated()])].sort_values('qid1')
train_q2_dup = train[q2.isin(q2[q2.duplicated()])].sort_values('qid2')

In [13]:
# let's check
print('First 5 entries for qid1')
train_q1_dup.head()

First 5 entries for qid1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
359232,359232,3,488853,What is the story of Kohinoor (Koh-i-Noor) Dia...,Could India keep the Koh-I-Noor safe?,0
263614,263614,3,380197,What is the story of Kohinoor (Koh-i-Noor) Dia...,What are some interesting facts about Kohinoor...,0
184732,184732,3,282170,What is the story of Kohinoor (Koh-i-Noor) Dia...,Is it possible to melt down diamonds?,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [14]:
print('Last 5 entries for qid1')
train_q1_dup.tail()

Last 5 entries for qid1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
403750,403750,532883,537349,What does “天人” mean in English?,What does ''bientot'' mean in English?,0
400745,400745,534098,78019,What is a career path?,What is the career path of a recruiter?,0
403461,403461,534098,537043,What is a career path?,What is the career path for a doctor?,0
404120,404120,535331,254941,What should I say when someone is expressing c...,"What does it mean to be ""people smart""?",0
401886,401886,535331,535332,What should I say when someone is expressing c...,How can you tell when someone is faking confid...,0


In [15]:
print('First 5 entries for qid2')
train_q2_dup.head()

First 5 entries for qid2


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
402369,402369,38436,18,"When do you use ""into"" instead of ""in to""?","When do you use ""&"" instead of ""and""?",0
65735,65735,114035,26,Is there a way to make learning physics easier?,How can you make physics easy to learn?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [16]:
print('Last 5 entries for qid2')
train_q2_dup.tail()

Last 5 entries for qid2


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
399312,399312,219590,532565,How do astronauts shower in space?,How do astronauts bathe in space?,1
400906,400906,534275,533534,"What is the origin of the saying ""knock on wood""?","What is the origin of ""knocking on wood""?",1
400191,400191,533533,533534,Why is knocking on wood a part of many cultures?,"What is the origin of ""knocking on wood""?",0
403058,403058,177065,534864,What are mind-boggling facts about rich people?,What are some mind boggling facts about billio...,1
401473,401473,177066,534864,What are some mind-boggling facts about rich p...,What are some mind boggling facts about billio...,1


## say something about your analysis
##
##
##
##
##


In [21]:
# duplicate pair
q1_ids = set(train_q2_dup['id'])
q2_ids = set(train_q1_dup['id'])


In [25]:
dup_pairs = list(q1_ids & q2_ids)

In [26]:
len(dup_pairs)

99870

In [54]:
dup = train.filter(items=dup_pairs,  axis=0).sort_values(['qid1', 'qid2'], ascending=[True, False])

In [55]:
dup.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
126071,126071,9,109465,"Which one dissolve in water quikly sugar, salt...","Which freezes faster, sugar water or salt wat...",0
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
80628,80628,29,44255,What are the laws to change your status from a...,What are the laws to change your status from a...,0
14,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
238955,238955,31,93145,What would a Trump presidency mean for current...,Will the Trump factor affect the admission of ...,1
215442,215442,31,88834,What would a Trump presidency mean for current...,How would a Trump presidency affect schools ac...,1
364513,364513,31,12544,What would a Trump presidency mean for current...,How will Trump's presidency affect prospective...,1
238721,238721,31,11435,What would a Trump presidency mean for current...,What will happen to international students in ...,1
160090,160090,31,6937,What would a Trump presidency mean for current...,How will Trump’s presidency affect internation...,1
218075,218075,31,1101,What would a Trump presidency mean for current...,How would Trump presidency affect Indian stude...,1


In [48]:
dup2 = train.filter(items=dup_pairs,  axis=0).sort_values(['qid2'])
dup2.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
402369,402369,38436,18,"When do you use ""into"" instead of ""in to""?","When do you use ""&"" instead of ""and""?",0
65735,65735,114035,26,Is there a way to make learning physics easier?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1
185471,185471,261411,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
14,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0


In [None]:
# check if we have duplicated questions pairs

In [80]:
d = train.duplicated(['qid1', 'qid2']) 
ddup = train[d]
ddup

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


from above analysis we don't have any duplicated pairs of question.
### write some conclusion
# todo visualizations
#
#

## Test analysis

Now let's analyse text data