In [5]:
import pandas as pd
import numpy as np
from utils import save

from sklearn.model_selection import train_test_split

In [6]:
! ls -l ../data/

total 61920
drwxrwxr-x 3 ubuntu ubuntu     4096 Dec  7 23:12 pkl
-rw-r--r-- 1 ubuntu ubuntu 63399110 Dec  7 23:40 train.csv


In [7]:
TRAIN_PATH = '../data/train.csv'

In [8]:
train_df = pd.read_csv(TRAIN_PATH)

In [9]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


Let's see how long these questions are.

In [10]:
train_df['q1_len'] = train_df['question1'].str.len()
train_df['q2_len'] = train_df['question2'].str.len()

train_df.loc[:, ['q1_len', 'q2_len']].describe()

Unnamed: 0,q1_len,q2_len
count,404289.0,404288.0
mean,59.536856,60.108663
std,29.940546,33.86369
min,1.0,1.0
25%,39.0,39.0
50%,52.0,51.0
75%,72.0,72.0
max,623.0,1169.0


There seems to be some really small questions. Let's take a look at these questions.

In [11]:
train_df[train_df['q1_len'] == 1]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
3306,3306,6553,6554,.,Why is Cornell's endowment the lowest in the I...,0,1.0,56.0
13016,13016,25026,25027,?,Why should one not work at Google?,0,1.0,34.0
20794,20794,39204,39205,?,What is the Gmail tech support help phone number?,0,1.0,49.0
96725,96725,161071,161072,?,"Who are Moses, Noah and Exodus?",0,1.0,31.0
104101,104101,171925,171926,?,"Problem of solving a problem is not a problem,...",0,1.0,149.0
108978,108978,178936,178937,i,What questions to ask any drdummer?,0,1.0,35.0
115347,115347,188110,52215,o,Where can I watch free streaming movies online?,0,1.0,47.0
134403,134403,214814,214815,?,What is your take on the undercover report tha...,0,1.0,144.0
151922,151922,188110,238787,o,What is this - “This website/URL has been bloc...,0,1.0,132.0
158778,158778,247989,48850,A,Do men forgive their wives when they cheat on ...,0,1.0,51.0


In [12]:
train_df[train_df['q2_len'] == 1]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
47056,47056,84067,84068,Is there anywhere in the world offering pain m...,?,0,117.0,1.0
198913,198913,300250,188110,What is this?,o,0,13.0,1.0


Let's confirm did not make a mistake reading the csv file.

In [13]:
! cat {TRAIN_PATH} | head -n 47060 | tail -n 1

"47056","84067","84068","Is there anywhere in the world offering pain management for peripheral neuropathy as opioids haved been banned in US?","?","0"
cat: write error: Broken pipe


The number of questions do not seem to be matching the lines in `train.csv`. Let's confirm this.

In [14]:
print('Rows:',train_df.shape[0])
! wc -l {TRAIN_PATH}

Rows: 404290
404302 ../data/train.csv


Let's confirm the last row id is present in both.

In [15]:
print(train_df['id'].tail(1))
! cat {TRAIN_PATH} | tail -n 1

404289    404289
Name: id, dtype: int64
"404289","537932","537933","What is like to have sex with cousin?","What is it like to have sex with your cousin?","0"


In [16]:
train_df['id'].min()

0

This seems accurate. The `id` in the train.csv is auto-incrementing and starts at 0, and the maximum value of the `id` column matches the last row in the flat file.

I am going to remove all single character questions as this will be added noise in my classifier.

In [17]:
train_df = train_df[train_df['q1_len'] != 1]
train_df = train_df[train_df['q2_len'] != 1]

train_df.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate,q1_len,q2_len
count,404269.0,404269.0,404269.0,404269.0,404268.0,404267.0
mean,202146.488467,217244.037886,220958.026858,0.369217,59.53958,60.108641
std,116708.990923,157752.561226,159904.900421,0.482593,29.938408,33.863037
min,0.0,1.0,2.0,0.0,2.0,2.0
25%,101072.0,74436.0,74727.0,0.0,39.0,39.0
50%,202149.0,192183.0,197055.0,0.0,52.0,51.0
75%,303220.0,346574.0,354697.0,1.0,72.0,72.0
max,404289.0,537932.0,537933.0,1.0,623.0,1169.0


In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404269 entries, 0 to 404289
Data columns (total 8 columns):
id              404269 non-null int64
qid1            404269 non-null int64
qid2            404269 non-null int64
question1       404268 non-null object
question2       404267 non-null object
is_duplicate    404269 non-null int64
q1_len          404268 non-null float64
q2_len          404267 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 27.8+ MB


There appears to be some null values for questions. Let's take a look at this data and confirm if we can drop the data.

In [19]:
train_df[train_df['question1'].isna()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0,,123.0


In [20]:
train_df[train_df['question2'].isna()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
105780,105780,174363,174364,How can I develop android app?,,0,30.0,
201841,201841,303951,174364,How can I create an Android app?,,0,32.0,


In [21]:
!cat {TRAIN_PATH} | head -n 105785 | tail -n 1

"105780","174363","174364","How can I develop android app?","","0"
cat: write error: Broken pipe


Appears there are indeed some blank questions. Any row with a blank question for either pair will be dropped.

In [22]:
train_df = train_df.dropna()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404266 entries, 0 to 404289
Data columns (total 8 columns):
id              404266 non-null int64
qid1            404266 non-null int64
qid2            404266 non-null int64
question1       404266 non-null object
question2       404266 non-null object
is_duplicate    404266 non-null int64
q1_len          404266 non-null float64
q2_len          404266 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 27.8+ MB


How many pairs of questions were flagged as duplicates in the dataset?

In [23]:
train_df[train_df['is_duplicate'] == 1].shape[0] / train_df.shape[0]

0.3692197711407835

About 37% of the question pairs are identified as duplicates. The training data is not that imbalanced.

Let's now sort by id, perform train / test split and save the altered data frame.

In [24]:
train_df = train_df.sort_values('id')
X, y = (train_df.loc[:,['id', 'question1', 'question2']], train_df.is_duplicate.values)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

# need to sort by id to keep consistent with the stacking and unstacking of questions
temp_y = pd.DataFrame(np.concatenate([X_train.id.values.reshape(-1, 1), y_train.reshape(-1, 1)], axis=1))
temp_y = temp_y.sort_values(0)
y_train = temp_y.loc[:, 1].values
X_train = X_train.sort_values('id')

temp_y = pd.DataFrame(np.concatenate([X_test.id.values.reshape(-1, 1), y_test.reshape(-1, 1)], axis=1))
temp_y = temp_y.sort_values(0)
y_test = temp_y.loc[:, 1].values
X_test = X_test.sort_values('id')

In [25]:
save(train_df, 'train')
save(X_train, 'X_train')
save(X_test, 'X_test')
save(y_train, 'y_train')
save(y_test, 'y_test')