In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


# Imports

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

#  Extracting Data 

In [4]:
!unzip -q '/kaggle/input/quora-question-pairs/train.csv.zip' -d '/kaggle/working/'

In [5]:
train_df = pd.read_csv("/kaggle/working/train.csv")
print(train_df.shape)
train_df.sample(10)

(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
306284,306284,429772,429773,Which colleges in India offer Genetical Engine...,Where would it be sensible and profitable to i...,0
248522,248522,361967,60986,"All my dreams are coming true, literally. If I...",How do I make dreams come true successfully?,0
295165,295165,417170,417171,How do I login to Facebook?,How do I login to Facebook without my email?,0
257823,257823,30047,29055,I have an untreatable and life-long disabling ...,What is the easiest and painless way to commit...,1
319412,319412,264822,121750,What is the value of 0?,What is the value of 1÷0?,0
374236,374236,505118,92173,ACA or ICSA? I am unsure which of either would...,I am planning to do a certified computer cours...,0
220477,220477,327574,327575,What is the difference between stable and expe...,Why is the Tor browser slower than others?,0
162515,162515,252984,252985,Which type of schools do you need to attend to...,What type of schools do you need to attend to ...,1
136456,136456,34627,217722,Does Tinder really work in India?,How do I win a girl through Tinder in India?,0
152833,152833,240035,240036,Is the skanda avatar is before or after krishn...,What will be the result of the WC '14 if Neyma...,0


# Preprocessing

In [7]:
new_df = train_df.sample(10000)
new_df.sample(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
185604,185604,96918,198462,When will India take back PoK from Pakistan?,Why India not demand POK (pakistan occupied Ka...,1
140302,140302,222982,222983,What is the connection between operating syste...,How penguin publishing company manage all book...,0
177553,177553,272923,272924,Where was first guitar made?,Who made the first guitar?,0
244653,244653,41013,117216,How do I get that peace of mind?,What do you do to achieve peace of mind?,1
396434,396434,194511,160426,Is the intelligence acquired or inherited?,How much intelligence is inherited?,1


In [9]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [11]:
question_df = new_df[['question1','question2']]
print(question_df.shape)
question_df.head()

(10000, 2)


Unnamed: 0,question1,question2
378930,What were some of the movies you’ve watched ag...,What are the some movies that You Can Not watc...
67738,How should I live happily at home?,How do I live happily with no friends?
307561,Is imo video call saved on server or can anyon...,Can a IMO call can be seen by others without t...
314275,What is the problem with Zee News channel?,Is zee news biased towards right wing?
139109,Do class 12 marks matter for IIT?,Does board marks of 12 class count in JEE 2018?


In [14]:
documents = pd.Series(question_df.question1.to_list() + question_df.question2.to_list())
print(documents.shape)
documents[0:5]

(20000,)


0    What were some of the movies you’ve watched ag...
1                   How should I live happily at home?
2    Is imo video call saved on server or can anyon...
3           What is the problem with Zee News channel?
4                    Do class 12 marks matter for IIT?
dtype: object

In [25]:
cv = CountVectorizer(max_features=3000)

vectorized_documents = cv.fit_transform(documents).toarray()
vectorized_documents.shape

(20000, 3000)

In [27]:
vec_size = vectorized_documents.shape[0]
question1_vectors = vectorized_documents[0:int(vec_size/2)]
question2_vectors = vectorized_documents[int(vec_size/2):]

In [31]:
print(question1_vectors.shape)
print(question2_vectors.shape)

(10000, 3000)
(10000, 3000)


In [56]:
vector_question1_df = pd.DataFrame(question1_vectors)
vector_question2_df = pd.DataFrame(question2_vectors)

vector_question_df = pd.concat([vector_question1_df,vector_question2_df],axis=1)

column_names = [str(i) for i in range(6000)]
vector_question_df.columns= column_names
vector_question_df.index = question_df.index
vector_question_df['is_duplicate'] = new_df.is_duplicate

print(vector_question_df.shape)
vector_question_df.head()

(10000, 6001)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5991,5992,5993,5994,5995,5996,5997,5998,5999,is_duplicate
378930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
307561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
314275,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
139109,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train test split

In [57]:
xtrain,xtest,ytrain,ytest = train_test_split(vector_question_df.iloc[:,0:-1],vector_question_df.iloc[:,-1], random_state=42, test_size=0.2)

print("xtrain.shape : ",xtrain.shape)
print("xtest.shape : ",xtest.shape)
print("ytrain.shape : ",ytrain.shape)
print("ytest.shape : ",ytest.shape)

xtrain.shape :  (8000, 6000)
xtest.shape :  (2000, 6000)
ytrain.shape :  (8000,)
ytest.shape :  (2000,)


# Random forest model

In [58]:
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)

In [59]:
ytrain_pred = rf.predict(xtrain)
ytest_pred = rf.predict(xtest)

In [60]:
print("train accuracy = ",accuracy_score(ytrain_pred,ytrain))
print("test accuracy = ",accuracy_score(ytest_pred,ytest))

train accuracy =  0.999
test accuracy =  0.7075


# XGBoost model

In [61]:
xgb = XGBClassifier()
xgb.fit(xtrain,ytrain)

In [62]:
train_pred = xgb.predict(xtrain)
test_pred = xgb.predict(xtest)

In [64]:
print("train accuracy = ",accuracy_score(train_pred,ytrain))
print("test accuracy = ",accuracy_score(test_pred,ytest))

train accuracy =  0.804875
test accuracy =  0.7165
