# Quora Question Pairs


In this project, I will build regression model to predict the possibility of duplicates on question pairs. We will use the [Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs/data) dataset from Kaggle

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from gensim import models

In [4]:
data=pd.read_csv('train.csv')

In [5]:
data.shape

(404290, 6)

In [6]:
data.head(30)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [7]:
data=data.drop(['id','qid1','qid2'], axis=1)

I will check how many null values in data. 

In [8]:
np.sum(pd.isnull(data),axis=0)

question1       1
question2       2
is_duplicate    0
dtype: int64

In [9]:
#drop_index=[]
#for i in range(data.shape[0]):
#    if(any(pd.isnull(data.iloc[i]))):
#        print(i)
#        drop_index.append(i)
print(data.index[data.isna().any(axis=1)].tolist())
data.dropna(axis=0, how="any", inplace=True)

105780
201841
363362


next,i check this is balance data?

In [10]:
print('Duplicate pairs: {}%'.format(round(data['is_duplicate'].mean()*100, 2)))

Duplicate pairs: 36.92%


In [11]:
data_df=data.drop(drop_index)
np.sum(pd.isnull(data_df),axis=0)

question1       0
question2       0
is_duplicate    0
dtype: int64

In [12]:
import string

translator = str.maketrans('', '', string.punctuation)

def pre_process(text):
    return text.translate(translator).lower()
data_df["question1"] = data_df["question1"].apply(pre_process)
data_df["question2"] = data_df["question2"].apply(pre_process)


In [13]:
train_data=data_df.drop(['is_duplicate'], axis=1)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_1gram = TfidfVectorizer()
X1=vectorizer_1gram.fit_transform(train_data["question1"])

In [15]:
X2=vectorizer_1gram.transform(train_data["question2"])

In [16]:
labels=data_df['is_duplicate']

In [17]:
m=int(X1.shape[0]*0.8)
X1_train=X1[:m]
X2_train=X2[:m]
X1_test=X1[m:]
X2_test=X2[m:]
label_train=labels[:m]
label_test=labels[m:]


In [18]:
(label_test==1).sum()/len(label_test)

0.35972940216181454

# Attempt simple cosine similarity
Calculate the cosine similarity between vectors and use them as proof. As you can see below, this is not a good approach since even with very high threshold the accuracy never came up to higher than 70%, while recall increased significantly.

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score,f1_score,classification_report

In [20]:
def getsimilarity(X1,X2):
    k=[]
    for i in range(X1.shape[0]):
        k.append(cosine_similarity(X1_train[i],X2_train[i])[0][0])
    return k

In [21]:
similarity=getsimilarity(X1_train,X2_train)

In [22]:
def choisethreshold(similarity,labels,metric,threshold):
    predict=[1 if i>threshold else 0 for i in similarity]
    score=metric(predict,labels)
    return score

In [23]:
for i in range(2,10):
    s=choisethreshold(similarity,label_train,accuracy_score,i/10)
    print(s,'----', i/10)

0.539061123152222 ---- 0.2
0.5993556545640619 ---- 0.3
0.6431086884602185 ---- 0.4
0.6622999174471058 ---- 0.5
0.6617495648194812 ---- 0.6
0.6531541698487149 ---- 0.7
0.6515123875719246 ---- 0.8
0.654004433739708 ---- 0.9


----------------------------------threshold=0.5------------------------------------

In [24]:
test_similarity=getsimilarity(X1_test,X2_test)

In [25]:
print(choisethreshold(test_similarity,label_test,accuracy_score,0.5))

0.5010388582453189


# Attempt out-of-the-box LogisticRegression

In [34]:
from scipy.sparse import hstack
X = hstack([X1_train, X2_train])
print(X.shape, X1_train.shape, X2_train.shape,len(label_train))

(323429, 161804) (323429, 80902) (323429, 80902) 323429


In [51]:
X_test = hstack([X1_test, X2_test], format='csr')
#X_test=X1_test+X2_test

In [36]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(solver='lbfgs').fit(X, label_train)#.fit(X_train,y_train)



In [37]:
clf.score(X_test, label_test)

0.7621014618219595

In [37]:
import warnings
warnings.filterwarnings('ignore')

In [38]:
from sklearn.model_selection import GridSearchCV
log_reg_params = {"C": [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3],"solver":['newton-cg', 'lbfgs']}
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params, cv=5, n_jobs=7)
grid_log_reg.fit(X, label_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'solver': ['newton-cg', 'lbfgs'], 'C': [0.001, 0.01, 0.1, 1, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
grid_log_reg.best_params_

{'C': 1, 'solver': 'newton-cg'}

In [40]:
grid_log_reg.best_estimator_.score(X_test,label_test)
    

0.7629177075861386

# Try with 1-3 Ngram
We create the 2-3 gram and append it to the original unigram to reduce running time

In [41]:
vectorizer_2_3gram = TfidfVectorizer(ngram_range=(2,3))

In [42]:
X_1=vectorizer_2_3gram.fit_transform(train_data["question1"])
X_2=vectorizer_2_3gram.transform(train_data["question2"])

In [43]:
X_1_train=X_1[:m]
X_2_train=X_2[:m]
X_1_test=X_1[m:]
X_2_test=X_2[m:]

In [44]:
#X_=X_1_train+X_2_train
X_ = hstack([X_1_train, X_2_train], format='csr')

In [50]:
#X_test_=X_1_test+X_2_test
X_test_ = hstack([X_1_test, X_2_test], format='csr')

In [46]:
X.shape

(323429, 161804)

In [47]:
X_.shape

(323429, 4538354)

In [48]:
X_and_X_= hstack((X, X_), format='csr')

In [52]:
test_and_test_= hstack((X_test, X_test_), format='csr')

In [53]:
clf_=LogisticRegression(solver='lbfgs').fit(X_and_X_,label_train)



In [54]:
clf_.score(test_and_test_,label_test)

0.8072175913329541

# Additional methods: Naive Bayes/SVM
Naive Bayes using Bernoulli method
SVM using classical classifier (SVC)

In [62]:
from sklearn.model_selection import train_test_split
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words="english", max_df=0.95, min_df=5)
size = len(train_data["question1"].values)
full_data = np.concatenate([train_data["question1"].values, train_data["question2"].values], axis=0)
X_1_2 = tfidf_vectorizer.fit_transform(full_data)
print(size, len(full_data), X_1_2.shape)
# X_1_2 & full_data should double the length of the original data (size)

404287 808574 (808574, 185649)


In [63]:
X1, X2 = X_1_2[:size], X_1_2[size:]
X_all = hstack([X1, X2])
X_train, X_test, y_train, y_test = train_test_split(X_all, data_df['is_duplicate'], train_size=0.8, random_state=44)



In [67]:
from sklearn.naive_bayes import BernoulliNB
nb_model = BernoulliNB().fit(X_train, y_train)
nb_model.score(X_test, y_test)

0.7811719310396003

In [71]:
log_reg_params = {"alpha": [float(i) / 10.0 for i in range(0, 11)]}
grid_log_reg = GridSearchCV(BernoulliNB(), log_reg_params, cv=5, n_jobs=5)
grid_log_reg.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=5,
       param_grid={'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [73]:
print(grid_log_reg.best_params_)
grid_log_reg.best_estimator_.score(X_test, y_test)

{'alpha': 0.1}


0.7875163867520839

In [None]:
from sklearn import svm
svm_model = svm.SVC().fit(X_train, y_train)
svm_model.score(X_test, y_test)

