# Quora-Question-Pair

## Import Libraries

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import os
%matplotlib inline

## Load Training Data

In [None]:
data = pd.read_csv('qq-train.csv').fillna("")
np_data = data.values

## Extract Questions with qid

In [None]:
q1, q2 = data[['qid1', 'question1']], data[['qid2', 'question2']]
q1.columns = ['qid', 'question']
q2.columns = ['qid', 'question']
question_data = pd.concat((q1, q2), axis=0).fillna("").sort_values(by='qid').drop_duplicates('qid').values
for i in range(10):
    print('{}: {}'.format(question_data[i, 0], question_data[i, 1]))
print('...')

## TF-IDF Vectorization

In [None]:
vectors = TfidfVectorizer(max_features = 4096).fit_transform(question_data[:,1]).todense()

## Difference between Vectors

In [None]:
vector_data = np.zeros((data.shape[0], vectors.shape[1]))
for i in range(data.shape[0]):
    vector_data[i] = vectors[np_data[i,1]-1] - vectors[np_data[i,2]-1]

## Stratified Sampling 25% data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vector_data, np_data[:,-1].astype('int'),
                                                    test_size=0.75, random_state=10,
                                                    stratify=np_data[:,-1].astype('int'))

## PCA

In [None]:
# s = set(np.nonzero(X_train[0])[0].tolist())
# for i in range(1, X_train.shape[0]):
#     s = s.union(set(np.nonzero(X_train[i])[0].tolist()))
# np.median(np.count_nonzero(X_train, axis=1))
# print(len(s))

In [None]:
pca = PCA(n_components=12, random_state=10)
pca = pca.fit(X_train)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1)
fig.set_figheight(6)
fig.set_figwidth(8)
axes.set_title('Variance Explanation vs Eigen Vectors')
axes.set_xlabel('Eigen Vectors')
axes.set_ylabel('Explained Variance')
axes.plot(np.cumsum(pca.explained_variance_))
plt.show()

In [None]:
clf = GaussianNB()
skf = StratifiedKFold(n_splits=10)
g_X = pca.transform(X_train)
for train_index, _ in skf.split(X_train, y_train):
    clf.partial_fit(g_X[train_index], y_train[train_index], [0,1])
    print('.', end='')
y_pred = clf.predict(pca.transform(X_test))

In [None]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
param_grid = {
    'C':[0.001, 0.01, 0.1, 1, 10, 100]
}
logistic = LogisticRegression(penalty='l1', solver='saga')
clf = GridSearchCV(logistic, param_grid,
                   ['accuracy'],
                   cv=5, refit='accuracy', verbose=1, n_jobs=4)

g_X = pca.transform(X_train)
print(g_X.shape)
clf.fit(g_X, y_train)
y_pred = clf.predict(pca.transform(X_test)) 

In [None]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
param_grid = [
    {'C':[0.001, 0.01, 0.1, 1, 10, 100],
     'kernel':['linear', 'sigmoid', 'rbf', 'poly'],
     'degree':[2,3],
     'gamma': [0.1,0.3,0.5,0.7,0.9],
     'coef0': [0.1,0.5,1.0,1.5,5.0,10.0]}
]

svc = SVC(random_state=10)
clf = GridSearchCV(svc, param_grid,
                   ['f1', 'accuracy', 'recall', 'precision'],
                   cv=5, refit='accuracy', verbose=1, n_jobs=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(pca.transform(X_test)) 

In [None]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))