In [None]:
from __future__ import print_function, division
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)
plt.rcParams.update({'font.size': 22})
from tqdm import tqdm
from numpy import random
from sklearn.decomposition import TruncatedSVD, NMF

from sklearn import pipeline, preprocessing

from sklearn.model_selection import train_test_split, cross_val_predict,cross_val_score, StratifiedShuffleSplit,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, \
                            accuracy_score, f1_score, roc_auc_score, roc_curve, \
                             precision_recall_curve,log_loss, confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
#from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.stem.lancaster import LancasterStemmer
from xgboost import XGBClassifier
from fuzzywuzzy import fuzz
from sklearn.metrics.pairwise import pairwise_distances
#from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import gensim
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from sklearn.manifold import TSNE

In [None]:
df = pd.read_csv('../data/quora_duplicate_questions.tsv', sep='\t')
df.fillna('',inplace = True)

In [None]:
X,y  = df[['question1','question2']], df['is_duplicate']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=89)

In [None]:
df_s2vec = pd.read_csv('../data/df_s2vec.csv')

df_tfidf = pd.read_csv('../data/df_tfidf.csv')
df_tfidf_stop = pd.read_csv('../data/df_tfidf_stop.csv')

df_reduced_tf = pd.read_csv('../data/df_reduced_tf.csv')
df_reduced_stop_tf = pd.read_csv('../data/df_reduced_stop_tf.csv')

df_raw = pd.read_csv('../data/df_raw.csv')

df_fuzz = pd.read_csv('../data/df_fuzz.csv')

In [None]:
df_basic = pd.concat([df_raw,df_fuzz,df_tfidf,df_s2vec,df_reduced_tf,df_tfidf_stop,df_reduced_stop_tf], axis=1)

In [None]:
df_basic = df_basic.apply(lambda x: x.fillna(x.max()),axis=0)

In [None]:
class Get_Precalcualted_Features(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df_result = df_basic.loc[X.index,:]
        return df_result
    
    def _get_param_names(self):
        return df_basic.columns

In [None]:
class Column_Selector(BaseEstimator, TransformerMixin):
    def __init__(self,selected_columns, **kwargs):
        self.kwargs = kwargs
        self.selected_columns=selected_columns
        
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.selected_columns]
    

In [None]:
for c in df_basic.columns:
    selected_col = [c]
    steps = [('test', Get_Precalcualted_Features()),
             ('selector', Column_Selector(selected_col)),
            ('lr', LogisticRegression())]
    lr = pipeline.Pipeline(steps)
    lr.fit(X_train,y_train)
    lr_pred_proba = lr.predict_proba(X_test)[:,1]
    print(c,'has log loss:', log_loss(y_test, lr_pred_proba))

In [None]:
selected_tfidf = list(df_s2vec.columns)+list(df_tfidf.columns)\
                +list(df_raw.columns)+list(df_fuzz.columns)
selected_tfidf_stop = list(df_s2vec.columns)+list(df_tfidf_stop.columns)\
                +list(df_raw.columns)+list(df_fuzz.columns)

In [None]:
steps = [('test', Get_Precalcualted_Features()),
        ('lr', LogisticRegression())]
lr = pipeline.Pipeline(steps)
lr.fit(X_train,y_train)
lr_pred_proba = lr.predict_proba(X_test)[:,1]
log_loss(y_test, lr_pred_proba)

In [None]:
steps = [('test', Get_Precalcualted_Features()),
        ('lr', LogisticRegression())]
lr = pipeline.Pipeline(steps)

cross_val_score(lr, X, y, scoring = 'neg_log_loss')

In [None]:
def print_model_result(model, model_name,feature, feature_name,selected_col, X,y):

    
    steps = [(feature_name,feature),
             ('selector', Column_Selector(selected_col)),
             (model_name,model),]

    reg = pipeline.Pipeline(steps)
    reg.fit(X,y)
    
    print(model_name + ' test set has negative log log score:', np.mean(cross_val_score(reg, X, y, scoring = 'neg_log_loss')))
    

In [None]:
model_dict = {'logistic regression': LogisticRegression(),
              'gaussian naive bayes': GaussianNB(),
              'decision tree classifier': DecisionTreeClassifier(),
              'random forest classifier': RandomForestClassifier(class_weight='balanced_subsample'),
              'gradient boosting classifier': GradientBoostingClassifier(),
              'xgb': XGBClassifier()

             }

feature_name = 'raw'
feature = Get_Precalcualted_Features()
for model_name, model in model_dict.items():
    print_model_result(model, model_name,feature,feature_name,selected_tfidf_stop, X,y)
    

In [None]:
# Cross validated log loss
model_dict = {'logistic regression': LogisticRegression(),
              'gaussian naive bayes': GaussianNB(),
              'decision tree classifier': DecisionTreeClassifier(),
              'random forest classifier': RandomForestClassifier(class_weight='balanced_subsample'),
              'gradient boosting classifier': GradientBoostingClassifier(),
              'xgb': XGBClassifier()

             }

feature_name = 'raw'
feature = Get_Precalcualted_Features()
for model_name, model in model_dict.items():
    print_model_result(model, model_name,feature,feature_name,selected_tfidf, X,y)
    

In [None]:
steps = [('feature', Get_Precalcualted_Features()),
         ('selector', Column_Selector(list(df_raw.columns))),
        ('xgb', XGBClassifier())]
simple = pipeline.Pipeline(steps)
simple.fit(X_train,y_train)
simple_pred_proba = simple.predict_proba(X_test)[:,1]

In [None]:
simple_pred_predict = simple.predict(X_test)

In [None]:
simple_data = pd.concat([X_test,pd.Series(simple_pred_predict,index=y_test.index,name='predicted'), y_test], axis=1)

In [None]:
simple_data[simple_data.predicted != simple_data.is_duplicate].to_csv('simple_wrong.csv',index=False)

In [None]:
log_loss(y_test,simple_pred_proba)

In [None]:
steps = [('feature', Get_Precalcualted_Features()),
         ('selector', Column_Selector(list(df_raw.columns)+list(df_tfidf_stop.columns)+list(df_fuzz.columns))),
        ('xgb', XGBClassifier())]
medium = pipeline.Pipeline(steps)
medium.fit(X_train,y_train)
medium_pred_proba = medium.predict_proba(X_test)[:,1]

In [None]:
log_loss(y_test,medium_pred_proba)

In [None]:
medium_pred_predict = medium.predict(X_test)

In [None]:
medium_data = pd.concat([X_test,pd.Series(medium_pred_predict,\
                                          index=y_test.index,name='predicted')\
                         , y_test], axis=1)

In [None]:
medium_data[medium_data.predicted != medium_data.is_duplicate].to_csv('medium_wrong.csv',index=False)

In [None]:
steps = [('feature', Get_Precalcualted_Features()),
         ('selector', Column_Selector(selected_tfidf_stop)),
        ('xgb', XGBClassifier())]
advance = pipeline.Pipeline(steps)
advance.fit(X_train,y_train)
advance_pred_proba = advance.predict_proba(X_test)[:,1]

In [None]:
log_loss(y_test,advance_pred_proba)

In [None]:
def print_feature_importances(model,feature):
    if 'feature_importances_' in dir(model):
        coefs = list(model.feature_importances_)
    elif 'coef_' in dir(model):
        coefs = list(model.coef_[0])
    if coefs:
        feature_names = feature.get_params().keys()
        features = sorted(zip(feature_names,coefs),key=lambda x: abs(x[1]),reverse=True)
        for f in features:
            print("{}: {}".format(f[0],f[1]))

In [None]:
model =  advance.named_steps['xgb']
feature_union_step = advance.named_steps['feature']

print_feature_importances(model,feature_union_step) 

In [None]:
advance_pred_predict = advance.predict(X_test)

In [None]:
advance_data = pd.concat([X_test,pd.Series(advance_pred_predict,\
                                          index=y_test.index,name='predicted')\
                         , y_test], axis=1)

In [None]:
advance_data[advance_data.predicted != advance_data.is_duplicate].to_csv('advance_wrong.csv',index=False)

In [None]:
prec_simple, recall_simple, thresholds_simple = precision_recall_curve(y_test,simple_pred_proba)

In [None]:
prec_medium, recall_medium, thresholds_medium = precision_recall_curve(y_test,medium_pred_proba)

In [None]:
prec_advance, recall_advance, thresholds_advance = precision_recall_curve(y_test,advance_pred_proba)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(prec_simple[0:-1], recall_simple[0:-1], color='blue')

plt.plot(prec_medium[0:-1], recall_medium[0:-1],'red')
plt.plot(prec_advance[0:-1], recall_advance[0:-1], color = 'black')

plt.legend(['basic','basic+fuzz+tfidf','basic+fuzz+tfidf+quesiotn2vec'],fontsize=22)

plt.title('precision-recall curve',fontsize=22)
plt.xlabel('precision',fontsize=22)
plt.ylabel('recall',fontsize=22);
plt.savefig('precision_recall.png',fmt='png', dpi=300, bbox_inches='tight')

In [None]:
accuracy_list = []

for threshold in np.arange(0.01,0.99,0.01):
    y_test_pred = np.array(advance_pred_proba>threshold).astype(int)
    accuracy_list.append(accuracy_score(y_test, y_test_pred))

In [None]:
max(accuracy_list)

In [None]:
threshold = np.arange(0.01,0.99,0.01)[np.argmax(accuracy_list)]
threshold

In [None]:
y_advance_pred = np.array(advance_pred_proba>threshold).astype(int)

In [None]:
accuracy_score(y_test,y_advance_pred)