In [1]:
# data manipulation
import utils
import pandas as pd
import numpy as np


from sklearn import metrics

In [2]:
mvp_pipe = utils.load('mvp_model')
train_df = utils.load('X_train')
results_df = utils.load('results')
y = utils.load('y_train')

In [3]:
y_probs = mvp_pipe.predict_proba(train_df)[:, 1]
y_dummy = np.zeros(len(y_probs))

### Add all negtive prediction to results

In [4]:
results_df = results_df.drop(index='all_neg', errors='ignore')
results_df = results_df.append(utils.log_scores(mvp_pipe, train_df, y, 'all_neg', p_cut=1))
results_df

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,accuracy,precision,recall,f1,auc,log_loss
"mvp (tf-idf, nmf(5), xgboost)",0.631325,0.823529,0.001876,0.003743,0.571099,0.654121
mvp (+ lemma),0.631466,0.819018,0.002385,0.004756,0.571259,0.654228
all_neg,0.63078,0.0,0.0,0.0,0.571099,0.654121


In [5]:
results_df.loc['all_neg', 'auc'] = metrics.roc_auc_score(y, y_dummy)
results_df.loc['all_neg', 'log_loss'] = metrics.log_loss(y, y_dummy)
results_df

Unnamed: 0,accuracy,precision,recall,f1,auc,log_loss
"mvp (tf-idf, nmf(5), xgboost)",0.631325,0.823529,0.001876,0.003743,0.571099,0.654121
mvp (+ lemma),0.631466,0.819018,0.002385,0.004756,0.571259,0.654228
all_neg,0.63078,0.0,0.0,0.0,0.5,12.752392


In [6]:
utils.save(results_df, 'results')

### MVP Analysis

Combine the MVP model predictions with the ground truth to analyze where the classification model does poorly.

In [7]:
train_probs_df = utils.ground_truth_analysis(y, y_probs)
train_probs_df.head()

Unnamed: 0,gt,prob,diff
0,0,0.387433,-0.387433
1,0,0.375376,-0.375376
2,1,0.378481,0.621519
3,0,0.367141,-0.367141
4,1,0.365472,0.634528


Top pairs that are **not** duplicates, but classified as duplicates.

In [8]:
train_probs_df.sort_values('diff').head()

Unnamed: 0,gt,prob,diff
96365,0,0.610469,-0.610469
302061,0,0.607997,-0.607997
221399,0,0.605531,-0.605531
150608,0,0.602485,-0.602485
296074,0,0.584514,-0.584514


In [9]:
idx_false_pos = train_probs_df.sort_values('diff').head().index
train_df.iloc[idx_false_pos]

Unnamed: 0,id,question1,question2
86283,86283,What are the parts of a cell? What function do...,What are the two major types of cells? What fu...
397157,397157,What is a better pick: judiciary line or civil...,Can a color blinded person opt for IPS in UPSC...
164671,164671,What makes a great politician leader?,What makes a great leader?
396903,396903,"Which one is better, a 70K private job or a 40...",Is a government job like IT officer in SBI bet...
371929,371929,I'm pursuing my BBA (UG course) and after that...,I have a family business and 2 older brothers....


Top pairs that **are** duplicates, but classified as not duplicates.

In [10]:
train_probs_df.sort_values('diff', ascending=False).head()

Unnamed: 0,gt,prob,diff
299139,1,0.171967,0.828033
202205,1,0.182819,0.817181
277438,1,0.196578,0.803422
28157,1,0.233236,0.766764
107724,1,0.235218,0.764782


In [11]:
idx_false_neg = train_probs_df.sort_values('diff', ascending=False).head().index
false_neg_df = train_df.iloc[idx_false_neg]

for row in false_neg_df.values:
    print(row[1])
    print(row[2])
    print()

How can someone control their anger?
What should I do to control my anger?

How do I stop being so gullible and easily influenced?
Is it bad to be gullible? If so how can I stop being so easily influenced?

What are the best PS3 games?
Which are the best PS3 games?

What has been your best sexual experience?
What was the best sexual experience you've ever had?

Why has Ernest W. Adams disabled comments on his answers?
Why does Ernest W Adams disable comments?



In [12]:
mvp_pipe.fit(false_neg_df, y[idx_false_neg])

Pipeline(memory=None,
     steps=[('stack', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function stack_questions at 0x1a20861158>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('tf', TfidfVectorizer(analyzer='word', binary=False,...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])

In [13]:
y_fn_probs = mvp_pipe.predict_proba(false_neg_df)
y_fn_probs

array([[0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454]], dtype=float32)

In [14]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [15]:
pipe_cos_sim = Pipeline(
    [
        ('stack', FunctionTransformer(utils.stack_questions, validate=False)),
        ('lemma', FunctionTransformer(utils.cleanup_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('nmf', NMF(n_components=5)),
        ('unstack', FunctionTransformer(utils.unstack_questions, validate=True)),
        ('cos_sim', FunctionTransformer(utils.calc_cos_sim, validate=True)),
        ('xgb', XGBClassifier(n_estimators=500, random_state=42))
    ]
)
pipe_cos_sim.fit(false_neg_df, y[idx_false_neg])

Pipeline(memory=None,
     steps=[('stack', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function stack_questions at 0x1a20861158>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('lemma', FunctionTransformer(accept_sparse=False, ch...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])

In [16]:
pipe_cos_sim.predict_proba(false_neg_df)

array([[0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454]], dtype=float32)

No change in the result, but this is an unrealistic test. Let's now expand this into a full model to see if there is an impact or not.