In [1]:
# data manipulation
import utils
import pandas as pd
import numpy as np


from sklearn import metrics

In [2]:
mvp_pipe = utils.load('mvp_model')
train_df = utils.load('train')
results_df = utils.load('results')

In [3]:
y_probs = mvp_pipe.predict_proba(train_df)[:, 1]
y_dummy = np.zeros(len(y_probs))
y = train_df.is_duplicate.values

### Add all negtive prediction to results

In [4]:
results_df = results_df.drop(index='all_neg', errors='ignore')
results_df = results_df.append(utils.log_scores(mvp_pipe, train_df, y, 'all_neg', p_cut=1))
results_df

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,accuracy,precision,recall,f1,auc,log_loss
"mvp (tf-idf, nmf(5), xgboost)",0.704833,0.670638,0.39413,0.496481,0.74804,0.563257
mvp (+ lemma),0.699275,0.655821,0.390391,0.489436,0.742537,0.568832
all_neg,0.63078,0.0,0.0,0.0,0.74804,0.563257


In [5]:
results_df.loc['all_neg', 'auc'] = metrics.roc_auc_score(y, y_dummy)
results_df.loc['all_neg', 'log_loss'] = metrics.log_loss(y, y_dummy)
results_df

Unnamed: 0,accuracy,precision,recall,f1,auc,log_loss
"mvp (tf-idf, nmf(5), xgboost)",0.704833,0.670638,0.39413,0.496481,0.74804,0.563257
mvp (+ lemma),0.699275,0.655821,0.390391,0.489436,0.742537,0.568832
all_neg,0.63078,0.0,0.0,0.0,0.5,12.752399


In [6]:
utils.save(results_df, 'results')

### MVP Analysis

Combine the MVP model predictions with the ground truth to analyze where the classification model does poorly.

In [7]:
train_probs_df = utils.ground_truth_analysis(y, y_probs)
train_probs_df.head()

Unnamed: 0,gt,prob,diff
0,0,0.111957,-0.111957
1,0,0.189304,-0.189304
2,0,0.465162,-0.465162
3,0,0.276392,-0.276392
4,0,0.297288,-0.297288


Top pairs that are **not** duplicates, but classified as duplicates.

In [8]:
train_probs_df.sort_values('diff').head()

Unnamed: 0,gt,prob,diff
129758,0,0.976695,-0.976695
228518,0,0.974996,-0.974996
220182,0,0.972504,-0.972504
4106,0,0.97007,-0.97007
119978,0,0.967884,-0.967884


In [9]:
idx_false_pos = train_probs_df.sort_values('diff').head().index
train_df.iloc[idx_false_pos]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len
129767,129767,208389,208390,How do I do earn money online for free in India?,How can one earn money online in India?,0,48.0,39.0
228537,228537,49755,337625,What is the reason behind banning of ₹500 and ...,How can I convert 500 and 1000 rupee notes ban...,0,67.0,77.0
220201,220201,327260,327261,Do people make money from Quora posts?,Are there people making money on Quora?,0,38.0,39.0
4107,4107,8128,8129,Why is the Indian government not abolishing th...,What is the percentage of domestic black money...,0,100.0,150.0
119987,119987,128615,194699,Can a foreigner exchange 500 and 1000 rupee no...,Will the ban on 500 and 1000 rupee notes have ...,0,85.0,105.0


Top pairs that **are** duplicates, but classified as not duplicates.

In [10]:
train_probs_df.sort_values('diff', ascending=False).head()

Unnamed: 0,gt,prob,diff
401440,1,0.02868,0.97132
195563,1,0.035745,0.964255
145494,1,0.038411,0.961589
356667,1,0.039607,0.960393
205507,1,0.041198,0.958802


In [11]:
idx_false_neg = train_probs_df.sort_values('diff', ascending=False).head().index
false_neg_df = train_df.iloc[idx_false_neg]

for row in false_neg_df.values:
    print(row[3])
    print(row[4])
    print()

Do angel investors invests in early stage startups? Like you have the idea and you want to build your product?
Do angel investors invest just on idea or product? Can I ask them funding for the development of the product?

How can MSG the warrior Lion Heart get 9.8 on IMDB with over 3K votes?
How movies like MSG the Warrior get rating of 9.4 on IMDB with more than 4000 people voting.

How do I know the balance in my PF (provident fund) account online?
How do we check if PF (Provident Found) paid regularly into my account from my company?

Should fairness creams like Ponds, Fair and Lovely etc. be banned for misleading people/consumers?
Should fairness creams be banned in India?

How is the criminal justice system in the United States institutionally racist, specifically against African Americans?
Why are there so many African Americans fathers in prison? Is the criminal justice system rigged against black people to fail in life?



In [12]:
mvp_lemma_model = utils.load('mvp_lemma_model')

In [13]:
mvp_lemma_model.fit(false_neg_df, false_neg_df.is_duplicate.values)

Pipeline(memory=None,
     steps=[('stack', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function stack_questions at 0x1a1d276158>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('lemma', FunctionTransformer(accept_sparse=False, ch...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])

In [14]:
y_fn_probs = mvp_lemma_model.predict_proba(false_neg_df)
y_fn_probs

array([[0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454]], dtype=float32)

In [15]:
from sklearn.pipeline import Pipeline
lemma_only = Pipeline(mvp_lemma_model.steps[:5])

In [16]:
pd.DataFrame(lemma_only.transform(false_neg_df))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.7806189,0.0,0.027203,0.0,0.0,0.778823
1,0.590685,0.017913,0.018372,0.043033,0.0,0.593884,0.0,0.0,0.0,0.0
2,0.012376,0.0,0.797876,0.017034,0.0,0.0,0.0,0.799114,0.0,0.0
3,0.0,0.0,0.0,0.882146,8.864733e-16,0.02657,0.01532,0.015649,0.877722,0.0
4,0.0,0.79946,0.0,0.0,0.03019659,0.013109,0.800101,0.0,0.01797,0.0


In [17]:
utils.calc_cos_sim(lemma_only.transform(false_neg_df))

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        7.80618879e-01, 0.00000000e+00, 2.72030480e-02, 0.00000000e+00,
        0.00000000e+00, 7.78823252e-01, 9.99390561e-01],
       [5.90684975e-01, 1.79134591e-02, 1.83721565e-02, 4.30328362e-02,
        0.00000000e+00, 5.93883972e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 9.96422003e-01],
       [1.23763045e-02, 0.00000000e+00, 7.97875843e-01, 1.70338613e-02,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.99114126e-01,
        0.00000000e+00, 0.00000000e+00, 9.99651987e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.82145595e-01,
        8.86473338e-16, 2.65704493e-02, 1.53204005e-02, 1.56490514e-02,
        8.77721710e-01, 0.00000000e+00, 9.99231414e-01],
       [0.00000000e+00, 7.99460446e-01, 0.00000000e+00, 0.00000000e+00,
        3.01965934e-02, 1.31091077e-02, 8.00101227e-01, 0.00000000e+00,
        1.79701148e-02, 0.00000000e+00, 9.98901485e-

In [18]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from xgboost import XGBClassifier

In [19]:
pipe_cos_sim = Pipeline(
    [
        ('stack', FunctionTransformer(utils.stack_questions, validate=False)),
        ('lemma', FunctionTransformer(utils.cleanup_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('nmf', NMF(n_components=5)),
        ('unstack', FunctionTransformer(utils.unstack_questions, validate=True)),
        ('cos_sim', FunctionTransformer(utils.calc_cos_sim, validate=True)),
        ('xgb', XGBClassifier(n_estimators=500, random_state=42))
    ]
)
pipe_cos_sim.fit(false_neg_df, false_neg_df.is_duplicate.values)

Pipeline(memory=None,
     steps=[('stack', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function stack_questions at 0x1a1d276158>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('lemma', FunctionTransformer(accept_sparse=False, ch...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])

In [20]:
pipe_cos_sim.predict_proba(false_neg_df)

array([[0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454],
       [0.9955355 , 0.00446454]], dtype=float32)

No change in the result, but this is an unrealistic test. Let's now expand this into a full model to see if there is an impact or not.