In [1]:
# data manipulation
import utils
import pandas as pd
import numpy as np

# modeling
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import FunctionTransformer

from xgboost import XGBClassifier

In [2]:
train_df = utils.load('train')

In [4]:
try:
    train_lemma = utils.load('train_lemma') 
except:
    pipe_cos_sim = Pipeline(
        [
            ('stack', FunctionTransformer(utils.stack_questions, validate=False)),
            ('lemma', FunctionTransformer(utils.cleanup_text, validate=False)),
        ]
    )

    train_lemma = pipe_cos_sim.transform(train_df)
    utils.save(train_lemma, 'train_lemma') # save as it can take 13 minutes to lemmatize the entire corpus

In [6]:
pipe_cos_sim = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('nmf', NMF(n_components=5)),
        ('unstack', FunctionTransformer(utils.unstack_questions, validate=True)),
        ('cos_sim', FunctionTransformer(utils.calc_cos_sim, validate=True)),
        ('xgb', XGBClassifier(n_estimators=500, random_state=42))
    ]
)

y = train_df.is_duplicate.values
pipe_cos_sim.fit(train_lemma, y)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ate=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1))])

In [7]:
y_probs = pipe_cos_sim.predict_proba(train_lemma)[:, 1]

In [11]:
results_df = utils.load('results')

results_df = results_df.drop(index='cos_sim_model', errors='ignore')
results_df = results_df.append(utils.log_scores(pipe_cos_sim, train_lemma, y, 'cos_sim_model'))
results_df

Unnamed: 0,accuracy,precision,recall,f1,auc,log_loss
"mvp (tf-idf, nmf(5), xgboost)",0.704833,0.670638,0.39413,0.496481,0.74804,0.563257
mvp (+ lemma),0.699275,0.655821,0.390391,0.489436,0.742537,0.568832
all_neg,0.63078,0.0,0.0,0.0,0.5,12.752399
cos_sim_model,0.71305,0.664312,0.450433,0.536855,0.752023,0.560813


In [12]:
utils.save(results_df, 'results')
utils.save(pipe_cos_sim, 'cos_sim_model')

### Results

Adding the cosine similarity metric to the model made a marginal improvement in the training statistics, and is possibly a good candidate to hyper tune via cross validation. 

Let's now take a look at where the classifier was wrong.

In [14]:
class_errors_df = utils.ground_truth_analysis(y, y_probs)
class_errors_df.head()

Unnamed: 0,gt,prob,diff
0,0,0.219144,-0.219144
1,0,0.187922,-0.187922
2,0,0.295018,-0.295018
3,0,0.132087,-0.132087
4,0,0.356076,-0.356076


In [19]:
fn_idx = class_errors_df.sort_values('diff', ascending = False).head().index
for row in train_df.iloc[fn_idx].values:
    print(row[3])
    print(row[4])
    print()

How can I get unlimited Ola Credits? Please help. I know there's a hack for that.
What is the best Ola hack to get unlimited Ola Credits?

What is the best Ola hack to get unlimited Ola Credits?
How can I get unlimited Ola credits? I know there's a hack for that.

If somebody has a good startup idea, then how can he find angel investors to invest in his idea?
How can I invite investors to invest money for my idea?

What arguments do climate change skeptics use to defend their position?
What are some of the best arguments to refute Climate deniers?

How do I become a quick learner in life?
How do I become a quick learner?

