# Model
This notebook will train different models on the new features generated in previous feature engineering

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
import re
import difflib
from nltk.corpus import stopwords
import nltk
import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import pickle
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt  
%matplotlib inline

stops = set(stopwords.words("english"))
seed = 1024
np.random.seed(seed)

## Load data

In [2]:
with open("../data/is_duplicate.pickle", "rb") as handler:
    is_duplicate = pickle.load(handler)

In [54]:
train_sim_feats = ["train_common_ratio","train_common_ratio_stem","train_interaction_ratio",
                  'train_interaction_ratio_stem','train_common_bigrams_ratio','train_common_bigrams_ratio_stem',
                 "train_jaccard","train_jaccard_stem","train_nlevenshtein_1",
                  'train_nlevenshtein_1_stem','train_nlevenshtein_2','train_nlevenshtein_2_stem',
                  "train_sorensen", "train_sorensen_stem",
                 "train_word_len_diff","train_char_len_diff","train_word_match_share",
                 "train_word_len_diff_stem","train_char_len_diff_stem","train_word_match_share_stem",
                 "train_cos_dist", "train_cos_dist_stem","train_tfidf_sum_dist","train_tfidf_sum_dist_stem",
                   "train_tfidf_sum_q1","train_tfidf_sum_q1_stem",'train_tfidf_sum_q2',"train_tfidf_sum_q2_stem"]
train_tfidf = ["train_tfidf_q1","train_tfidf_q2","train_tfidf_q1_stem","train_tfidf_q2_stem"]

In [72]:
test_sim_feats = ["test_common_ratio","test_common_ratio_stem","test_interaction_ratio",
                  'test_interaction_ratio_stem','test_common_bigrams_ratio','test_common_bigrams_ratio_stem',
                "test_jaccard","test_jaccard_stem","test_nlevenshtein_1",
                  'test_nlevenshtein_1_stem','test_nlevenshtein_2','test_nlevenshtein_2_stem',
                  "test_sorensen", "test_sorensen_stem",
                "test_word_len_diff","test_char_len_diff","test_word_match_share",
                 "test_word_len_diff_stem","test_char_len_diff_stem","test_word_match_share_stem",
                 "test_cos_dist", "test_cos_dist_stem","test_tfidf_sum_dist","test_tfidf_sum_dist_stem",
                   "test_tfidf_sum_q1","test_tfidf_sum_q1_stem",'test_tfidf_sum_q2',"test_tfidf_sum_q2_stem"]

# test_tfidf = ["test_tfidf_q1","test_tfidf_q2","test_tfidf_q1_stem","test_tfidf_q2_stem"]

In [None]:
# load train data 
for feat in train_sim_feats:
    with open("../data/{}.pickle".format(feat), "rb") as handler:
        exec("{} = np.array(pickle.load(handler)).reshape(404290,-1)".format(feat))

In [76]:
# load test data 
for feat in test_sim_feats:
    with open("../data/{}.pickle".format(feat), "rb") as handler:
        exec("{} = np.array(pickle.load(handler)).reshape(2345796,-1)".format(feat))

## reconstruct train and test

In [56]:
# check shape
for feat in train_sim_feats:
    assert len(eval("{}.shape".format(feat))) == 2

In [77]:
for feat in test_sim_feats:
    assert len(eval("{}.shape".format(feat))) == 2

In [57]:
# train = hstack([globals()[comp] for comp in train_sim_feats]).tocsr()
train = pd.concat([pd.DataFrame(globals()[comp]) for comp in train_sim_feats],axis=1)
train.columns = train_sim_feats
train["is_duplicate"] = is_duplicate

In [82]:
# train = hstack([globals()[comp] for comp in train_sim_feats]).tocsr()
test = pd.concat([pd.DataFrame(globals()[comp]) for comp in test_sim_feats],axis=1)
test.columns = test_sim_feats

In [80]:
train.tail(2)

Unnamed: 0,train_common_ratio,train_common_ratio_stem,train_interaction_ratio,train_interaction_ratio_stem,train_common_bigrams_ratio,train_common_bigrams_ratio_stem,train_jaccard,train_jaccard_stem,train_nlevenshtein_1,train_nlevenshtein_1_stem,...,train_word_match_share_stem,train_cos_dist,train_cos_dist_stem,train_tfidf_sum_dist,train_tfidf_sum_dist_stem,train_tfidf_sum_q1,train_tfidf_sum_q1_stem,train_tfidf_sum_q2,train_tfidf_sum_q2_stem,is_duplicate
192891,0.571429,0.571429,0.4,0.4,0.4,0.4,0.5,0.5,0.4,0.4,...,0.571429,0.296471,0.348425,0.243042,0.251994,1.96393,1.962776,1.720888,1.710782,0
192892,0.363636,0.727273,0.222222,0.571429,0.0,0.444444,0.7,0.375,0.571429,0.285714,...,0.727273,0.709881,0.312223,0.207441,0.207526,2.377108,2.369066,2.169667,2.16154,0


In [83]:
test.tail(2)

Unnamed: 0,test_common_ratio,test_common_ratio_stem,test_interaction_ratio,test_interaction_ratio_stem,test_common_bigrams_ratio,test_common_bigrams_ratio_stem,test_jaccard,test_jaccard_stem,test_nlevenshtein_1,test_nlevenshtein_1_stem,...,test_char_len_diff_stem,test_word_match_share_stem,test_cos_dist,test_cos_dist_stem,test_tfidf_sum_dist,test_tfidf_sum_dist_stem,test_tfidf_sum_q1,test_tfidf_sum_q1_stem,test_tfidf_sum_q2,test_tfidf_sum_q2_stem
2345794,0.869565,0.869565,0.769231,0.769231,0.666667,0.666667,0.214286,0.214286,0.125,0.125,...,10,0.869565,0.280878,0.309392,0.21382,0.226343,3.102861,3.077048,3.316681,3.303391
2345795,0.444444,0.444444,0.285714,0.285714,0.285714,0.285714,0.625,0.625,0.5,0.5,...,8,0.444444,0.358283,0.309387,0.240722,0.236988,2.140926,2.134253,1.900204,1.897265


In [86]:
test.columns = train_sim_feats

In [58]:
# resampling
pos_train = train[train['is_duplicate'] == 1]
neg_train = train[train['is_duplicate'] == 0]
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1

while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
    
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
train = pd.concat([pos_train, neg_train])

## predict with train test split 

In [69]:
X_train, X_test, y_train, y_test = train_test_split(train[train_sim_feats], train["is_duplicate"],
                                                    test_size = 0.3, random_state = 1234)

In [70]:
params = {
    'min_child_weight': 1,
    'eta': 0.01,
    'colsample_bytree': 0.7,
    'max_depth': 12,
    'subsample': 0.3,
    'reg_alpha': 1,
    'gamma': 0.04,
    'silent':True,
    "eval_metric":"logloss"}

xgtrain = xgb.DMatrix(X_train, label=y_train)
xgval = xgb.DMatrix(X_test, label=y_test)

In [71]:
# fit model no training data
gb_model = xgb.train(params, 
                     dtrain=xgtrain, 
                     verbose_eval = 50,
                     evals=[(xgtrain,"train"),(xgval,"validation")], 
                     early_stopping_rounds = 50,
                     num_boost_round = 2000)

[0]	train-logloss:0.687978	validation-logloss:0.688037
Multiple eval metrics have been passed: 'validation-logloss' will be used for early stopping.

Will train until validation-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.516952	validation-logloss:0.519952
[100]	train-logloss:0.435586	validation-logloss:0.441448
[150]	train-logloss:0.391846	validation-logloss:0.400636
[200]	train-logloss:0.365837	validation-logloss:0.377812
[250]	train-logloss:0.349085	validation-logloss:0.364409
[300]	train-logloss:0.337601	validation-logloss:0.356148
[350]	train-logloss:0.329262	validation-logloss:0.350797
[400]	train-logloss:0.323048	validation-logloss:0.347175
[450]	train-logloss:0.317757	validation-logloss:0.344426
[500]	train-logloss:0.313364	validation-logloss:0.34253
[550]	train-logloss:0.30956	validation-logloss:0.340998
[600]	train-logloss:0.305895	validation-logloss:0.339515
[650]	train-logloss:0.302324	validation-logloss:0.337993
[700]	train-logloss:0.299138	validation-loglos

## Predict with how train

In [88]:
xgtest = xgb.DMatrix(test)

In [101]:
pred = gb_model.predict(xgtest,ntree_limit=1000)

In [102]:
test_id = np.array(range(test.shape[0]))

In [103]:
sub =  pd.DataFrame({"test_id":test_id,"is_duplicate":pred},columns=["test_id","is_duplicate"])

In [104]:
sub.to_csv("../submit/sub11.csv",index=False, header=True, cols=["test_id","is_duplicate"])