# XGBoost with Stratified K-Fold

In [1]:
import pandas as pd
import numpy as np
import time, re, string
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack

## Read Train and Test Set with Derived Features

In [2]:
train = pd.read_csv('input/train_featv3.csv')
test = pd.read_csv('input/test_featv3.csv')
train.head()

Unnamed: 0,user_id,is_open,is_click,date,month,hour,mins,communication_type,total_links,no_of_internal_links,...,is_open_confidence,email_3_similar,sub_3_similar,campaign_id,sub_period,comm_type_click_percent,body_polarity,title_polarity,body_subjectivity,title_subjectivity
0,14051,0,0,1,9,19,55,3,88.0,79.0,...,0.0,0.0,2.0,42,247,12.70333,0.278346,0.4625,0.644388,0.65
1,134438,0,0,2,11,12,53,3,67.0,62.0,...,0.0,1.0,2.0,52,150,12.70333,0.071053,0.170455,0.376754,0.454545
2,181789,0,0,24,7,15,15,4,7.0,3.0,...,0.0,1.0,1.0,33,249,14.27746,0.408333,0.0,0.579365,0.0
3,231448,0,0,5,9,11,36,5,60.0,56.0,...,0.0,1.0,1.0,44,216,13.02976,0.166667,0.0,0.766667,0.0
4,185580,0,0,1,7,18,1,3,67.0,61.0,...,0.0,0.0,1.0,29,220,12.70333,0.390083,0.0,0.588671,0.0


In [3]:
train.columns

Index(['user_id', 'is_open', 'is_click', 'date', 'month', 'hour', 'mins',
       'communication_type', 'total_links', 'no_of_internal_links',
       'no_of_images', 'no_of_sections', 'count_sent', 'count_word',
       'count_unique_word', 'count_letters', 'count_punctuations',
       'count_stopwords', 'mean_word_len', 'word_unique_percent',
       'punct_percent', 'email_count_word', 'email_count_unique_word',
       'email_count_letters', 'email_count_punctuations', 'email_cap_count',
       'day_of_week', 'count_click', 'count_user', 'click_confidence',
       'count_is_open', 'is_open_confidence', 'email_3_similar',
       'sub_3_similar', 'campaign_id', 'sub_period', 'comm_type_click_percent',
       'body_polarity', 'title_polarity', 'body_subjectivity',
       'title_subjectivity'],
      dtype='object')

## Feature Engineering

- Date 
- Time (in minutes)
- Day of Week
- Communication Type
- Total Links
- No of Internal Links, No of Images
- Subject - Count of Sentences, Letters, Punctuations and Stopwords
- Subject - Unique Word Percentage
- Subject - Punctuation Percentage
- Email - Count of Word, Punctuation and Capital Letters
- Count Click
- Count User
- Click Confidence
- Count of People Opening the Mail
- Open Confidence
- Email Similarity, Subject Similarity
- Subscription Period
- Communication Type Click Percentage
- Count User Frequency
- Sentiment of Mail

In [5]:
train['time'] = train['hour']*60 + train['mins']
test['time'] = test['hour']*60 + test['mins']

In [6]:
Y_train = train['is_click'].values

cols = ['user_id', 'date', 'time', 'communication_type', 'total_links',  
        'no_of_internal_links', 'no_of_images', 'count_sent', 'count_letters', 'count_punctuations', 
        'count_stopwords', 'word_unique_percent', 'punct_percent', 'email_count_word', 
        'email_count_unique_word', 'email_count_punctuations', 'email_cap_count', 'day_of_week', 
        'count_click', 'count_user', 'click_confidence','count_is_open','is_open_confidence', 
        'body_polarity', 'title_polarity','body_subjectivity', 'title_subjectivity', 
        'email_3_similar', 'sub_3_similar', 'sub_period', 'comm_type_click_percent']

X_train = train[cols]
X_test = test[cols]

### Fill NaN values with mean 

In [7]:
## filled new user with click confidence of mean of first users ###
X_test['click_confidence'] = X_test['click_confidence'].fillna(0.0072169867589168555)
X_test['is_open_confidence'] = X_test['is_open_confidence'].fillna(0.10831444590242156)
X_test.loc[X_test['count_user'] == 0,'count_click'] = 0.0072169867589168555
X_test.loc[X_test['count_user'] == 0,'count_is_open'] = 0.10831444590242156
X_test.loc[X_test['count_user'] == 0,'count_user'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Calculate Sentiment Score from Polarity and Subjectivity Scores

In [8]:
X_train.loc[:,'count_user_freq'] = (1/X_train.loc[:,'count_user'])
X_test.loc[:,'count_user_freq'] = (1/X_test.loc[:,'count_user'])

dropcols = ['body_polarity', 'title_polarity','body_subjectivity', 'title_subjectivity']
X_train.loc[:,'sentiment'] = X_train.loc[:,'body_polarity'] + X_train.loc[:,'title_polarity'] + X_train.loc[:,'body_subjectivity'] + X_train.loc[:,'title_subjectivity']
X_test.loc[:,'sentiment'] = X_test.loc[:,'body_polarity'] + X_test.loc[:,'title_polarity'] + X_test.loc[:,'body_subjectivity'] + X_test.loc[:,'title_subjectivity']
X_train.drop(dropcols,axis=1,inplace=True)
X_test.drop(dropcols,axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
X_train.head()

Unnamed: 0,user_id,date,time,communication_type,total_links,no_of_internal_links,no_of_images,count_sent,count_letters,count_punctuations,...,count_user,click_confidence,count_is_open,is_open_confidence,email_3_similar,sub_3_similar,sub_period,comm_type_click_percent,count_user_freq,sentiment
0,14051,1,1195,3,88.0,79.0,13.0,1.0,88.0,5.0,...,9.0,0.0,0.0,0.0,0.0,2.0,247,12.70333,0.111111,2.035234
1,134438,2,773,3,67.0,62.0,10.0,1.0,111.0,5.0,...,4.0,0.0,0.0,0.0,1.0,2.0,150,12.70333,0.25,1.072807
2,181789,24,915,4,7.0,3.0,1.0,1.0,71.0,3.0,...,7.0,0.0,0.0,0.0,1.0,1.0,249,14.27746,0.142857,0.987698
3,231448,5,696,5,60.0,56.0,19.0,1.0,73.0,3.0,...,6.0,0.0,0.0,0.0,1.0,1.0,216,13.02976,0.166667,0.933333
4,185580,1,1081,3,67.0,61.0,12.0,1.0,55.0,2.0,...,5.0,0.0,0.0,0.0,0.0,1.0,220,12.70333,0.2,0.978753


## Under Sampling using Repeated Edited Nearest Neighbour Algorithm

In [10]:
from imblearn.under_sampling import (AllKNN, EditedNearestNeighbours, RepeatedEditedNearestNeighbours)

print('RENN')
renn = RepeatedEditedNearestNeighbours(return_indices=True)
X_res, Y_res, idx_res = renn.fit_sample(X_train, Y_train)
reduction_str = ('Reduced {:.2f}%'.format(100 * (1 - float(len(X_res))/len(X_train))))
print(reduction_str)

RENN
Reduced 5.09%


In [11]:
print (X_res.shape, Y_res.shape)
print (Y_res.sum(), Y_train.sum())

(971083, 29) (971083,)
12782 12782


# Train the Model with 10 Folds 

In [28]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

def fit_cv(X, y, label, n_splits=10):
    estimators, scores = [],[]
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1234)
    skf.get_n_splits(label)
    for train_idx, valid_idx in tqdm(skf.split(X, y)):
        X_train_ = X[train_idx]
        y_train_ = y[train_idx]
        X_valid_ = X[valid_idx]
        y_valid_ =  y[valid_idx]
        
        params = {}
        params['booster'] = 'gbtree'
        params['objective'] = 'binary:logistic'
        params['eta'] = 0.01
        params['eval_metric'] = 'auc'
        params['max_depth'] = 3
        params['colsample_bytree'] = 0.8
        params['subsample'] = 0.8
        # params['min_child_weight'] = 5
        params['silent'] = 1

        d_train = xgb.DMatrix(X_train_, label=y_train_)
        d_valid = xgb.DMatrix(X_valid_, label=y_valid_)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        estimator_ = xgb.train(params, d_train, 7200, watchlist, early_stopping_rounds=50, verbose_eval=100)

        estimators.append(estimator_)
    return estimators

In [29]:
estimators = fit_cv(X_res, Y_res, 'is_click')

[0]	train-auc:0.976037	valid-auc:0.976025
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.

[100]	train-auc:0.978533	valid-auc:0.977895
[200]	train-auc:0.978936	valid-auc:0.978352
[300]	train-auc:0.97925	valid-auc:0.978628
[400]	train-auc:0.97946	valid-auc:0.978839
[500]	train-auc:0.979616	valid-auc:0.978985
[600]	train-auc:0.979724	valid-auc:0.979093
[700]	train-auc:0.979798	valid-auc:0.979164
[800]	train-auc:0.97986	valid-auc:0.979213
[900]	train-auc:0.979914	valid-auc:0.979265
[1000]	train-auc:0.979967	valid-auc:0.979311
[1100]	train-auc:0.980015	valid-auc:0.979344
[1200]	train-auc:0.980097	valid-auc:0.979392
[1300]	train-auc:0.980206	valid-auc:0.979469
[1400]	train-auc:0.980292	valid-auc:0.979528
[1500]	train-auc:0.980371	valid-auc:0.979558
[1600]	train-auc:0.98045	valid-auc:0.9796
[1700]	train-auc:0.980524	valid-auc:0.979632
[1800]	train-auc:0.980596	valid-auc:0.979654
[1900]	train-auc:0.

[0]	train-auc:0.974096	valid-auc:0.973971
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[100]	train-auc:0.978339	valid-auc:0.978281
[200]	train-auc:0.97885	valid-auc:0.978818
[300]	train-auc:0.979181	valid-auc:0.979174
[400]	train-auc:0.979387	valid-auc:0.979395
[500]	train-auc:0.979528	valid-auc:0.979581
[600]	train-auc:0.979651	valid-auc:0.979713
[700]	train-auc:0.979745	valid-auc:0.979789
[800]	train-auc:0.97982	valid-auc:0.97986
[900]	train-auc:0.979863	valid-auc:0.979903
[1000]	train-auc:0.97991	valid-auc:0.979949
[1100]	train-auc:0.979959	valid-auc:0.980004
[1200]	train-auc:0.980045	valid-auc:0.980061
[1300]	train-auc:0.98012	valid-auc:0.980099
[1400]	train-auc:0.980206	valid-auc:0.980137
[1500]	train-auc:0.980288	valid-auc:0.980177
[1600]	train-auc:0.980374	valid-auc:0.980216
[1700]	train-auc:0.980459	valid-auc:0.980258
[1800]	train-auc:0.980533	valid-auc:0.980275
Stopping. Best iter

NameError: name 'scores' is not defined

## Predicting on Test Data

In [30]:
d_train = xgb.DMatrix(X_res, label=Y_res)
d_test = xgb.DMatrix(X_test)
d_test.feature_names = d_train.feature_names

In [31]:
pred_click = []
for estimator in estimators:
    p_test = estimator.predict(d_test) 
    pred_click.append(p_test)
pred_click = np.mean(np.stack(pred_click, axis=-1), axis=-1)

In [32]:
sub = pd.read_csv('input/sample_submission.csv')
sub['is_click'] = pred_click
sub.head()

Unnamed: 0,id,is_click
0,63_122715,7.8e-05
1,56_76206,7.8e-05
2,57_96189,9.2e-05
3,56_166917,7.8e-05
4,56_172838,9.1e-05


In [33]:
sub.to_csv('sub_xgb_10fold.csv', index=False)

## Output Distribution

In [34]:
sub.describe()

Unnamed: 0,is_click
count,773858.0
mean,0.01356
std,0.062454
min,7.8e-05
25%,7.8e-05
50%,7.9e-05
75%,8.9e-05
max,0.987405


## End