In [1]:
import xgboost as xgb

In [2]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import seaborn as sns
from tqdm import tqdm
import pickle
%matplotlib inline

# 1. Read Data

In [3]:
# read saved
data = pd.read_csv("data/interim/data_train_concat.csv", index_col=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
Y = data['x_exit_19'].between(3750901.5068, 3770901.5068) \
    & data['y_exit_19'].between((-19268905.6133), -19208905.6133)

In [5]:
X = data.drop([x+str(y) for y in range(20) for x in ['Unnamed: 0_', 'hash_', 'trajectory_id_', 'time_entry_', 'time_exit_']] \
    + ['x_exit_19', 'y_exit_19','vmax_19', 'vmin_19', 'vmean_19'] + ['Unnamed: 0'], axis=1)

## 1.2 More Feature Engineering

In [6]:
# extrapolation
for i in range(18,-1,-1):
    cur = str(i)
    nex = str(i+1)
    nan = X['x_entry_'+cur].isnull()
    for x,y in zip(['x_entry_'+cur, 'y_entry_'+cur, 'entry_hour_'+cur, 'entry_minute_'+cur, 'entry_second_'+cur],
                   ['x_entry_'+nex, 'y_entry_'+nex, 'entry_hour_'+nex, 'entry_minute_'+nex, 'entry_second_'+nex]):
        X.iloc[np.where(nan)[0], X.columns.get_loc(x)] = X[y][nan]
    for x,y in zip(['x_exit_'+cur, 'y_exit_'+cur, 'exit_hour_'+cur, 'exit_minute_'+cur, 'exit_second_'+cur],
                   ['x_entry_'+nex, 'y_entry_'+nex, 'entry_hour_'+nex, 'entry_minute_'+nex, 'entry_second_'+nex]):
        X.iloc[np.where(nan)[0], X.columns.get_loc(x)] = X[y][nan]

In [7]:
# space and velocity inference
for i in range(19):
    X['distance_'+str(i)] = ((X['x_entry_'+str(i)]-X['x_exit_'+str(i)]).pow(2)+(X['y_entry_'+str(i)]-X['y_exit_'+str(i)]).pow(2)).pow(0.5)
    X['speed_'+str(i)] = X['distance_'+str(i)]/X['duration_'+str(i)]

# missing values    
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)

In [8]:
# time feature
for i in range(20):
    X['entry_hour_quarter_'+str(i)] = X['entry_hour_'+str(1)]*4 + np.ceil(X['entry_minute_'+str(1)]/15)
    X['exit_hour_quarter_'+str(i)] = X['exit_hour_'+str(1)]*4 + np.ceil(X['exit_minute_'+str(1)]/15)
    

In [9]:
sortcol = sorted(X.columns.tolist(), key=lambda x: (int(x[x.rfind("_")+1:]), x[:x.rfind("_")]))
X = X[sortcol]

In [10]:
X.columns.tolist()

['distance_0',
 'duration_0',
 'entry_hour_0',
 'entry_hour_quarter_0',
 'entry_minute_0',
 'entry_second_0',
 'exit_hour_0',
 'exit_hour_quarter_0',
 'exit_minute_0',
 'exit_second_0',
 'speed_0',
 'vmax_0',
 'vmean_0',
 'vmin_0',
 'x_entry_0',
 'x_exit_0',
 'y_entry_0',
 'y_exit_0',
 'distance_1',
 'duration_1',
 'entry_hour_1',
 'entry_hour_quarter_1',
 'entry_minute_1',
 'entry_second_1',
 'exit_hour_1',
 'exit_hour_quarter_1',
 'exit_minute_1',
 'exit_second_1',
 'speed_1',
 'vmax_1',
 'vmean_1',
 'vmin_1',
 'x_entry_1',
 'x_exit_1',
 'y_entry_1',
 'y_exit_1',
 'distance_2',
 'duration_2',
 'entry_hour_2',
 'entry_hour_quarter_2',
 'entry_minute_2',
 'entry_second_2',
 'exit_hour_2',
 'exit_hour_quarter_2',
 'exit_minute_2',
 'exit_second_2',
 'speed_2',
 'vmax_2',
 'vmean_2',
 'vmin_2',
 'x_entry_2',
 'x_exit_2',
 'y_entry_2',
 'y_exit_2',
 'distance_3',
 'duration_3',
 'entry_hour_3',
 'entry_hour_quarter_3',
 'entry_minute_3',
 'entry_second_3',
 'exit_hour_3',
 'exit_hour_quar

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
X[:] = scaler.fit_transform(X)

In [14]:
X[:5]

Unnamed: 0,distance_0,duration_0,entry_hour_0,entry_hour_quarter_0,entry_minute_0,entry_second_0,exit_hour_0,exit_hour_quarter_0,exit_minute_0,exit_second_0,...,entry_hour_19,entry_hour_quarter_19,entry_minute_19,entry_second_19,exit_hour_19,exit_hour_quarter_19,exit_minute_19,exit_second_19,x_entry_19,y_entry_19
0,-0.014382,-0.019108,0.068311,-0.017664,-1.40864,0.405377,0.068284,-0.017764,-1.408663,0.405325,...,0.394396,-0.017664,-1.359113,0.399336,-0.013928,-0.017764,-0.409425,0.48256,-1.697862,-0.785789
1,-0.014382,-0.019108,1.816504,1.784561,0.025205,-0.065709,1.816494,1.784582,0.025155,-0.065751,...,0.394396,1.784561,-1.470466,0.458538,-0.013928,1.784582,0.237699,1.368141,-1.244502,-0.578554
2,-0.014382,-0.019108,1.067279,1.100958,0.484035,-1.066766,1.067261,1.100934,0.483977,-1.066786,...,-2.440666,1.100958,0.422544,0.636141,-0.013928,1.100934,-0.350596,1.545257,-0.18857,-2.021392
3,-0.014382,-0.019108,0.567795,0.665939,1.573758,-0.948995,0.567773,0.665885,1.573679,-0.949017,...,0.394396,0.665939,0.088484,1.760966,-0.013928,0.665885,0.178869,1.722373,0.810236,0.577988
4,-0.014382,-0.019108,-1.679881,-1.819888,-1.638055,-1.184537,-1.679926,-1.820111,-1.638074,-1.184555,...,0.394396,-1.819888,-1.025052,-1.139898,-0.013928,-1.820111,-0.99772,-1.170525,-1.402849,-0.065226


## 1.2 Dimensionality Reduction

dimensionality reduction was found to be unhelpful for prediction, hence removed

In [27]:
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
n_comp = 1

In [28]:
# all features
# pca = PCA(n_components=n_comp, random_state=42)
# pca.fit(pd.concat([pd.DataFrame(X[X.columns[X.columns.str.endswith("_"+str(i))]].values) 
#          for i in range(19)], axis=0))

# for i in range(19): #timesteps
#     trans = pca.transform(X[X.columns[X.columns.str.endswith("_"+str(i))]])

#     for j in range(n_comp): #components
#         X['pca_'+str(j)+"_"+str(i)] = trans[:,j]

In [29]:
# xy
pca = PCA(n_components=n_comp, random_state=42)
# pca = FastICA(n_components=n_comp, random_state=42)
# pca = TruncatedSVD(n_components=n_comp, random_state=420)
pca.fit(pd.concat([pd.DataFrame(X[['x_entry_'+str(i),'y_entry_'+str(i)]].values)
           for i in range(20)] + [pd.DataFrame(X[['x_exit_'+str(i),'y_exit_'+str(i)]].values)
           for i in range(19)], axis=0))

for i in range(19): #timesteps
    transEntry = pca.transform(X[['x_entry_'+str(i),'y_entry_'+str(i)]])
    for j in range(1): #components
        X['entry_pca_'+str(j)+"_"+str(i)] = transEntry[:,j]
    if i<19:
        transExit = pca.transform(X[['x_exit_'+str(i),'y_exit_'+str(i)]])
        for j in range(1): #components
            X['exit_pca_'+str(j)+"_"+str(i)] = transExit[:,j]
            
# ica = FastICA(n_components=n_comp, random_state=42)
# # pca = TruncatedSVD(n_components=n_comp, random_state=420)
# ica.fit(pd.concat([pd.DataFrame(X[['x_entry_'+str(i),'y_entry_'+str(i)]].values)
#            for i in range(20)] + [pd.DataFrame(X[['x_exit_'+str(i),'y_exit_'+str(i)]].values)
#            for i in range(19)], axis=0))

# for i in range(19): #timesteps
#     transEntry = ica.transform(X[['x_entry_'+str(i),'y_entry_'+str(i)]])
#     for j in range(1): #components
#         X['entry_ica_'+str(j)+"_"+str(i)] = transEntry[:,j]
#     if i<19:
#         transExit = ica.transform(X[['x_exit_'+str(i),'y_exit_'+str(i)]])
#         for j in range(1): #components
#             X['exit_ica_'+str(j)+"_"+str(i)] = transExit[:,j]
            
# svd = TruncatedSVD(n_components=n_comp, random_state=420)
# svd.fit(pd.concat([pd.DataFrame(X[['x_entry_'+str(i),'y_entry_'+str(i)]].values)
#            for i in range(20)] + [pd.DataFrame(X[['x_exit_'+str(i),'y_exit_'+str(i)]].values)
#            for i in range(19)], axis=0))

# for i in range(19): #timesteps
#     transEntry = svd.transform(X[['x_entry_'+str(i),'y_entry_'+str(i)]])
#     for j in range(1): #components
#         X['entry_svd_'+str(j)+"_"+str(i)] = transEntry[:,j]
#     if i<19:
#         transExit = svd.transform(X[['x_exit_'+str(i),'y_exit_'+str(i)]])
#         for j in range(1): #components
#             X['exit_svd_'+str(j)+"_"+str(i)] = transExit[:,j]
        

In [30]:
# dds
pca2 = PCA(n_components=n_comp, random_state=42)
# pca2 = TruncatedSVD(n_components=n_comp, random_state=420)
# pca2 = FastICA(n_components=n_comp, random_state=42)

pca2.fit(pd.concat([pd.DataFrame(X[['distance_'+str(i),'duration_'+str(i),'speed_'+str(i)]].values)
           for i in range(19)], axis=0))

for i in range(19): #timesteps
    trans = pca2.transform(X[['distance_'+str(i),'duration_'+str(i),'speed_'+str(i)]])
    for j in range(n_comp): #components
        X['dds_pca_'+str(j)+"_"+str(i)] = trans[:,j]

# pca2 = TruncatedSVD(n_components=n_comp, random_state=420)
# ica2 = FastICA(n_components=n_comp, random_state=42)

# ica2.fit(pd.concat([pd.DataFrame(X[['distance_'+str(i),'duration_'+str(i),'speed_'+str(i)]].values)
#            for i in range(19)], axis=0))

# for i in range(19): #timesteps
#     trans = ica2.transform(X[['distance_'+str(i),'duration_'+str(i),'speed_'+str(i)]])
#     for j in range(n_comp): #components
#         X['dds_ica_'+str(j)+"_"+str(i)] = trans[:,j]
        
# svd2 = TruncatedSVD(n_components=n_comp, random_state=420)

# svd2.fit(pd.concat([pd.DataFrame(X[['distance_'+str(i),'duration_'+str(i),'speed_'+str(i)]].values)
#            for i in range(19)], axis=0))

# for i in range(19): #timesteps
#     trans = svd2.transform(X[['distance_'+str(i),'duration_'+str(i),'speed_'+str(i)]])
#     for j in range(n_comp): #components
#         X['dds_svd_'+str(j)+"_"+str(i)] = trans[:,j]
        

## 1.3 Train/Test Split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X[:5]

Unnamed: 0,distance_0,duration_0,entry_hour_0,entry_hour_quarter_0,entry_minute_0,entry_second_0,exit_hour_0,exit_hour_quarter_0,exit_minute_0,exit_second_0,...,entry_hour_19,entry_hour_quarter_19,entry_minute_19,entry_second_19,exit_hour_19,exit_hour_quarter_19,exit_minute_19,exit_second_19,x_entry_19,y_entry_19
0,-0.014382,-0.019108,0.068311,-0.017664,-1.40864,0.405377,0.068284,-0.017764,-1.408663,0.405325,...,0.394396,-0.017664,-1.359113,0.399336,-0.013928,-0.017764,-0.409425,0.48256,-1.697862,-0.785789
1,-0.014382,-0.019108,1.816504,1.784561,0.025205,-0.065709,1.816494,1.784582,0.025155,-0.065751,...,0.394396,1.784561,-1.470466,0.458538,-0.013928,1.784582,0.237699,1.368141,-1.244502,-0.578554
2,-0.014382,-0.019108,1.067279,1.100958,0.484035,-1.066766,1.067261,1.100934,0.483977,-1.066786,...,-2.440666,1.100958,0.422544,0.636141,-0.013928,1.100934,-0.350596,1.545257,-0.18857,-2.021392
3,-0.014382,-0.019108,0.567795,0.665939,1.573758,-0.948995,0.567773,0.665885,1.573679,-0.949017,...,0.394396,0.665939,0.088484,1.760966,-0.013928,0.665885,0.178869,1.722373,0.810236,0.577988
4,-0.014382,-0.019108,-1.679881,-1.819888,-1.638055,-1.184537,-1.679926,-1.820111,-1.638074,-1.184555,...,0.394396,-1.819888,-1.025052,-1.139898,-0.013928,-1.820111,-0.99772,-1.170525,-1.402849,-0.065226


In [17]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=420)

In [18]:
(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

((107250, 353), (26813, 353), (107250,), (26813,))

# 2. Model Building

In [None]:
#change to the round for pca to evaluate

In [63]:
# 261 rounds (0.983522)
# models = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=262, n_jobs=-1,
#                            subsample=0.8, colsample_bytree=0.8, seed=420)
models = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, min_child_weight=5,
                           gamma=0.2, n_estimators=20000, n_jobs=-1,
                           subsample=0.82, colsample_bytree=0.87, seed=420)
#
# models = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=20000,
#                            subsample=0.8, colsample_bytree=0.9, reg_alpha=65, seed=420, n_jobs=-1)

In [64]:
models.fit(train_x, train_y, eval_metric=['auc'], early_stopping_rounds=100,
           eval_set=[(train_x, train_y), (test_x, test_y)])


[0]	validation_0-auc:0.98235	validation_1-auc:0.977015
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.979902	validation_1-auc:0.972154
[2]	validation_0-auc:0.983411	validation_1-auc:0.976369
[3]	validation_0-auc:0.981748	validation_1-auc:0.973889
[4]	validation_0-auc:0.979995	validation_1-auc:0.971589
[5]	validation_0-auc:0.982155	validation_1-auc:0.974047
[6]	validation_0-auc:0.983766	validation_1-auc:0.976028
[7]	validation_0-auc:0.984709	validation_1-auc:0.977095
[8]	validation_0-auc:0.98529	validation_1-auc:0.977576
[9]	validation_0-auc:0.985794	validation_1-auc:0.978179
[10]	validation_0-auc:0.986212	validation_1-auc:0.97871
[11]	validation_0-auc:0.986572	validation_1-auc:0.979073
[12]	validation_0-auc:0.986925	validation_1-auc:0.979473
[13]	validation_0-auc:0.987076	validation_1-auc:0.979487
[14]	validation_0-auc:0.986842	validation_1-auc:0.97907
[

[142]	validation_0-auc:0.994148	validation_1-auc:0.983634
[143]	validation_0-auc:0.994171	validation_1-auc:0.98363
[144]	validation_0-auc:0.994192	validation_1-auc:0.98364
[145]	validation_0-auc:0.994223	validation_1-auc:0.983651
[146]	validation_0-auc:0.994248	validation_1-auc:0.983661
[147]	validation_0-auc:0.994265	validation_1-auc:0.983668
[148]	validation_0-auc:0.994291	validation_1-auc:0.983684
[149]	validation_0-auc:0.994324	validation_1-auc:0.983684
[150]	validation_0-auc:0.994349	validation_1-auc:0.983689
[151]	validation_0-auc:0.99438	validation_1-auc:0.983688
[152]	validation_0-auc:0.994408	validation_1-auc:0.9837
[153]	validation_0-auc:0.994433	validation_1-auc:0.983717
[154]	validation_0-auc:0.994455	validation_1-auc:0.983737
[155]	validation_0-auc:0.994473	validation_1-auc:0.983743
[156]	validation_0-auc:0.994497	validation_1-auc:0.983766
[157]	validation_0-auc:0.994528	validation_1-auc:0.983769
[158]	validation_0-auc:0.994552	validation_1-auc:0.983777
[159]	validation_0-

[284]	validation_0-auc:0.996536	validation_1-auc:0.984043
[285]	validation_0-auc:0.996542	validation_1-auc:0.984045
[286]	validation_0-auc:0.996564	validation_1-auc:0.984048
[287]	validation_0-auc:0.996584	validation_1-auc:0.984044
[288]	validation_0-auc:0.99662	validation_1-auc:0.984044
[289]	validation_0-auc:0.996632	validation_1-auc:0.984039
[290]	validation_0-auc:0.996637	validation_1-auc:0.984038
[291]	validation_0-auc:0.996654	validation_1-auc:0.984045
[292]	validation_0-auc:0.996678	validation_1-auc:0.984042
[293]	validation_0-auc:0.996706	validation_1-auc:0.984042
[294]	validation_0-auc:0.996716	validation_1-auc:0.984037
[295]	validation_0-auc:0.996731	validation_1-auc:0.984038
[296]	validation_0-auc:0.996736	validation_1-auc:0.984042
[297]	validation_0-auc:0.996768	validation_1-auc:0.984041
[298]	validation_0-auc:0.996808	validation_1-auc:0.984048
[299]	validation_0-auc:0.996818	validation_1-auc:0.984047
[300]	validation_0-auc:0.996838	validation_1-auc:0.984044
[301]	validatio

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.87, gamma=0.2, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=5, missing=None,
       n_estimators=20000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.82)

In [42]:
feature_selected = [x for x,y in sorted(zip(train_x.columns.values, models.feature_importances_), 
                     key=lambda x: x[1], reverse=True) if y>0]


In [66]:
sorted(zip(train_x.columns.values, models.feature_importances_), key=lambda x: x[1], reverse=True)


[('y_entry_19', 0.10096904),
 ('x_entry_19', 0.038991664),
 ('y_entry_18', 0.024488797),
 ('duration_19', 0.022922177),
 ('y_exit_18', 0.019942867),
 ('y_entry_2', 0.015547923),
 ('entry_hour_19', 0.014663386),
 ('y_exit_1', 0.012257992),
 ('y_exit_0', 0.0102666),
 ('x_exit_18', 0.0098255025),
 ('x_exit_9', 0.00720055),
 ('y_entry_1', 0.006573375),
 ('y_exit_2', 0.0059630927),
 ('y_entry_8', 0.005912143),
 ('y_entry_6', 0.0058701728),
 ('exit_hour_quarter_1', 0.005706703),
 ('y_entry_0', 0.005680445),
 ('y_exit_9', 0.005580068),
 ('x_exit_2', 0.0054984167),
 ('entry_hour_5', 0.0053022564),
 ('x_exit_6', 0.005152616),
 ('x_entry_1', 0.005135173),
 ('x_exit_1', 0.0049819807),
 ('x_entry_11', 0.0049576005),
 ('x_exit_3', 0.0049458733),
 ('x_entry_8', 0.004873053),
 ('y_entry_3', 0.0047189128),
 ('x_exit_11', 0.0046945093),
 ('y_entry_4', 0.0043952013),
 ('exit_hour_7', 0.004375856),
 ('x_entry_2', 0.0043409057),
 ('x_exit_5', 0.0043303426),
 ('y_entry_7', 0.0043128654),
 ('vmin_9', 0.0042

In [65]:
pred_y = models.predict(test_x)
(pred_y==test_y).sum()/test_y.count()

  if diff:


0.9419684481408273

In [72]:
param_test = {
    'subsample': [0.8]
}

gsearch = GridSearchCV(estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, 
                        n_estimators=262, n_jobs=-1,
                        subsample=0.8, colsample_bytree=0.8, seed=420),
                        param_grid = param_test, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch.fit(X,Y)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] subsample=0.8 ...................................................
[CV] .......... subsample=0.8, score=0.9831463016614883, total=  36.7s
[CV] subsample=0.8 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.3s remaining:    0.0s


[CV] .......... subsample=0.8, score=0.9831311762047277, total=  39.4s
[CV] subsample=0.8 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV] .......... subsample=0.8, score=0.9834275824803944, total=  35.7s
[CV] subsample=0.8 ...................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV] .......... subsample=0.8, score=0.9828726510007434, total=  40.5s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=262,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.8]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc',
       verbose=10)

In [73]:
(gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_)



([mean: 0.98314, std: 0.00020, params: {'subsample': 0.8}],
 {'subsample': 0.8},
 0.9831444298640706)

## 2.1 Fine Tuning

In [20]:
from sklearn.model_selection import GridSearchCV

### 2.1.1 Max_Depth and Min_Child Weight

In [25]:
param_test1 = {
    'max_depth': range(3,11,2),
    'min_child_weight': range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        n_estimators=261, n_jobs=-1, subsample=0.8, 
                        colsample_bytree=0.8, seed=420), 
                        param_grid = param_test1, 
                        scoring='roc_auc', cv=5, verbose=10)
gsearch1.fit(X, Y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] max_depth=3, min_child_weight=1 .................................
[CV]  max_depth=3, min_child_weight=1, score=0.9754736948707831, total=  18.9s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.5s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.9744238726575192, total=  20.6s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   40.6s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.9748612977003671, total=  19.4s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.0min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.9752800675225428, total=  19.7s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.9740269609531568, total=  18.8s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.9756109536371733, total=  19.7s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.0min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.9744078122949807, total=  19.2s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.3min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.9748459935747857, total=  19.2s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.7min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.9751098105233276, total=  19.8s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.0min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.9739585690099366, total=  19.9s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.9756103064390335, total=  20.3s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.9744540236491265, total=  18.9s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.9748393351721828, total=  19.1s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.9754333382185223, total=  19.3s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.9740500115319728, total=  20.2s
[CV] max_depth=5, min_child_weight=1 .................................
[CV]  max_depth=5, min_child_weight=1, score=0.9822001974911054, total=  32.9s
[CV] max_depth=5, min

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 45.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=261,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(3, 11, 2), 'min_child_weight': range(1, 6, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [26]:
(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)



([mean: 0.97481, std: 0.00053, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.97479, std: 0.00057, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.97488, std: 0.00059, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.98182, std: 0.00031, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.98181, std: 0.00032, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.98184, std: 0.00035, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.98300, std: 0.00023, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.98304, std: 0.00025, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.98302, std: 0.00026, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.98329, std: 0.00025, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.98337, std: 0.00027, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.98342, std: 0.00029, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 9, 'min_child_weight': 5

In [21]:
param_test2 = {
    'max_depth': [9,10],
    'min_child_weight': [1,4,5,6]
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        n_estimators=261, n_jobs=-1, subsample=0.8, 
                        colsample_bytree=0.8, seed=420), 
                        param_grid = param_test2, 
                        scoring='roc_auc', cv=5, verbose=10)
gsearch2.fit(X, Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] max_depth=9, min_child_weight=1 .................................
[CV]  max_depth=9, min_child_weight=1, score=0.983520703291647, total=  32.3s
[CV] max_depth=9, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.9s remaining:    0.0s


[CV]  max_depth=9, min_child_weight=1, score=0.9831789615694589, total=  28.3s
[CV] max_depth=9, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=1, score=0.9829782457289215, total=  27.1s
[CV] max_depth=9, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.5min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=1, score=0.9836540934113434, total=  29.2s
[CV] max_depth=9, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=1, score=0.9831340360520199, total=  27.9s
[CV] max_depth=9, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=4, score=0.9835821097325571, total=  27.1s
[CV] max_depth=9, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.9min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=4, score=0.9832919609577442, total=  28.5s
[CV] max_depth=9, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.4min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=4, score=0.9830352519259562, total=  26.5s
[CV] max_depth=9, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=4, score=0.9838545908845523, total=  27.0s
[CV] max_depth=9, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.3min remaining:    0.0s


[CV]  max_depth=9, min_child_weight=4, score=0.9831370435668814, total=  27.8s
[CV] max_depth=9, min_child_weight=5 .................................
[CV]  max_depth=9, min_child_weight=5, score=0.9836253277953058, total=  27.1s
[CV] max_depth=9, min_child_weight=5 .................................
[CV]  max_depth=9, min_child_weight=5, score=0.9832516025259074, total=  26.4s
[CV] max_depth=9, min_child_weight=5 .................................
[CV]  max_depth=9, min_child_weight=5, score=0.9830510906934791, total=  26.6s
[CV] max_depth=9, min_child_weight=5 .................................
[CV]  max_depth=9, min_child_weight=5, score=0.9838805997323727, total=  26.8s
[CV] max_depth=9, min_child_weight=5 .................................
[CV]  max_depth=9, min_child_weight=5, score=0.9832926323357191, total=  26.5s
[CV] max_depth=9, min_child_weight=6 .................................
[CV]  max_depth=9, min_child_weight=6, score=0.983567709573943, total=  27.1s
[CV] max_depth=9, min_

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 19.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=261,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [9, 10], 'min_child_weight': [1, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [22]:
(gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_)



([mean: 0.98329, std: 0.00025, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.98338, std: 0.00030, params: {'max_depth': 9, 'min_child_weight': 4},
  mean: 0.98342, std: 0.00029, params: {'max_depth': 9, 'min_child_weight': 5},
  mean: 0.98342, std: 0.00033, params: {'max_depth': 9, 'min_child_weight': 6},
  mean: 0.98333, std: 0.00026, params: {'max_depth': 10, 'min_child_weight': 1},
  mean: 0.98344, std: 0.00031, params: {'max_depth': 10, 'min_child_weight': 4},
  mean: 0.98345, std: 0.00032, params: {'max_depth': 10, 'min_child_weight': 5},
  mean: 0.98341, std: 0.00030, params: {'max_depth': 10, 'min_child_weight': 6}],
 {'max_depth': 10, 'min_child_weight': 5},
 0.9834503614855639)

### 2.1.2 Gamma

In [24]:
param_test3 = {
    'gamma': [i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        max_depth=10, min_child_weight=5,
                        n_estimators=261, n_jobs=-1, subsample=0.8, 
                        colsample_bytree=0.8, seed=420), 
                        param_grid = param_test3, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch3.fit(X, Y)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] gamma=0.0 .......................................................
[CV] .............. gamma=0.0, score=0.9831392263064357, total=  29.3s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.9s remaining:    0.0s


[CV] .............. gamma=0.0, score=0.9833505437274163, total=  28.1s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   58.7s remaining:    0.0s


[CV] .............. gamma=0.0, score=0.9836445885498695, total=  28.0s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.5min remaining:    0.0s


[CV] .............. gamma=0.0, score=0.9829391638627483, total=  29.6s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s


[CV] .............. gamma=0.1, score=0.9831369684059846, total=  29.3s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min remaining:    0.0s


[CV] .............. gamma=0.1, score=0.9833661891991361, total=  29.9s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.0min remaining:    0.0s


[CV] .............. gamma=0.1, score=0.9836839881248394, total=  29.2s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.5min remaining:    0.0s


[CV] .............. gamma=0.1, score=0.9829606339309472, total=  31.0s
[CV] gamma=0.2 .......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.0min remaining:    0.0s


[CV] .............. gamma=0.2, score=0.9831878961047931, total=  29.7s
[CV] gamma=0.2 .......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.5min remaining:    0.0s


[CV] .............. gamma=0.2, score=0.9833666124147743, total=  30.4s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.9837317709712748, total=  29.5s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.9829321467501786, total=  29.5s
[CV] gamma=0.3 .......................................................
[CV] .............. gamma=0.3, score=0.9831684394411851, total=  30.4s
[CV] gamma=0.3 .......................................................
[CV] .............. gamma=0.3, score=0.9833746039866598, total=  28.6s
[CV] gamma=0.3 .......................................................
[CV] .............. gamma=0.3, score=0.9835951848775868, total=  35.0s
[CV] gamma=0.3 .......................................................
[CV] .............. gamma=0.3, score=0.9829944182775409, total=  29.2s
[CV] gamma=0.4 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 10.1min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=5, missing=None, n_estimators=261,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [25]:
(gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_)



([mean: 0.98327, std: 0.00026, params: {'gamma': 0.0},
  mean: 0.98329, std: 0.00027, params: {'gamma': 0.1},
  mean: 0.98330, std: 0.00029, params: {'gamma': 0.2},
  mean: 0.98328, std: 0.00022, params: {'gamma': 0.3},
  mean: 0.98330, std: 0.00028, params: {'gamma': 0.4}],
 {'gamma': 0.2},
 0.9833046093384997)

### 2.1.3 Subsample and Colsample_bytree

In [46]:
param_test4 = {
    'subsample': [i/10.0 for i in range(6,10)],
    'colsample_bytree': [i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        max_depth=10, min_child_weight=5, gamma=0.2,
                        n_estimators=261, n_jobs=-1, seed=420), 
                        param_grid = param_test4, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch4.fit(X, Y)

Fitting 4 folds for each of 16 candidates, totalling 64 fits
[CV] colsample_bytree=0.6, subsample=0.6 .............................
[CV]  colsample_bytree=0.6, subsample=0.6, score=0.982647087300539, total=  29.1s
[CV] colsample_bytree=0.6, subsample=0.6 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.6s remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.6, score=0.9827103490338155, total=  27.8s
[CV] colsample_bytree=0.6, subsample=0.6 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   58.1s remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.6, score=0.9830936203191591, total=  32.8s
[CV] colsample_bytree=0.6, subsample=0.6 .............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.5min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.6, score=0.9822252058645683, total=  33.0s
[CV] colsample_bytree=0.6, subsample=0.7 .............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.1min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.7, score=0.9828910193394014, total=  30.6s
[CV] colsample_bytree=0.6, subsample=0.7 .............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.6min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.7, score=0.9829356213151115, total=  31.1s
[CV] colsample_bytree=0.6, subsample=0.7 .............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.1min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.7, score=0.9832063645148635, total=  31.0s
[CV] colsample_bytree=0.6, subsample=0.7 .............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.7min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.7, score=0.9824204986134618, total=  28.0s
[CV] colsample_bytree=0.6, subsample=0.8 .............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.1min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.8, score=0.9826937603314985, total=  28.4s
[CV] colsample_bytree=0.6, subsample=0.8 .............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.6min remaining:    0.0s


[CV]  colsample_bytree=0.6, subsample=0.8, score=0.9830470643478443, total=  29.0s
[CV] colsample_bytree=0.6, subsample=0.8 .............................
[CV]  colsample_bytree=0.6, subsample=0.8, score=0.9833456362269345, total=  31.7s
[CV] colsample_bytree=0.6, subsample=0.8 .............................
[CV]  colsample_bytree=0.6, subsample=0.8, score=0.9822438236127076, total=  28.6s
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV]  colsample_bytree=0.6, subsample=0.9, score=0.9829353399217352, total=  29.6s
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV]  colsample_bytree=0.6, subsample=0.9, score=0.9829742937695973, total=  27.9s
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV]  colsample_bytree=0.6, subsample=0.9, score=0.98317777945013, total=  29.4s
[CV] colsample_bytree=0.6, subsample=0.9 .............................
[CV]  colsample_bytree=0.6, subsample=0.9, score=0.9823974841951753, total=  2

[CV]  colsample_bytree=0.9, subsample=0.9, score=0.9831150982058074, total=  36.3s


[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed: 36.0min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=5, missing=None, n_estimators=261,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [47]:
(gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_)



([mean: 0.98267, std: 0.00031, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.98286, std: 0.00028, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.98283, std: 0.00041, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.98287, std: 0.00029, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.98301, std: 0.00035, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.98308, std: 0.00029, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.98320, std: 0.00034, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.98321, std: 0.00031, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.98316, std: 0.00024, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.98325, std: 0.00026, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.98330, std: 0.00029, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.98333, std: 0.00019, params: {'colsample_bytree': 0.8, 'subsample'

In [58]:
param_test5 = {
    'subsample':[i/100.0 for i in range(82,93,5)],
    'colsample_bytree':[i/100.0 for i in range(82,93,5)]
}
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        max_depth=10, min_child_weight=5, gamma=0.2,
                        n_estimators=261, n_jobs=-1, seed=420), 
                        param_grid = param_test5, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch5.fit(X, Y)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] colsample_bytree=0.82, subsample=0.82 ...........................
[CV]  colsample_bytree=0.82, subsample=0.82, score=0.9831901450006562, total=  37.1s
[CV] colsample_bytree=0.82, subsample=0.82 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.7s remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.82, score=0.9833833519439409, total=  34.7s
[CV] colsample_bytree=0.82, subsample=0.82 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.82, score=0.9836860659335295, total=  37.5s
[CV] colsample_bytree=0.82, subsample=0.82 ...........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.82, score=0.9829935065256671, total=  35.9s
[CV] colsample_bytree=0.82, subsample=0.87 ...........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.5min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.87, score=0.9832919553753326, total=  30.8s
[CV] colsample_bytree=0.82, subsample=0.87 ...........................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.87, score=0.9834003255923967, total=  33.5s
[CV] colsample_bytree=0.82, subsample=0.87 ...........................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.6min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.87, score=0.9836480868323231, total=  36.9s
[CV] colsample_bytree=0.82, subsample=0.87 ...........................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.2min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.87, score=0.9831192089685767, total=  37.2s
[CV] colsample_bytree=0.82, subsample=0.92 ...........................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.8min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.92, score=0.9832111999786411, total=  37.1s
[CV] colsample_bytree=0.82, subsample=0.92 ...........................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.4min remaining:    0.0s


[CV]  colsample_bytree=0.82, subsample=0.92, score=0.9833872374236803, total=  36.5s
[CV] colsample_bytree=0.82, subsample=0.92 ...........................
[CV]  colsample_bytree=0.82, subsample=0.92, score=0.9835957251528693, total=  36.6s
[CV] colsample_bytree=0.82, subsample=0.92 ...........................
[CV]  colsample_bytree=0.82, subsample=0.92, score=0.9830280675497827, total=  36.1s
[CV] colsample_bytree=0.87, subsample=0.82 ...........................
[CV]  colsample_bytree=0.87, subsample=0.82, score=0.983149826957706, total=  35.8s
[CV] colsample_bytree=0.87, subsample=0.82 ...........................
[CV]  colsample_bytree=0.87, subsample=0.82, score=0.983328052517639, total=  33.7s
[CV] colsample_bytree=0.87, subsample=0.82 ...........................
[CV]  colsample_bytree=0.87, subsample=0.82, score=0.9837107407559068, total=  34.2s
[CV] colsample_bytree=0.87, subsample=0.82 ...........................
[CV]  colsample_bytree=0.87, subsample=0.82, score=0.9831166920831

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 22.9min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=5, missing=None, n_estimators=261,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.82, 0.87, 0.92], 'colsample_bytree': [0.82, 0.87, 0.92]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [59]:
(gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_)



([mean: 0.98331, std: 0.00026, params: {'colsample_bytree': 0.82, 'subsample': 0.82},
  mean: 0.98336, std: 0.00019, params: {'colsample_bytree': 0.82, 'subsample': 0.87},
  mean: 0.98331, std: 0.00021, params: {'colsample_bytree': 0.82, 'subsample': 0.92},
  mean: 0.98333, std: 0.00024, params: {'colsample_bytree': 0.87, 'subsample': 0.82},
  mean: 0.98329, std: 0.00024, params: {'colsample_bytree': 0.87, 'subsample': 0.87},
  mean: 0.98326, std: 0.00018, params: {'colsample_bytree': 0.87, 'subsample': 0.92},
  mean: 0.98330, std: 0.00022, params: {'colsample_bytree': 0.92, 'subsample': 0.82},
  mean: 0.98329, std: 0.00021, params: {'colsample_bytree': 0.92, 'subsample': 0.87},
  mean: 0.98327, std: 0.00017, params: {'colsample_bytree': 0.92, 'subsample': 0.92}],
 {'colsample_bytree': 0.82, 'subsample': 0.87},
 0.9833648960247675)

In [61]:
param_test5 = {
    'subsample': [0.85],
    'colsample_bytree': [0.85]
}
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        max_depth=10, min_child_weight=5, gamma=0.2,
                        n_estimators=261, n_jobs=-1, seed=420), 
                        param_grid = param_test5, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch5.fit(X, Y)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] colsample_bytree=0.85, subsample=0.85 ...........................
[CV]  colsample_bytree=0.85, subsample=0.85, score=0.9831339811339023, total=  38.7s
[CV] colsample_bytree=0.85, subsample=0.85 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   39.3s remaining:    0.0s


[CV]  colsample_bytree=0.85, subsample=0.85, score=0.9834318866734775, total=  38.2s
[CV] colsample_bytree=0.85, subsample=0.85 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  colsample_bytree=0.85, subsample=0.85, score=0.9837344655942456, total=  37.3s
[CV] colsample_bytree=0.85, subsample=0.85 ...........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  colsample_bytree=0.85, subsample=0.85, score=0.983075057665492, total=  37.5s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.2, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=5, missing=None, n_estimators=261,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.85], 'colsample_bytree': [0.85]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [62]:
(gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_)



([mean: 0.98334, std: 0.00026, params: {'colsample_bytree': 0.85, 'subsample': 0.85}],
 {'colsample_bytree': 0.85, 'subsample': 0.85},
 0.983343849771733)

### 2.1.4 Reg_alpha

In [68]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        max_depth=10, min_child_weight=5, gamma=0.2,
                        subsample=0.87, colsample_bytree=0.82,
                        n_estimators=261, n_jobs=-1, seed=420), 
                        param_grid = param_test6, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch6.fit(X,Y)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] reg_alpha=1e-05 .................................................
[CV] ........ reg_alpha=1e-05, score=0.9833101198805568, total=  39.8s
[CV] reg_alpha=1e-05 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.4s remaining:    0.0s


[CV] ........ reg_alpha=1e-05, score=0.9834003255923968, total=  40.5s
[CV] reg_alpha=1e-05 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s


[CV] .......... reg_alpha=1e-05, score=0.98364808908347, total=  41.5s
[CV] reg_alpha=1e-05 .................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.1min remaining:    0.0s


[CV] ........ reg_alpha=1e-05, score=0.9831177906878841, total=  39.8s
[CV] reg_alpha=0.01 ..................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.7min remaining:    0.0s


[CV] .......... reg_alpha=0.01, score=0.983290883829356, total=  36.4s
[CV] reg_alpha=0.01 ..................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.4min remaining:    0.0s


[CV] ......... reg_alpha=0.01, score=0.9833002170848595, total=  38.0s
[CV] reg_alpha=0.01 ..................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.0min remaining:    0.0s


[CV] ......... reg_alpha=0.01, score=0.9836515716078946, total=  32.4s
[CV] reg_alpha=0.01 ..................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.5min remaining:    0.0s


[CV] ......... reg_alpha=0.01, score=0.9830516920538911, total=  38.3s
[CV] reg_alpha=0.1 ...................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.2min remaining:    0.0s


[CV] .......... reg_alpha=0.1, score=0.9832918293111002, total=  39.3s
[CV] reg_alpha=0.1 ...................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.9min remaining:    0.0s


[CV] .......... reg_alpha=0.1, score=0.9834333859373862, total=  32.8s
[CV] reg_alpha=0.1 ...................................................
[CV] .......... reg_alpha=0.1, score=0.9836747944404503, total=  35.4s
[CV] reg_alpha=0.1 ...................................................
[CV] .......... reg_alpha=0.1, score=0.9830604156057703, total=  36.4s
[CV] reg_alpha=1 .....................................................
[CV] ............. reg_alpha=1, score=0.983314608667695, total=  41.2s
[CV] reg_alpha=1 .....................................................
[CV] ............. reg_alpha=1, score=0.983287345026256, total=  34.7s
[CV] reg_alpha=1 .....................................................
[CV] ............ reg_alpha=1, score=0.9836050921755781, total=  37.5s
[CV] reg_alpha=1 .....................................................
[CV] ............ reg_alpha=1, score=0.9831802490680995, total=  37.1s
[CV] reg_alpha=100 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 12.3min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.82, gamma=0.2, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=5, missing=None,
       n_estimators=261, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.87),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [1e-05, 0.01, 0.1, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [69]:
(gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_)



([mean: 0.98337, std: 0.00019, params: {'reg_alpha': 1e-05},
  mean: 0.98332, std: 0.00021, params: {'reg_alpha': 0.01},
  mean: 0.98337, std: 0.00022, params: {'reg_alpha': 0.1},
  mean: 0.98335, std: 0.00016, params: {'reg_alpha': 1},
  mean: 0.98059, std: 0.00050, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.983369083185499)

In [74]:
param_test7 = {
    'reg_alpha': [1e-4, 1e-3, 0.05]
}

gsearch7 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, 
                        max_depth=10, min_child_weight=5, gamma=0.2,
                        subsample=0.87, colsample_bytree=0.82,
                        n_estimators=261, n_jobs=-1, seed=420), 
                        param_grid = param_test7, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch7.fit(X,Y)

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV] reg_alpha=0.0001 ................................................
[CV] ....... reg_alpha=0.0001, score=0.9833425071325905, total=  35.1s
[CV] reg_alpha=0.0001 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.8s remaining:    0.0s


[CV] ....... reg_alpha=0.0001, score=0.9834439393145694, total=  35.2s
[CV] reg_alpha=0.0001 ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV] ....... reg_alpha=0.0001, score=0.9836936522989536, total=  37.2s
[CV] reg_alpha=0.0001 ................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s


[CV] ....... reg_alpha=0.0001, score=0.9830220905097212, total=  37.0s
[CV] reg_alpha=0.001 .................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.4min remaining:    0.0s


[CV] ........ reg_alpha=0.001, score=0.9832600633756412, total=  38.4s
[CV] reg_alpha=0.001 .................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.1min remaining:    0.0s


[CV] ........ reg_alpha=0.001, score=0.9834023178575004, total=  35.7s
[CV] reg_alpha=0.001 .................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.7min remaining:    0.0s


[CV] ........ reg_alpha=0.001, score=0.9836558127688616, total=  37.6s
[CV] reg_alpha=0.001 .................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.3min remaining:    0.0s


[CV] ........ reg_alpha=0.001, score=0.9830635493307291, total=  36.2s
[CV] reg_alpha=0.05 ..................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.0min remaining:    0.0s


[CV] .......... reg_alpha=0.05, score=0.983285751214173, total=  32.6s
[CV] reg_alpha=0.05 ..................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.5min remaining:    0.0s


[CV] ......... reg_alpha=0.05, score=0.9834113517104519, total=  33.0s
[CV] reg_alpha=0.05 ..................................................
[CV] ......... reg_alpha=0.05, score=0.9836906672780181, total=  37.0s
[CV] reg_alpha=0.05 ..................................................
[CV] ......... reg_alpha=0.05, score=0.9831482184368384, total=  38.1s


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  7.3min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.82, gamma=0.2, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=5, missing=None,
       n_estimators=261, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.87),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [0.0001, 0.001, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [75]:
(gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_)



([mean: 0.98338, std: 0.00024, params: {'reg_alpha': 0.0001},
  mean: 0.98335, std: 0.00022, params: {'reg_alpha': 0.001},
  mean: 0.98338, std: 0.00020, params: {'reg_alpha': 0.05}],
 {'reg_alpha': 0.05},
 0.9833839989185863)

### 2.1.5 Learning Rate

In [21]:
# test final model
# [65]	validation_0-auc:0.995769	validation_1-auc:0.983838 (lr=0.1)
# [210]	validation_0-auc:0.997505	validation_1-auc:0.98399
# [255]	validation_0-auc:0.996459	validation_1-auc:0.984336 (lr=0.03)
# [1288]	validation_0-auc:0.998344	validation_1-auc:0.984239 (lr=0.01)
models = xgb.XGBClassifier(learning_rate=0.03, reg_alpha=0.05,
                           max_depth=10, min_child_weight=5, gamma=0.2,
                           subsample=0.87, colsample_bytree=0.82,
                           n_estimators=20000, n_jobs=-1, seed=420)
models.fit(train_x, train_y, eval_metric=['auc'], early_stopping_rounds=100,
           eval_set=[(train_x, train_y), (test_x, test_y)])

[0]	validation_0-auc:0.982544	validation_1-auc:0.977668
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.980008	validation_1-auc:0.972932
[2]	validation_0-auc:0.983344	validation_1-auc:0.976778
[3]	validation_0-auc:0.981828	validation_1-auc:0.974417
[4]	validation_0-auc:0.980049	validation_1-auc:0.972135
[5]	validation_0-auc:0.982203	validation_1-auc:0.974491
[6]	validation_0-auc:0.983639	validation_1-auc:0.976311
[7]	validation_0-auc:0.984593	validation_1-auc:0.97727
[8]	validation_0-auc:0.98523	validation_1-auc:0.977719
[9]	validation_0-auc:0.985783	validation_1-auc:0.978263
[10]	validation_0-auc:0.98619	validation_1-auc:0.978875
[11]	validation_0-auc:0.986527	validation_1-auc:0.979253
[12]	validation_0-auc:0.986867	validation_1-auc:0.979573
[13]	validation_0-auc:0.987046	validation_1-auc:0.979695
[14]	validation_0-auc:0.986842	validation_1-auc:0.979253


[141]	validation_0-auc:0.994381	validation_1-auc:0.983817
[142]	validation_0-auc:0.994397	validation_1-auc:0.983828
[143]	validation_0-auc:0.994428	validation_1-auc:0.983845
[144]	validation_0-auc:0.99447	validation_1-auc:0.98386
[145]	validation_0-auc:0.994491	validation_1-auc:0.983867
[146]	validation_0-auc:0.994523	validation_1-auc:0.983876
[147]	validation_0-auc:0.99456	validation_1-auc:0.983881
[148]	validation_0-auc:0.994589	validation_1-auc:0.983913
[149]	validation_0-auc:0.994608	validation_1-auc:0.983916
[150]	validation_0-auc:0.99464	validation_1-auc:0.983929
[151]	validation_0-auc:0.994667	validation_1-auc:0.983938
[152]	validation_0-auc:0.994686	validation_1-auc:0.983937
[153]	validation_0-auc:0.994715	validation_1-auc:0.98394
[154]	validation_0-auc:0.994749	validation_1-auc:0.98395
[155]	validation_0-auc:0.994778	validation_1-auc:0.983957
[156]	validation_0-auc:0.9948	validation_1-auc:0.983979
[157]	validation_0-auc:0.994834	validation_1-auc:0.983993
[158]	validation_0-auc

[283]	validation_0-auc:0.996819	validation_1-auc:0.984316
[284]	validation_0-auc:0.996839	validation_1-auc:0.984319
[285]	validation_0-auc:0.996866	validation_1-auc:0.984319
[286]	validation_0-auc:0.996872	validation_1-auc:0.984319
[287]	validation_0-auc:0.996885	validation_1-auc:0.984317
[288]	validation_0-auc:0.996908	validation_1-auc:0.984324
[289]	validation_0-auc:0.996917	validation_1-auc:0.984327
[290]	validation_0-auc:0.996921	validation_1-auc:0.984328
[291]	validation_0-auc:0.996933	validation_1-auc:0.984327
[292]	validation_0-auc:0.996973	validation_1-auc:0.984322
[293]	validation_0-auc:0.996978	validation_1-auc:0.984322
[294]	validation_0-auc:0.996984	validation_1-auc:0.984324
[295]	validation_0-auc:0.996994	validation_1-auc:0.984326
[296]	validation_0-auc:0.997006	validation_1-auc:0.984328
[297]	validation_0-auc:0.997017	validation_1-auc:0.98433
[298]	validation_0-auc:0.997046	validation_1-auc:0.98433
[299]	validation_0-auc:0.99706	validation_1-auc:0.984326
[300]	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.82, gamma=0.2, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=5, missing=None,
       n_estimators=20000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.05,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.87)

In [22]:
pred_y = models.predict(test_x)
(pred_y==test_y).sum()/test_y.count()

  if diff:


0.9425278782680043

In [81]:
param_test = {
    'subsample': [0.87]
}

gsearch = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.03, reg_alpha=0.05,
                           max_depth=10, min_child_weight=5, gamma=0.2,
                           subsample=0.87, colsample_bytree=0.82,
                           n_estimators=256, n_jobs=-1, seed=420),
                        param_grid = param_test, 
                        scoring='roc_auc', cv=4, verbose=10)
gsearch.fit(X,Y)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV] subsample=0.87 ..................................................
[CV] ......... subsample=0.87, score=0.9833011648177508, total=  34.9s
[CV] subsample=0.87 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.5s remaining:    0.0s


[CV] ......... subsample=0.87, score=0.9833985471862587, total=  35.8s
[CV] subsample=0.87 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV] ......... subsample=0.87, score=0.9837048450018876, total=  35.9s
[CV] subsample=0.87 ..................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s


[CV] ......... subsample=0.87, score=0.9831469172205841, total=  33.1s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.4min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.82, gamma=0.2, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=5, missing=None,
       n_estimators=255, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.05,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.87),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'subsample': [0.87]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc',
       verbose=10)

In [82]:
(gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_)



([mean: 0.98339, std: 0.00020, params: {'subsample': 0.87}],
 {'subsample': 0.87},
 0.9833878703539197)

## 2.2 Final Model

In [19]:
models2 = xgb.XGBClassifier(learning_rate=0.03, reg_alpha=0.05,
                           max_depth=10, min_child_weight=5, gamma=0.2,
                           subsample=0.87, colsample_bytree=0.82,
                           n_estimators=256, n_jobs=-1, seed=420)

In [20]:
models2.fit(X, Y, eval_metric='auc', eval_set=[(X, Y)])


[0]	validation_0-auc:0.9448
[1]	validation_0-auc:0.975744
[2]	validation_0-auc:0.981692
[3]	validation_0-auc:0.983878
[4]	validation_0-auc:0.984974
[5]	validation_0-auc:0.985543
[6]	validation_0-auc:0.985405
[7]	validation_0-auc:0.985128
[8]	validation_0-auc:0.985685
[9]	validation_0-auc:0.98607
[10]	validation_0-auc:0.986353
[11]	validation_0-auc:0.986703
[12]	validation_0-auc:0.98695
[13]	validation_0-auc:0.987119
[14]	validation_0-auc:0.987326
[15]	validation_0-auc:0.987086
[16]	validation_0-auc:0.987245
[17]	validation_0-auc:0.987426
[18]	validation_0-auc:0.987615
[19]	validation_0-auc:0.987743
[20]	validation_0-auc:0.98787
[21]	validation_0-auc:0.988038
[22]	validation_0-auc:0.988158
[23]	validation_0-auc:0.988276
[24]	validation_0-auc:0.98835
[25]	validation_0-auc:0.988488
[26]	validation_0-auc:0.988528
[27]	validation_0-auc:0.988623
[28]	validation_0-auc:0.988677
[29]	validation_0-auc:0.988758
[30]	validation_0-auc:0.988855
[31]	validation_0-auc:0.988814
[32]	validation_0-auc:0.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.82, gamma=0.2, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=5, missing=None,
       n_estimators=256, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0.05,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.87)

In [106]:
sorted(zip(X.columns.values, models2.feature_importances_), key=lambda x: x[1], reverse=True)


[('y_entry_19', 0.11676897),
 ('x_entry_19', 0.041110184),
 ('duration_19', 0.025248382),
 ('entry_hour_19', 0.02436599),
 ('y_entry_2', 0.019651614),
 ('y_entry_18', 0.01964161),
 ('y_exit_18', 0.014286882),
 ('y_exit_1', 0.014022528),
 ('vmax_9', 0.010110569),
 ('x_entry_2', 0.009191091),
 ('x_exit_18', 0.008180014),
 ('x_exit_2', 0.007038316),
 ('y_entry_3', 0.00698125),
 ('y_exit_2', 0.0068428223),
 ('x_entry_3', 0.0062673013),
 ('x_entry_12', 0.0062101083),
 ('x_entry_18', 0.0061816396),
 ('x_entry_4', 0.0061089606),
 ('y_entry_0', 0.0059725656),
 ('y_entry_8', 0.0059519703),
 ('x_exit_6', 0.005729021),
 ('y_exit_0', 0.005675687),
 ('x_entry_7', 0.0053745396),
 ('x_entry_8', 0.0053210724),
 ('entry_minute_19', 0.0052934885),
 ('y_exit_9', 0.0052486104),
 ('x_entry_1', 0.0052465196),
 ('x_exit_11', 0.005195101),
 ('speed_5', 0.005096183),
 ('x_exit_5', 0.0046734386),
 ('y_entry_1', 0.0044644587),
 ('y_entry_6', 0.0043883733),
 ('y_entry_12', 0.0043539233),
 ('y_entry_7', 0.00421940

In [21]:
pickle.dump(models2, open("models/final.pickle", "wb"))

# 3. Make Prediction

In [22]:
models2 = pickle.load(open("models/final.pickle", "rb"))

In [23]:
test2 = pd.read_csv("data/interim/data_test_concat.csv", index_col=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
test2.shape

(33515, 381)

In [25]:
tname = test2['trajectory_id_19']

In [26]:
tX = test2.drop([x+str(y) for y in range(20) for x in ['Unnamed: 0_', 'hash_', 'trajectory_id_', 'time_entry_', 'time_exit_']] \
    + ['x_exit_19', 'y_exit_19','vmax_19', 'vmin_19', 'vmean_19'] + ['Unnamed: 0'], axis=1)
# X = X.fillna(0)

In [27]:
# extrapolation
for i in range(18,-1,-1):
    cur = str(i)
    nex = str(i+1)
    nan = tX['x_entry_'+cur].isnull()
    for x,y in zip(['x_entry_'+cur, 'y_entry_'+cur, 'entry_hour_'+cur, 'entry_minute_'+cur, 'entry_second_'+cur],
                   ['x_entry_'+nex, 'y_entry_'+nex, 'entry_hour_'+nex, 'entry_minute_'+nex, 'entry_second_'+nex]):
        tX.iloc[np.where(nan)[0], tX.columns.get_loc(x)] = tX[y][nan]
    for x,y in zip(['x_exit_'+cur, 'y_exit_'+cur, 'exit_hour_'+cur, 'exit_minute_'+cur, 'exit_second_'+cur],
                   ['x_entry_'+nex, 'y_entry_'+nex, 'entry_hour_'+nex, 'entry_minute_'+nex, 'entry_second_'+nex]):
        tX.iloc[np.where(nan)[0], tX.columns.get_loc(x)] = tX[y][nan]

In [28]:
# space and velocity inference
for i in range(19):
    tX['distance_'+str(i)] = ((tX['x_entry_'+str(i)]-tX['x_exit_'+str(i)]).pow(2)+(tX['y_entry_'+str(i)]-tX['y_exit_'+str(i)]).pow(2)).pow(0.5)
    tX['speed_'+str(i)] = tX['distance_'+str(i)]/tX['duration_'+str(i)]
tX = tX.replace([np.inf, -np.inf], np.nan)
tX = tX.fillna(0)

In [29]:
# time feature
for i in range(20):
    tX['entry_hour_quarter_'+str(i)] = tX['entry_hour_'+str(1)]*4 + np.ceil(tX['entry_minute_'+str(1)]/15)
    tX['exit_hour_quarter_'+str(i)] = tX['exit_hour_'+str(1)]*4 + np.ceil(tX['exit_minute_'+str(1)]/15)
    

In [30]:
sortcol = sorted(tX.columns.tolist(), key=lambda x: (int(x[x.rfind("_")+1:]), x[:x.rfind("_")]))
tX = tX[sortcol]

In [31]:
tX.columns.tolist()

['distance_0',
 'duration_0',
 'entry_hour_0',
 'entry_hour_quarter_0',
 'entry_minute_0',
 'entry_second_0',
 'exit_hour_0',
 'exit_hour_quarter_0',
 'exit_minute_0',
 'exit_second_0',
 'speed_0',
 'vmax_0',
 'vmean_0',
 'vmin_0',
 'x_entry_0',
 'x_exit_0',
 'y_entry_0',
 'y_exit_0',
 'distance_1',
 'duration_1',
 'entry_hour_1',
 'entry_hour_quarter_1',
 'entry_minute_1',
 'entry_second_1',
 'exit_hour_1',
 'exit_hour_quarter_1',
 'exit_minute_1',
 'exit_second_1',
 'speed_1',
 'vmax_1',
 'vmean_1',
 'vmin_1',
 'x_entry_1',
 'x_exit_1',
 'y_entry_1',
 'y_exit_1',
 'distance_2',
 'duration_2',
 'entry_hour_2',
 'entry_hour_quarter_2',
 'entry_minute_2',
 'entry_second_2',
 'exit_hour_2',
 'exit_hour_quarter_2',
 'exit_minute_2',
 'exit_second_2',
 'speed_2',
 'vmax_2',
 'vmean_2',
 'vmin_2',
 'x_entry_2',
 'x_exit_2',
 'y_entry_2',
 'y_exit_2',
 'distance_3',
 'duration_3',
 'entry_hour_3',
 'entry_hour_quarter_3',
 'entry_minute_3',
 'entry_second_3',
 'exit_hour_3',
 'exit_hour_quar

In [32]:
tX[:] = scaler.transform(tX)

In [33]:
(X.shape, tX.shape)

((134063, 353), (33515, 353))

In [34]:
tX[:5]

Unnamed: 0,distance_0,duration_0,entry_hour_0,entry_hour_quarter_0,entry_minute_0,entry_second_0,exit_hour_0,exit_hour_quarter_0,exit_minute_0,exit_second_0,...,entry_hour_19,entry_hour_quarter_19,entry_minute_19,entry_second_19,exit_hour_19,exit_hour_quarter_19,exit_minute_19,exit_second_19,x_entry_19,y_entry_19
0,-0.014382,-0.019108,1.067279,1.100958,0.828158,-0.419023,1.067261,1.100934,0.828093,-0.419057,...,0.394396,1.100958,-1.303436,0.458538,-0.013928,1.100934,-0.880061,0.423521,1.384915,1.01214
1,-0.014382,-0.019108,0.318053,0.230919,-1.179225,-0.065709,0.318028,0.230835,-1.179252,-0.065751,...,0.394396,0.230919,0.14416,-0.903092,-0.013928,0.230835,0.296528,1.84045,1.041301,-1.567424
2,-0.014382,-0.019108,0.318053,0.35521,0.942866,-1.125652,0.318028,0.355135,0.942799,-1.125671,...,0.394396,0.35521,-0.02287,-0.962294,-0.013928,0.355135,0.06121,-0.993409,-0.32755,-0.218088
3,-0.014382,-0.019108,0.567795,0.479502,-1.064517,-1.243423,0.567773,0.479435,-1.064546,-1.24344,...,0.394396,0.479502,0.478221,-0.370281,-0.013928,0.479435,1.002482,-1.170525,0.004736,-1.746494
4,-0.014382,-0.019108,0.318053,0.417356,1.114927,0.817577,0.318028,0.417285,1.114857,0.817516,...,-2.440666,0.417356,1.53608,-1.021495,-0.013928,0.417285,-1.174208,-0.639176,0.640926,0.680892


In [35]:
pred_y = models2.predict(tX)
print(pred_y.sum(), pred_y.shape)

  if diff:


In [40]:
output = pd.DataFrame({'id': tname, 'target': pred_y.astype(int)})
output.to_csv('data/submissions/submission10.csv', index=False, columns=['id', 'target'])