# Fine-tuning XGBoost models

Based on the tutorial at http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import copy
import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
import sklearn
import time
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda, munge, ml, ensemble
from datsci import kaggle as kg

In [2]:
import santander

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier as SGDClf
from sklearn.cross_validation import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer

import xgboost as xgb

In [3]:
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(
    santander.FILE_TRAIN_DEDUP_NZ,
    santander.FILE_TEST_DEDUP_NZ
)

## Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

In [5]:
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=1.0,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)
santander.cv_fit_xgb_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 68



Model Report
best n_estimators: 69
AUC Score (Train): 0.881826
AUC Score (Test) : 0.843402


## Step 2: Tune max_depth and min_child_weight

In [10]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=69,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=1.0,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)

param_grid = {'max_depth': [4, 5, 6], 'min_child_weight': [4, 5, 6, 7]}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 394.706
CPU times: user 23min 9s, sys: 24.2 s, total: 23min 33s
Wall time: 6min 34s


In [12]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.max_depth, best_model.min_child_weight, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

5 | 5 | 0.8737533002769353 | 0.8456223716451572


FILE_TRAIN

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
5 | 5 |  | 0.8357748141861523 | {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]}
5 | 9 |  | 0.83552176985969 | {'max_depth': [4, 5, 6], 'min_child_weight': [8, 9, 10]}


FILE_TRAIN_DEDUP

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
7 | 5 | 0.8901070517971329 | 0.843010994860875 | {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]}


FILE_TRAIN_DEDUP_NZ

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
5 | 5 | 0.8737533002769353 | 0.8456223716451572 | {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]}
5 | 5 | 0.8737533002769353 | 0.8456223716451572 | {'max_depth': [4, 5, 6], 'min_child_weight': [4, 5, 6, 7]}


FILE_TRAIN_DEDUP_VAR3_DELTA1_1HOT

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
5 | 9 | 0.8664432060811502 | 0.843041822112627 | {'max_depth': [4, 5, 6], 'min_child_weight': [8, 9, 10]}


FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
6 | 6 | 0.8789324145479674 | 0.8429895974606881 | {'max_depth': [5, 6, 7], 'min_child_weight': [5, 6, 7]}
7 | 6 | 0.8879859317528758 | 0.8435746519822442 | {'max_depth': [6, 7, 8], 'min_child_weight': [4, 5, 6, 7]}

## Step 3: Tune gamma

In [15]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=69,
    max_depth=5,
    min_child_weight=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=1.0,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)

param_grid = {'gamma': [0.16, 0.18, 0.20, 0.22, 0.24]}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 168.140
CPU times: user 9min 51s, sys: 10.4 s, total: 10min 1s
Wall time: 2min 48s


In [17]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.gamma, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.2 | 0.8733990019044412 | 0.8446681687141747


FILE_TRAIN

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.3 |  | 0.8360898605728038 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],}


FILE_TRAIN_DEDUP

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.0 | 0.8879859317528758 | 0.8435746519822442 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]}
0.01 | 0.8886836614252743 | 0.8432648510789971 | {'gamma': [0.0, 0.01, 0.02, 0.03, 0.04]}


FILE_TRAIN_DEDUP_NZ

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.2 | 0.8733990019044412 | 0.8446681687141747 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]}
0.2 | 0.8733990019044412 | 0.8446681687141747 | {'gamma': [0.16, 0.18, 0.20, 0.22, 0.24]}


FILE_TRAIN_DEDUP_VAR3_DELTA1_1HOT

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.0 | 0.8664432060811502 | 0.843041822112627 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],}


FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.0 | 0.8789324145479674 | 0.8429895974606881 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]}

In [18]:
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=1.0,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)
santander.cv_fit_xgb_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 62



Model Report
best n_estimators: 63
AUC Score (Train): 0.871316
AUC Score (Test) : 0.844491


## Step 4: Tune subsample, colsample_bytree, and colsample_bylevel

In [22]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=63,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    colsample_bylevel=1.0,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)

param_grid = {'subsample': [0.76, 0.78, 0.80, 0.82, 0.84], 'colsample_bytree': [0.76, 0.78, 0.80, 0.82, 0.84], 'colsample_bylevel': [0.86, 0.88, 0.90, 0.92, 0.94]}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 3514.352
CPU times: user 3h 22min 38s, sys: 3min 59s, total: 3h 26min 37s
Wall time: 58min 34s


In [24]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.subsample, best_model.colsample_bytree, best_model.colsample_bylevel, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.84 | 0.8 | 0.9 | 0.8713303353642213 | 0.842270967794029


FILE_TRAIN

subsample | colsample_bytree | train score | test score | gridcv params
----------|------------------|-------------|------------|---------------
0.7 | 0.7 |  | 0.8364270758593435 | {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
0.7 | 0.7 |  | 0.8364270758593435 | {'subsample': [0.65, 0.7, 0.75], 'colsample_bytree': [0.65, 0.7, 0.75]}


FILE_TRAIN_DEDUP

subsample | colsample_bytree | colsample_bylevel | train score | test score | gridcv params
----------|------------------|-------------------|-------------|------------|---------------
0.8 | 0.7 | 0.8 | 0.8858305565754111 | 0.8443039515115303 | {'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8, 0.9], 'colsample_bylevel': [0.8, 0.9, 1.0]}
0.8 | 0.68 | 0.82 | 0.8861695721331364 | 0.8433246599846411| {'subsample': [0.76, 0.78, 0.8, 0.82, 0.84], 'colsample_bytree': [0.66, 0.68, 0.7, 0.72, 0.74], 'colsample_bylevel': [0.76, 0.78, 0.8, 0.82, 0.84]}


FILE_TRAIN_DEDUP_NZ

subsample | colsample_bytree | colsample_bylevel | train score | test score | gridcv params
----------|------------------|-------------------|-------------|------------|---------------
0.8 | 0.8 | 0.9 | 0.8717281762412291 | 0.843654445252214 | {'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8, 0.9], 'colsample_bylevel': [0.8, 0.9, 1.0]}
0.84 | 0.8 | 0.9 | 0.8713303353642213 | 0.842270967794029 | {'subsample': [0.76, 0.78, 0.80, 0.82, 0.84], 'colsample_bytree': [0.76, 0.78, 0.80, 0.82, 0.84], 'colsample_bylevel': [0.86, 0.88, 0.90, 0.92, 0.94]}


FILE_TRAIN_DEDUP_VAR3_DELTA1_1HOT

subsample | colsample_bytree | train score | test score | gridcv params
----------|------------------|-------------|------------|---------------
0.8 | 0.7 | 0.8720600990462103 | 0.8450419887931837 | {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9],}


FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT

subsample | colsample_bytree | colsample_bylevel | train score | test score | gridcv params
----------|------------------|-------------------|-------------|------------|---------------
0.8 | 0.8 | 1.0 | 0.8735857141145673 | 0.8422202426903511 | {'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8, 0.9]}
0.76 | 0.82 | 1.0 | 0.8739140693391325 | 0.8422564048734166 | {'subsample': [0.76, 0.78, 0.8, 0.82, 0.84], 'colsample_bytree': [0.76, 0.78, 0.8, 0.82, 0.84]}
0.8 | 0.8 | 0.9 | 0.8735938935745703 | 0.8430431197986221 | {'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8, 0.9], 'colsample_bylevel': [0.8, 0.9, 1.0]}
0.76 | 0.8 | 0.88 | 0.8718073998509144 | 0.8441056362540015 | {'subsample': [0.74, 0.76, 0.78], 'colsample_bytree': [0.78, 0.8, 0.82], 'colsample_bylevel': [0.86, 0.88, 0.90]}


## Step 5: Tuning Regularization Parameters

In [25]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=63,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.84,
    colsample_bytree=0.8,
    colsample_bylevel=0.9,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)

param_grid = {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10]}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 684.254
CPU times: user 39min 22s, sys: 46.9 s, total: 40min 9s
Wall time: 11min 24s


In [26]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.reg_alpha, best_model.reg_lambda, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0 | 1 | 0.8713303353642213 | 0.842270967794029


FILE_TRAIN

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | | 0.8364270758593435 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10],}
0.01 | 0.99 | | 0.8362743989494978 | {'reg_alpha': [0, 0.01, 0.02, 0.03, 0.05], 'reg_lambda': [0.98, 0.99, 1, 1.01, 1.02],}


FILE_TRAIN_DEDUP

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | 0.8861695721331364 | 0.8433246599846411 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10]}


FILE_TRAIN_DEDUP_NZ

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | 0.8713303353642213 | 0.842270967794029 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10]}


FILE_TRAIN_DEDUP_VAR3_DELTA1_1HOT

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | 0.8720600990462103 | 0.8450419887931837 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10],}


FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | 0.8739140693391325 | 0.8422564048734166 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10]}
0.1 | 0 | 0.8750670975469783 | 0.8437352478335133 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10]}
0.1 | 0 | 0.8750670975469783 | 0.8437352478335133 | {'reg_alpha': [0.06, 0.08, 0.1, 0.2, 0.3], 'reg_lambda': [0, 0.01, 0.02, 0.03, 0.04]}

In [27]:
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.84,
    colsample_bytree=0.8,
    colsample_bylevel=0.9,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)
santander.cv_fit_xgb_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 72



Model Report
best n_estimators: 73
AUC Score (Train): 0.874694
AUC Score (Test) : 0.842927


## Step 6: Handle Imbalanced Data Set

In [31]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=73,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.84,
    colsample_bytree=0.8,
    colsample_bylevel=0.9,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)

param_grid = {'scale_pos_weight': [.1, .5, .8, 1], 'max_delta_step': [0, .5, 1, 1.5, 2]}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 554.303
CPU times: user 31min 54s, sys: 37.8 s, total: 32min 32s
Wall time: 9min 14s


In [33]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.scale_pos_weight, best_model.max_delta_step, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

1 | 0 | 0.8746939752843542 | 0.8429269336458545


FILE_TRAIN_DEDUP

scale_pos_weight | max_delta_step | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
1 | 0 | 0.8861695721331364 | 0.8433246599846411 | {'scale_pos_weight': [1, 10, 100, 1000], 'max_delta_step': [0, .5, 1, 1.5, 2]}
1 | 0 | 0.8861695721331364 | 0.8433246599846411 | {'scale_pos_weight': [.1, .5, .8, 1], 'max_delta_step': [0, .5, 1, 1.5, 2]}


FILE_TRAIN_DEDUP_NZ

scale_pos_weight | max_delta_step | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
1 | 0 | 0.8746939752843542 | 0.8429269336458545 | {'scale_pos_weight': [1, 10, 100, 1000], 'max_delta_step': [0, .5, 1, 1.5, 2]}
1 | 0 | 0.8746939752843542 | 0.8429269336458545 | {'scale_pos_weight': [.1, .5, .8, 1], 'max_delta_step': [0, .5, 1, 1.5, 2]}


FILE_TRAIN_DEDUP_VAR3_DELTANAN_1HOT

scale_pos_weight | max_delta_step | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
1 | 0 | 0.8802559656782861 | 0.8450920217976642 | {'scale_pos_weight': [1, 10, 100, 1000], 'max_delta_step': [0, .5, 1, 1.5, 2]}
0.5 | 0 | 0.8707687197517684 | 0.8439434543420719 | {'scale_pos_weight': [.1, .5, .8, 1], 'max_delta_step': [0, .5, 1, 1.5, 2]}
0.6 | 0 | 0.8741375175556232 | 0.844573726011193 | {'scale_pos_weight': [.4, .5, .6, .7], 'max_delta_step': [0, 0.1, 0.2, 0.3]}

In [34]:
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.84,
    colsample_bytree=0.8,
    colsample_bylevel=0.9,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)
santander.cv_fit_xgb_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 72



Model Report
best n_estimators: 73
AUC Score (Train): 0.874694
AUC Score (Test) : 0.842927


## Step 7: Reducing Learning Rate

In [35]:
model = xgb.XGBRegressor(
    learning_rate=0.01,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.84,
    colsample_bytree=0.8,
    colsample_bylevel=0.9,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    max_delta_step=0,
    objective='binary:logistic',
    nthread=4,
    seed=santander.RANDOM_SEED
)
santander.cv_fit_xgb_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=200)

Will train until cv error hasn't decreased in 200 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 743



Model Report
best n_estimators: 744
AUC Score (Train): 0.876384
AUC Score (Test) : 0.844422


## Predict Test and Save

In [36]:
out_prefix = 'xgb.nz.fine_tuned.dedup'

# Fine-tuned
kg.save_submission(model.predict(df_test), 'submissions/{}.csv'.format(out_prefix))  # Score

# Retrained on all data
kg.save_submission(
    model.fit(
        df_train[feature_cols],
        df_train[santander.TARGET_COL]
    ).predict(df_test),
    'submissions/{}.fit_all.csv'.format(out_prefix))  # Score

# Retrained on all data w cv
model.n_estimators = 5000
santander.cv_fit_xgb_model(model,
                           df_train[feature_cols],
                           df_train[santander.TARGET_COL],
                           X_test, y_test,
                           cv_nfold=5, early_stopping_rounds=200)

kg.save_submission(model.predict(df_test), 'submissions/{}.fit_all_cv.csv'.format(out_prefix))  # Score

Will train until cv error hasn't decreased in 200 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 809



Model Report
best n_estimators: 810
AUC Score (Train): 0.873587
AUC Score (Test) : 0.877753


In [72]:
xgb.XGBClassifier?

In [5]:
xgb.cv?

In [6]:
xgb.DMatrix?