# Fine-tuning XGBoost models

Based on the tutorial at http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
import sklearn
import time
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda, munge, ml
from datsci import kaggle as kg

In [2]:
import santander

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier as SGDClf
from sklearn.cross_validation import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer

import xgboost as xgb

In [3]:
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(
    santander.FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT,
    santander.FILE_TEST_DEDUP_VAR3_DELTA_1HOT
)

## Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

In [6]:
RANDOM_SEED = 5

def cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=np.nan):
    
    # Train cv
    xgb_param = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, label=y_train.values, missing=missing)
    cv_result = xgb.cv(
        xgb_param, dtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_nfold,
        metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
    best_n_estimators = cv_result.shape[0]
    model.set_params(n_estimators=best_n_estimators)
    
    # Train model
    model.fit(X_train, y_train, eval_metric='auc')
        
    # Predict training data
    y_hat_train = model.predict(X_train)

    # Predict test data
    y_hat_test = model.predict(X_test)
    
    # Print model report:
    print("\nModel Report")
    print("best n_estimators: {}".format(best_n_estimators))
    print("AUC Score (Train): %f" % roc_auc_score(y_train, y_hat_train))
    print("AUC Score (Test) : %f" % roc_auc_score(y_test,  y_hat_test))
                    
#     feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 56



Model Report
best n_estimators: 57
AUC Score (Train): 0.877107
AUC Score (Test) : 0.844208


## Step 2: Tune max_depth and min_child_weight

In [10]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=57,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)

param_grid = {
 'max_depth': [3, 4, 5],
 'min_child_weight': [6, 7, 8, 9],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 321.738
CPU times: user 18min 28s, sys: 20.6 s, total: 18min 49s
Wall time: 5min 21s


In [11]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.max_depth, best_model.min_child_weight, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

5 | 9 | 0.8664432060811502 | 0.843041822112627


FILE_TRAIN

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
5 | 5 |  | 0.8357748141861523 | {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5],}
5 | 9 |  | 0.83552176985969 | {'max_depth': [4, 5, 6], 'min_child_weight': [8, 9, 10],}


FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT

max_depth | min_child_weight | train score | test score | gridcv params
----------|------------------|-------------|------------|--------------
5 | 9 | 0.8664432060811502 | 0.843041822112627 | {'max_depth': [4, 5, 6], 'min_child_weight': [8, 9, 10],}

## Step 3: Tune gamma

In [15]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=57,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)

param_grid = {
  'gamma': [0.0, 0.02, 0.04, 0.06, 0.08],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 160.232
CPU times: user 9min 22s, sys: 9.15 s, total: 9min 31s
Wall time: 2min 40s


In [17]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.gamma, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.0 | 0.8664432060811502 | 0.843041822112627


FILE_TRAIN

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.3 |  | 0.8360898605728038 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],}


FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT

gamma | train score | test score | gridcv params
------|-------------|------------|--------------
0.0 | 0.8664432060811502 | 0.843041822112627 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],}

In [19]:
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 75



Model Report
best n_estimators: 76
AUC Score (Train): 0.870831
AUC Score (Test) : 0.844556


Final parameters

- n_estimators: 76
- max_depth: 5
- min_child_weight: 9
- gamma: 0.0

## Step 4: Tune subsample and colsample_bytree

In [23]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=76,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)

param_grid = {
    'subsample': [0.75, 0.8, 0.85],
    'colsample_bytree': [0.65, 0.7, 0.75],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 333.234
CPU times: user 19min 45s, sys: 17.6 s, total: 20min 3s
Wall time: 5min 33s


In [25]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.subsample, best_model.colsample_bytree, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.8 | 0.7 | 0.8720600990462103 | 0.8450419887931837


FILE_TRAIN

subsample | colsample_bytree | train score | test score | gridcv params
----------|------------------|-------------|------------|---------------
0.7 | 0.7 |  | 0.8364270758593435 | {'subsample': [0.6, 0.7, 0.8], 'colsample_bytree': [0.6, 0.7, 0.8],}
0.7 | 0.7 |  | 0.8364270758593435 | {'subsample': [0.65, 0.7, 0.75], 'colsample_bytree': [0.65, 0.7, 0.75],}


FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT

subsample | colsample_bytree | train score | test score | gridcv params
----------|------------------|-------------|------------|---------------
0.8 | 0.7 | 0.8720600990462103 | 0.8450419887931837 | {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9],}

## Step 5: Tuning Regularization Parameters

In [26]:
%%time

model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=76,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)

param_grid = {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10],}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 917.254
CPU times: user 54min 28s, sys: 48.4 s, total: 55min 16s
Wall time: 15min 17s


In [27]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.reg_alpha, best_model.reg_lambda, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0 | 1 | 0.8720600990462103 | 0.8450419887931837


FILE_TRAIN

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | | 0.8364270758593435 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10],}
0.01 | 0.99 | | 0.8362743989494978 | {'reg_alpha': [0, 0.01, 0.02, 0.03, 0.05], 'reg_lambda': [0.98, 0.99, 1, 1.01, 1.02],}


FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT

reg_alpha | reg_lambda | train score | test score | gridcv params
----------|------------|-------------|------------|--------------
0 | 1 | 0.8720600990462103 | 0.8450419887931837 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10],}

In [29]:
model = xgb.XGBRegressor(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    reg_alpha=0,
    reg_lambda=1,
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 74



Model Report
best n_estimators: 75
AUC Score (Train): 0.871799
AUC Score (Test) : 0.845032


## Step 6: Reducing Learning Rate

In [31]:
model = xgb.XGBRegressor(
    learning_rate=0.01,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    reg_alpha=0,
    reg_lambda=1,
    nthread=4,
    scale_pos_weight=1,
    seed=RANDOM_SEED
)
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 733



Model Report
best n_estimators: 734
AUC Score (Train): 0.871176
AUC Score (Test) : 0.845529


## Predict Test and Save

In [33]:
# FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT
kg.save_submission(model.predict(df_test), 'submissions/xgb.fine_tuned.1hot.csv')  # Score 0.837155

In [34]:
# FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT trained on all data
kg.save_submission(
    model.fit(
        df_train[feature_cols],
        df_train[santander.TARGET_COL]
    ).predict(df_test),
    'submissions/xgb.fine_tuned.1hot.fit_all.csv')  # Score 0.838335

In [39]:
# FILE_TRAIN_DEDUP_VAR3_DELTA_1HOT trained on all data w cv
model.n_estimators = 2000
cv_fit_model(model,
             df_train[feature_cols],
             df_train[santander.TARGET_COL],
             X_test, y_test,
             cv_nfold=5, early_stopping_rounds=50)

kg.save_submission(model.predict(df_test), 'submissions/xgb.fine_tuned.1hot.fit_all_cv.csv')  # Score 0.838332

Will train until cv error hasn't decreased in 50 rounds.
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Stopping. Best iteration: 756



Model Report
best n_estimators: 757
AUC Score (Train): 0.869186
AUC Score (Test) : 0.873916


In [72]:
xgb.XGBClassifier?

In [71]:
xgb.cv?