In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
import sklearn
import time
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda, munge, ml
from datsci import kaggle as kg

In [69]:
import santander

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier as SGDClf

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import make_scorer

import xgboost as xgb

# Following tutorial

http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [70]:
X_train, y_train, X_test, y_test, feature_cols, df_train, df_test = santander.read_split(santander.FILE_TRAIN,
                                                                                         santander.FILE_TEST)

## Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

In [9]:
def cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50, missing=missing):
    
    # Train cv
    xgb_param = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, label=y_train.values, missing=missing)
    cv_result = xgb.cv(
        xgb_param, dtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_nfold,
        metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
    best_n_estimators = cv_result.shape[0]
    model.set_params(n_estimators=best_n_estimators)
    
    # Train model
    model.fit(X_train, y_train, eval_metric='auc')
        
    # Predict training data
    y_hat_train = model.predict(X_train)

    # Predict test data
    y_hat_test = model.predict(X_test)
    
    # Print model report:
    print("\nModel Report")
    print("best n_estimators: {}".format(best_n_estimators))
    print("AUC Score (Train): %f" % roc_auc_score(y_train, y_hat_train))
    print("AUC Score (Test) : %f" % roc_auc_score(y_test,  y_hat_test))
                    
#     feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')

model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)
cv_fit_model(model, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.
Stopping. Best iteration: 57



Model Report
best n_estimators: 58
AUC Score (Train): 0.878746
AUC Score (Test) : 0.834606


## Step 2: Tune max_depth and min_child_weight

In [30]:
%%time

model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=58,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

param_grid = {
 'max_depth': [5],
 'min_child_weight': [8, 9, 10],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 112.355
CPU times: user 6min 33s, sys: 6.79 s, total: 6min 40s
Wall time: 1min 52s


In [29]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.reg_alpha, best_model.reg_lambda, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

5 | 9 | 0.83552176985969


max_depth | min_child_weight | test score | gridcv params
----------|------------------|------------|--------------
5 | 5 | 0.8357748141861523 | {'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5],}
5 | 9 | 0.83552176985969 | {'max_depth': [4, 5, 6], 'min_child_weight': [8, 9, 10],}

## Step 3: Tune gamma

In [33]:
%%time

model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=58,
    max_depth=5,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

param_grid = {
  'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 187.796
CPU times: user 10min 58s, sys: 11.3 s, total: 11min 9s
Wall time: 3min 7s


In [35]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.reg_alpha, best_model.reg_lambda, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.3 | 0.8360898605728038


gamma | test score | gridcv params
----------|------------------|------------|--------------
0.3 | 0.8360898605728038 | {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],}

In [36]:
model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=9,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)
cv_fit_model(model1, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.



Model Report
best n_estimators: 58
AUC Score (Train): 0.878746
AUC Score (Test) : 0.834606


Final parameters

- n_estimators: 58
- max_depth: 5
- min_child_weight: 9
- gamma: 0.3

## Step 4: Tune subsample and colsample_bytree

In [43]:
%%time

model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=58,
    max_depth=5,
    min_child_weight=9,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

param_grid = {
    'subsample': [0.65, 0.7, 0.75],
    'colsample_bytree': [0.65, 0.7, 0.75],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 310.089
CPU times: user 17min 51s, sys: 20.1 s, total: 18min 12s
Wall time: 5min 10s


In [44]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.reg_alpha, best_model.reg_lambda, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.7 | 0.7 | 0.8364270758593435


subsample | colsample_bytree | test score | gridcv params
----------|------------------|------------|--------------
0.7 | 0.7 | 0.8364270758593435 | {'subsample': [0.6, 0.7, 0.8], 'colsample_bytree': [0.6, 0.7, 0.8],}
0.7 | 0.7 | 0.8364270758593435 | {'subsample': [0.65, 0.7, 0.75], 'colsample_bytree': [0.65, 0.7, 0.75],}

## Step 5: Tuning Regularization Parameters

In [62]:
%%time

model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=58,
    max_depth=5,
    min_child_weight=9,
    gamma=0.3,
    subsample=0.7,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

param_grid = {
    'reg_alpha': [0, 0.01, 0.02, 0.03, 0.05],
    'reg_lambda': [0.98, 0.99, 1, 1.01, 1.02],
}

best_score, best_model = ml.fine_tune_params(model,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=5,
                                             scorer=roc_auc_score,
                                             n_jobs=1,
                                             gscv_kwargs={'iid': False})

iteration 0
Each iteration time(secs): 847.463
CPU times: user 48min 43s, sys: 54.6 s, total: 49min 37s
Wall time: 14min 7s


In [65]:
train_score = roc_auc_score(y_train, best_model.predict(X_train))
fine_tune_results = best_model.reg_alpha, best_model.reg_lambda, train_score, best_score
print(" | ".join(["{}"] * len(fine_tune_results)).format(*fine_tune_results))

0.01 | 0.99 | 0.8681091030364465 | 0.8362743989494978


reg_alpha | reg_lambda | train score | test score | gridcv params
----------|---------|---------|------------|--------------
0 | 1 | | 0.8364270758593435 | {'reg_alpha': [1e-2, 0.1, 0, 1, 10], 'reg_lambda': [1e-2, 0.1, 0, 1, 10],}
0.01 | 0.99 | | 0.8362743989494978 | {'reg_alpha': [0, 0.01, 0.02, 0.03, 0.05], 'reg_lambda': [0.98, 0.99, 1, 1.01, 1.02],}


In [64]:
model = xgb.XGBRegressor(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=9,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    reg_alpha=0,
    reg_lambda=1,
    nthread=4,
    scale_pos_weight=1,
    seed=27
)
cv_fit_model(model1, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.



Model Report
best n_estimators: 58
AUC Score (Train): 0.878746
AUC Score (Test) : 0.834606


## Step 6: Reducing Learning Rate

In [67]:
model = xgb.XGBRegressor(
    learning_rate =0.005,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=9,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    reg_alpha=0,
    reg_lambda=1,
    nthread=4,
    scale_pos_weight=1,
    seed=27
)
cv_fit_model(model1, X_train, y_train, X_test, y_test, cv_nfold=5, early_stopping_rounds=50)

Will train until cv error hasn't decreased in 50 rounds.



Model Report
best n_estimators: 58
AUC Score (Train): 0.878746
AUC Score (Test) : 0.834606


In [72]:
xgb.XGBClassifier?

In [71]:
xgb.cv?