In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss

In [4]:
trn = pd.read_csv('data/baseline.feature_engineer.trn.csv')
y = pd.read_csv('data/baseline.clean.y.csv')

In [7]:
print('# Cross validation..')

# XGB Model Param
num_round = 500
early_stop = 10
xgb_params = {
    'booster': 'gbtree',

    # model complexity
    'max_depth': 2,  # higher, more complex
    # 'gamma': 3,    # lower, more complex
    # 'min_child_weight': 5, # lower, more complex

    # regularization via random
    # 'colsample_bylevel': 0.7,
    # 'colsample_bytree': 1,
    # 'subsample': 0.8,

    # regulization
    # 'reg_alpha': 2,
    # 'reg_lambda': 3,

    # 'learning_rate': 0.03,

    # basic
    'nthread': 4,
    'num_class': 15,
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'seed': 777,
}

trn_scores = []
vld_scores = []
best_iters = []
n_splits = 5
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=777)
for i, (t_ind, v_ind) in enumerate(sss.split(trn, y)):
    print('# Iter {} / {}'.format(i+1, n_splits))
    x_trn = np.asarray(trn)[t_ind]
    x_vld = np.asarray(trn)[v_ind]
    y_trn = np.asarray(y)[t_ind]
    y_vld = np.asarray(y)[v_ind]

    dtrn = xgb.DMatrix(x_trn, label=y_trn)
    dvld = xgb.DMatrix(x_vld, label=y_vld)
    watch_list = [(dtrn, 'train'), (dvld, 'eval')]

    # fit xgb
    bst = xgb.train(xgb_params, dtrn, num_round, watch_list,
                    early_stopping_rounds=early_stop, verbose_eval=True)

    # eval _ trn
    score = log_loss(y_trn, bst.predict(dtrn))
    trn_scores.append(score)

    # eval _ vld
    score = log_loss(y_vld, bst.predict(dvld))
    vld_scores.append(score)

    # best iters
    best_iters.append(bst.best_iteration)

print('# TRN logloss: {}'.format(np.mean(trn_scores)))
print('# VLD logloss: {}'.format(np.mean(vld_scores)))
print('# Best Iters : {}'.format(np.mean(best_iters)))
# TRN logloss : 1.8049220522722245
# VLD logloss : 1.8451546928938647
# Best Iters  : 63.6

##################################################################################################################
### Model Fit
##################################################################################################################

print('# Refit and predict on test data..')
dtrn = xgb.DMatrix(trn, label=y)
num_round = int(np.mean(best_iters) / 0.9)
bst = xgb.train(xgb_params, dtrn, num_round, verbose_eval=False)

# Cross validation..
# Iter 1 / 5
[0]	train-mlogloss:2.39523	eval-mlogloss:2.38703
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.25494	eval-mlogloss:2.24478
[2]	train-mlogloss:2.16127	eval-mlogloss:2.15028
[3]	train-mlogloss:2.0936	eval-mlogloss:2.08212
[4]	train-mlogloss:2.04285	eval-mlogloss:2.03103
[5]	train-mlogloss:2.00399	eval-mlogloss:1.99158
[6]	train-mlogloss:1.97403	eval-mlogloss:1.96157
[7]	train-mlogloss:1.9502	eval-mlogloss:1.93736
[8]	train-mlogloss:1.93118	eval-mlogloss:1.91814
[9]	train-mlogloss:1.91563	eval-mlogloss:1.90289
[10]	train-mlogloss:1.90299	eval-mlogloss:1.89023
[11]	train-mlogloss:1.89265	eval-mlogloss:1.88013
[12]	train-mlogloss:1.8841	eval-mlogloss:1.87167
[13]	train-mlogloss:1.87699	eval-mlogloss:1.86485
[14]	train-mlogloss:1.87083	eval-mlogloss:1.85889
[15]	train-mlogloss:1.86576	eval-mlogloss:1.85399
[16]	train-mlogloss:1.86145	

[70]	train-mlogloss:1.80795	eval-mlogloss:1.82701
[71]	train-mlogloss:1.80754	eval-mlogloss:1.82707
[72]	train-mlogloss:1.80712	eval-mlogloss:1.82717
[73]	train-mlogloss:1.80675	eval-mlogloss:1.8271
[74]	train-mlogloss:1.80633	eval-mlogloss:1.8269
[75]	train-mlogloss:1.80588	eval-mlogloss:1.82695
[76]	train-mlogloss:1.80539	eval-mlogloss:1.82668
[77]	train-mlogloss:1.80498	eval-mlogloss:1.82657
[78]	train-mlogloss:1.80447	eval-mlogloss:1.82661
[79]	train-mlogloss:1.80411	eval-mlogloss:1.82675
[80]	train-mlogloss:1.80375	eval-mlogloss:1.8266
[81]	train-mlogloss:1.80339	eval-mlogloss:1.8268
[82]	train-mlogloss:1.80292	eval-mlogloss:1.82686
[83]	train-mlogloss:1.80243	eval-mlogloss:1.82682
[84]	train-mlogloss:1.80195	eval-mlogloss:1.82708
[85]	train-mlogloss:1.80155	eval-mlogloss:1.82714
[86]	train-mlogloss:1.8011	eval-mlogloss:1.82724
[87]	train-mlogloss:1.80074	eval-mlogloss:1.82715
Stopping. Best iteration:
[77]	train-mlogloss:1.80498	eval-mlogloss:1.82657

# Iter 3 / 5
[0]	train-mlogl

# Iter 5 / 5
[0]	train-mlogloss:2.39409	eval-mlogloss:2.39826
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.25318	eval-mlogloss:2.25845
[2]	train-mlogloss:2.15938	eval-mlogloss:2.16523
[3]	train-mlogloss:2.09147	eval-mlogloss:2.09791
[4]	train-mlogloss:2.04047	eval-mlogloss:2.04814
[5]	train-mlogloss:2.00145	eval-mlogloss:2.00934
[6]	train-mlogloss:1.9713	eval-mlogloss:1.97978
[7]	train-mlogloss:1.94757	eval-mlogloss:1.9566
[8]	train-mlogloss:1.92823	eval-mlogloss:1.93813
[9]	train-mlogloss:1.91269	eval-mlogloss:1.92299
[10]	train-mlogloss:1.90021	eval-mlogloss:1.91128
[11]	train-mlogloss:1.8896	eval-mlogloss:1.9012
[12]	train-mlogloss:1.88116	eval-mlogloss:1.89315
[13]	train-mlogloss:1.87395	eval-mlogloss:1.88687
[14]	train-mlogloss:1.86794	eval-mlogloss:1.88149
[15]	train-mlogloss:1.86287	eval-mlogloss:1.87672
[16]	train-mlogloss:1.85837	eval-mlogloss:1.87307


In [8]:
bst.save_model('model/baseline.depth2.trn-1.80.dev-1.83.xgb')