## ML Fit
- train / dev / test split
- ml fit

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss

In [None]:
# from cache
trn = pd.read_csv('data/adv.feature_engineer.trn.csv')
trg = pd.read_csv('data/adv.feature_engineer.y.csv')

print('=' * 50)
print('# Cross validation..')

# XGB Model Param
num_round = 500
early_stop = 50
xgb_params = {
    'booster': 'gbtree',
    'gamma': 1,
    'learning_rate': 0.1,
    'max_depth': 4,
    'min_child_weight': 3,
    'nthread': 4,
    'num_class': 15,
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'seed': 777,
    }

trn_scores = []
vld_scores = []
best_iters = []
n_splits = 2
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.05, random_state=777)
for i, (t_ind, v_ind) in enumerate(sss.split(trn, trg)):
    print('# Iter {} / {}'.format(i+1, n_splits))
    x_trn = np.asarray(trn)[t_ind]
    x_vld = np.asarray(trn)[v_ind]
    y_trn = np.asarray(trg)[t_ind]
    y_vld = np.asarray(trg)[v_ind]

    dtrn = xgb.DMatrix(x_trn, label=y_trn)
    dvld = xgb.DMatrix(x_vld, label=y_vld)
    watch_list = [(dtrn, 'train'), (dvld, 'eval')]

    # fit xgb
    bst = xgb.train(xgb_params, dtrn, num_round, watch_list,
                    early_stopping_rounds=early_stop, verbose_eval=True)

    # eval _ trn
    score = log_loss(y_trn, bst.predict(dtrn))
    trn_scores.append(score)

    # eval _ vld
    score = log_loss(y_vld, bst.predict(dvld))
    vld_scores.append(score)

    # best iters
    best_iters.append(bst.best_iteration)

print('# TRN logloss: {}'.format(np.mean(trn_scores)))
print('# VLD logloss: {}'.format(np.mean(vld_scores)))
print('# Best Iters : {}'.format(np.mean(best_iters)))

##################################################################################################################
# Model Fit
##################################################################################################################

print('=' * 50)
print('# Refit and predict on test data..')
dtrn = xgb.DMatrix(trn, label=trg)
num_round = int(np.mean(best_iters) / 0.9)
bst = xgb.train(xgb_params, dtrn, num_round, verbose_eval=False)

In [None]:
bst.save_model('model/adv.trn-0.85.dev-1.83.xgb')