In [39]:
import pickle


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

from ml_toolbox.kaggle import KaggleResult
from shared_functions_redhat import load_test_train, load_leaked_predictions_as_validation_set, merge_prediction_with_leak_data

In [3]:
dir_out = 'model_xgboost_v1'

In [4]:
feature_files = ['numerical', 'bool', 'dates', 'activity_category','p_group_1',
                 'char_1','char_2','char_3','char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9',
                 'p_char_2','p_char_3','p_char_4', 'p_char_5', 'p_char_6', 'p_char_7', 'p_char_8', 'p_char_9']

feat_filter = {'numerical': ['p_char_38', 'unique_act_cat_group'],
               'dates': ['months_since_pdate', 'months_since_first_activity_group', 
                         'months_since_first_activity_people_id', 'range_date_group'],
               'bool':['p_char_10', 'p_char_11', 'p_char_12', 'p_char_13', 'p_char_14',
                       'p_char_15', 'p_char_16', 'p_char_17', 'p_char_18', 'p_char_19', 'p_char_20',
                       'p_char_21', 'p_char_22', 'p_char_23', 'p_char_24', 'p_char_25', 'p_char_26',
                       'p_char_27', 'p_char_28', 'p_char_29', 'p_char_30', 'p_char_31', 'p_char_32',
                       'p_char_33', 'p_char_34', 'p_char_35', 'p_char_36', 'p_char_37',]}

In [6]:
cv_set_kfold = './data_ori/cv_StratifiedKFold_V4.pickle'

### Load data

In [5]:
Xtrain, Xtest = load_test_train(feature_files, feat_filter)

Loaded ['p_char_38' 'unique_act_cat_group'] from numerical.
Loaded ['p_char_10' 'p_char_11' 'p_char_12' 'p_char_13' 'p_char_14' 'p_char_15'
 'p_char_16' 'p_char_17' 'p_char_18' 'p_char_19' 'p_char_20' 'p_char_21'
 'p_char_22' 'p_char_23' 'p_char_24' 'p_char_25' 'p_char_26' 'p_char_27'
 'p_char_28' 'p_char_29' 'p_char_30' 'p_char_31' 'p_char_32' 'p_char_33'
 'p_char_34' 'p_char_35' 'p_char_36' 'p_char_37'] from bool.
Loaded ['months_since_pdate' 'months_since_first_activity_group'
 'months_since_first_activity_people_id' 'range_date_group'] from dates.
Loaded activity_category.
Loaded p_group_1.
Loaded char_1.
Loaded char_2.
Loaded char_3.
Loaded char_4.
Loaded char_5.
Loaded char_6.
Loaded char_7.
Loaded char_8.
Loaded char_9.
Loaded p_char_2.
Loaded p_char_3.
Loaded p_char_4.
Loaded p_char_5.
Loaded p_char_6.
Loaded p_char_7.
Loaded p_char_8.
Loaded p_char_9.
Loaded ['p_char_38' 'unique_act_cat_group'] from numerical.
Loaded ['p_char_10' 'p_char_11' 'p_char_12' 'p_char_13' 'p_char_14'

In [19]:
Xval, yval = load_leaked_predictions_as_validation_set(Xtest)

Loading samples from ./data_ori/cv_test_V3.csv


In [20]:
print('Xtrain shape: {}'.format(Xtrain.shape))
print('Xtest shape: {}'.format(Xtest.shape))
print('Xval shape: {}'.format(Xval.shape))

Xtrain shape: (2197291, 14145)
Xtest shape: (498687, 14145)
Xval shape: (384061, 14145)


In [7]:
# Load kfold set
with open(cv_set_kfold, 'rb') as f:
    fold_data = pickle.load(f)
kfolds = fold_data['folds']
y = fold_data['y']
print fold_data['seed']

1980


In [21]:
d_train = xgb.DMatrix(Xtrain, label=y)
d_test = xgb.DMatrix(Xtest)
d_val = xgb.DMatrix(Xval, label=y_val)

### Train bag of XGBoost models

In [26]:
seeds = np.random.randint(1,99999,10)

In [None]:
params = {'max_depth':11, 'eta':0.05, 'silent':1, 'objective':'binary:logistic' }
#param['nthread'] = 1
params['eval_metric'] = 'auc'
params['subsample'] = 0.86
params['colsample_bytree']= 0.92
params['colsample_bylevel']= 0.9
params['min_child_weight'] = 0
params['gamma'] = 0.005
params['booster'] = "gbtree"
params['seed'] = 1712
params['num_round'] = 550
params['early_stopping'] = 20

watchlist  = [(d_train,'train'), (d_val,'leak_eval')]

for s in seeds:

    params['seed'] = s
    params['max_depth'] = np.random.choice([10,11,12])
    params['subsample'] = np.random.choice([0.8,0.85,0.9])

    eval_result = {}

    bst = xgb.train(params, 
                    d_train, 
                    num_boost_round=params['num_round'], 
                    evals=watchlist,
                    evals_result=eval_result,
                    early_stopping_rounds=params['early_stopping'],
                    verbose_eval=20)

    y_pred_test = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)
    
    cv_score = eval_result['leak_eval']['auc'][-1]
    
    description = {'description': 'XGBoost gbtree model V0 - seed {}'.format(s),
                   'params': params,
                   'feature_files': feature_files,
                   'feature_filter': feat_filter,
                   'train': 'all',
                   'val': 'cv_test_V3.csv'
                   }
    
    y_pred_leak = merge_prediction_with_leak_data(y_pred_test)
    
    kag = KaggleResult(y_pred_leak[['activity_id','outcome']], None, cv_score, description, dir_out)
    
    print('Seed: {} | max_depth: {} | subsample: {} | cv_score {}'.format(s, 
                                                                          params['max_depth'], 
                                                                          params['subsample'],
                                                                          cv_score))


[0]	train-auc:0.947327	leak_eval-auc:0.964024
Multiple eval metrics have been passed: 'leak_eval-auc' will be used for early stopping.

Will train until leak_eval-auc hasn't improved in 20 rounds.
