In [3]:
# required to get access to model_stacking package
import os
os.chdir('../..')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
from framework.model_stacking import getConfigParameters
import pickle

retrieving configuration file: config.yml from current working directory


In [5]:
CONFIG = getConfigParameters()
ROOT_DIR = CONFIG['ROOT_DIR']
DATA_DIR = CONFIG['DATA_DIR']
ID_VAR = CONFIG['ID_VAR']
TARGET_VAR = CONFIG['TARGET_VAR']

In [6]:
from xgboost import XGBRegressor as ThisModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import randint, uniform

## Get Training Data

In [7]:
# get training data
train_df = pd.read_csv(os.path.join(ROOT_DIR,DATA_DIR,'KFS01','train.csv.gz'))

In [8]:
X_train = train_df.drop(ID_VAR + [TARGET_VAR],axis=1)
y_train = train_df.loc[:,TARGET_VAR]

In [9]:
X_train.shape

(4459, 4730)

In [10]:
y_train.shape

(4459,)

In [11]:
y_train[:10]

0    17.453097
1    13.304687
2    16.118096
3    14.508658
4    16.482739
5    14.845130
6    12.007628
7    13.304687
8    13.794288
9    13.038984
Name: target, dtype: float64

## Setup pipeline for hyper-parameter tuning

In [16]:
# set up pipeline
pipe = Pipeline([('this_model',ThisModel(n_jobs=6))])

In [17]:
def kag_rmsle(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

this_scorer = make_scorer(kag_rmsle, greater_is_better=False)

In [20]:
param_grid = dict(this_model__n_estimators=randint(100,500),
                  this_model__colsample_bytree=uniform(0.6,0.4),
                  this_model__colsample_bylevel=uniform(0.6,0.4),
                 this_model__max_depth=randint(3,10))
grid_search = RandomizedSearchCV(pipe, param_distributions=param_grid,scoring=this_scorer,cv=5,
                                 n_iter=20,
                                 verbose=2,
                                 n_jobs=1,
                                  refit=False)

In [21]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433 
[CV]  this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433, total= 6.3min
[CV] this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.3min remaining:    0.0s


[CV]  this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433, total= 6.3min
[CV] this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433 
[CV]  this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433, total= 6.2min
[CV] this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433 
[CV]  this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433, total= 6.3min
[CV] this_model__colsample_bylevel=0.944570835134, this_model__colsample_bytree=0.62637706151, this_model__max_depth=7, this_model__n_estimators=433 
[CV]  this_model__colsample_bylevel=0.944570835134, thi

[CV]  this_model__colsample_bylevel=0.845969733205, this_model__colsample_bytree=0.610417090902, this_model__max_depth=3, this_model__n_estimators=145, total=  56.7s
[CV] this_model__colsample_bylevel=0.845969733205, this_model__colsample_bytree=0.610417090902, this_model__max_depth=3, this_model__n_estimators=145 
[CV]  this_model__colsample_bylevel=0.845969733205, this_model__colsample_bytree=0.610417090902, this_model__max_depth=3, this_model__n_estimators=145, total=  55.4s
[CV] this_model__colsample_bylevel=0.845969733205, this_model__colsample_bytree=0.610417090902, this_model__max_depth=3, this_model__n_estimators=145 
[CV]  this_model__colsample_bylevel=0.845969733205, this_model__colsample_bytree=0.610417090902, this_model__max_depth=3, this_model__n_estimators=145, total=  53.7s
[CV] this_model__colsample_bylevel=0.959128187437, this_model__colsample_bytree=0.793159557304, this_model__max_depth=3, this_model__n_estimators=114 
[CV]  this_model__colsample_bylevel=0.95912818743

[CV]  this_model__colsample_bylevel=0.983632514601, this_model__colsample_bytree=0.758808180868, this_model__max_depth=7, this_model__n_estimators=179, total= 2.9min
[CV] this_model__colsample_bylevel=0.983632514601, this_model__colsample_bytree=0.758808180868, this_model__max_depth=7, this_model__n_estimators=179 
[CV]  this_model__colsample_bylevel=0.983632514601, this_model__colsample_bytree=0.758808180868, this_model__max_depth=7, this_model__n_estimators=179, total= 2.7min
[CV] this_model__colsample_bylevel=0.807459250911, this_model__colsample_bytree=0.909648330223, this_model__max_depth=4, this_model__n_estimators=437 
[CV]  this_model__colsample_bylevel=0.807459250911, this_model__colsample_bytree=0.909648330223, this_model__max_depth=4, this_model__n_estimators=437, total= 3.7min
[CV] this_model__colsample_bylevel=0.807459250911, this_model__colsample_bytree=0.909648330223, this_model__max_depth=4, this_model__n_estimators=437 
[CV]  this_model__colsample_bylevel=0.80745925091

[CV]  this_model__colsample_bylevel=0.925967599102, this_model__colsample_bytree=0.637004486682, this_model__max_depth=6, this_model__n_estimators=312, total= 4.0min
[CV] this_model__colsample_bylevel=0.776277246403, this_model__colsample_bytree=0.900485215273, this_model__max_depth=4, this_model__n_estimators=281 
[CV]  this_model__colsample_bylevel=0.776277246403, this_model__colsample_bytree=0.900485215273, this_model__max_depth=4, this_model__n_estimators=281, total= 2.4min
[CV] this_model__colsample_bylevel=0.776277246403, this_model__colsample_bytree=0.900485215273, this_model__max_depth=4, this_model__n_estimators=281 
[CV]  this_model__colsample_bylevel=0.776277246403, this_model__colsample_bytree=0.900485215273, this_model__max_depth=4, this_model__n_estimators=281, total= 2.4min
[CV] this_model__colsample_bylevel=0.776277246403, this_model__colsample_bytree=0.900485215273, this_model__max_depth=4, this_model__n_estimators=281 
[CV]  this_model__colsample_bylevel=0.77627724640

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 396.2min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('this_model', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=6, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
          fit_params={}, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'this_model__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f81ee10>, 'this_model__colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f82e080>, 'this_model__colsample_bylevel': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f82e668>, 'this_model__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10f82e8d0>},
          pre_dispatch='2*n_jobs', 

In [22]:
grid_search.best_params_

{'this_model__colsample_bylevel': 0.7720117072989221,
 'this_model__colsample_bytree': 0.9381525821154606,
 'this_model__max_depth': 7,
 'this_model__n_estimators': 283}

In [23]:
grid_search.best_score_

-1.4793025306974654

In [24]:
pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_this_model__colsample_bylevel,param_this_model__colsample_bytree,param_this_model__max_depth,param_this_model__n_estimators,params,rank_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
1,253.662403,0.113333,-1.479303,-0.798423,0.772012,0.938153,7,283,{'this_model__colsample_bylevel': 0.7720117072...,1,...,-1.419588,-0.812352,-1.45636,-0.793297,-1.552818,-0.787922,1.377626,0.006621,0.050483,0.00837
7,139.848399,0.126982,-1.480402,-0.891976,0.928744,0.873204,8,134,{'this_model__colsample_bylevel': 0.9287436923...,2,...,-1.409529,-0.907508,-1.445914,-0.89887,-1.56664,-0.887022,2.804241,0.012997,0.056615,0.00967
10,165.105571,0.119043,-1.483389,-0.894094,0.983633,0.758808,7,179,{'this_model__colsample_bylevel': 0.9836325146...,3,...,-1.422486,-0.905678,-1.461773,-0.899576,-1.570875,-0.891476,3.881372,0.010008,0.055647,0.007712
15,234.363294,0.121983,-1.483604,-0.848415,0.925968,0.637004,6,312,{'this_model__colsample_bylevel': 0.9259675991...,4,...,-1.408223,-0.865613,-1.458586,-0.850493,-1.572385,-0.83963,2.950153,0.005641,0.058737,0.009904
9,346.587832,0.117835,-1.485196,-0.697435,0.899408,0.988449,8,323,{'this_model__colsample_bylevel': 0.8994077388...,5,...,-1.409079,-0.710037,-1.459825,-0.698722,-1.552473,-0.689359,4.426426,0.004959,0.062296,0.007023
18,245.999105,0.120372,-1.488225,-0.734639,0.717789,0.988609,9,217,{'this_model__colsample_bylevel': 0.7177888401...,6,...,-1.408172,-0.744494,-1.456517,-0.73253,-1.560537,-0.713223,0.596058,0.005967,0.057645,0.011435
3,111.282358,0.123494,-1.489835,-1.102282,0.724275,0.836508,4,217,{'this_model__colsample_bylevel': 0.7242749220...,7,...,-1.422289,-1.120841,-1.453841,-1.1068,-1.581598,-1.090134,0.481526,0.006134,0.056143,0.010697
16,145.505184,0.127608,-1.491408,-1.050969,0.776277,0.900485,4,281,{'this_model__colsample_bylevel': 0.7762772464...,8,...,-1.419797,-1.072106,-1.477214,-1.056338,-1.573954,-1.03952,1.08303,0.009235,0.055229,0.012006
0,376.38505,0.128946,-1.491469,-0.695062,0.944571,0.626377,7,433,{'this_model__colsample_bylevel': 0.9445708351...,9,...,-1.420051,-0.70941,-1.46478,-0.700762,-1.562672,-0.676413,2.666069,0.01618,0.056952,0.010877
8,345.860662,0.11931,-1.494331,-0.697195,0.679918,0.670678,8,342,{'this_model__colsample_bylevel': 0.6799184995...,10,...,-1.421186,-0.697912,-1.46189,-0.698834,-1.567048,-0.677587,5.078798,0.003972,0.066666,0.01106


In [25]:
df = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')

In [27]:
hyper_parameters = dict(FeatureSet='KFS01',cv_run=df)

In [28]:
with open(os.path.join(CONFIG['ROOT_DIR'],'eda','hyper-parameter_tuning','xgboost_df_hyp.pkl'),'wb') as f:
    pickle.dump(hyper_parameters,f)

In [29]:
with open(os.path.join(CONFIG['ROOT_DIR'],'eda','hyper-parameter_tuning','xgboost_df_hyp.pkl'),'rb') as f:
    z=pickle.load(f)


In [31]:
z['cv_run']

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_this_model__colsample_bylevel,param_this_model__colsample_bytree,param_this_model__max_depth,param_this_model__n_estimators,params,rank_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
1,253.662403,0.113333,-1.479303,-0.798423,0.772012,0.938153,7,283,{'this_model__colsample_bylevel': 0.7720117072...,1,...,-1.419588,-0.812352,-1.45636,-0.793297,-1.552818,-0.787922,1.377626,0.006621,0.050483,0.00837
7,139.848399,0.126982,-1.480402,-0.891976,0.928744,0.873204,8,134,{'this_model__colsample_bylevel': 0.9287436923...,2,...,-1.409529,-0.907508,-1.445914,-0.89887,-1.56664,-0.887022,2.804241,0.012997,0.056615,0.00967
10,165.105571,0.119043,-1.483389,-0.894094,0.983633,0.758808,7,179,{'this_model__colsample_bylevel': 0.9836325146...,3,...,-1.422486,-0.905678,-1.461773,-0.899576,-1.570875,-0.891476,3.881372,0.010008,0.055647,0.007712
15,234.363294,0.121983,-1.483604,-0.848415,0.925968,0.637004,6,312,{'this_model__colsample_bylevel': 0.9259675991...,4,...,-1.408223,-0.865613,-1.458586,-0.850493,-1.572385,-0.83963,2.950153,0.005641,0.058737,0.009904
9,346.587832,0.117835,-1.485196,-0.697435,0.899408,0.988449,8,323,{'this_model__colsample_bylevel': 0.8994077388...,5,...,-1.409079,-0.710037,-1.459825,-0.698722,-1.552473,-0.689359,4.426426,0.004959,0.062296,0.007023
18,245.999105,0.120372,-1.488225,-0.734639,0.717789,0.988609,9,217,{'this_model__colsample_bylevel': 0.7177888401...,6,...,-1.408172,-0.744494,-1.456517,-0.73253,-1.560537,-0.713223,0.596058,0.005967,0.057645,0.011435
3,111.282358,0.123494,-1.489835,-1.102282,0.724275,0.836508,4,217,{'this_model__colsample_bylevel': 0.7242749220...,7,...,-1.422289,-1.120841,-1.453841,-1.1068,-1.581598,-1.090134,0.481526,0.006134,0.056143,0.010697
16,145.505184,0.127608,-1.491408,-1.050969,0.776277,0.900485,4,281,{'this_model__colsample_bylevel': 0.7762772464...,8,...,-1.419797,-1.072106,-1.477214,-1.056338,-1.573954,-1.03952,1.08303,0.009235,0.055229,0.012006
0,376.38505,0.128946,-1.491469,-0.695062,0.944571,0.626377,7,433,{'this_model__colsample_bylevel': 0.9445708351...,9,...,-1.420051,-0.70941,-1.46478,-0.700762,-1.562672,-0.676413,2.666069,0.01618,0.056952,0.010877
8,345.860662,0.11931,-1.494331,-0.697195,0.679918,0.670678,8,342,{'this_model__colsample_bylevel': 0.6799184995...,10,...,-1.421186,-0.697912,-1.46189,-0.698834,-1.567048,-0.677587,5.078798,0.003972,0.066666,0.01106
