In [1]:
# required to get access to model_stacking package
import os
os.chdir('../..')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
from framework.model_stacking import getConfigParameters
import pickle

retrieving configuration file: config.yml from current working directory


In [3]:
CONFIG = getConfigParameters()
ROOT_DIR = CONFIG['ROOT_DIR']
DATA_DIR = CONFIG['DATA_DIR']
ID_VAR = CONFIG['ID_VAR']
TARGET_VAR = CONFIG['TARGET_VAR']

In [23]:
from sklearn.ensemble import RandomForestRegressor as ThisModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import randint

## Get Training Data

In [6]:
# get training data
train_df = pd.read_csv(os.path.join(ROOT_DIR,DATA_DIR,'KFS01','train.csv.gz'))

In [7]:
X_train = train_df.drop(ID_VAR + [TARGET_VAR],axis=1)
y_train = train_df.loc[:,TARGET_VAR]

In [8]:
X_train.shape

(4459, 4730)

In [9]:
y_train.shape

(4459,)

In [10]:
y_train[:10]

0    17.453097
1    13.304687
2    16.118096
3    14.508658
4    16.482739
5    14.845130
6    12.007628
7    13.304687
8    13.794288
9    13.038984
Name: target, dtype: float64

## Setup pipeline for hyper-parameter tuning

In [24]:
# set up pipeline
pipe = Pipeline([('this_model',ThisModel(n_jobs=-1))])

In [25]:
def kag_rmsle(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

this_scorer = make_scorer(kag_rmsle, greater_is_better=False)

In [27]:
param_grid = dict(this_model__n_estimators=randint(50,500),
                 this_model__max_depth=randint(3,10))
grid_search = RandomizedSearchCV(pipe, param_distributions=param_grid,scoring=this_scorer,cv=5,
                                 n_iter=10,
                                 verbose=1,
                                 n_jobs=1,
                                  refit=False)

In [28]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] this_model__max_depth=3, this_model__n_estimators=456 ...........
[CV]  this_model__max_depth=3, this_model__n_estimators=456, total=  14.5s
[CV] this_model__max_depth=3, this_model__n_estimators=456 ...........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.7s remaining:    0.0s


[CV]  this_model__max_depth=3, this_model__n_estimators=456, total=  15.5s
[CV] this_model__max_depth=3, this_model__n_estimators=456 ...........
[CV]  this_model__max_depth=3, this_model__n_estimators=456, total=  16.1s
[CV] this_model__max_depth=3, this_model__n_estimators=456 ...........
[CV]  this_model__max_depth=3, this_model__n_estimators=456, total=  16.6s
[CV] this_model__max_depth=3, this_model__n_estimators=456 ...........
[CV]  this_model__max_depth=3, this_model__n_estimators=456, total=  16.6s
[CV] this_model__max_depth=9, this_model__n_estimators=357 ...........
[CV]  this_model__max_depth=9, this_model__n_estimators=357, total=  39.7s
[CV] this_model__max_depth=9, this_model__n_estimators=357 ...........
[CV]  this_model__max_depth=9, this_model__n_estimators=357, total=  41.7s
[CV] this_model__max_depth=9, this_model__n_estimators=357 ...........
[CV]  this_model__max_depth=9, this_model__n_estimators=357, total=  41.2s
[CV] this_model__max_depth=9, this_model__n_estim

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 17.7min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('this_model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'this_model__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x112967b38>, 'this_model__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10644f2b0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=False,
          return_train_score=True,
          scoring=make_scorer(kag_rmsle, greater_is_better=False),
          verbose=2)

In [29]:
grid_search.best_params_

{'this_model__max_depth': 9, 'this_model__n_estimators': 357}

In [30]:
grid_search.best_score_

-1.5102115780948988

In [33]:
pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_this_model__max_depth,param_this_model__n_estimators,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
1,40.336227,0.117225,-1.510212,-1.302399,9,357,"{'this_model__max_depth': 9, 'this_model__n_es...",1,-1.47651,-1.300003,...,-1.456272,-1.324416,-1.48689,-1.29843,-1.602173,-1.292912,0.839359,0.002473,0.051778,0.011262
8,37.264799,0.117023,-1.511644,-1.302622,9,327,"{'this_model__max_depth': 9, 'this_model__n_es...",2,-1.479084,-1.298343,...,-1.459901,-1.32668,-1.491348,-1.298687,-1.599829,-1.291844,0.680744,0.002512,0.049382,0.012284
3,33.122976,0.116198,-1.523093,-1.349697,8,330,"{'this_model__max_depth': 8, 'this_model__n_es...",3,-1.485444,-1.347518,...,-1.472438,-1.371839,-1.500321,-1.345976,-1.617054,-1.340453,0.613498,0.002241,0.052192,0.011343
2,26.753457,0.115146,-1.536568,-1.3992,7,309,"{'this_model__max_depth': 7, 'this_model__n_es...",4,-1.501755,-1.402139,...,-1.486199,-1.419668,-1.511875,-1.393706,-1.632572,-1.386504,0.14629,0.002334,0.052466,0.011368
7,7.713429,0.116984,-1.552546,-1.448691,6,103,"{'this_model__max_depth': 6, 'this_model__n_es...",5,-1.508138,-1.448687,...,-1.508645,-1.468553,-1.53408,-1.448003,-1.645241,-1.43444,0.046275,0.001497,0.051043,0.011156
5,10.658119,0.118517,-1.601178,-1.551342,4,214,"{'this_model__max_depth': 4, 'this_model__n_es...",6,-1.551608,-1.55731,...,-1.55567,-1.565704,-1.574972,-1.549195,-1.711592,-1.538051,0.224537,0.001142,0.05919,0.009456
6,19.190133,0.11903,-1.602158,-1.551575,4,390,"{'this_model__max_depth': 4, 'this_model__n_es...",7,-1.552526,-1.55866,...,-1.559717,-1.567027,-1.575387,-1.548163,-1.70894,-1.53591,0.505802,0.000949,0.057486,0.010564
9,3.724215,0.118103,-1.633703,-1.601944,3,97,"{'this_model__max_depth': 3, 'this_model__n_es...",8,-1.579065,-1.610576,...,-1.594646,-1.615252,-1.612481,-1.604739,-1.738243,-1.581767,0.047146,0.0016,0.056553,0.011725
0,15.741,0.119691,-1.634104,-1.60237,3,456,"{'this_model__max_depth': 3, 'this_model__n_es...",9,-1.581099,-1.611867,...,-1.593807,-1.614486,-1.611828,-1.605013,-1.736617,-1.581402,0.760373,0.003138,0.055864,0.011785
4,14.973077,0.116919,-1.634656,-1.60283,3,409,"{'this_model__max_depth': 3, 'this_model__n_es...",10,-1.581227,-1.611328,...,-1.594994,-1.615254,-1.609146,-1.603829,-1.738645,-1.582309,0.28908,0.002118,0.056746,0.011407


In [20]:
np.sqrt(13)

3.6055512754639891