In [1]:
# required to get access to model_stacking package
import os
os.chdir('../..')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
from framework.model_stacking import getConfigParameters
import pickle

retrieving configuration file: config.yml from current working directory


In [3]:
CONFIG = getConfigParameters()
ROOT_DIR = CONFIG['ROOT_DIR']
DATA_DIR = CONFIG['DATA_DIR']
ID_VAR = CONFIG['ID_VAR']
TARGET_VAR = CONFIG['TARGET_VAR']

In [4]:
from sklearn.ensemble import RandomForestRegressor as ThisModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import randint

In [5]:
FEATURE_SET = 'L1FS01'
MODEL_ALGO = 'randomforest_level1_df_hyp.pkl'

In [6]:
PARAM_GRID = dict(this_model__n_estimators=randint(100,800),
                  this_model__min_samples_split=randint(2,8),
                 this_model__max_depth=randint(2,15))

N_ITER = 30

## Get Training Data

In [7]:
# get training data
train_df = pd.read_csv(os.path.join(ROOT_DIR,DATA_DIR,FEATURE_SET,'train.csv.gz'))

In [8]:
X_train = train_df.drop(ID_VAR + [TARGET_VAR],axis=1)
y_train = train_df.loc[:,TARGET_VAR]

In [9]:
X_train.shape

(4459, 2)

In [10]:
y_train.shape

(4459,)

In [11]:
y_train[:10]

0    17.453097
1    13.304687
2    16.118096
3    14.508658
4    16.482739
5    14.845130
6    12.007628
7    13.304687
8    13.794288
9    13.038984
Name: target, dtype: float64

## Setup pipeline for hyper-parameter tuning

In [12]:
# set up pipeline
pipe = Pipeline([('this_model',ThisModel(n_jobs=-1))])

In [13]:
def kag_rmsle(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

this_scorer = make_scorer(kag_rmsle, greater_is_better=False)

In [17]:

grid_search = RandomizedSearchCV(pipe, 
                                 param_distributions=PARAM_GRID,
                                 scoring=this_scorer,cv=5,
                                 n_iter=N_ITER,
                                 verbose=2,
                                 n_jobs=1,
                                  refit=False)

In [18]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211 
[CV]  this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211, total=   0.3s
[CV] this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211, total=   0.3s
[CV] this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211 
[CV]  this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211, total=   0.3s
[CV] this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211 
[CV]  this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211, total=   0.4s
[CV] this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211 
[CV]  this_model__max_depth=3, this_model__min_samples_split=7, this_model__n_estimators=211, total=   0.3s
[CV] this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=161 
[CV]  this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=161, total=   0.3s
[CV] this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=

[CV]  this_model__max_depth=11, this_model__min_samples_split=4, this_model__n_estimators=635, total=   1.0s
[CV] this_model__max_depth=11, this_model__min_samples_split=4, this_model__n_estimators=635 
[CV]  this_model__max_depth=11, this_model__min_samples_split=4, this_model__n_estimators=635, total=   1.0s
[CV] this_model__max_depth=11, this_model__min_samples_split=4, this_model__n_estimators=635 
[CV]  this_model__max_depth=11, this_model__min_samples_split=4, this_model__n_estimators=635, total=   1.1s
[CV] this_model__max_depth=9, this_model__min_samples_split=4, this_model__n_estimators=697 
[CV]  this_model__max_depth=9, this_model__min_samples_split=4, this_model__n_estimators=697, total=   1.0s
[CV] this_model__max_depth=9, this_model__min_samples_split=4, this_model__n_estimators=697 
[CV]  this_model__max_depth=9, this_model__min_samples_split=4, this_model__n_estimators=697, total=   1.0s
[CV] this_model__max_depth=9, this_model__min_samples_split=4, this_model__n_estima

[CV]  this_model__max_depth=3, this_model__min_samples_split=6, this_model__n_estimators=184, total=   0.3s
[CV] this_model__max_depth=3, this_model__min_samples_split=6, this_model__n_estimators=184 
[CV]  this_model__max_depth=3, this_model__min_samples_split=6, this_model__n_estimators=184, total=   0.3s
[CV] this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=260 
[CV]  this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=260, total=   0.5s
[CV] this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=260 
[CV]  this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=260, total=   0.5s
[CV] this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=260 
[CV]  this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=260, total=   0.5s
[CV] this_model__max_depth=2, this_model__min_samples_split=6, this_model__n_estimators=

[CV]  this_model__max_depth=7, this_model__min_samples_split=3, this_model__n_estimators=320, total=   0.5s
[CV] this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138 
[CV]  this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138, total=   0.3s
[CV] this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138 
[CV]  this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138, total=   0.3s
[CV] this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138 
[CV]  this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138, total=   0.3s
[CV] this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138 
[CV]  this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=138, total=   0.3s
[CV] this_model__max_depth=2, this_model__min_samples_split=7, this_model__n_estimators=

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.1min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('this_model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'this_model__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11db4f2b0>, 'this_model__min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11db4f400>, 'this_model__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11db4f588>},
          pre_dispatch='2*n_jobs', random_state=None, refit=False,
          return_train_score=True,
          scoring=make_scorer(kag_rmsle, greater_is_better

In [19]:
grid_search.best_params_

{'this_model__max_depth': 4,
 'this_model__min_samples_split': 2,
 'this_model__n_estimators': 161}

In [20]:
grid_search.best_score_

-1.4347663661450438

In [21]:
df = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
df

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_this_model__max_depth,param_this_model__min_samples_split,param_this_model__n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
1,0.199424,0.106746,-1.434766,-1.400247,4,2,161,"{'this_model__max_depth': 4, 'this_model__min_...",1,-1.404956,...,-1.36928,-1.415705,-1.409672,-1.407017,-1.496967,-1.38412,0.002849,0.001198,0.051125,0.012976
15,0.443443,0.108531,-1.435437,-1.400084,4,5,399,"{'this_model__max_depth': 4, 'this_model__min_...",2,-1.405107,...,-1.370899,-1.41563,-1.409717,-1.406422,-1.496384,-1.383426,0.005716,0.001592,0.051039,0.013007
0,0.253399,0.107734,-1.436827,-1.414089,3,7,211,"{'this_model__max_depth': 3, 'this_model__min_...",3,-1.406406,...,-1.371091,-1.429569,-1.413471,-1.420904,-1.498742,-1.398038,0.036672,0.000624,0.050892,0.012924
16,0.210717,0.107581,-1.437812,-1.414009,3,6,184,"{'this_model__max_depth': 3, 'this_model__min_...",4,-1.406899,...,-1.371771,-1.429093,-1.414849,-1.42108,-1.499756,-1.398031,0.001108,0.001258,0.051086,0.012874
19,0.717133,0.108202,-1.437879,-1.379346,5,7,613,"{'this_model__max_depth': 5, 'this_model__min_...",5,-1.408236,...,-1.373198,-1.395277,-1.411892,-1.386534,-1.497862,-1.362566,0.05309,0.002077,0.050955,0.013424
26,0.67463,0.109931,-1.438208,-1.378867,5,5,640,"{'this_model__max_depth': 5, 'this_model__min_...",6,-1.408638,...,-1.373267,-1.394823,-1.412471,-1.385767,-1.49801,-1.362312,0.003722,0.001355,0.050972,0.013071
12,0.57663,0.109793,-1.438253,-1.378486,5,2,463,"{'this_model__max_depth': 5, 'this_model__min_...",7,-1.408538,...,-1.374596,-1.394067,-1.411391,-1.385287,-1.497824,-1.361663,0.003635,0.001387,0.050778,0.013181
18,0.313348,0.106589,-1.438272,-1.378626,5,4,217,"{'this_model__max_depth': 5, 'this_model__min_...",8,-1.408376,...,-1.37339,-1.394367,-1.411209,-1.385714,-1.499423,-1.36149,0.042672,0.002071,0.051509,0.013312
13,0.664791,0.10959,-1.438586,-1.379178,5,6,617,"{'this_model__max_depth': 5, 'this_model__min_...",9,-1.408853,...,-1.375067,-1.395113,-1.411946,-1.386412,-1.498584,-1.362271,0.005945,0.001673,0.05064,0.013324
10,0.196841,0.107823,-1.438708,-1.379463,5,6,159,"{'this_model__max_depth': 5, 'this_model__min_...",10,-1.410671,...,-1.373016,-1.395868,-1.413142,-1.386444,-1.498325,-1.362702,0.001776,0.00103,0.050749,0.013462


In [22]:
hyper_parameters = dict(FeatureSet=FEATURE_SET,cv_run=df)
with open(os.path.join(CONFIG['ROOT_DIR'],'eda','hyper-parameter_tuning',MODEL_ALGO),'wb') as f:
    pickle.dump(hyper_parameters,f)