In [1]:
# required to get access to model_stacking package
import os
os.chdir('../..')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
from framework.model_stacking import getConfigParameters
import pickle

retrieving configuration file: config.yml from current working directory


In [3]:
CONFIG = getConfigParameters()
ROOT_DIR = CONFIG['ROOT_DIR']
DATA_DIR = CONFIG['DATA_DIR']
ID_VAR = CONFIG['ID_VAR']
TARGET_VAR = CONFIG['TARGET_VAR']

In [4]:
from sklearn.ensemble import RandomForestRegressor as ThisModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import randint

In [5]:
FEATURE_SET = 'KFS01'
MODEL_ALGO = 'randomforest_level0_'+FEATURE_SET+'_hyp.pkl'

In [6]:
PARAM_GRID = dict(this_model__n_estimators=randint(100,800),
                  this_model__min_samples_split=randint(2,8),
                 this_model__max_depth=randint(2,15))

N_ITER = 30

## Get Training Data

In [7]:
# get training data
train_df = pd.read_csv(os.path.join(ROOT_DIR,DATA_DIR,FEATURE_SET,'train.csv.gz'))

In [8]:
X_train = train_df.drop(ID_VAR + [TARGET_VAR],axis=1)
y_train = train_df.loc[:,TARGET_VAR]

In [9]:
X_train.shape

(4459, 4730)

In [10]:
y_train.shape

(4459,)

In [11]:
y_train[:10]

0    17.453097
1    13.304687
2    16.118096
3    14.508658
4    16.482739
5    14.845130
6    12.007628
7    13.304687
8    13.794288
9    13.038984
Name: target, dtype: float64

## Setup pipeline for hyper-parameter tuning

In [12]:
# set up pipeline
pipe = Pipeline([('this_model',ThisModel(n_jobs=-1))])

In [13]:
def kag_rmsle(y,y_hat):
    return np.sqrt(mean_squared_error(y,y_hat))

this_scorer = make_scorer(kag_rmsle, greater_is_better=False)

In [None]:

grid_search = RandomizedSearchCV(pipe, 
                                 param_distributions=PARAM_GRID,
                                 scoring=this_scorer,cv=5,
                                 n_iter=N_ITER,
                                 verbose=2,
                                 n_jobs=1,
                                  refit=False)

In [None]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202 
[CV]  this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202, total=   4.7s
[CV] this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s remaining:    0.0s


[CV]  this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202, total=   4.6s
[CV] this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202 
[CV]  this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202, total=   4.8s
[CV] this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202 
[CV]  this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202, total=   5.0s
[CV] this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202 
[CV]  this_model__max_depth=2, this_model__min_samples_split=4, this_model__n_estimators=202, total=   4.8s
[CV] this_model__max_depth=5, this_model__min_samples_split=6, this_model__n_estimators=460 
[CV]  this_model__max_depth=5, this_model__min_samples_split=6, this_model__n_estimators=460, total=  27.5s
[CV] this_model__max_depth=5, this_model__min_samples_split=6, this_model__n_estimators=

[CV]  this_model__max_depth=9, this_model__min_samples_split=3, this_model__n_estimators=128, total=  14.9s
[CV] this_model__max_depth=9, this_model__min_samples_split=3, this_model__n_estimators=128 
[CV]  this_model__max_depth=9, this_model__min_samples_split=3, this_model__n_estimators=128, total=  15.1s
[CV] this_model__max_depth=9, this_model__min_samples_split=3, this_model__n_estimators=128 
[CV]  this_model__max_depth=9, this_model__min_samples_split=3, this_model__n_estimators=128, total=  14.6s
[CV] this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=385 
[CV]  this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=385, total=  19.2s
[CV] this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=385 
[CV]  this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=385, total=  19.3s
[CV] this_model__max_depth=4, this_model__min_samples_split=2, this_model__n_estimators=

[CV]  this_model__max_depth=13, this_model__min_samples_split=7, this_model__n_estimators=718, total= 1.8min
[CV] this_model__max_depth=13, this_model__min_samples_split=7, this_model__n_estimators=718 
[CV]  this_model__max_depth=13, this_model__min_samples_split=7, this_model__n_estimators=718, total= 1.8min
[CV] this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_estimators=435 
[CV]  this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_estimators=435, total= 1.0min
[CV] this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_estimators=435 
[CV]  this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_estimators=435, total= 1.0min
[CV] this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_estimators=435 
[CV]  this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_estimators=435, total= 1.0min
[CV] this_model__max_depth=11, this_model__min_samples_split=2, this_model__n_e

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
df = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
df

In [None]:
hyper_parameters = dict(FeatureSet=FEATURE_SET,cv_run=df)
with open(os.path.join(CONFIG['ROOT_DIR'],'eda','hyper-parameter_tuning',MODEL_ALGO),'wb') as f:
    pickle.dump(hyper_parameters,f)