# Hyperparameter optimization of random forest model

In this notebook Bayesian hyperparameter optimization is performed for the random forest model.

In [None]:
# general dependencies

import pandas as pd
import numpy as np

# bayesian optimization dependencies

from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from sklearn.model_selection import cross_val_predict
from skopt import gp_minimize
from skopt.plots import plot_convergence

In [None]:
# ignore warnings in this notebook
# not necessary, can be commented
import warnings
warnings.filterwarnings('ignore')

__Restore the splitted data__

In [None]:
%store -r X_train
%store -r y_train

In [None]:
%store -r X_val
%store -r y_val

__Metrics function__

In [None]:
# r2 and rmse and AARD returning function
from sklearn.metrics import r2_score, mean_squared_error

def result_stats(actual, predicted):
    """
    Returns r_2, rmse and AARD value for two arrays of equal length
    """
    
    r2 = r2_score(actual, predicted)
    rmse = np.sqrt(mean_squared_error( actual, predicted ))
    aard = (100 / len(actual)) * np.sum(np.abs((actual - predicted) / actual))
    
    return r2,rmse, aard

__Import the random forest model__

In [None]:
from sklearn.ensemble import RandomForestRegressor as RF
rf = RF(n_estimators=20)     # use 20 estimators

## Bayesian hyperparameter optimization

In [None]:
# definition of the searched hyperparameter space
space  = [
          Integer(1,10, name='max_depth'),
          Integer(3,10, name='min_samples_leaf'),
          Integer(1,len(X_train.columns), name='max_features')
         ]

# global variable for printing the current call number
run = 0

@use_named_args(space)

# function returning rmse on validation data for given hyperparameter configuration
def objective(**params):
    rf.set_params(**params)
    
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_val)
    _, rmse, _ = result_stats(rf_pred, y_val)
    
    global run
    run += 1
    print("Run #{0}: {1:.2f}; {2}".format(run, rmse, params))
    
    return rmse

In [None]:
# run Bayesian optimization for 50 configurations
bayes = gp_minimize(objective, space, n_calls=50, random_state=0)


In [None]:
# summarize the best achieved results
print("Top 10 results")
print("-------------------------")

results = [(rmse, run + 1) for run,rmse in enumerate(bayes.func_vals)]
results.sort()

for r in results[:10]:
    print("RMSE {0:.2f} in run {1}".format(r[0], r[1]))


In [None]:
# see how the search converged towards minimum rmse
plot_convergence(bayes)

__NOTE:__ Because hyperparameter optimization for random forest model is quick, it is advisable to rerun the optimization several times, to minimize change of getting stuck in local minima

In [None]:
# save the optimal hyperparameters
# the results might vary depending on the train/test split of the shuffled data
# this has to be manually typed

rf_params = {'max_depth': 10, 'min_samples_leaf': 3, 'max_features': 13}

%store rf_params