# Hyperparameter Exploration

This notebook in concerned with exploring the hyperparameters associated with the Random Forest regressor. It is _extremely_ computationally intensive so you should only get stuck into this if you have: a) time, and b) an interest in whether I've selected the optimal parameters.

In [None]:
# Needed on a Mac
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import matplotlib.pyplot as plt 

In [None]:
# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

In [None]:
import os
import re
import pandas as pd
import seaborn as sns

import sklearn
print('Your scikit-learn version is {}.'.format(sklearn.__version__))
print('Please check it is at least 0.18.0.')

from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn import tree
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics  
from sklearn import ensemble

from sklearn.externals.six import StringIO
#from sklearn.model_selection import GridSearchCV
#from sklearn.feature_selection import SelectKBest 
#from sklearn.feature_selection import f_regression

from timeit import default_timer as timer
import datetime

In [None]:
analytical = os.path.join('data','analytical')

def load_status_scores(dtype):
    status = pd.read_csv(os.path.join(analytical,dtype+'-Scores.csv.gz'), index_col=0)  # SES scores
    
    # Scores
    status.drop(['RANK_01','RANK_11'], axis=1, inplace=True)
    status.rename(columns={
        'SES_01':'SES 2001',
        'SES_11':'SES 2011',
        'SES_ASC':'SES Ascent 2001-2011',
        'SES_PR_01':'SES 2001 Percentile', # 99 = High-status
        'SES_PR_11':'SES 2011 Percentile', # 99 = High-status
        'SES_PR_ASC':'SES Percentile Ascent 2001-2011'
    }, inplace=True)
    return status

def classifier_report(clf, y_true, y_hat):
    
    txt = ''
    
    # If the task is regression evaluate using regression metrics, 
    # otherwise evaluate using classification metrics
    txt += "R2:        {0:8.5f}".format(metrics.r2_score(y_true, y_hat)) + "\n" #  R2 - Coefficient of determination
    txt += "MSE:       {0:8.5f}".format(metrics.mean_squared_error(y_true, y_hat)) + "\n"  #  Mean squared error regression loss
    txt += "MAE:       {0:8.5f}".format(metrics.mean_absolute_error(y_true, y_hat)) + "\n"  #  Mean absolute error regression loss
    txt += "Expl. Var: {0:8.5f}".format(metrics.explained_variance_score(y_true, y_hat)) + "\n"  # Explained variance regression score function
    txt += "\n"
    
    return txt

## Exploring Hyperparameters

The code below is concerned with exploring the imapct that different hyperparameter settings can have on performance of the overall prediction.

In [None]:
# Take a paramter grid and explore a hyperparameter space
# using Cross-Fold Validation...
def explore_extr_hyper(params, x_train, y_train):
    
    clf = ensemble.ExtraTreesRegressor(n_jobs=-1, random_state=r_state)
    cv  = model_selection.GridSearchCV(estimator=clf, param_grid=params, cv=4, n_jobs=2, 
                                       return_train_score=True, verbose=1, scoring='neg_mean_absolute_error') 

    cv.fit(x_train, y_train)
    
    print("Best score: " + str(cv.best_score_))
    print("Best parameters: " + str(cv.best_params_))
    
    best_clf = cv.best_estimator_ # Extract the best estimator from the GridSearch
    best_clf.fit(x_train, y_train)
    y_pred  = best_clf.predict(X_test)

    print(classifier_report(best_clf, y_test, y_pred))
    return cv

# Output the results of a Cross-Validation process
# to a data frame. Currently focussed on training and
# testing scores.
def cv_to_df(cvr):
    # Extract the parameters from the Cross-Validation object that 
    # we want to track in our results
    params  = cvr.cv_results_['params']
    trn_scr = cvr.cv_results_['mean_train_score']
    tst_scr = cvr.cv_results_['mean_test_score']
    trn_std = cvr.cv_results_['std_train_score']
    tst_std = cvr.cv_results_['std_test_score']
    rank    = cvr.cv_results_['rank_test_score']
    
    # Create a data frame from the numbers
    df = pd.DataFrame.from_dict({'Training Score':trn_scr, 'Test Score':tst_scr, 
                                'Std. of Training Scores':trn_std, 'Std. of Test Scores':tst_std})
    
    # Add the rank of the result
    rs = pd.Series(rank, index=df.index)
    df['rank'] = rs
    
    # And now work out how many parameters there
    # were and create the appropriate columns to
    # add to the df. Start with named parameters...
    n_params = cvr.cv_results_['params'][0].keys()
    
    # Convert these to arrays that can be assigned
    # as a new data series to the df.
    for p in list(n_params):
        vals = []
        for v in cvr.cv_results_['params']:
            vals.append(v[p])
        
        # Create and assign a new series using
        # the index from the data frame to avoid
        # setting-with-copy warnings
        ps = pd.Series(vals, index=df.index)
        df[p] = ps
    
    return df

In [None]:
# Can override to_use here if have already generated data above
to_use = 'Untransformed'

SES = load_status_scores(to_use)  # SES scores in 2011

#  Read the transformed data
d01_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2001-Data-Transformed_and_Scaled.csv.gz'), index_col=0)
d11_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2011-Data-Transformed_and_Scaled.csv.gz'), index_col=0)

# Data about variables used later in process
vardb = pd.read_csv(os.path.join('data','variables.csv'), index_col=False)
vardb.drop('Description', axis=1, inplace=True)

To evaluate the models most reliably a portion of the dataset must be kept as holdout to evaluate the classifier on independently.  The code below splits the data into training and test sets using a test size of 20%.

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    d01_trs2, SES['SES Ascent 2001-2011'], test_size=0.2, random_state=r_state)

### n_estimators

This one is a beast since computations pile up as you increase the number of trees. For 400 fits on a MacBook Air I get a total running time of 5:10:40.

In [None]:
param_grid = {
    "n_estimators" : [int(x) for x in np.arange(start=20, stop=2001, step=20)]
}

start = timer()
cv1 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

cv_to_df(cv1).to_csv(os.path.join(analytical,to_use+'-Scores-n_estimators.csv'), index=False)

### max_depth

This appears to take approximately 36 seconds on a MacBook Air.

In [None]:
param_grid = {
    "max_depth" : [int(x) for x in np.arange(start=10, stop=161, step=10)],
}

start = timer()
cv2 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

cv_to_df(cv2).to_csv(os.path.join(analytical,to_use+'-Scores-max_depth.csv'), index=False)

### min_samples_leaf

This is relatively quick since increasing the minimum size of terminal leaves reduces the depth of the trees substantially. It should take approximately 21 seconds on a MacBook Air.

In [None]:
param_grid = {
    "min_samples_leaf" : [int(x) for x in np.arange(start=1, stop=26, step=1)],
}

start = timer()
cv3 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

# Save results to CSV file
cv_to_df(cv3).to_csv(os.path.join(analytical,to_use+'-Scores-min_samples_leaf.csv'), index=False)

### max_features & bootstrap

The `max_features` applies limits to how many features each tree can employ as a share of the total number of features (1.0). Bootstrapping should not be necessary with a `k`-folds approach but in some cases can chagne the results. Running this apepars to take about 40 seconds on a MacBook Air.

In [None]:
param_grid = {
    "max_features"  : [float(x) for x in np.arange(start=0.1, stop=1.01, step=0.1)], # For regression normally n_features (worth trying after shorter runs)
    "bootstrap"     : [True, False]    # Not normally needed for ExtraTrees, but seems to improve performance?
}

param_grid['max_features'].append('auto')
param_grid['max_features'].append('sqrt')

start = timer()
cv4 = explore_extr_hyper(param_grid, X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")

# Save results to CSV file
cv_to_df(cv4).to_csv(os.path.join(analytical,to_use+'-Scores-max_features_and_bootstrap.csv'), index=False)

## Important Caveat

Although this exploration provides a useful overview of how the tuning of different hyperparameters can impact overall performance of the regressor, they _do not act independently of one another_. In other words: this is just exploration to get a 'feel' for the algorithm, and we will actually need to undertake a much, much, much more computationally challenging 'grid search' in [Notebook 8](08-Neighbourhood Prediction.ipynb) (or, I would suggest, [Script 8](08-Neighbourhood Prediction.py)).