# How to Train & Tune a Random Forest

### Loading Libraries

In [2]:
# OS
import os, sys

# Numerical Computing
import numpy as np
from numpy.random import choice, normal

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# SciPy
from scipy.stats import spearmanr

# Warnings
import warnings

# Path
from pathlib import Path

# JobLib
import joblib

# Scikit-Learn
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, RandomForestClassifier

In [5]:
# Utils- Outside Library
from utils import MultipleTimeSeriesCV

In [6]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [7]:
sns.set_style('white')

np.random.seed(seed=42)

warnings.filterwarnings('ignore')

In [8]:
results_path = Path('results', 'random_forest')

if not results_path.exists():
    results_path.mkdir(parents=True)

### Getting Data

In [10]:
with pd.HDFStore('data.h5') as store:
    data =store['us/equities/monthly']

data.info()

In [11]:
y = data.target
y_binary = (y > 0).astype(int)

X = pd.get_dummies(data.drop('target', axis=1))

### Random Forests

#### Cross-Validation Parameters

In [12]:
n_splits = 10
train_period_length = 60
test_period_length = 6
lookahead = 1

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          train_period_length=train_period_length,
                          test_period_length=test_period_length,
                          lookahead=lookahead)

#### Classifier

In [13]:
rf_clf = RandomForestClassifier(n_estimators=100,       
                                criterion='gini', 
                                max_depth=None, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_features='auto',
                                max_leaf_nodes=None, 
                                min_impurity_decrease=0.0, 
                                min_impurity_split=None, 
                                bootstrap=True, 
                                oob_score=True, 
                                n_jobs=-1,
                                random_state=42, 
                                verbose=1)

#### Cross-Validation with Default Settings

In [14]:
cv_score = cross_val_score(estimator=rf_clf,
                           X=X,
                           y=y_binary,
                           scoring='roc_auc',
                           cv=cv,
                           n_jobs=-1,
                           verbose=1)

In [15]:
np.mean(cv_score)

#### Regression RF

In [16]:
def rank_correl(y, y_pred):
    return spearmanr(y, y_pred)[0]

ic = make_scorer(rank_correl)

In [41]:
rf_reg = RandomForestRegressor(n_estimators=100, 
                                max_depth=None, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_features='auto', 
                                max_leaf_nodes=None, 
                                min_impurity_decrease=0.0, 
                                min_impurity_split=None, 
                                bootstrap=True, 
                                oob_score=False, 
                                n_jobs=-1, 
                                random_state=None, 
                                verbose=0, 
                                warm_start=False)

In [42]:
cv_score = cross_val_score(estimator=rf_reg,
                           X=X,
                           y=y,
                           scoring=ic,
                           cv=cv,
                           n_jobs=-1,
                           verbose=1)

In [43]:
np.mean(cv_score)

### Parameter Tuning

#### Defining Parameter Grid

In [44]:
param_grid = {'n_estimators': [50, 100, 250],
              'max_depth': [5, 15, None],
              'min_samples_leaf': [5, 25, 100]}

In [45]:
gridsearch_clf = GridSearchCV(estimator=rf_clf,
                              param_grid=param_grid,
                              scoring='roc_auc',
                              n_jobs=-1,
                              cv=cv,
                              refit=True,
                              return_train_score=True,
                              verbose=1)

#### Fitting Classifier

In [46]:
gridsearch_clf.fit(X=X, y=y_binary)

#### Persisting Result

In [47]:
joblib.dump(gridsearch_clf, results_path / 'gridsearch_clf.joblib')

In [48]:
gridsearch_clf = joblib.load(results_path / 'gridsearch_clf.joblib')

In [49]:
gridsearch_clf.best_params_

In [50]:
gridsearch_clf.best_score_

#### Feature Importance

In [51]:
fig, ax = plt.subplots(figsize=(12,5))

(pd.Series(gridsearch_clf.best_estimator_.feature_importances_, 
           index=X.columns)
 .sort_values(ascending=False)
 .iloc[:20]
 .sort_values()
 .plot.barh(ax=ax, title='RF Feature Importance'))
sns.despine()
fig.tight_layout();
plt.show()

#### Fitting Regressor

In [52]:
gridsearch_reg = GridSearchCV(estimator=rf_reg,
                      param_grid=param_grid,
                      scoring=ic,
                      n_jobs=-1,
                      cv=cv,
                      refit=True,
                      return_train_score=True,
                      verbose=1)

In [53]:
gs_reg = gridsearch_reg

In [54]:
gridsearch_reg.fit(X=X, y=y)

In [55]:
joblib.dump(gridsearch_reg, results_path / 'rf_reg_gridsearch.joblib')

In [56]:
gridsearch_reg = joblib.load(results_path / 'rf_reg_gridsearch.joblib')

In [57]:
gridsearch_reg.best_params_

In [58]:
f'{gridsearch_reg.best_score_*100:.2f}'

### Comparing Results

#### Best Parameters

In [59]:
pd.DataFrame({'Regression': pd.Series(gridsearch_reg.best_params_),
              'Classification': pd.Series(gridsearch_clf.best_params_)})

In [60]:
fi_clf = gridsearch_clf.best_estimator_.feature_importances_

fi_reg = gridsearch_reg.best_estimator_.feature_importances_

In [61]:
idx = [c.replace('_', ' ').upper() for c in X.columns]

In [62]:
fig, axes = plt.subplots(figsize=(14, 4), ncols=2)

(pd.Series(fi_clf, index=idx)
 .sort_values(ascending=False)
 .iloc[:15]
 .sort_values()
 .plot.barh(ax=axes[1], title='Classifier'))
(pd.Series(fi_reg, index=idx)
 .sort_values(ascending=False)
 .iloc[:15]
 .sort_values()
 .plot.barh(ax=axes[0], title='Regression'))
sns.despine()
fig.tight_layout()
plt.show()