# Random Forest Hyperparameter Tuning 

This shows some simple code of how to plot n_estimators to F1 score.

In [1]:
%matplotlib notebook 

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv(r'/home/team/Documents/Data-Oriented-Proposal-Engine/SpendingData/Dummies.csv')

In [3]:
X = data.drop(["set_aside"],axis=1)
y = data["set_aside"]

In [4]:
# Split the data into test and training data sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# N-Estimators Visualizer
The cell below tests a number of estimators (decision trees) and returns a graph used to determine the appropriate number of trees to use. Each decision tree makes a guess as to what the target is; the purposes of running multiple decision trees is that a Random Forest model will the average resulting in a more accurate prediction. While more decision trees will always result in better predictions, the benefit of additional trees levels out relatively quickly with the trade off of performance. 

In [5]:
%%time 

def n_estimators_tuning(X_train, y_train, min_estimators=1, max_estimators=50, cv=5, ax=None, save=None):
    
    if ax is None:
        _, ax = plt.subplots()
    
    means = []
    stds = []
    n_estimators = np.arange(min_estimators, max_estimators+1)

    for n in n_estimators:
        model = RandomForestClassifier(n_estimators=n)
        scores = cross_val_score(model, X_train, y_train, cv=cv)
        means.append(scores.mean())
        stds.append(scores.std())
    
    means = np.array(means)
    stds = np.array(stds)
    
    ax.plot(n_estimators, means, label="CV={} scores".format(cv))
    ax.fill_between(n_estimators, means-stds, means+stds, alpha=0.3)
    
    max_score = means.max()
    max_score_idx = np.where(means==max_score)[0]
    ax.axhline(max_score, ls="--", lw=1, c='r')
    ax.axvline(n_estimators[max_score_idx], ls="--", lw=1, c='r', label="Max Score = {:0.2f}".format(max_score))
    
    
    ax.set_xlim(min_estimators, max_estimators)
    ax.set_xlabel("n_estimators")
    ax.set_ylabel("F1 Score")
    ax.set_title("Random Forest Hyperparameter Tuning")
    ax.legend(loc='best')
    
    if save:
        plt.savefig(save)
    
    return ax, scores
    

# ax1, score_result = n_estimators_tuning(X_train, y_train)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs


In [6]:
from sklearn.metrics import classification_report
model = RandomForestClassifier(n_estimators=17)
model.fit(X_train, y_train)


In [None]:
# Here we create paramaters used for a Roc Auc score and run the calculation. 
# I believe this only works for a binary target so probably doesn't apply here.

from sklearn.metrics import roc_auc_score
# Actual class predictions
rf_predictions = model.predict(X_test)
# Probabilities for each class
rf_probs = model.predict_proba(X_test)[:, 1]
# Calculate roc auc
roc_value = roc_auc_score(y_test, rf_probs)
print(roc_value)

In [11]:
# And here we return the accuracy. The bottom two lines show what the model is predicting and what the true values are. 

print('Model Accuracy: {:.2%}'.format(model.score(X_test, y_test)))
# print(model.predict(X_test[50:75]))
# print(y_test[50:75])

Model Accuracy: 91.31%
