# Random Forest Hyperparameter Tuning 

This shows some simple code of how to plot n_estimators to F1 score.

In [1]:
%matplotlib notebook 

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv(r'/home/team/Documents/Data-Oriented-Proposal-Engine/SpendingData/DummiesAllSetAsides.csv')
# data.head()

In [3]:
X = data.drop(["type_of_set_aside_code"],axis=1)
y = data["type_of_set_aside_code"]

In [4]:
# Split the data into test and training data sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
# Below we create the model with model.fit() 

from sklearn.metrics import classification_report
model = RandomForestClassifier(n_estimators=17)
model.fit(X_train, y_train)

In [107]:
# And here we return the accuracy. The bottom two lines show what the model is predicting and what the true values are. 

print('Model Accuracy: {:.2%}'.format(model.score(X_test, y_test)))
# print(model.predict(X_test[50:75]))
# print(y_test[50:75])

Model Accuracy: 83.55%


In [None]:
# Next I am testing the accuracy of the model on each specific set aside. Because we have an unbalanced data set
# it seems that the model is great for predicting set asides in general, however it is also skewed to better
# predict certain categories compared to others.

# Create a dictionary object to capture set aside code and it's score
class scores(dict):  
  
    # __init__ function  
    def __init__(self):  
        self = dict()  
          
    # Function to add key:value  
    def add(self, key, value):  
        self[key] = value  

scores = scores()
percent = ''
set_aside_codes = data['type_of_set_aside_code'].unique()

# Loop through each set aside, test it, and append to the dictionary
for set_aside in set_aside_codes:
    dataPoint = data.loc[data['type_of_set_aside_code'] == set_aside]
    XPoint = dataPoint.drop(["type_of_set_aside_code"],axis=1)
    yPoint = dataPoint["type_of_set_aside_code"]
    percent = model.score(XPoint, yPoint)
    percent = round(percent, 4)
    scores.add(set_aside, percent)

In [117]:
# Sort the dictionary by score
import operator
sortedScores = sorted(scores.items(), key=operator.itemgetter(1))

In [119]:
# Print scores
for score in reversed(sortedScores):
    print("{:<8} {:.2%}".format(score[0], score[1]))

NONE     96.64%
SBA      90.22%
SDVOSBC  82.29%
HZC      81.48%
SBP      80.95%
8A       78.31%
WOSB     75.62%
8AN      74.68%
SDVOSBS  71.43%
HZS      50.00%
WOSBSS   33.33%
EDWOSB   14.29%
