# Random Forest Hyperparameter Tuning 

This shows some simple code of how to plot n_estimators to F1 score.

In [1]:
%matplotlib notebook 

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import psycopg2

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score

In [2]:
conn = psycopg2.connect(database='usaspending', user='team', password='ZAQ!@#zaq123', host='127.0.0.1', port='5432')

In [5]:
sql_cols = ('federal_action_obligation, '
            #'total_dollars_obligated, '
            'base_and_exercised_options_value, '
            'base_and_all_options_value, '
            'awarding_sub_agency_name, '
            'awarding_office_name, '
            'funding_sub_agency_name, '
            #'funding_office_name, '
            'primary_place_of_performance_state_code, '
            'award_or_idv_flag, '
            'award_type, '
            'type_of_contract_pricing, '
            'dod_claimant_program_description, '
            'type_of_set_aside_code, '
            #'multi_year_contract, '
            #'dod_acquisition_program_description, '
            #'subcontracting_plan, ' 
            'contract_bundling, '
            #'evaluated_preference, ' 
            'national_interest_action, '
            #'cost_or_pricing_data, ' 
            'gfe_gfp, '
            'contract_financing, '
            'portfolio_group, '
            'product_or_service_code_description, '
            #'naics_bucket_title, '
            'naics_description'
            )

sql_tbl_name = 'consolidated_data2'

df = pd.read_sql_query('SELECT ' + sql_cols + ' FROM ' + sql_tbl_name, con=conn)
df.isna().sum()

federal_action_obligation                     0
base_and_exercised_options_value              0
base_and_all_options_value                    0
awarding_sub_agency_name                      0
awarding_office_name                          0
funding_sub_agency_name                       0
primary_place_of_performance_state_code       0
award_or_idv_flag                             0
award_type                                    0
type_of_contract_pricing                      0
dod_claimant_program_description              3
type_of_set_aside_code                     1803
contract_bundling                             0
national_interest_action                      0
gfe_gfp                                       0
contract_financing                            3
portfolio_group                               0
product_or_service_code_description           0
naics_description                           470
dtype: int64

In [6]:
df = df[pd.notnull(df['type_of_set_aside_code'])]
df = df.dropna()

def contract_value(c):
    if c['base_and_exercised_options_value'] > 0:
        return c['base_and_exercised_options_value']
    elif c['base_and_all_options_value'] > 0:
        return c['base_and_all_options_value']
    # elif c['total_dollars_obligated'] > 0: # Total Dollars Obligated has too many NaN values
        # return c['total_dollars_obligated'] # Total Dollars Obligated has too many NaN values
    elif c['federal_action_obligation'] > 0:
        return c['federal_action_obligation'] 
    else:
        return 0
    
df['contract_value'] = df.apply(contract_value, axis=1)

del df['base_and_exercised_options_value']
del df['base_and_all_options_value']
# del df['total_dollars_obligated']
del df['federal_action_obligation']

df = df.dropna()
# non_dummy_cols = ['type_of_set_aside_code']
# dummy_cols = list(set(df_data.columns) - set(non_dummy_cols))

X = df.drop(["type_of_set_aside_code"],axis=1)
y = df["type_of_set_aside_code"]

X = pd.get_dummies(X)
X.shape

(270531, 3394)

In [7]:
# Split the data into test and training data sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# N-Estimators Visualizer
The cell below tests a number of estimators (decision trees) and returns a graph used to determine the appropriate number of trees to use. Each decision tree makes a guess as to what the target is; the purposes of running multiple decision trees is that a Random Forest model will the average resulting in a more accurate prediction. While more decision trees will always result in better predictions, the benefit of additional trees levels out relatively quickly with the trade off of performance. 

In [None]:
%%time 

def n_estimators_tuning(X_train, y_train, min_estimators=1, max_estimators=50, cv=5, ax=None, save=None):
    
    if ax is None:
        _, ax = plt.subplots()
    
    means = []
    stds = []
    n_estimators = np.arange(min_estimators, max_estimators+1)

    for n in n_estimators:
        model = RandomForestClassifier(n_estimators=n)
        scores = cross_val_score(model, X_train, y_train, cv=cv)
        means.append(scores.mean())
        stds.append(scores.std())
    
    means = np.array(means)
    stds = np.array(stds)
    
    ax.plot(n_estimators, means, label="CV={} scores".format(cv))
    ax.fill_between(n_estimators, means-stds, means+stds, alpha=0.3)
    
    max_score = means.max()
    max_score_idx = np.where(means==max_score)[0]
    ax.axhline(max_score, ls="--", lw=1, c='r')
    ax.axvline(n_estimators[max_score_idx], ls="--", lw=1, c='r', label="Max Score = {:0.2f}".format(max_score))
    
    
    ax.set_xlim(min_estimators, max_estimators)
    ax.set_xlabel("n_estimators")
    ax.set_ylabel("F1 Score")
    ax.set_title("Random Forest Hyperparameter Tuning")
    ax.legend(loc='best')
    
    if save:
        plt.savefig(save)
    
    return ax, scores
    

#ax1, score_result = n_estimators_tuning(X_train, y_train)

In [8]:
# Below we create the model with model.fit() 

from sklearn.metrics import classification_report
model = RandomForestClassifier(n_estimators=17)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=17,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [9]:
# And here we return the accuracy. The bottom two lines show what the model is predicting and what the true values are. 

print('Model Accuracy: {:.2%}'.format(model.score(X_test, y_test)))
#print(model.predict(X_test[50:75]))
#print(y_test[50:75])

Model Accuracy: 86.01%


In [22]:
# Feature Selection

feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
contract_value,0.137832
dod_claimant_program_description_CONSTRUCTION,0.016067
naics_description_COMMERCIAL AND INSTITUTIONAL BUILDING CONSTRUCTION,0.015518
award_type_DO,0.012372
award_type_DELIVERY ORDER,0.012135
...,...
product_or_service_code_description_REPAIR OR ALTERATION OF EPG FACILITIES - SOLAR,0.000000
awarding_office_name_NROTCU ILLINOIS INST OF TECHNOLOGY,0.000000
naics_description_OTHER NONFERROUS METAL FOUNDRIES (EXCEPT DIE-CASTING),0.000000
product_or_service_code_description_REPAIR OR ALTERATION OF MINE FIRE CONTROL FACILITIES,0.000000


In [None]:
# Next I am testing the accuracy of the model on each specific set aside. Because we have an unbalanced data set
# it seems that the model is great for predicting set asides in general, however it is also skewed to better
# predict certain categories compared to others.

# Create a dictionary object to capture set aside code and it's score
class scores(dict):  
  
    # __init__ function  
    def __init__(self):  
        self = dict()  
          
    # Function to add key:value  
    def add(self, key, value):  
        self[key] = value  

scores = scores()
percent = ''
set_aside_codes = data['type_of_set_aside_code'].unique()

# Loop through each set aside, test it, and append to the dictionary
for set_aside in set_aside_codes:
    dataPoint = data.loc[data['type_of_set_aside_code'] == set_aside]
    XPoint = dataPoint.drop(["type_of_set_aside_code"],axis=1)
    yPoint = dataPoint["type_of_set_aside_code"]
    percent = model.score(XPoint, yPoint)
    percent = round(percent, 4)
    scores.add(set_aside, percent)

In [None]:
# Sort the dictionary by score
import operator
sortedScores = sorted(scores.items(), key=operator.itemgetter(1))

In [None]:
# Print scores
for score in reversed(sortedScores):
    print("{:<8} {:.2%}".format(score[0], score[1]))

In [None]:
print(classification_report(y_test, model.predict(X_test), target_names=set_aside_codes))