# Random Forest Hyperparameter Tuning 

This shows some simple code of how to plot n_estimators to F1 score.

In [1]:
%matplotlib notebook 

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import psycopg2
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score

In [2]:
conn = psycopg2.connect(database='usaspending', user='team', password='ZAQ!@#zaq123', host='127.0.0.1', port='5432')

In [3]:
sql_cols = ('federal_action_obligation, '
            #'total_dollars_obligated, '
            'base_and_exercised_options_value, '
            'base_and_all_options_value, '
            'awarding_sub_agency_name, '
            'awarding_office_name, '
            'funding_sub_agency_name, '
            #'funding_office_name, '  too many NaN
            'primary_place_of_performance_state_code, '
            'award_or_idv_flag, '
            'award_type, '
            'type_of_contract_pricing, '
            'dod_claimant_program_description, '
            'type_of_set_aside_code, '
            #'multi_year_contract, ' too many NaN
            #'dod_acquisition_program_description, ' too many NaN
            #'subcontracting_plan, ' too many NaN
            'contract_bundling, '
            #'evaluated_preference, ' too many NaN
            'national_interest_action, '
            #'cost_or_pricing_data, ' too many NaN
            'gfe_gfp, '
            'contract_financing, '
            'portfolio_group, '
            'product_or_service_code_description, '
            #'naics_bucket_title, ' too many NaN
            'naics_description'
            )

sql_tbl_name = 'consolidated_data2'

df = pd.read_sql_query('SELECT ' + sql_cols + ' FROM ' + sql_tbl_name, con=conn)
df.isna().sum()

federal_action_obligation                     0
base_and_exercised_options_value              0
base_and_all_options_value                    0
awarding_sub_agency_name                      0
awarding_office_name                          0
funding_sub_agency_name                       0
primary_place_of_performance_state_code       0
award_or_idv_flag                             0
award_type                                    0
type_of_contract_pricing                      0
dod_claimant_program_description              3
type_of_set_aside_code                     1803
contract_bundling                             0
national_interest_action                      0
gfe_gfp                                       0
contract_financing                            3
portfolio_group                               0
product_or_service_code_description           0
naics_description                           470
dtype: int64

In [4]:
df = df[pd.notnull(df['type_of_set_aside_code'])]
df = df.dropna()

def contract_value(c):
    if c['base_and_exercised_options_value'] > 0:
        return c['base_and_exercised_options_value']
    elif c['base_and_all_options_value'] > 0:
        return c['base_and_all_options_value']
    # elif c['total_dollars_obligated'] > 0: # Total Dollars Obligated has too many NaN values
        # return c['total_dollars_obligated'] # Total Dollars Obligated has too many NaN values
    elif c['federal_action_obligation'] > 0:
        return c['federal_action_obligation'] 
    else:
        return 0
    
df['contract_value'] = df.apply(contract_value, axis=1)

del df['base_and_exercised_options_value']
del df['base_and_all_options_value']
# del df['total_dollars_obligated']
del df['federal_action_obligation']
df = df.dropna()

X = df.drop(["type_of_set_aside_code"],axis=1)
y = df["type_of_set_aside_code"]

X = pd.get_dummies(X)
X.shape

(270531, 3394)

In [None]:
def set_aside(c):
    
    if c['type_of_set_aside_code'] == 'NONE':
        return 0
    else:
        return 1
    
df_corr = df
df_corr['set_aside'] = df_corr.apply(set_aside, axis=1)
del df_corr['type_of_set_aside_code']

non_dummy_cols = ['set_aside', 'contract_value']
dummy_cols = list(set(df.columns) - set(non_dummy_cols))
df_corr = pd.get_dummies(df, columns=dummy_cols)

In [None]:
df_corr.head()

In [None]:
df_corr.corr()['set_aside'][:]
# Correlation with output variable
cor_target = abs(df_corr.corr()['set_aside'][:])

In [None]:
# Selecting highly correlated features
relevant_features = cor_target[cor_target>0.05]
relevant_features

In [None]:
print(relevant_features)

In [None]:
#Drop columns that we dont need anymore.
X1 = X[relevant_features]

In [5]:
# Split the data into test and training data sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# N-Estimators Visualizer
The cell below tests a number of estimators (decision trees) and returns a graph used to determine the appropriate number of trees to use. Each decision tree makes a guess as to what the target is; the purposes of running multiple decision trees is that a Random Forest model will the average resulting in a more accurate prediction. While more decision trees will always result in better predictions, the benefit of additional trees levels out relatively quickly with the trade off of performance. 

In [None]:
%%time 

def n_estimators_tuning(X_train, y_train, min_estimators=1, max_estimators=50, cv=5, ax=None, save=None):
    
    if ax is None:
        _, ax = plt.subplots()
    
    means = []
    stds = []
    n_estimators = np.arange(min_estimators, max_estimators+1)

    for n in n_estimators:
        model = RandomForestClassifier(n_estimators=n)
        scores = cross_val_score(model, X_train, y_train, cv=cv)
        means.append(scores.mean())
        stds.append(scores.std())
    
    means = np.array(means)
    stds = np.array(stds)
    
    ax.plot(n_estimators, means, label="CV={} scores".format(cv))
    ax.fill_between(n_estimators, means-stds, means+stds, alpha=0.3)
    
    max_score = means.max()
    max_score_idx = np.where(means==max_score)[0]
    ax.axhline(max_score, ls="--", lw=1, c='r')
    ax.axvline(n_estimators[max_score_idx], ls="--", lw=1, c='r', label="Max Score = {:0.2f}".format(max_score))
    
    
    ax.set_xlim(min_estimators, max_estimators)
    ax.set_xlabel("n_estimators")
    ax.set_ylabel("F1 Score")
    ax.set_title("Random Forest Hyperparameter Tuning")
    ax.legend(loc='best')
    
    if save:
        plt.savefig(save)
    
    return ax, scores
    

#ax1, score_result = n_estimators_tuning(X_train, y_train)

In [6]:
# Below we create the model with model.fit() 

from sklearn.metrics import classification_report
model = RandomForestClassifier(n_estimators=17)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=17,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [7]:
# And here we return the accuracy. The bottom two lines show what the model is predicting and what the true values are. 

print('Model Accuracy: {:.2%}'.format(model.score(X_test, y_test)))
#print(model.predict(X_test[50:75]))
#print(y_test[50:75])

Model Accuracy: 85.85%


In [51]:
# Feature Selection

feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
contract_value,0.137628
dod_claimant_program_description_CONSTRUCTION,0.014926
portfolio_group_Electronic & Communication Services,0.014391
award_type_DELIVERY ORDER,0.012498
award_type_DO,0.011440
...,...
product_or_service_code_description_R&D- SPACE: COMMERCIAL PROGRAMS (BASIC RESEARCH),0.000000
product_or_service_code_description_R&D- SPACE: OTHER (ENGINEERING DEVELOPMENT),0.000000
naics_description_OUTDOOR POWER EQUIPMENT STORES,0.000000
product_or_service_code_description_R&D- SPACE: SCIENCE/APPLICATIONS (MANAGEMENT/SUPPORT),0.000000


In [55]:
feature_importances['cumpercentage'] = feature_importances['importance'].cumsum()/feature_importances['importance'].sum()*100
relevant_features = feature_importances[feature_importances.cumpercentage < 80]
relevant_features

Unnamed: 0,importance,cumpercentage
contract_value,0.137628,13.762796
dod_claimant_program_description_CONSTRUCTION,0.014926,15.255356
portfolio_group_Electronic & Communication Services,0.014391,16.694476
award_type_DELIVERY ORDER,0.012498,17.944322
award_type_DO,0.011440,19.088278
...,...,...
product_or_service_code_description_R&D- DEFENSE SYSTEM: ELECTRONICS/COMMUNICATION EQUIPMENT (BASIC RESEARCH),0.000542,79.732143
product_or_service_code_description_ARCHITECT AND ENGINEERING- CONSTRUCTION: OFFICE BUILDINGS,0.000541,79.786278
contract_financing_PERFORMANCE-BASED FINANCING,0.000539,79.840208
product_or_service_code_description_CONSTRUCTION OF OFFICE BUILDINGS,0.000538,79.893960


In [56]:
list_relevant_features = list(relevant_features.index)
print(list_relevant_features)

['contract_value', 'dod_claimant_program_description_CONSTRUCTION', 'portfolio_group_Electronic & Communication Services', 'award_type_DELIVERY ORDER', 'award_type_DO', 'naics_description_COMMERCIAL AND INSTITUTIONAL BUILDING CONSTRUCTION', 'dod_claimant_program_description_SERVICES', 'primary_place_of_performance_state_code_CA', 'primary_place_of_performance_state_code_VA', 'portfolio_group_Facility Related Services', 'award_type_PURCHASE ORDER', 'awarding_sub_agency_name_DEPT OF THE ARMY', 'award_type_PO', 'primary_place_of_performance_state_code_WA', 'primary_place_of_performance_state_code_FL', 'naics_description_ENGINEERING SERVICES', 'product_or_service_code_description_REPAIR OR ALTERATION OF MISCELLANEOUS BUILDINGS', 'contract_financing_NOT APPLICABLE', 'primary_place_of_performance_state_code_MD', 'award_type_DEFINITIVE CONTRACT', 'type_of_contract_pricing_FIRM FIXED PRICE', 'primary_place_of_performance_state_code_TX', 'funding_sub_agency_name_DEPT OF THE ARMY', 'awarding_sub

In [58]:
X1 = X[list_relevant_features]
X1.head()

Unnamed: 0,contract_value,dod_claimant_program_description_CONSTRUCTION,portfolio_group_Electronic & Communication Services,award_type_DELIVERY ORDER,award_type_DO,naics_description_COMMERCIAL AND INSTITUTIONAL BUILDING CONSTRUCTION,dod_claimant_program_description_SERVICES,primary_place_of_performance_state_code_CA,primary_place_of_performance_state_code_VA,portfolio_group_Facility Related Services,...,product_or_service_code_description_R&D- DEFENSE SYSTEM: MISSILE/SPACE SYSTEMS (APPLIED RESEARCH/EXPLORATORY DEVELOPMENT),product_or_service_code_description_OPERATION OF RECREATION FACILITIES (NON-BUILDING),product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- SHIP AND MARINE EQUIPMENT,awarding_office_name_W6QM MICC-FT RUCKER,awarding_sub_agency_name_U.S. SPECIAL OPERATIONS COMMAND (USSOCOM),product_or_service_code_description_R&D- DEFENSE SYSTEM: ELECTRONICS/COMMUNICATION EQUIPMENT (BASIC RESEARCH),product_or_service_code_description_ARCHITECT AND ENGINEERING- CONSTRUCTION: OFFICE BUILDINGS,contract_financing_PERFORMANCE-BASED FINANCING,product_or_service_code_description_CONSTRUCTION OF OFFICE BUILDINGS,awarding_office_name_US ARMY ENGINEER DISTRICT HUNTINGTO
0,10000.0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,998435.39,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,124345.9,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,129999.4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,200.0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [59]:
# Re-reun model with relevant features

# Drop non-relevant columns that we dont need anymore


# Split the data into test and training data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.5, random_state=42)

# Below we create the model with model.fit() 

from sklearn.metrics import classification_report
model = RandomForestClassifier(n_estimators=17)
model.fit(X_train, y_train)

# And here we return the accuracy. The bottom two lines show what the model is predicting and what the true values are. 

print('Model Accuracy: {:.2%}'.format(model.score(X_test, y_test)))
#print(model.predict(X_test[50:75]))
#print(y_test[50:75])

Model Accuracy: 84.31%


In [61]:
df_new = X1
df_new['type_of_set_aside_code'] = y
df_new.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(270531, 324)

In [67]:
from sqlalchemy import create_engine

db_name, ip, user_name, password, table_name = 'usaspending', 'dopelytics.site:5432','team', 'ZAQ!@#zaq123', 'featured_dummy'
print('Connecting to Postgresql...\n')
engine = create_engine('postgresql+psycopg2://{}:{}@{}/{}'.format(user_name,password,ip,db_name)) #create connection to db
print('Successfully Connected to Postgres\n')

Connecting to Postgresql...

Successfully Connected to Postgres



In [68]:
#Create Pandas DataFrame to open our csv file.
print('Creating Data Frame...\n')

#Define function to connect db , create table and populate csv file values to Postgres Database.
def upload_DF_to_postgres(df_to_upload=df_new, table_name=table_name, user_name=user_name, password=password, db_name=db_name):
    df_to_upload.to_sql(table_name, engine, if_exists='append')

upload_DF_to_postgres()
print('Data Frame Successfully Created\n')  
print('CSV file Successfully Uploaded to Postgres')

Creating Data Frame...



ProgrammingError: (psycopg2.errors.DuplicateColumn) column "product_or_service_code_description_ARCHITECT AND ENGINEERING- " specified more than once

[SQL: 
CREATE TABLE featured_dummy (
	index BIGINT, 
	contract_value FLOAT(53), 
	"dod_claimant_program_description_CONSTRUCTION" BIGINT, 
	"portfolio_group_Electronic & Communication Services" BIGINT, 
	"award_type_DELIVERY ORDER" BIGINT, 
	"award_type_DO" BIGINT, 
	"naics_description_COMMERCIAL AND INSTITUTIONAL BUILDING CONSTRUCTION" BIGINT, 
	"dod_claimant_program_description_SERVICES" BIGINT, 
	"primary_place_of_performance_state_code_CA" BIGINT, 
	"primary_place_of_performance_state_code_VA" BIGINT, 
	"portfolio_group_Facility Related Services" BIGINT, 
	"award_type_PURCHASE ORDER" BIGINT, 
	"awarding_sub_agency_name_DEPT OF THE ARMY" BIGINT, 
	"award_type_PO" BIGINT, 
	"primary_place_of_performance_state_code_WA" BIGINT, 
	"primary_place_of_performance_state_code_FL" BIGINT, 
	"naics_description_ENGINEERING SERVICES" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF MISCELLANEOUS BUILDINGS" BIGINT, 
	"contract_financing_NOT APPLICABLE" BIGINT, 
	"primary_place_of_performance_state_code_MD" BIGINT, 
	"award_type_DEFINITIVE CONTRACT" BIGINT, 
	"type_of_contract_pricing_FIRM FIXED PRICE" BIGINT, 
	"primary_place_of_performance_state_code_TX" BIGINT, 
	"funding_sub_agency_name_DEPT OF THE ARMY" BIGINT, 
	"awarding_sub_agency_name_DEFENSE INFORMATION SYSTEMS AGENCY (DISA)" BIGINT, 
	"naics_description_FACILITIES SUPPORT SERVICES" BIGINT, 
	"funding_sub_agency_name_DEPT OF THE AIR FORCE" BIGINT, 
	"primary_place_of_performance_state_code_GA" BIGINT, 
	"type_of_contract_pricing_COST PLUS FIXED FEE" BIGINT, 
	"naics_description_GENERAL MEDICAL AND SURGICAL HOSPITALS" BIGINT, 
	"primary_place_of_performance_state_code_NC" BIGINT, 
	"dod_claimant_program_description_ALL OTHERS NOT IDENTIFIABLE TO ANY OTHER PROCUREMENT PROGRAM" BIGINT, 
	"awarding_sub_agency_name_DEPT OF THE NAVY" BIGINT, 
	"award_type_DCA" BIGINT, 
	"awarding_sub_agency_name_DEPT OF THE AIR FORCE" BIGINT, 
	"funding_sub_agency_name_DEPT OF THE NAVY" BIGINT, 
	"primary_place_of_performance_state_code_PA" BIGINT, 
	"contract_financing_FAR 52.232-16 PROGRESS PAYMENTS" BIGINT, 
	"portfolio_group_Knowledge Based Services" BIGINT, 
	"gfe_gfp_TRANSACTION DOES NOT USE GFE/GFP" BIGINT, 
	"naics_description_RESEARCH AND DEVELOPMENT IN THE PHYSICAL, ENGINEERING, AND LIFE SCIENCES (EXCEPT BIOTECHNOLOGY)" BIGINT, 
	"primary_place_of_performance_state_code_AL" BIGINT, 
	"funding_sub_agency_name_U.S. ARMY CORPS OF ENGINEERS - CIVIL PROGRAM FINANCING ONLY" BIGINT, 
	"primary_place_of_performance_state_code_SC" BIGINT, 
	"gfe_gfp_TRANSACTION USES GFE/GFP" BIGINT, 
	"primary_place_of_performance_state_code_AK" BIGINT, 
	"naics_description_WIRED TELECOMMUNICATIONS CARRIERS" BIGINT, 
	"portfolio_group_Construction Services" BIGINT, 
	"primary_place_of_performance_state_code_KS" BIGINT, 
	"award_type_BPA CALL" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- WASTE TREATMENT/STORAGE" BIGINT, 
	"portfolio_group_Equipment Related Services" BIGINT, 
	"primary_place_of_performance_state_code_HI" BIGINT, 
	"naics_description_SEPTIC TANK AND RELATED SERVICES" BIGINT, 
	"primary_place_of_performance_state_code_IN" BIGINT, 
	"contract_financing_PERCENTAGE OF COMPLETION PROGRESS PAYMENTS" BIGINT, 
	"product_or_service_code_description_SUPPORT- PROFESSIONAL: OTHER" BIGINT, 
	"primary_place_of_performance_state_code_IL" BIGINT, 
	"awarding_office_name_NAVAL FAC ENGINEERING CMD MID LANT" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- FACILITIES OPERATIONS SUPPORT" BIGINT, 
	"primary_place_of_performance_state_code_NY" BIGINT, 
	"primary_place_of_performance_state_code_AZ" BIGINT, 
	"portfolio_group_Transportation Services" BIGINT, 
	"awarding_office_name_DITCO-SCOTT" BIGINT, 
	"naics_description_PLUMBING, HEATING, AND AIR-CONDITIONING CONTRACTORS" BIGINT, 
	"primary_place_of_performance_state_code_OH" BIGINT, 
	"naics_description_SHIP BUILDING AND REPAIRING" BIGINT, 
	"primary_place_of_performance_state_code_NJ" BIGINT, 
	"primary_place_of_performance_state_code_MS" BIGINT, 
	"primary_place_of_performance_state_code_MO" BIGINT, 
	"awarding_office_name_DLA DISPOSTION SERVICE - EBS" BIGINT, 
	"portfolio_group_Research and Development" BIGINT, 
	"award_type_BPA" BIGINT, 
	"naics_description_ALL OTHER SPECIALTY TRADE CONTRACTORS" BIGINT, 
	"primary_place_of_performance_state_code_OK" BIGINT, 
	"awarding_office_name_NAV FAC ENGINEERING CMD WASHINGTON" BIGINT, 
	"naics_description_SATELLITE TELECOMMUNICATIONS" BIGINT, 
	"primary_place_of_performance_state_code_LA" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF OFFICE BUILDINGS" BIGINT, 
	"primary_place_of_performance_state_code_TN" BIGINT, 
	"naics_description_OTHER COMPUTER RELATED SERVICES" BIGINT, 
	"awarding_office_name_NAVFAC SOUTHWEST" BIGINT, 
	"primary_place_of_performance_state_code_CO" BIGINT, 
	"naics_description_ELECTRICAL CONTRACTORS AND OTHER WIRING INSTALLATION CONTRACTORS" BIGINT, 
	"portfolio_group_Medical Services" BIGINT, 
	"naics_description_REMEDIATION SERVICES" BIGINT, 
	"naics_description_OTHER ELECTRONIC AND PRECISION EQUIPMENT REPAIR AND MAINTENANCE" BIGINT, 
	"naics_description_HAZARDOUS WASTE TREATMENT AND DISPOSAL" BIGINT, 
	"naics_description_HIGHWAY, STREET, AND BRIDGE CONSTRUCTION" BIGINT, 
	"primary_place_of_performance_state_code_DC" BIGINT, 
	"product_or_service_code_description_SUPPORT- PROFESSIONAL: ENGINEERING/TECHNICAL" BIGINT, 
	"naics_description_COMMERCIAL AND INDUSTRIAL MACHINERY AND EQUIPMENT (EXCEPT AUTOMOTIVE AND ELECTRONIC) REPAIR AND MAINTENANCE" BIGINT, 
	"awarding_office_name_SOUTHWEST REGIONAL MAINT CENTER" BIGINT, 
	"primary_place_of_performance_state_code_UT" BIGINT, 
	"primary_place_of_performance_state_code_MA" BIGINT, 
	"naics_description_LANDSCAPING SERVICES" BIGINT, 
	"primary_place_of_performance_state_code_KY" BIGINT, 
	"naics_description_INDUSTRIAL BUILDING CONSTRUCTION" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- LANDSCAPING/GROUNDSKEEPING" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF OFFICE BUILDINGS" BIGINT, 
	"naics_description_TEMPORARY HELP SERVICES" BIGINT, 
	"naics_description_ALL OTHER PROFESSIONAL, SCIENTIFIC, AND TECHNICAL SERVICES" BIGINT, 
	"naics_description_ADMINISTRATIVE MANAGEMENT AND GENERAL MANAGEMENT CONSULTING SERVICES" BIGINT, 
	"naics_description_HOTELS (EXCEPT CASINO HOTELS) AND MOTELS" BIGINT, 
	"primary_place_of_performance_state_code_CT" BIGINT, 
	"awarding_office_name_NAVFAC SOUTHEAST" BIGINT, 
	"awarding_sub_agency_name_DEFENSE LOGISTICS AGENCY" BIGINT, 
	"primary_place_of_performance_state_code_NM" BIGINT, 
	"primary_place_of_performance_state_code_GU" BIGINT, 
	"naics_description_PAINTING AND WALL COVERING CONTRACTORS" BIGINT, 
	"product_or_service_code_description_ARCHITECT AND ENGINEERING- GENERAL: OTHER" BIGINT, 
	"product_or_service_code_description_ARCHITECT AND ENGINEERING- GENERAL: LANDSCAPING, INTERIOR LAYOUT, AND DESIGNING" BIGINT, 
	"product_or_service_code_description_MEDICAL- GENERAL HEALTH CARE" BIGINT, 
	"product_or_service_code_description_ARCHITECT AND ENGINEERING- CONSTRUCTION: RESTORATION OF REAL PROPERTY (PUBLIC OR PRIVATE)" BIGINT, 
	"contract_financing_UNUSUAL PROGRESS PAYMENTS OR ADVANCE PAYMENTS" BIGINT, 
	"funding_sub_agency_name_DEFENSE LOGISTICS AGENCY" BIGINT, 
	"awarding_office_name_NAVSUP FLT LOG CTR NORFOLK" BIGINT, 
	"awarding_office_name_W6QK ACC-PICA" BIGINT, 
	"product_or_service_code_description_IT AND TELECOM- TELECOMMUNICATIONS AND TRANSMISSION" BIGINT, 
	"naics_description_COMPUTER SYSTEMS DESIGN SERVICES" BIGINT, 
	"naics_description_ENVIRONMENTAL CONSULTING SERVICES" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF OTHER NON-BUILDING FACILITIES" BIGINT, 
	"funding_sub_agency_name_DEFENSE INFORMATION SYSTEMS AGENCY (DISA)" BIGINT, 
	"naics_description_ROOFING CONTRACTORS" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF MISCELLANEOUS BUILDINGS" BIGINT, 
	"awarding_office_name_W6QM MICC-FT LEONARD WOOD" BIGINT, 
	"awarding_sub_agency_name_USTRANSCOM" BIGINT, 
	"primary_place_of_performance_state_code_MI" BIGINT, 
	"product_or_service_code_description_SUPPORT- PROFESSIONAL: PROGRAM MANAGEMENT/SUPPORT" BIGINT, 
	"naics_description_OTHER SCIENTIFIC AND TECHNICAL CONSULTING SERVICES" BIGINT, 
	"dod_claimant_program_description_ELECTRONICS AND COMMUNICATION EQUIPMENT" BIGINT, 
	"awarding_office_name_NAVAL MEDICAL LOGISTICS COMMAND" BIGINT, 
	"awarding_office_name_W6QK ACC-RSA" BIGINT, 
	"naics_description_JANITORIAL SERVICES" BIGINT, 
	"naics_description_OFFICE ADMINISTRATIVE SERVICES" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF OTHER ADMINISTRATIVE FACILITIES AND SERVICE BUILDINGS" BIGINT, 
	"product_or_service_code_description_EDUCATION/TRAINING- OTHER" BIGINT, 
	"primary_place_of_performance_state_code_WI" BIGINT, 
	"awarding_office_name_DLA DISPOSTION SERVICES -  PACIFIC" BIGINT, 
	"type_of_contract_pricing_COST NO FEE" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- MISCELLANEOUS" BIGINT, 
	"product_or_service_code_description_OTHER ENVIRONMENTAL SERVICES" BIGINT, 
	"product_or_service_code_description_CONSTRUCTION OF MISCELLANEOUS BUILDINGS" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- MEDICAL, DENTAL, AND VETERINARY EQUIPMENT AND SUPPLIES" BIGINT, 
	"naics_description_CUSTOM COMPUTER PROGRAMMING SERVICES" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF OTHER NON-BUILDING FACILITIES" BIGINT, 
	"awarding_office_name_NAVAL RESEARCH LABORATORY" BIGINT, 
	"product_or_service_code_description_LEASE OR RENTAL OF EQUIPMENT- TOILETRIES" BIGINT, 
	"naics_description_ARCHITECTURAL SERVICES" BIGINT, 
	"primary_place_of_performance_state_code_NV" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF OTHER ADMINISTRATIVE FACILITIES AND SERVICE BUILDINGS" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- CUSTODIAL JANITORIAL" BIGINT, 
	"primary_place_of_performance_state_code_RI" BIGINT, 
	"awarding_office_name_W2V6 USA ENG SPT CTR HUNTSVIL" BIGINT, 
	"product_or_service_code_description_SUPPORT- ADMINISTRATIVE: OTHER" BIGINT, 
	"product_or_service_code_description_IT AND TELECOM- OTHER IT AND TELECOMMUNICATIONS" BIGINT, 
	"naics_description_NONSCHEDULED CHARTERED PASSENGER AIR TRANSPORTATION" BIGINT, 
	"dod_claimant_program_description_SHIPS" BIGINT, 
	"awarding_office_name_SPACE AND NAVAL WARFARE SYSTEMS" BIGINT, 
	"awarding_office_name_W6QM MICC-FT RILEY" BIGINT, 
	"awarding_office_name_W6QM MICC-FT BELVOIR" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- INSTRUMENTS AND LABORATORY EQUIPMENT" BIGINT, 
	"product_or_service_code_description_EDUCATION/TRAINING- TUITION/REGISTRATION/MEMBERSHIP FEES" BIGINT, 
	"naics_description_OTHER AIRCRAFT PARTS AND AUXILIARY EQUIPMENT MANUFACTURING" BIGINT, 
	"product_or_service_code_description_CONSTRUCTION OF RESTORATION OF REAL PROPERTY (PUBLIC OR PRIVATE)" BIGINT, 
	"product_or_service_code_description_LEASE OR RENTAL OF EQUIPMENT- PLUMBING, HEATING, AND WASTE DISPOSAL EQUIPMENT" BIGINT, 
	"naics_description_PROFESSIONAL AND MANAGEMENT DEVELOPMENT TRAINING" BIGINT, 
	"naics_description_TESTING LABORATORIES" BIGINT, 
	"naics_description_OTHER HEAVY AND CIVIL ENGINEERING CONSTRUCTION" BIGINT, 
	"naics_description_ALL OTHER SUPPORT SERVICES" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF HIGHWAYS/ROADS/STREETS/BRIDGES/RAILWAYS" BIGINT, 
	"awarding_office_name_NAVAL FACILITIES ENGINEERING AND" BIGINT, 
	"primary_place_of_performance_state_code_AR" BIGINT, 
	"primary_place_of_performance_state_code_OR" BIGINT, 
	"awarding_office_name_FA8101  AFSC PZIO" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- ELECTRICAL AND ELECTRONIC EQUIPMENT COMPONENTS" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- OTHER" BIGINT, 
	"awarding_office_name_NAVFAC ENGINEERING COMMAND MARIANAS" BIGINT, 
	"awarding_office_name_W0ML USA DEP TOBYHANNA" BIGINT, 
	"primary_place_of_performance_state_code_ME" BIGINT, 
	"awarding_office_name_W6QM MICC-FT CAMPBELL" BIGINT, 
	"awarding_office_name_NAVAIR WARFARE CTR AIRCRAFT DIV LKE" BIGINT, 
	"naics_description_COMPUTER FACILITIES MANAGEMENT SERVICES" BIGINT, 
	"awarding_office_name_W6QM MICC-FT BLISS" BIGINT, 
	"funding_sub_agency_name_DEPT OF DEFENSE" BIGINT, 
	"product_or_service_code_description_SUPPORT- MANAGEMENT: OTHER" BIGINT, 
	"awarding_office_name_COMMANDING GENERAL" BIGINT, 
	"product_or_service_code_description_ENVIRONMENTAL SYSTEMS PROTECTION- ENVIRONMENTAL REMEDIATION" BIGINT, 
	"naics_description_WIRELESS TELECOMMUNICATIONS CARRIERS (EXCEPT SATELLITE)" BIGINT, 
	"naics_description_RV (RECREATIONAL VEHICLE) PARKS AND CAMPGROUNDS" BIGINT, 
	"primary_place_of_performance_state_code_MN" BIGINT, 
	"naics_description_WATER AND SEWER LINE AND RELATED STRUCTURES CONSTRUCTION" BIGINT, 
	"awarding_office_name_W6QM MICC-FDO FT HOOD" BIGINT, 
	"naics_description_FOOD SERVICE CONTRACTORS" BIGINT, 
	"awarding_office_name_COMMANDER" BIGINT, 
	"product_or_service_code_description_NON-NUCLEAR SHIP REPAIR (WEST)" BIGINT, 
	"awarding_office_name_W6QM MICC-FT DRUM" BIGINT, 
	"awarding_office_name_W071 ENDIST OMAHA" BIGINT, 
	"primary_place_of_performance_state_code_NE" BIGINT, 
	"naics_description_OTHER BUILDING EQUIPMENT CONTRACTORS" BIGINT, 
	"awarding_office_name_W4MM USA JOINT MUNITIONS CMD" BIGINT, 
	"product_or_service_code_description_SUPPORT- PROFESSIONAL: COMMUNICATIONS" BIGINT, 
	"awarding_office_name_0413 AQ HQ     CONTRACT" BIGINT, 
	"awarding_office_name_W6QK ACC-APG DIR" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF OTHER UTILITIES" BIGINT, 
	"awarding_office_name_W2SD ENDIST BALTIMORE" BIGINT, 
	"awarding_office_name_FA4484  87 CONS LGC" BIGINT, 
	"dod_claimant_program_description_MISSILE AND SPACE SYSTEMS" BIGINT, 
	"naics_description_SOLID WASTE COLLECTION" BIGINT, 
	"naics_description_FLOORING CONTRACTORS" BIGINT, 
	"awarding_office_name_W6QM MICC FT MCCOY (RC)" BIGINT, 
	"naics_description_COLLEGES, UNIVERSITIES, AND PROFESSIONAL SCHOOLS" BIGINT, 
	"awarding_office_name_US ARMY ENGINEER DISTRICT MOBILE" BIGINT, 
	"awarding_office_name_NAVFAC NORTHWEST" BIGINT, 
	"naics_description_SOFTWARE PUBLISHERS" BIGINT, 
	"naics_description_OTHER SERVICES TO BUILDINGS AND DWELLINGS" BIGINT, 
	"awarding_office_name_NAVSEA HQ" BIGINT, 
	"awarding_office_name_FA2521  45 CONS LGC" BIGINT, 
	"naics_description_OTHER COMMERCIAL AND INDUSTRIAL MACHINERY AND EQUIPMENT RENTAL AND LEASING" BIGINT, 
	"product_or_service_code_description_IT AND TELECOM- ANNUAL SOFTWARE MAINTENANCE SERVICE PLANS" BIGINT, 
	"awarding_office_name_FA8650  USAF AFMC AFRL/RQK" BIGINT, 
	"awarding_office_name_W6QM MICC-FT KNOX" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- TRASH/GARBAGE COLLECTION" BIGINT, 
	"primary_place_of_performance_state_code_ND" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- FOOD" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- COMMUNICATION, DETECTION, AND COHERENT RADIATION EQUIPMENT" BIGINT, 
	"awarding_office_name_W2SD ENDIST PHILADELPHIA" BIGINT, 
	"awarding_office_name_NAVSUP FLT LOG CTR JACKSONVILLE" BIGINT, 
	"primary_place_of_performance_state_code_MT" BIGINT, 
	"product_or_service_code_description_HOUSEKEEPING- LAUNDRY/DRYCLEANING" BIGINT, 
	"awarding_office_name_W6QK ACC-APG NATICK" BIGINT, 
	"product_or_service_code_description_EDUCATION/TRAINING- TRAINING/CURRICULUM DEVELOPMENT" BIGINT, 
	"awarding_office_name_FA8903  772 ESS PK" BIGINT, 
	"product_or_service_code_description_REPAIR OR ALTERATION OF RESTORATION OF REAL PROPERTY (PUBLIC OR PRIVATE)" BIGINT, 
	"awarding_office_name_W7M7 USPFO ACTIVITY IN ARNG" BIGINT, 
	"portfolio_group_Logistics Management Services" BIGINT, 
	"type_of_contract_pricing_COST PLUS AWARD FEE" BIGINT, 
	"product_or_service_code_description_TRANSPORTATION/TRAVEL/RELOCATION- TRAVEL/LODGING/RECRUITMENT: LODGING, HOTEL/MOTEL" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF HIGHWAYS/ROADS/STREETS/BRIDGES/RAILWAYS" BIGINT, 
	"awarding_office_name_W6QK ACC-APG" BIGINT, 
	"naics_description_SECURITY SYSTEMS SERVICES (EXCEPT LOCKSMITHS)" BIGINT, 
	"awarding_office_name_FA5000  673 CONS LGC" BIGINT, 
	"contract_financing_COMMERCIAL FINANCING" BIGINT, 
	"product_or_service_code_description_EDUCATION/TRAINING- GENERAL" BIGINT, 
	"naics_description_SITE PREPARATION CONTRACTORS" BIGINT, 
	"naics_description_ELECTROPLATING, PLATING, POLISHING, ANODIZING, AND COLORING" BIGINT, 
	"naics_description_EDUCATIONAL SUPPORT SERVICES" BIGINT, 
	"dod_claimant_program_description_OTHER AIRCRAFT EQUIPMENT " BIGINT, 
	"type_of_contract_pricing_TIME AND MATERIALS" BIGINT, 
	"awarding_office_name_NAVSUP FLT LOG CTR PUGET SOUND" BIGINT, 
	"awarding_office_name_FA2823  AFTC PZIO" BIGINT, 
	"naics_description_ALL OTHER MISCELLANEOUS SCHOOLS AND INSTRUCTION" BIGINT, 
	"naics_description_OTHER MANAGEMENT CONSULTING SERVICES" BIGINT, 
	"product_or_service_code_description_IT AND TELECOM- TELECOMMUNICATIONS NETWORK MANAGEMENT" BIGINT, 
	"primary_place_of_performance_state_code_SD" BIGINT, 
	"awarding_office_name_W0L6 USA DEP LETTERKENY" BIGINT, 
	"awarding_office_name_COMMANDING OFFICER" BIGINT, 
	"awarding_office_name_W6QM MICC-WEST POINT" BIGINT, 
	"awarding_office_name_W076 ENDIST FT WORTH" BIGINT, 
	"awarding_office_name_FA7000  10 CONS LGC" BIGINT, 
	"product_or_service_code_description_R&D- DEFENSE OTHER: OTHER (APPLIED RESEARCH/EXPLORATORY DEVELOPMENT)" BIGINT, 
	"naics_description_SURVEYING AND MAPPING (EXCEPT GEOPHYSICAL) SERVICES" BIGINT, 
	"product_or_service_code_description_UTILITIES- ELECTRIC" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- AIRCRAFT COMPONENTS AND ACCESSORIES" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF MAINTENANCE BUILDINGS" BIGINT, 
	"awarding_office_name_US ARMY ENGINEER DISTRICT CHARLESTO" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- REFRIGERATION, AIR CONDITIONING, AND AIR CIRCULATING EQUIPMENT" BIGINT, 
	"awarding_office_name_FA8517  AFSC PZAAC" BIGINT, 
	"product_or_service_code_description_MAINTENANCE OF HOSPITALS AND INFIRMARIES" BIGINT, 
	"primary_place_of_performance_state_code_NH" BIGINT, 
	"product_or_service_code_description_SUPPORT- MANAGEMENT: LOGISTICS SUPPORT" BIGINT, 
	"awarding_office_name_NAVSUP FLT LOG CTR PEARL HARBOR" BIGINT, 
	"awarding_office_name_W07V ENDIST N ORLEANS" BIGINT, 
	"awarding_office_name_W0LX ANNISTON DEPOT PROP DIV" BIGINT, 
	"awarding_office_name_W6QM MICC-JB LEWIS-MC CHORD" BIGINT, 
	"product_or_service_code_description_IT AND TELECOM- INTEGRATED HARDWARE/SOFTWARE/SERVICES SOLUTIONS, PREDOMINANTLY SERVICES" BIGINT, 
	"naics_description_POWER AND COMMUNICATION LINE AND RELATED STRUCTURES CONSTRUCTION" BIGINT, 
	"product_or_service_code_description_CONSTRUCTION OF OTHER NON-BUILDING FACILITIES" BIGINT, 
	"product_or_service_code_description_TRANSPORTATION/TRAVEL/RELOCATION- TRANSPORTATION: OTHER" BIGINT, 
	"naics_description_INDUSTRIAL LAUNDERERS" BIGINT, 
	"funding_sub_agency_name_DEFENSE COMMISSARY AGENCY  (DECA)" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- MATERIALS HANDLING EQUIPMENT" BIGINT, 
	"naics_description_COMMUNICATION EQUIPMENT REPAIR AND MAINTENANCE" BIGINT, 
	"awarding_office_name_W075 ENDIST LOS ANGELES" BIGINT, 
	"awarding_office_name_W4PZ USA MED RSCH ACQUIS ACT" BIGINT, 
	"awarding_office_name_NAVAL SAFETY AND ENVIRONMENTAL" BIGINT, 
	"primary_place_of_performance_state_code_WV" BIGINT, 
	"awarding_office_name_FA4800  633 CONS LGCP" BIGINT, 
	"product_or_service_code_description_CONSTRUCTION OF OTHER ADMINISTRATIVE FACILITIES AND SERVICE BUILDINGS" BIGINT, 
	"awarding_office_name_FA8604  AFLCMC PZI" BIGINT, 
	"naics_description_ALL OTHER MISCELLANEOUS AMBULATORY HEALTH CARE SERVICES" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- ADP EQUIPMENT/SOFTWARE/SUPPLIES/SUPPORT EQUIPMENT" BIGINT, 
	"awarding_office_name_W6QM MICC-FT STEWART" BIGINT, 
	"awarding_office_name_MSC NORFOLK" BIGINT, 
	"awarding_office_name_NAVFAC ENGINEERING COMMAND HAWAII" BIGINT, 
	"naics_description_COMPUTER AND OFFICE MACHINE REPAIR AND MAINTENANCE" BIGINT, 
	"awarding_office_name_US ARMY ENGINEER DISTRICT FT WORTH" BIGINT, 
	"funding_sub_agency_name_U.S. SPECIAL OPERATIONS COMMAND (USSOCOM)" BIGINT, 
	"awarding_office_name_DEFENSE COMMISSARY AGENCY" BIGINT, 
	"product_or_service_code_description_MEDICAL- LABORATORY TESTING" BIGINT, 
	"naics_description_OTHER SUPPORT ACTIVITIES FOR AIR TRANSPORTATION" BIGINT, 
	"awarding_office_name_FA8201  OL H  PZIO" BIGINT, 
	"awarding_office_name_NAVSUP FLT LOG CTR  SAN DIEGO" BIGINT, 
	"awarding_office_name_W2R2 USA ENGR R AND D CTR" BIGINT, 
	"awarding_office_name_SPAWAR SYSTEMS CENTER ATLANTIC" BIGINT, 
	"awarding_office_name_W6QM MICC-FT GORDON" BIGINT, 
	"product_or_service_code_description_SUPPORT- MANAGEMENT: ADVERTISING" BIGINT, 
	"product_or_service_code_description_CONSTRUCTION OF HIGHWAYS, ROADS, STREETS, BRIDGES, AND RAILWAYS" BIGINT, 
	"awarding_office_name_W074 ENDIST MOBILE" BIGINT, 
	"naics_description_AUTOMOTIVE BODY, PAINT, AND INTERIOR REPAIR AND MAINTENANCE" BIGINT, 
	"awarding_office_name_FA8501  AFSC PZIO" BIGINT, 
	"awarding_office_name_W6QM MICC FT LEE" BIGINT, 
	"product_or_service_code_description_MEDICAL- NURSING" BIGINT, 
	"product_or_service_code_description_LEASE/RENTAL OF CONFERENCE SPACE AND FACILITIES" BIGINT, 
	"product_or_service_code_description_R&D- DEFENSE SYSTEM: MISSILE/SPACE SYSTEMS (APPLIED RESEARCH/EXPLORATORY DEVELOPMENT)" BIGINT, 
	"product_or_service_code_description_OPERATION OF RECREATION FACILITIES (NON-BUILDING)" BIGINT, 
	"product_or_service_code_description_MAINT/REPAIR/REBUILD OF EQUIPMENT- SHIP AND MARINE EQUIPMENT" BIGINT, 
	"awarding_office_name_W6QM MICC-FT RUCKER" BIGINT, 
	"awarding_sub_agency_name_U.S. SPECIAL OPERATIONS COMMAND (USSOCOM)" BIGINT, 
	"product_or_service_code_description_R&D- DEFENSE SYSTEM: ELECTRONICS/COMMUNICATION EQUIPMENT (BASIC RESEARCH)" BIGINT, 
	"product_or_service_code_description_ARCHITECT AND ENGINEERING- CONSTRUCTION: OFFICE BUILDINGS" BIGINT, 
	"contract_financing_PERFORMANCE-BASED FINANCING" BIGINT, 
	"product_or_service_code_description_CONSTRUCTION OF OFFICE BUILDINGS" BIGINT, 
	"awarding_office_name_US ARMY ENGINEER DISTRICT HUNTINGTO" BIGINT, 
	type_of_set_aside_code TEXT
)

]
(Background on this error at: http://sqlalche.me/e/f405)

In [None]:
# Next I am testing the accuracy of the model on each specific set aside. Because we have an unbalanced data set
# it seems that the model is great for predicting set asides in general, however it is also skewed to better
# predict certain categories compared to others.

# Create a dictionary object to capture set aside code and it's score
class scores(dict):  
  
    # __init__ function  
    def __init__(self):  
        self = dict()  
          
    # Function to add key:value  
    def add(self, key, value):  
        self[key] = value  

scores = scores()
percent = ''
set_aside_codes = data['type_of_set_aside_code'].unique()

# Loop through each set aside, test it, and append to the dictionary
for set_aside in set_aside_codes:
    dataPoint = data.loc[data['type_of_set_aside_code'] == set_aside]
    XPoint = dataPoint.drop(["type_of_set_aside_code"],axis=1)
    yPoint = dataPoint["type_of_set_aside_code"]
    percent = model.score(XPoint, yPoint)
    percent = round(percent, 4)
    scores.add(set_aside, percent)

In [None]:
# Sort the dictionary by score
import operator
sortedScores = sorted(scores.items(), key=operator.itemgetter(1))

In [None]:
# Print scores
for score in reversed(sortedScores):
    print("{:<8} {:.2%}".format(score[0], score[1]))

In [None]:
print(classification_report(y_test, model.predict(X_test), target_names=set_aside_codes))