<h1 align = 'center'> Homework 11: Data Competition </h1>
<h3 align = 'center'> By: Josh Jaeger and Max Butler </h3>
<h3 align = 'center'> Tuning Hyper-Parameters </h3>

In [405]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn import tree
import sklearn
import lightgbm as lgb

In [406]:
reducedTrainingDF = pd.read_csv("reducedTrainingDF.csv", index_col = 0)
reducedTestDF = pd.read_csv("reducedTestDF.csv", index_col = 0)

In [407]:
train_X = reducedTrainingDF.drop(columns = ['class'])
train_y = reducedTrainingDF['class']

In [408]:
test_X = reducedTestDF.drop(columns = 'class')
test_y = reducedTestDF['class']

<h3 align = 'center'> Without Hypertuning </h3>

In [409]:
clf1 = BernoulliNB()
clf2 = RandomForestClassifier(n_estimators = 50, random_state=1984)
clf3 = GaussianNB()
clf4 = SVC(kernel = 'rbf', probability=True)
eclf = VotingClassifier(estimators = [('bnb',clf1),('rf',clf2),('gnb',clf3),('svc',clf4)], voting = 'soft', weights =[1,2,1,1])
eclf.fit(train_X,train_y)

VotingClassifier(estimators=[('bnb', BernoulliNB()),
                             ('rf',
                              RandomForestClassifier(n_estimators=50,
                                                     random_state=1984)),
                             ('gnb', GaussianNB()),
                             ('svc', SVC(probability=True))],
                 voting='soft', weights=[1, 2, 1, 1])

In [410]:
pred_y = eclf.predict(test_X)

In [411]:
cm = metrics.confusion_matrix(test_y, pred_y)
print(cm)

[[803  34]
 [ 47 497]]


In [412]:
metrics.roc_auc_score(test_y,pred_y)

0.9364908373743762

In [413]:
clf2.fit(train_X, train_y)

RandomForestClassifier(n_estimators=50, random_state=1984)

In [414]:
pred_y_rf = clf2.predict(test_X)

In [415]:
metrics.roc_auc_score(test_y,pred_y_rf)

0.9464166490969147

<h3 align = 'center'> Tuning Random Forests Hyper-Parameters </h3>

In [416]:
import time
start = time.time()
print('Time Start:' + time.strftime("%m/%d/%Y %H:%M:%S"))

max_features = [7,9,13,15,17]
n_estimators = [10,20,30,40,50,60,70,80,90,100,140,150]
criterion = ['gini', 'entropy']
bootstrap = [True, False]
param_grid = dict(max_features = max_features, n_estimators = n_estimators, criterion = criterion, bootstrap = bootstrap,
                  random_state = [1984])

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid = param_grid, scoring = 'roc_auc', cv=5)

grid.fit(train_X, train_y)

end = time.time()
print('Time End:'+ time.strftime("%m/%d/%Y %H:%M:%S"))
print('Execution Time (Seconds):' + str(end - start))

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Time Start:11/22/2021 19:01:46
Time End:11/22/2021 19:12:41
Execution Time (Seconds):654.603963136673
The best parameters are {'bootstrap': True, 'criterion': 'entropy', 'max_features': 7, 'n_estimators': 150, 'random_state': 1984} with a score of 0.99


In [417]:
rf_final12 = RandomForestClassifier(n_estimators = 150,
                                   max_features = 7,
                                   criterion = 'entropy',
                                   bootstrap = True,
                                   random_state = 1984)

In [418]:
rf_final12.fit(train_X,train_y)

RandomForestClassifier(criterion='entropy', max_features=7, n_estimators=150,
                       random_state=1984)

In [419]:
pred_y_rf_final12 = rf_final12.predict(test_X)

In [420]:
metrics.roc_auc_score(test_y, pred_y_rf_final12)

0.9498174942019817

I Found That the Model Will Preform Slightly Beter Using 140 n_estimators and Bootstrap set to False 

In [421]:
rf_final13 = RandomForestClassifier(n_estimators = 140,
                                   max_features = 7,
                                   criterion = 'entropy',
                                   bootstrap = False,
                                   random_state = 1984)

In [422]:
rf_final13.fit(train_X,train_y)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=7,
                       n_estimators=140, random_state=1984)

In [423]:
pred_y_rf_final13 = rf_final13.predict(test_X)

In [424]:
metrics.roc_auc_score(test_y, pred_y_rf_final13)

0.9513801040129313

<h3 align = 'center'> Model Comparison </h3>

In [425]:
acc = [metrics.accuracy_score(test_y, pred_y_rf),
       metrics.accuracy_score(test_y, pred_y_rf_final12),
       metrics.accuracy_score(test_y, pred_y_rf_final13)]

pre = [metrics.precision_score(test_y, pred_y_rf),
       metrics.precision_score(test_y, pred_y_rf_final12),
       metrics.precision_score(test_y, pred_y_rf_final13)]

rec = [metrics.recall_score(test_y, pred_y_rf),
       metrics.recall_score(test_y, pred_y_rf_final12),
       metrics.recall_score(test_y, pred_y_rf_final13)]

f1  = [metrics.f1_score(test_y, pred_y_rf),
       metrics.f1_score(test_y, pred_y_rf_final12),
       metrics.f1_score(test_y, pred_y_rf_final13)]

auc = [metrics.roc_auc_score(test_y, pred_y_rf),
       metrics.roc_auc_score(test_y, pred_y_rf_final12),
       metrics.roc_auc_score(test_y, pred_y_rf_final13)]

In [426]:
pd.DataFrame({'Accuracy':acc, 'Precision':pre, 'Recall':rec, 'F1 Score': f1, 'AUC Score':auc},
             index = ['RF without Hyperparameter Tunning','RF with GridSearchCV Hyperparameter Tunning',
                      'RF with Manual Hyperparameter Tunning'])

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,AUC Score
RF without Hyperparameter Tunning,0.952209,0.957854,0.919118,0.938086,0.946417
RF with GridSearchCV Hyperparameter Tunning,0.954381,0.954631,0.928309,0.941286,0.949817
RF with Manual Hyperparameter Tunning,0.955105,0.951311,0.933824,0.942486,0.95138


<h3 align = 'center'> Fine Tuning for the Best Hyper Parameters </h3>

In [427]:
rf_final1= RandomForestClassifier(n_estimators = 250,
                                   max_features = 7,
                                   criterion = 'gini',
                                   random_state = 1984)

In [428]:
rf_final2 = RandomForestClassifier(n_estimators = 50,
                                   max_features = 3,
                                   criterion = 'entropy',
                                   random_state = 1984)

In [429]:
rf_final3 = RandomForestClassifier(n_estimators = 500,
                                   max_features = 3,
                                   criterion = 'gini',
                                   random_state = 1984)

In [430]:
rf_final4 = RandomForestClassifier(n_estimators = 50,
                                   max_features = 3,
                                   criterion = 'gini',
                                   random_state = 1984)

In [431]:
rf_final5 = RandomForestClassifier(n_estimators = 250,
                                   max_features = 6,
                                   criterion = 'gini',
                                   random_state = 1984)

In [432]:
rf_final6 = RandomForestClassifier(n_estimators = 200,
                                   max_features = 6,
                                   criterion = 'gini',
                                   random_state = 1984)

In [433]:
rf_final7 = RandomForestClassifier(n_estimators = 200,
                                   max_features = 6,
                                   criterion = 'entropy',
                                   random_state = 1984)

In [434]:
rf_final8 = RandomForestClassifier(n_estimators = 150,
                                   max_features = 6,
                                   criterion = 'entropy',
                                   random_state = 1984)

In [435]:
rf_final9 = RandomForestClassifier(n_estimators = 150,
                                   max_features = 7,
                                   criterion = 'entropy',
                                   random_state = 1984)

In [436]:
rf_final10 = RandomForestClassifier(n_estimators = 150,
                                   max_features = 7,
                                   criterion = 'entropy',
                                   bootstrap = False,
                                   random_state = 1984)

In [437]:
rf_final11 = RandomForestClassifier(n_estimators = 150,
                                   max_features = 7,
                                   criterion = 'entropy',
                                   bootstrap = False,
                                   random_state = 1984)

In [438]:
rf_final1.fit(train_X,train_y)

RandomForestClassifier(max_features=7, n_estimators=250, random_state=1984)

In [439]:
rf_final2.fit(train_X,train_y)

RandomForestClassifier(criterion='entropy', max_features=3, n_estimators=50,
                       random_state=1984)

In [440]:
rf_final3.fit(train_X,train_y)

RandomForestClassifier(max_features=3, n_estimators=500, random_state=1984)

In [441]:
rf_final4.fit(train_X,train_y)

RandomForestClassifier(max_features=3, n_estimators=50, random_state=1984)

In [442]:
rf_final5.fit(train_X,train_y)

RandomForestClassifier(max_features=6, n_estimators=250, random_state=1984)

In [443]:
rf_final6.fit(train_X,train_y)

RandomForestClassifier(max_features=6, n_estimators=200, random_state=1984)

In [444]:
rf_final7.fit(train_X,train_y)

RandomForestClassifier(criterion='entropy', max_features=6, n_estimators=200,
                       random_state=1984)

In [445]:
rf_final8.fit(train_X,train_y)

RandomForestClassifier(criterion='entropy', max_features=6, n_estimators=150,
                       random_state=1984)

In [446]:
rf_final9.fit(train_X,train_y)

RandomForestClassifier(criterion='entropy', max_features=7, n_estimators=150,
                       random_state=1984)

In [447]:
rf_final10.fit(train_X,train_y)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=7,
                       n_estimators=150, random_state=1984)

In [448]:
rf_final11.fit(train_X,train_y)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=7,
                       n_estimators=150, random_state=1984)

In [449]:
pred_y_rf_final1 = rf_final1.predict(test_X)

In [450]:
pred_y_rf_final2 = rf_final2.predict(test_X)

In [451]:
pred_y_rf_final3 = rf_final3.predict(test_X)

In [452]:
pred_y_rf_final4 = rf_final4.predict(test_X)

In [453]:
pred_y_rf_final5 = rf_final5.predict(test_X)

In [454]:
pred_y_rf_final6 = rf_final6.predict(test_X)

In [455]:
pred_y_rf_final7 = rf_final7.predict(test_X)

In [456]:
pred_y_rf_final8 = rf_final8.predict(test_X)

In [457]:
pred_y_rf_final9 = rf_final9.predict(test_X)

In [458]:
pred_y_rf_final10 = rf_final10.predict(test_X)

In [459]:
pred_y_rf_final11 = rf_final11.predict(test_X)

In [460]:
metrics.roc_auc_score(test_y, pred_y_rf_final1)

0.946462769695692

In [461]:
metrics.roc_auc_score(test_y, pred_y_rf_final2)

0.9421889275423432

In [462]:
metrics.roc_auc_score(test_y, pred_y_rf_final3)

0.9437054167545154

In [463]:
metrics.roc_auc_score(test_y, pred_y_rf_final4)

0.9391098285192214

In [464]:
metrics.roc_auc_score(test_y, pred_y_rf_final5)

0.9446245344015741

In [465]:
metrics.roc_auc_score(test_y, pred_y_rf_final6)

0.9458653981305783

In [466]:
metrics.roc_auc_score(test_y, pred_y_rf_final7)

0.9461871442125238

In [467]:
metrics.roc_auc_score(test_y, pred_y_rf_final8)

0.946462769695692

In [468]:
metrics.roc_auc_score(test_y, pred_y_rf_final9)

0.9498174942019817

In [469]:
metrics.roc_auc_score(test_y, pred_y_rf_final10)

0.9498636148007591

In [470]:
metrics.roc_auc_score(test_y, pred_y_rf_final11)

0.9498636148007591

In [471]:
metrics.accuracy_score(test_y, pred_y_rf_final11)

0.9536567704561911

In [472]:
metrics.recall_score(test_y, pred_y_rf_final11)

0.9319852941176471

In [473]:
metrics.f1_score(test_y, pred_y_rf_final11)

0.9406307977736549