### Random Forest Classification
This program dates a CSV file as input and attempts to classify it using a random forest algorithm.

#### Results #1, No Hyperparameters (ended up being the same with optimized parameters)
Overall Results
Train acc:  1.0
Test acc:  0.985

In [88]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

import statistics
from statistics import mode

In [89]:
df = pd.read_csv('./heart.csv')
display(df.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [90]:
# Split data into features and target
y = df['thal']
x = df.drop(['target', 'thal'], axis=1)

# Get test and train split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)

print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)
display(xtrain.head())
display(ytrain.head())

(820, 12) (820,) (205, 12) (205,)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca
315,42,1,3,148,244,0,0,178,0,0.8,2,2
204,66,0,2,146,278,0,0,152,0,0.0,1,1
363,53,1,2,130,246,1,0,173,0,0.0,2,3
5,58,0,0,100,248,0,0,122,0,1.0,1,0
1017,53,1,0,123,282,0,1,95,1,2.0,1,2


315     2
204     2
363     2
5       2
1017    3
Name: thal, dtype: int64

In [91]:
# Create random forest
rf = RandomForestClassifier(n_estimators=20, max_depth=7)
rf.fit(xtrain, ytrain)

# Get rf prediction
rf_predicted = rf.predict(xtest)
rf_acc = np.round(100*accuracy_score(ytest, rf_predicted), 2)
print("Accuracy of RandomForestClassifier: ", rf_acc, '%')

Accuracy of RandomForestClassifier:  94.15 %


In [92]:
# Create linear regression model
lr = LogisticRegression(max_iter=10000)
lr = lr.fit(xtrain, ytrain) 

# Get the prediction
lr_predicted = lr.predict(xtest)
lr_acc = np.round(100*accuracy_score(ytest, lr_predicted), 2)
print("Accuracy of LogisticRegression Model: ", lr_acc, '%')

Accuracy of LogisticRegression Model:  74.15 %


In [93]:
# Create a gaussian model
gaus = GaussianNB()
gaus.fit(xtrain, ytrain)

# Get the prediction
gaus_predicted = gaus.predict(xtest)
gaus_acc = np.round(100*accuracy_score(ytest, gaus_predicted), 2)
print("Accuracy of GaussianNB Model: ", gaus_acc, '%')

Accuracy of GaussianNB Model:  69.27 %


In [94]:
# Combines arrays into a array of tuples, useful for voting
# combine_arrs([1, 2, 3], [4, 5, 6], [7, 8, 9]) ==> [(1, 4, 7), (2, 5, 8), (3, 6, 9)]
def combine_arrs(*args):
    arr_of_tuples = []
    for i in range(len(args[0])):
        arr_of_tuples.append(tuple([x[i] for x in args]))
    return arr_of_tuples

# Returns array of most common vote amongst the models
def ensemble_vote(votes_arr):
    conclusion = []
    for votes in votes_arr:
        conclusion.append(mode(votes))
    return conclusion

In [95]:
# Combines individual voting arrays into a single array of tuples, 
# where the ith tuple contains the guesses for the ith entry in xtest
ensemble_arr = combine_arrs(rf_predicted, lr_predicted, gaus_predicted)
print(ensemble_arr[:5])

[(2, 2, 2), (2, 2, 2), (3, 3, 3), (2, 2, 2), (3, 3, 3)]


In [96]:
# Determines the most common vote
res_from_vote = ensemble_vote(ensemble_arr)
acc_from_vote = np.round(100 * accuracy_score(ytest, res_from_vote), 2)
print(res_from_vote[:5])

[2, 2, 3, 2, 3]


In [97]:
print("Accuracy of RandomForestClassifier: ", rf_acc, '%')
print("Accuracy of LogisticRegression Model: ", lr_acc, '%')
print("Accuracy of GaussianNB Model: ", gaus_acc, '%')
print("Accuracy after ensemble voting: ", acc_from_vote, '%')

Accuracy of RandomForestClassifier:  94.15 %
Accuracy of LogisticRegression Model:  74.15 %
Accuracy of GaussianNB Model:  69.27 %
Accuracy after ensemble voting:  78.54 %
