### Random Forest Classification
This program dates a CSV file as input and attempts to classify it using a random forest algorithm.

In [53]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neural_network import MLPClassifier

import statistics
from statistics import mode

In [52]:
df = pd.read_csv('./heart.csv')
display(df.head())
display(df.describe())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [51]:
# Split data into features and target
y = df['thal']
x = df.drop(['target', 'thal'], axis=1)

# Get test and train split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)

print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)
display(xtrain.head())
display(ytrain.head())

(820, 12) (820,) (205, 12) (205,)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca
315,42,1,3,148,244,0,0,178,0,0.8,2,2
204,66,0,2,146,278,0,0,152,0,0.0,1,1
363,53,1,2,130,246,1,0,173,0,0.0,2,3
5,58,0,0,100,248,0,0,122,0,1.0,1,0
1017,53,1,0,123,282,0,1,95,1,2.0,1,2


315     2
204     2
363     2
5       2
1017    3
Name: thal, dtype: int64

In [42]:
def results(model):   
    # Train results
    train_res = model.predict(xtrain)
    train_acc = np.round(100*accuracy_score(ytrain, train_res), 2)
    
    # Test results
    model_res = model.predict(xtest)
    model_acc = np.round(100*accuracy_score(ytest, model_res), 2)
    
    # K-Fold Cross validation
    kfold = model_selection.KFold(n_splits=10)
    results_kfold = model_selection.cross_val_score(model, x, y, cv=kfold)
    validation_acc = np.round(100 * results_kfold.mean(), 2)
    
    print("Results for", model.__class__.__name__)
    print("\t Train Accuracy: ", train_acc)
    print("\t Test Accuracy: ", model_acc)
    print("\t Validation Accuracy: ", validation_acc)
    
    return (model_res, model_acc, validation_acc)

In [43]:
# Create random forest 
rf = RandomForestClassifier(n_estimators=20, max_depth=7)
rf.fit(xtrain, ytrain)

# Get rf prediction
rf_predicted, rf_acc, rf_val = results(rf)

Results for RandomForestClassifier
	 Train Accuracy:  94.63
	 Test Accuracy:  91.22
	 Validation Accuracy:  90.54


In [44]:
# Create linear regression model
lr = LogisticRegression(max_iter=10000)
lr = lr.fit(xtrain, ytrain) 

# Get the prediction
lr_predicted, lr_acc, lr_val = results(lr)

Results for LogisticRegression
	 Train Accuracy:  69.15
	 Test Accuracy:  74.15
	 Validation Accuracy:  68.79


In [45]:
# Create a gaussian model
gaus = GaussianNB()
gaus.fit(xtrain, ytrain)

# Get the prediction
gaus_predicted, gaus_acc, gaus_val = results(gaus)

Results for GaussianNB
	 Train Accuracy:  68.29
	 Test Accuracy:  69.27
	 Validation Accuracy:  67.62


In [56]:
# Create a fully connected ANN (MLP)
mlp1 = MLPClassifier()
mlp1.fit(xtrain, ytrain)

# Get the prediction
mlp1_predicted, mlp1_acc, mlp1_val = results(mlp1)

Results for MLPClassifier
	 Train Accuracy:  64.39
	 Test Accuracy:  69.76
	 Validation Accuracy:  68.19


In [46]:
# Combines arrays into a array of tuples, useful for voting
# combine_arrs([1, 2, 3], [4, 5, 6], [7, 8, 9]) ==> [(1, 4, 7), (2, 5, 8), (3, 6, 9)]
def combine_arrs(*args):
    arr_of_tuples = []
    for i in range(len(args[0])):
        arr_of_tuples.append(tuple([x[i] for x in args]))
    return arr_of_tuples

# Returns array of most common vote amongst the models
def ensemble_vote(votes_arr):
    conclusion = []
    for votes in votes_arr:
        conclusion.append(mode(votes))
    return conclusion

In [57]:
# Combines individual voting arrays into a single array of tuples, 
# where the ith tuple contains the guesses for the ith entry in xtest
ensemble_arr = combine_arrs(rf_predicted, lr_predicted, gaus_predicted, mlp1_predicted)
print(ensemble_arr[:5])

[(2, 2, 2, 3), (2, 2, 2, 2), (3, 3, 3, 3), (2, 2, 2, 2), (3, 3, 3, 3)]


In [58]:
# Determines the most common vote
res_from_vote = ensemble_vote(ensemble_arr)
acc_from_vote = np.round(100 * accuracy_score(ytest, res_from_vote), 2)
print(res_from_vote[:5])

[2, 2, 3, 2, 3]


In [60]:
print("Accuracy of RandomForestClassifier: ", rf_acc, '%')
print("Accuracy of LogisticRegression Model: ", lr_acc, '%')
print("Accuracy of GaussianNB Model: ", gaus_acc, '%')
print("Accuracy of GaussianNB Model: ", mlp1_acc, '%')
print("Accuracy after ensemble voting: ", acc_from_vote, '%')

Accuracy of RandomForestClassifier:  91.22 %
Accuracy of LogisticRegression Model:  74.15 %
Accuracy of GaussianNB Model:  69.27 %
Accuracy of GaussianNB Model:  69.76 %
Accuracy after ensemble voting:  84.39 %
