### ML Classification
This program dates a CSV file as input and attempts to classify it using a random forest algorithm.

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

import statistics
from statistics import mode

In [2]:
df = pd.read_csv('./heart.csv')
display(df.head())
display(df.describe())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [3]:
# Split data into features and target
y = df['thal']
x = df.drop(['target', 'thal'], axis=1)

# Get test and train split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)

print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)
display(xtrain.head())
display(ytrain.head())

(820, 12) (820,) (205, 12) (205,)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca
974,43,1,0,110,211,0,1,161,0,0.0,2,0
464,67,0,2,115,564,0,0,160,0,1.6,1,0
718,55,0,1,135,250,0,0,161,0,1.4,1,0
143,34,1,3,118,182,0,0,174,0,0.0,2,0
717,56,1,2,130,256,1,0,142,1,0.6,1,1


974    3
464    3
718    2
143    2
717    1
Name: thal, dtype: int64

In [4]:
def results(model):   
    # Train results
    train_res = model.predict(xtrain)
    train_acc = np.round(100*accuracy_score(ytrain, train_res), 2)
    
    # Test results
    model_res = model.predict(xtest)
    model_acc = np.round(100*accuracy_score(ytest, model_res), 2)
    
    # K-Fold Cross validation (src: https://www.pluralsight.com/guides/validating-machine-learning-models-scikit-learn)
    kfold = model_selection.KFold(n_splits=10)
    results_kfold = model_selection.cross_val_score(model, x, y, cv=kfold)
    validation_acc = np.round(100 * results_kfold.mean(), 2)
    
    print("Results for", model.__class__.__name__)
    print("\t Train Accuracy: ", train_acc)
    print("\t Test Accuracy: ", model_acc)
    print("\t Validation Accuracy: ", validation_acc)
    
    return (model_res, model_acc, validation_acc)

In [5]:
# Create random forest 
rf = RandomForestClassifier(n_estimators=20, max_depth=7)
rf.fit(xtrain, ytrain)

# Get rf prediction
rf_predicted, rf_acc, rf_val = results(rf)

Results for RandomForestClassifier
	 Train Accuracy:  94.51
	 Test Accuracy:  88.29
	 Validation Accuracy:  92.1


In [6]:
# Create linear regression model
lr = LogisticRegression(max_iter=10000)
lr = lr.fit(xtrain, ytrain) 

# Get the prediction
lr_predicted, lr_acc, lr_val = results(lr)

Results for LogisticRegression
	 Train Accuracy:  70.73
	 Test Accuracy:  68.78
	 Validation Accuracy:  68.69


In [7]:
# Create a gaussian model
gaus = GaussianNB()
gaus.fit(xtrain, ytrain)

# Get the prediction
gaus_predicted, gaus_acc, gaus_val = results(gaus)

Results for GaussianNB
	 Train Accuracy:  69.76
	 Test Accuracy:  64.88
	 Validation Accuracy:  67.62


In [8]:
# Create a fully connected ANN (MLP)
mlp1 = MLPClassifier()
mlp1.fit(xtrain, ytrain)

# Get the prediction
mlp1_predicted, mlp1_acc, mlp1_val = results(mlp1)

Results for MLPClassifier
	 Train Accuracy:  67.93
	 Test Accuracy:  62.44
	 Validation Accuracy:  68.78


In [19]:
mlp2 = MLPClassifier(hidden_layer_sizes=(200,100,50),
                    activation = 'logistic',
                    solver = 'adam',
                    learning_rate = 'adaptive',
                    max_iter = 1000,
                    shuffle = False,
                    random_state = 0
                    )
mlp2.fit(xtrain, ytrain)
mlp2_predicted, mlp2_acc, mlp2_val = results(mlp2)

Results for MLPClassifier
	 Train Accuracy:  72.44
	 Test Accuracy:  73.17
	 Validation Accuracy:  88.59


In [10]:
# Create a XBG model
xgb = XGBClassifier(learning_rate=0.01, n_estimators=25, max_depth=15,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=27, reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5)
xgb.fit(xtrain, ytrain)

xgb_res, xgb_acc, xgb_validation_acc = results(xgb)

Results for XGBClassifier
	 Train Accuracy:  84.88
	 Test Accuracy:  76.59
	 Validation Accuracy:  80.39


In [11]:
# Combines arrays into a array of tuples, useful for voting
# combine_arrs([1, 2, 3], [4, 5, 6], [7, 8, 9]) ==> [(1, 4, 7), (2, 5, 8), (3, 6, 9)]
def combine_arrs(*args):
    arr_of_tuples = []
    for i in range(len(args[0])):
        arr_of_tuples.append(tuple([x[i] for x in args]))
    return arr_of_tuples

# Returns array of most common vote amongst the models
def ensemble_vote(votes_arr):
    conclusion = []
    # TODO: Weight towards more performant models
    for votes in votes_arr:
        conclusion.append(mode(votes))
    return conclusion

In [12]:
# Combines individual voting arrays into a single array of tuples, 
# where the ith tuple contains the guesses for the ith entry in xtest
ensemble_arr = combine_arrs(rf_predicted, lr_predicted, gaus_predicted, mlp1_predicted)
print(ensemble_arr[:5])

[(2, 2, 2, 2), (3, 3, 1, 3), (2, 2, 2, 2), (2, 2, 2, 2), (3, 2, 3, 3)]


In [13]:
# Determines the most common vote
res_from_vote = ensemble_vote(ensemble_arr)
acc_from_vote = np.round(100 * accuracy_score(ytest, res_from_vote), 2)
print(res_from_vote[:5])
print(list(ytrain[:5].values))

[2, 3, 2, 2, 3]
[3, 3, 2, 2, 1]


In [14]:
print("Accuracy of RandomForestClassifier: ", rf_acc, '%')
print("Accuracy of LogisticRegression Model: ", lr_acc, '%')
print("Accuracy of GaussianNB Model: ", gaus_acc, '%')
print("Accuracy of MLP1 Model: ", mlp1_acc, '%')
print("Accuracy after ensemble voting: ", acc_from_vote, '%')

Accuracy of RandomForestClassifier:  88.29 %
Accuracy of LogisticRegression Model:  68.78 %
Accuracy of GaussianNB Model:  64.88 %
Accuracy of MLP1 Model:  62.44 %
Accuracy after ensemble voting:  77.56 %
