### Random Forest Classification
This program dates a CSV file as input and attempts to classify it using a random forest algorithm.

#### Results #1, No Hyperparameters (ended up being the same with optimized parameters)
Overall Results
Train acc:  1.0
Test acc:  0.985

In [61]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

In [55]:
df = pd.read_csv('./heart.csv')
display(df.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [81]:
# Split data into features and target
y = df['thal']
x = df.drop(['target', 'thal'], axis=1)

# Get test and train split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)
display(xtrain.head())
display(ytrain.head())

(820, 12) (205, 12) (820,) (205,)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca
315,42,1,3,148,244,0,0,178,0,0.8,2,2
204,66,0,2,146,278,0,0,152,0,0.0,1,1
363,53,1,2,130,246,1,0,173,0,0.0,2,3
5,58,0,0,100,248,0,0,122,0,1.0,1,0
1017,53,1,0,123,282,0,1,95,1,2.0,1,2


315     2
204     2
363     2
5       2
1017    3
Name: thal, dtype: int64

In [74]:
# Define some test parameters
parameters = {
    'n_estimators': (50, 70, 100, 120, 150),
    'criterion': ('gini', 'entropy'),
    'max_depth': (5, 7, 9, 11, 13),
    'max_features': ('auto', 'sqrt'),
    'min_samples_split': (2, 4, 6)
}

# Create the model, use grid search to find the best parameters
grid = GridSearchCV(RandomForestClassifier(n_jobs = -1, oob_score=False), param_grid=parameters, cv=3, verbose=True)
grid_model = grid.fit(xtrain, ytrain)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [75]:
# Display the best parameters
grid.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=11, n_estimators=120,
                       n_jobs=-1)

In [86]:
# 1: max_depth=11, max_features='sqrt', n_estimators=150, n_jobs=-1
model = RandomForestClassifier(n_estimators=20, random_state=101, max_depth=5)
model.fit(xtrain, ytrain)

RandomForestClassifier(max_depth=5, n_estimators=20, random_state=101)

In [87]:
predicted = model.predict(xtest)
acc = accuracy_score(ytest, predicted)
acc

0.8682926829268293