### Random Forest Classification
This program dates a CSV file as input and attempts to classify it using a random forest algorithm.

#### Results #1, No Hyperparameters (ended up being the same with optimized parameters)
Overall Results
Train acc:  1.0
Test acc:  0.985

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [40]:
df = pd.read_csv('./heart.csv')
display(df.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [42]:
# Split data into features and target
y = df['thal']
x = df.drop(['target', 'thal'], axis=1)

# Get test and train split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=101)

display(xtrain.head())
display(ytrain.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca
900,61,1,3,134,234,0,1,145,0,2.6,1,2
713,66,0,3,150,226,0,1,114,0,2.6,0,0
971,52,1,2,172,199,1,1,162,0,0.5,2,0
798,59,1,3,170,288,0,0,159,0,0.2,1,0
727,56,1,1,130,221,0,0,163,0,0.0,2,0


900    2
713    2
971    3
798    3
727    3
Name: thal, dtype: int64

In [43]:
# Define some test parameters
parameters = {
    'n_estimators': (50, 70, 100, 120, 150),
    'criterion': ('gini', 'entropy'),
    'max_depth': (5, 7, 9, 11, 13),
    'max_features': ('auto', 'sqrt'),
    'min_samples_split': (2, 4, 6)
}

# Create the model, use grid search to find the best parameters
grid = GridSearchCV(RandomForestClassifier(n_jobs = -1, oob_score=False), param_grid=parameters, cv=3, verbose=True)
grid_model = grid.fit(xtrain, ytrain)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [44]:
# Display the best parameters
grid.best_estimator_

RandomForestClassifier(max_depth=11, max_features='sqrt', n_estimators=150,
                       n_jobs=-1)

In [45]:
# 1: max_depth=11, max_features='sqrt', n_estimators=150, n_jobs=-1
model = RandomForestClassifier(max_depth=11, max_features='sqrt', n_estimators=150, n_jobs=-1)
model.fit(xtrain, ytrain)

RandomForestClassifier(max_depth=11, max_features='sqrt', n_estimators=150,
                       n_jobs=-1)

In [46]:
# Calculates the models results on certain data
def results(label, xtest, ytest):
    print('\n', label)
    print('Train acc: ', np.round(model.score(xtrain, ytrain), 3))
    print('Test acc: ', np.round(model.score(xtest, ytest), 3))

In [47]:
# Outputs those results
results("Overall Results", xtest, ytest)


 Overall Results
Train acc:  1.0
Test acc:  0.985
