In [63]:
import pandas as pd
import numpy as np
import json
import time
from itertools import combinations

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

#models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import joblib

#metrics
from sklearn.metrics import f1_score, recall_score, accuracy_score, roc_auc_score, classification_report, confusion_matrix

In [90]:
features_to_keep = ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall'] # all

# features_to_keep = ['cp', 'trtbps', 'exng', 'slp', 'caa', 'thall'] #subset best on xgboost and rf


In [91]:
df = pd.read_csv("heart.csv")
df.columns = df.columns.str.strip()

print(df.columns)

X = df[features_to_keep]
y = df["output"]

smote = SMOTE(random_state=32, sampling_strategy=0.91)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=32, stratify=y_resampled
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "scaler_cp_trtbps_exng_slp_caa_thall.pkl")

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')


['scaler_cp_trtbps_exng_slp_caa_thall.pkl']

In [98]:
rf = RandomForestClassifier(n_estimators=200, max_depth=5, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))
joblib.dump(rf, "RandomForest_model.pkl")

0.8571428571428571


['RandomForest_model.pkl']

In [86]:
print("On test dataset")

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

On test dataset
[[27  3]
 [ 3 30]]
0.9047619047619048
0.9045454545454545


In [37]:
y_pred2 = rf.predict(X_train_scaled)

print("On train dataset")

print(confusion_matrix(y_train, y_pred2))
print(accuracy_score(y_train, y_pred2))
print(roc_auc_score(y_train, y_pred2))



On train dataset
[[106  14]
 [ 11 121]]
0.9007936507936508
0.8999999999999999


In [39]:
y_pred3 = rf.predict(scaler.fit_transform(X))

print("On whole dataset")

print(confusion_matrix(y_pred3, y))
print(accuracy_score(y_pred3, y))
print(roc_auc_score(y_pred3, y))


On whole dataset
[[122  14]
 [ 16 151]]
0.900990099009901
0.9006252201479396
