In [285]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from scipy import stats
from sklearn.svm import SVC
from catboost import CatBoostClassifier
import lightgbm as lgb


import plotly.express as px
import plotly.graph_objects as go

sns.set()

In [286]:
#obtaining my dataframe.
df = pd.read_csv('C:/Users/Samuel Kim/Downloads/heart.csv')
print(df)

     Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0     40   M           ATA        140          289          0     Normal   
1     49   F           NAP        160          180          0     Normal   
2     37   M           ATA        130          283          0         ST   
3     48   F           ASY        138          214          0     Normal   
4     54   M           NAP        150          195          0     Normal   
..   ...  ..           ...        ...          ...        ...        ...   
913   45   M            TA        110          264          0     Normal   
914   68   M           ASY        144          193          1     Normal   
915   57   M           ASY        130          131          0     Normal   
916   57   F           ATA        130          236          0        LVH   
917   38   M           NAP        138          175          0     Normal   

     MaxHR ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0      172              N  

In [287]:
scores=[]
cv_scores=[]
models_name=['LogisticRegression','RandomForest','SVC','CatBoost','LGBM']

In [288]:
#Get dummies
df_e=pd.get_dummies(df)

#Define inputs & Output
X=df_e.drop(['HeartDisease'], axis=1)
y=df_e['HeartDisease']

#Normalize
scaler=StandardScaler()
X=scaler.fit_transform(X)

#Split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [289]:
#Parameter Tuning
#Param grid
parameters = {'n_estimators':[10,15,16,18,20,25,40,50,180,200],
              'max_depth':[5,7,8,9,10,12,15,16,18,19],
             #'min_samples_split':[2,3,5,8],
             #'min_samples_leaf':[2,3,5,8],
             }

#Define grid search
clf = GridSearchCV(RandomForestClassifier(), parameters)

In [290]:
#CV function
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return np.mean(scores)

In [291]:
scores=[]
cv_scores=[]
models_name=['LogisticRegression','RandomForest','SVC','CatBoost','LGBM']

In [292]:
#Logistic Regression
Model_LR=LogisticRegression(solver='liblinear')

#Fit and predict
Model_LR.fit(X_train,y_train)
y_pred=Model_LR.predict(X_test)

#Get scores
scores.append(accuracy_score(y_pred,y_test))
cv_scores.append(evaluate_model(Model_LR,X,y))

In [293]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86       119
           1       0.88      0.92      0.90       157

    accuracy                           0.88       276
   macro avg       0.88      0.87      0.88       276
weighted avg       0.88      0.88      0.88       276



In [294]:
#Random Forest Classification
Model_RF=RandomForestClassifier(n_estimators=200,max_depth=5,
                                min_samples_split=2,min_samples_leaf=2,
                                criterion='entropy')

#Fit and predict
Model_RF.fit(X_train,y_train)
y_pred=Model_RF.predict(X_test)

#Get scores
scores.append(accuracy_score(y_pred,y_test))
cv_scores.append(evaluate_model(Model_RF,X,y))

In [295]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86       116
           1       0.89      0.91      0.90       160

    accuracy                           0.88       276
   macro avg       0.88      0.88      0.88       276
weighted avg       0.88      0.88      0.88       276



In [296]:
Model_SVC=SVC(gamma='scale')

#Fit and predict
Model_SVC.fit(X_train,y_train)
y_pred=Model_SVC.predict(X_test)

#Get scores
scores.append(accuracy_score(y_pred,y_test))
cv_scores.append(evaluate_model(Model_SVC,X,y))
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87       113
           1       0.91      0.91      0.91       163

    accuracy                           0.89       276
   macro avg       0.89      0.89      0.89       276
weighted avg       0.89      0.89      0.89       276



In [297]:
Model_CatBoost=CatBoostClassifier(verbose=False)
#Fit and predict
Model_CatBoost.fit(X_train,y_train,eval_set=(X_test, y_test))
y_pred=Model_CatBoost.predict(X_test)

#Get scores
scores.append(accuracy_score(y_pred,y_test))
cv_scores.append(evaluate_model(Model_CatBoost,X,y))
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       114
           1       0.91      0.93      0.92       162

    accuracy                           0.91       276
   macro avg       0.90      0.90      0.90       276
weighted avg       0.91      0.91      0.91       276



In [298]:
Model_lgb=lgb.LGBMClassifier()

#Fit and predict
Model_lgb.fit(X_train,y_train)
y_pred=Model_lgb.predict(X_test)

#Get scores
scores.append(accuracy_score(y_pred,y_test))
cv_scores.append(evaluate_model(Model_lgb,X,y))
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       117
           1       0.88      0.91      0.89       159

    accuracy                           0.87       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.87      0.87      0.87       276



In [299]:
pd.DataFrame({'Model':models_name,'Validation Score':scores,'Cross_Validation Score':cv_scores}).style.background_gradient()

Unnamed: 0,Model,Validation Score,Cross_Validation Score
0,LogisticRegression,0.880435,0.864496
1,RandomForest,0.884058,0.8719
2,SVC,0.894928,0.86623
3,CatBoost,0.905797,0.877568
4,LGBM,0.873188,0.865816
