# Heart Attack Analytics & Prediction

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc

In [2]:
df = pd.read_csv('heart_data.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [6]:
print(df.shape)

(918, 12)


## Data exploration

In [7]:
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.express as px

color_discrete_map = {0: 'red', 1: 'blue'}

# Create subplot figure
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=[
    'Age Distribution', 'Sex Distribution', 'Exercise Induced Angina',
    'Chest Pain Type', 'Resting Blood Pressure by Target',
    'Fasting Blood Sugar', 'Resting Electrocardiographic Results'
])

# Add Age Histogram
age_hist = px.histogram(df, x='Age', color='HeartDisease', color_discrete_map=color_discrete_map, marginal="box")
for trace in age_hist.data:
    fig.add_trace(trace, row=1, col=1)

# Add Sex Count Plot
sex_hist = px.histogram(df, x='Sex', color='HeartDisease', color_discrete_map=color_discrete_map)
for trace in sex_hist.data:
    fig.add_trace(trace, row=1, col=2)

# Add Exercise Induced Angina Count Plot
exng_hist = px.histogram(df, x='ExerciseAngina', color='HeartDisease', color_discrete_map=color_discrete_map)
for trace in exng_hist.data:
    fig.add_trace(trace, row=1, col=3)

# Add Chest Pain Type Count Plot
cp_hist = px.histogram(df, x='ChestPainType', color='HeartDisease', color_discrete_map=color_discrete_map)
for trace in cp_hist.data:
    fig.add_trace(trace, row=2, col=1)

# Add Resting Blood Pressure Box Plot by Target
trtbps_box = px.box(df, x='HeartDisease', y='RestingBP', color='HeartDisease', color_discrete_map=color_discrete_map)
for trace in trtbps_box.data:
    fig.add_trace(trace, row=2, col=2)

# Add Fasting Blood Sugar Count Plot
fbs_hist = px.histogram(df, x='FastingBS', color='HeartDisease', color_discrete_map=color_discrete_map)
for trace in fbs_hist.data:
    fig.add_trace(trace, row=2, col=3)

# Add Resting Electrocardiographic Results Count Plot
restecg_hist = px.histogram(df, x='RestingECG', color='HeartDisease', color_discrete_map=color_discrete_map)
for trace in restecg_hist.data:
    fig.add_trace(trace, row=3, col=1)

# Update layout for the subplot figure
fig.update_layout(height=900, width=900, title_text="Heart Disease Data Visualization", showlegend=False)

# Show plot
fig.show()

## Binary Column

In [8]:
# take the data type that is object 

object_col = df.select_dtypes(include='object').columns

# turn object_col to binary data

df = pd.get_dummies(df, columns=object_col, drop_first=True)

In [9]:
df.sample(5)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
660,58,140,211,1,165,0.0,0,1,0,1,0,0,0,0,0,1
495,64,142,276,0,140,1.0,1,0,0,0,0,1,0,1,1,0
268,54,130,242,0,91,1.0,1,1,0,0,0,1,0,1,1,0
634,40,140,199,0,178,1.4,0,1,0,0,1,1,0,1,0,1
612,55,122,223,1,100,0.0,1,1,0,0,0,0,1,0,1,0


In [10]:
df.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease', 'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

## Split data

In [11]:
from sklearn.model_selection import train_test_split

X = df.drop(['HeartDisease'], axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size= 0.2, 
    stratify=y,
    random_state=123)

## Rescaling

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [13]:
from collections import Counter 

def class_perc(df): 
    lendata = len(df) 
    classes = Counter(df) 
    
    for sclass, freq in classes.items(): 
        perc = (freq / lendata) * 100 
        print(f"Class '{sclass}': {perc:.2f}%")  

class_perc(df['HeartDisease']) 

Class '0': 44.66%
Class '1': 55.34%


## Decision Tree

In [14]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

model = DecisionTreeClassifier() # by default Gini index, no random replication
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accDT=accuracy_score(y_test, predictions)
print("Accuracy test set Decision Trees (not pruned) = {0:.2%}".format(accDT))
print(classification_report(y_test, predictions))


Accuracy test set Decision Trees (not pruned) = 78.26%
              precision    recall  f1-score   support

           0       0.74      0.78      0.76        82
           1       0.82      0.78      0.80       102

    accuracy                           0.78       184
   macro avg       0.78      0.78      0.78       184
weighted avg       0.78      0.78      0.78       184



In [15]:
feature_names = X_train.columns.tolist()

importance = model.feature_importances_

# Summarize feature importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Get top 5 most important features
top_features = feature_importance.head(5)

# Plot feature importance using Plotly
fig = px.bar(top_features, x='Feature', y='Importance', title='Top 5 Most Important Features')
fig.show()

In [16]:
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [17]:
confusion_matrix(y_test, model.predict(X_test))

array([[64, 18],
       [22, 80]])

In [18]:
fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.21951219512195122, 1.0]
[0.0, 0.7843137254901961, 1.0]
0.7824007651841224


## Decision Tree Pruned

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

param_grid = {
    'classifier__max_depth': range(1, 20),
    'classifier__min_samples_split': range(2, 10),
    'classifier__min_samples_leaf': range(1, 10),
    'classifier__max_features': [None, 'sqrt', 'log2']
}


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier(criterion="entropy"))
])

cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=cv_strategy, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Train the final model with the best parameters
best_params = grid_search.best_params_
tree_pruned = DecisionTreeClassifier(
    max_depth=best_params["classifier__max_depth"],
    min_samples_split=best_params["classifier__min_samples_split"],
    min_samples_leaf=best_params["classifier__min_samples_leaf"],
    max_features=best_params["classifier__max_features"],
    criterion="entropy"
)
tree_pruned.fit(X_train, y_train)

# Evaluate the model
predictions = tree_pruned.predict(X_test)
accDTGrid = accuracy_score(y_test, predictions)
print("Accuracy of Test set (pruned) = {0:.2%}".format(accDTGrid))
print(classification_report(y_test, predictions))

Fitting 10 folds for each of 4104 candidates, totalling 41040 fits
Best parameters found:  {'classifier__max_depth': 6, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 9, 'classifier__min_samples_split': 8}
Best cross-validation score: 0.84
Accuracy of Test set (pruned) = 82.07%
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        82
           1       0.84      0.83      0.84       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.82      0.82      0.82       184



In [20]:
# print tree_pruned confusion matrix

confusion_matrix(y_test, tree_pruned.predict(X_test))

array([[66, 16],
       [17, 85]])

In [21]:
y_pred_proba = tree_pruned.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [22]:
fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.036585365853658534, 0.036585365853658534, 0.06097560975609756, 0.10975609756097561, 0.13414634146341464, 0.13414634146341464, 0.15853658536585366, 0.17073170731707318, 0.1951219512195122, 0.23170731707317074, 0.3048780487804878, 0.34146341463414637, 0.34146341463414637, 0.6097560975609756, 0.7560975609756098, 1.0]
[0.0, 0.21568627450980393, 0.27450980392156865, 0.3235294117647059, 0.3333333333333333, 0.35294117647058826, 0.3627450980392157, 0.5980392156862745, 0.6274509803921569, 0.6862745098039216, 0.7549019607843137, 0.7745098039215687, 0.8333333333333334, 0.8627450980392157, 0.8725490196078431, 0.8823529411764706, 0.9509803921568627, 0.9901960784313726, 0.9901960784313726, 1.0]
0.8748804399808704


## Bagging

In [23]:
bgg_model = RandomForestClassifier(n_estimators=200, oob_score=True) # no max_features option=Bagging
bgg_model.fit(X_train, y_train)
accBagg=bgg_model.oob_score_
print("Accuracy OOB set - Bagging = {0:.2%}".format(accBagg))
print(classification_report(y_test, bgg_model.predict(X_test)))

Accuracy OOB set - Bagging = 87.33%
              precision    recall  f1-score   support

           0       0.86      0.85      0.86        82
           1       0.88      0.89      0.89       102

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.88      0.87       184



In [24]:
y_pred_proba = bgg_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [25]:
confusion_matrix(y_test, bgg_model.predict(X_test))

array([[70, 12],
       [11, 91]])

In [26]:
fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.04878048780487805, 0.04878048780487805, 0.04878048780487805, 0.04878048780487805, 0.04878048780487805, 0.06097560975609756, 0.06097560975609756, 0.08536585365853659, 0.08536585365853659, 0.0975609756097561, 0.0975609756097561, 0.10975609756097561, 0.10975609756097561, 0.10975609756097561, 0.12195121951219512, 0.12195121951219512, 0.13414634146341464, 0.13414634146341464, 0.14634146341463414, 0.14634146341463414, 0.15853658536585366, 0.15853658536585366, 0.18292682926829268, 0.18292682926829268, 0.2073170731707317, 0.2073170731707317, 0.21951219512195122, 0.24390243902439024, 0.280487804

## Bagging Grid

In [27]:
param_grid = {'min_samples_leaf' :[2,3,4,5,6]}

BaggGrid = GridSearchCV(bgg_model, param_grid=param_grid, cv=20, scoring='accuracy', verbose=1) 
BaggGrid.fit(X_train,y_train)
print("Best param: ", BaggGrid.best_params_)
BaggGrid_model = RandomForestClassifier(n_estimators = 200, oob_score = True,
                               min_samples_leaf = BaggGrid.best_params_["min_samples_leaf"])
BaggGrid_model.fit(X_train,y_train)
accBaggGrid=BaggGrid_model.oob_score_
print("Accuracy OOB set - Bagging with Grid Searching: {0:.2%}".format(accBaggGrid))
print(classification_report(y_test, BaggGrid_model.predict(X_test)))

Fitting 20 folds for each of 5 candidates, totalling 100 fits
Best param:  {'min_samples_leaf': 3}
Accuracy OOB set - Bagging with Grid Searching: 87.06%
              precision    recall  f1-score   support

           0       0.86      0.87      0.86        82
           1       0.89      0.88      0.89       102

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.88      0.88      0.88       184



In [28]:
confusion_matrix(y_test, BaggGrid_model.predict(X_test))

array([[71, 11],
       [12, 90]])

In [29]:
y_pred_proba = BaggGrid_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [30]:
fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.0, 0.0, 0.012195121951219513, 0.012195121951219513, 0.024390243902439025, 0.024390243902439025, 0.036585365853658534, 0.036585365853658534, 0.04878048780487805, 0.04878048780487805, 0.06097560975609756, 0.06097560975609756, 0.07317073170731707, 0.07317073170731707, 0.08536585365853659, 0.08536585365853659, 0.10975609756097561, 0.10975609756097561, 0.12195121951219512, 0.12195121951219512, 0.14634146341463414, 0.14634146341463414, 0.15853658536585366, 0.15853658536585366, 0.1951219512195122, 0.1951219512195122, 0.2926829268292683, 0.2926829268292683, 0.3048780487804878, 0.3048780487804878, 0.3170731707317073, 0.3170731707317073, 0.32926829268292684, 0.32926829268292684, 0.524390243902439, 0.524390243902439, 0.5853658536585366, 0.5853658536585366, 1.0]
[0.0, 0.00980392156862745, 0.029411764705882353, 0.029411764705882353, 0.16666666666666666, 0.16666666666666666, 0.6078431372549019, 0.6078431372549019, 0.6470588235294118, 0.6470588235294118, 0.7058823529411765, 0.7058823529411765

## Random Forest

In [31]:
rf_model = RandomForestClassifier(n_estimators=200, oob_score=True, max_features=8) # 62 explanatories
rf_model.fit(X_train, y_train)
accRF=rf_model.oob_score_
print("Accuracy OOB set Random Forests = {0:.2%}".format(accRF))
print(classification_report(y_test, rf_model.predict(X_test)))

Accuracy OOB set Random Forests = 86.38%
              precision    recall  f1-score   support

           0       0.85      0.82      0.83        82
           1       0.86      0.88      0.87       102

    accuracy                           0.85       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184



In [32]:
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [48]:
confusion_matrix(y_test, rf_model.predict(X_test))

array([[67, 15],
       [12, 90]])

In [33]:

fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.04878048780487805, 0.04878048780487805, 0.06097560975609756, 0.06097560975609756, 0.06097560975609756, 0.07317073170731707, 0.07317073170731707, 0.08536585365853659, 0.08536585365853659, 0.0975609756097561, 0.0975609756097561, 0.10975609756097561, 0.12195121951219512, 0.12195121951219512, 0.13414634146341464, 0.14634146341463414, 0.14634146341463414, 0.18292682926829268, 0.18292682926829268, 0.1951219512195122, 0.1951219512195122, 0.24390243902439024, 0.2682926829268293, 0.2682926829268293, 0.3048780487

## RF Grid

In [34]:
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score

param_grid = {
    'min_samples_leaf' :[1,2,4,6,8,10], 
    'max_features' : [5,10,15,20,25],
}

RFGrid = GridSearchCV(rf_model, param_grid=param_grid, cv=10, scoring=accuracy_score, verbose=1) 
RFGrid.fit(X_train,y_train)
print("Best param: ", RFGrid.best_params_)
rfgrid_model = RandomForestClassifier(n_estimators=200, oob_score=True, 
                               max_features=RFGrid.best_params_["max_features"], 
                               min_samples_leaf=RFGrid.best_params_["min_samples_leaf"])
rfgrid_model.fit(X_train,y_train)
accRFGrid=rfgrid_model.oob_score_
print("Accuracy OOB set - RF with Grid Searching: {0:.2%}".format(accRFGrid))

Fitting 10 folds for each of 30 candidates, totalling 300 fits



Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 191, in wrapper
    params = func_sig.bind(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/inspect.py", line 3212, in bind
    return self._bind(args, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/inspect.py", line 3138, in _bind
    raise TypeError(
TypeError: too many positional arguments




Best param:  {'max_features': 5, 'min_samples_leaf': 1}
Accuracy OOB set - RF with Grid Searching: 86.65%


In [35]:
y_pred_proba = rfgrid_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [36]:
confusion_matrix(y_test, rf_model.predict(X_test))

array([[67, 15],
       [12, 90]])

In [37]:
fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.0, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.012195121951219513, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.024390243902439025, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.036585365853658534, 0.04878048780487805, 0.06097560975609756, 0.06097560975609756, 0.07317073170731707, 0.07317073170731707, 0.07317073170731707, 0.07317073170731707, 0.0975609756097561, 0.0975609756097561, 0.10975609756097561, 0.10975609756097561, 0.12195121951219512, 0.12195121951219512, 0.13414634146341464, 0.13414634146341464, 0.15853658536585366, 0.15853658536585366, 0.17073170731707318, 0.1951219512195122, 0.1951219512195122, 0.2073170731707317, 0.23170731707317074, 0.28

## XGBoost

In [38]:
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


xgb = XGBClassifier(n_estimators=200)
xgb.fit(X_train, y_train)
predictions_XGBC = xgb.predict(X_test)
accBoost = accuracy_score(y_test, predictions_XGBC)
print("Accuracy test set of XGBClassifier: {0:.2%}".format(accBoost))
print(classification_report(y_test, predictions_XGBC))

Accuracy test set of XGBClassifier: 88.59%
              precision    recall  f1-score   support

           0       0.88      0.87      0.87        82
           1       0.89      0.90      0.90       102

    accuracy                           0.89       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.89      0.89      0.89       184



In [39]:
y_pred_proba = xgb.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [49]:
confusion_matrix(y_test, xgb.predict(X_test))

array([[71, 11],
       [10, 92]])

In [40]:


fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.0, 0.0, 0.012195121951219513, 0.012195121951219513, 0.024390243902439025, 0.024390243902439025, 0.04878048780487805, 0.04878048780487805, 0.06097560975609756, 0.06097560975609756, 0.07317073170731707, 0.07317073170731707, 0.0975609756097561, 0.0975609756097561, 0.10975609756097561, 0.10975609756097561, 0.13414634146341464, 0.13414634146341464, 0.14634146341463414, 0.14634146341463414, 0.2073170731707317, 0.2073170731707317, 0.21951219512195122, 0.21951219512195122, 0.23170731707317074, 0.23170731707317074, 0.25609756097560976, 0.25609756097560976, 0.2682926829268293, 0.2682926829268293, 0.2804878048780488, 0.2804878048780488, 0.3048780487804878, 0.3048780487804878, 0.5121951219512195, 0.5121951219512195, 0.9512195121951219, 0.9512195121951219, 1.0]
[0.0, 0.00980392156862745, 0.13725490196078433, 0.13725490196078433, 0.4117647058823529, 0.4117647058823529, 0.5686274509803921, 0.5686274509803921, 0.7745098039215687, 0.7745098039215687, 0.8235294117647058, 0.8235294117647058, 0.86

## XGBoost Grid

In [41]:
xgb_grid = XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'subsample': [0.6, 0.7, 1.0],
    'max_depth': [2, 3, 5, 7, 9], 
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'min_child_weight': [1, 3, 5],
    'colsample_bytree': [0.3, 0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5]
}

# Initialize GridSearchCV with StratifiedKFold
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=xgb_grid, param_grid=param_grid, scoring='accuracy', cv=cv_strategy, n_jobs=-1, verbose=3)

grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

best_params = grid_search.best_params_
xgboost_model = XGBClassifier(n_estimators=200, 
                            subsample=best_params["subsample"],
                            max_depth=best_params["max_depth"],
                            learning_rate=best_params["learning_rate"],
                            min_child_weight=best_params["min_child_weight"],
                            colsample_bytree=best_params["colsample_bytree"],
                            gamma=best_params["gamma"],
                            use_label_encoder=False, eval_metric='logloss')
xgboost_model.fit(X_train, y_train)

# Evaluate the model
predictions_XGB_grid = xgboost_model.predict(X_test)
accBoost_grid = accuracy_score(y_test, predictions_XGB_grid)
print("Accuracy test set of XGB Grid: {0:.2%}".format(accBoost_grid))

Fitting 5 folds for each of 2880 candidates, totalling 14400 fits
[CV 1/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001, max_depth=2, min_child_weight=1, subsample=0.6;, score=0.728 total time=   0.0s
[CV 3/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001, max_depth=2, min_child_weight=1, subsample=0.6;, score=0.646 total time=   0.0s
[CV 5/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001, max_depth=2, min_child_weight=1, subsample=0.6;, score=0.685 total time=   0.0s
[CV 4/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001, max_depth=2, min_child_weight=1, subsample=0.6;, score=0.673 total time=   0.0s
[CV 2/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001, max_depth=2, min_child_weight=1, subsample=0.6;, score=0.707 total time=   0.0s
[CV 1/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001, max_depth=2, min_child_weight=1, subsample=0.7;, score=0.721 total time=   0.0s
[CV 5/5] END colsample_bytree=0.3, gamma=0, learning_rate=0.001,

In [42]:
print(classification_report(y_test, predictions_XGB_grid))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86        82
           1       0.88      0.89      0.89       102

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.88      0.87       184



In [43]:
importance = xgboost_model.feature_importances_

# Summarize feature importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Get top 5 most important features
top_features = feature_importance.head(5)

# Plot feature importance using Plotly
fig = px.bar(top_features, x='Feature', y='Importance', title='Top 5 Most Important Features')
fig.show()

In [44]:
y_pred_proba = xgboost_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='No Skill', line=dict(dash='dash')))

fig.update_layout(
    title='ROC Curve for Heart Disease Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True
)

# Show the plot
fig.show()

In [45]:
print(fpr)

[0.         0.         0.         0.01219512 0.01219512 0.02439024
 0.02439024 0.03658537 0.03658537 0.04878049 0.04878049 0.06097561
 0.06097561 0.07317073 0.07317073 0.08536585 0.08536585 0.09756098
 0.09756098 0.14634146 0.14634146 0.23170732 0.23170732 0.24390244
 0.24390244 0.26829268 0.26829268 0.53658537 0.53658537 0.58536585
 0.58536585 0.6097561  0.6097561  1.        ]


In [46]:
fpr = fpr.tolist()
tpr = tpr.tolist()

print(fpr)
print(tpr)
print(roc_auc)

[0.0, 0.0, 0.0, 0.012195121951219513, 0.012195121951219513, 0.024390243902439025, 0.024390243902439025, 0.036585365853658534, 0.036585365853658534, 0.04878048780487805, 0.04878048780487805, 0.06097560975609756, 0.06097560975609756, 0.07317073170731707, 0.07317073170731707, 0.08536585365853659, 0.08536585365853659, 0.0975609756097561, 0.0975609756097561, 0.14634146341463414, 0.14634146341463414, 0.23170731707317074, 0.23170731707317074, 0.24390243902439024, 0.24390243902439024, 0.2682926829268293, 0.2682926829268293, 0.5365853658536586, 0.5365853658536586, 0.5853658536585366, 0.5853658536585366, 0.6097560975609756, 0.6097560975609756, 1.0]
[0.0, 0.00980392156862745, 0.029411764705882353, 0.029411764705882353, 0.46078431372549017, 0.46078431372549017, 0.6078431372549019, 0.6078431372549019, 0.6176470588235294, 0.6176470588235294, 0.6666666666666666, 0.6666666666666666, 0.7941176470588235, 0.7941176470588235, 0.803921568627451, 0.803921568627451, 0.8137254901960784, 0.8137254901960784, 0.

## Result

In [47]:
import plotly.graph_objects as go

# Results from the analysis
models = ['Decision Trees', 'Decision Trees (pruned)', 'Bagging', 'Bagging (Grid Search)', 'Random Forests', 
          'Random Forests (Grid Search)', 'XGBClassifier', 'XGBClassifier (Grid Search)']
accuracy = [accDT, accDTGrid, accBagg, accBaggGrid, accRF, accRFGrid, accBoost, accBoost_grid]

# Assign a unique color to each model
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']

# Create a bar chart using Plotly
fig = go.Figure(data=[
    go.Bar(name='Accuracy', x=models, y=accuracy, marker=dict(color=colors))
])

# Update layout for better visualization
fig.update_layout(
    title='Accuracy of Different Models',
    xaxis_title='Models',
    yaxis_title='Accuracy',
    xaxis_tickangle=-45,
    yaxis_tickformat='.2%',
    height=600
)

# Show the plot
fig.show()

XGBoost has the best result