# ❤️ Heart Disease Prediction using Machine Learning (Scikit-learn Pipeline)

# Importing Necessary Libraries 

In [70]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler  ,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve)


# Loading the dataset

In [71]:
df = pd.read_csv('Heart_dataset.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Inspecting the dataset

In [72]:
df.shape

(1918, 12)

In [74]:
X = df.drop(['HeartDisease'],axis=1)
y = df['HeartDisease']
print(f'Shape of X :{X.shape} , Shape of y is {y.shape}')

Shape of X :(1918, 11) , Shape of y is (1918,)


In [75]:
# Numerocal Features
num_cols = X.select_dtypes(include=['float64','int64']).columns.tolist()
num_cols

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [76]:
# Categorical Features
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
cat_cols

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Building a PREPROCESSING PIPELINE

In [77]:
# Preprocessing for numerical columns
num_trans= Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical columns
cat_trans = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

cp_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder(categories=[
        ['ATA','NAP','TA','ASY']
    ]))
])

slope_trans = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder(categories=[
        ['Down','Flat','Up']
    ]))
])

# COlUMN TRANSFORMER
preprocessor = ColumnTransformer(
    [('numerical',num_trans,['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']),
    ('one_hot',cat_trans,['Sex', 'RestingECG', 'ExerciseAngina']),
     ('cp_ord',cp_transformer,['ChestPainType']),
     ('slope_ord',slope_trans,['ST_Slope'])
])

# Splitting the dataset

In [79]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state = 42 , stratify=y)

In [82]:
# Best Parameters : {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

# Model Training 

In [83]:
from sklearn.ensemble import RandomForestClassifier
final_pipe = make_pipeline(preprocessor,RandomForestClassifier(
     max_depth= None,
     max_features= 'sqrt',
     min_samples_leaf= 2,
     min_samples_split= 5,
     n_estimators= 100,
     random_state=42
))

final_pipe.fit(X_train , y_train)

# ----------------------------
# TRAINING METRICS
# ----------------------------
y_train_pred = final_pipe.predict(X_train)
y_train_prob = final_pipe.predict_proba(X_train)[:, 1]

print("\n========== TRAINING METRICS ==========\n")
print("Accuracy :", accuracy_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))
print("Recall   :", recall_score(y_train, y_train_pred))
print("F1 Score :", f1_score(y_train, y_train_pred))
print("AUC-ROC  :", roc_auc_score(y_train, y_train_prob))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))

# ----------------------------
# TESTING METRICS
# ----------------------------
y_test_pred = final_pipe.predict(X_test)
y_test_prob = final_pipe.predict_proba(X_test)[:, 1]

print("\n========== TESTING METRICS ==========\n")
print("Accuracy :", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall   :", recall_score(y_test, y_test_pred))
print("F1 Score :", f1_score(y_test, y_test_pred))
print("AUC-ROC  :", roc_auc_score(y_test, y_test_prob))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))



Accuracy : 0.9719687092568449
Precision: 0.9693530079455165
Recall   : 0.9816091954022989
F1 Score : 0.9754426042261565
AUC-ROC  : 0.9977028804874671

Confusion Matrix:
 [[637  27]
 [ 16 854]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       664
           1       0.97      0.98      0.98       870

    accuracy                           0.97      1534
   macro avg       0.97      0.97      0.97      1534
weighted avg       0.97      0.97      0.97      1534



Accuracy : 0.9140625
Precision: 0.918552036199095
Recall   : 0.9311926605504587
F1 Score : 0.9248291571753986
AUC-ROC  : 0.959323532662761

Confusion Matrix:
 [[148  18]
 [ 15 203]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90       166
           1       0.92      0.93      0.92       218

    accuracy                           0.91       384
   macro avg       0.91   

In [84]:
final_pipe

In [85]:
# Save the model
import joblib
joblib.dump(final_pipe,"heart_model.pkl")

['heart_model.pkl']