In [5]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, matthews_corrcoef)

Load the cleaned data

In [4]:
df = pd.read_csv('Heart_Disease_Cleaned.csv')
df.head(5)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,target
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2,1
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,0


In [3]:
# 2Define Features (X) and Target (y)
X = df.drop(['num', 'target'], axis=1) 
y = df['target']

# Identify column types automatically
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Preprocessor
# Note: sparse_output=False is required for Naive Bayes to work in a pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

# Initialize the 6 Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Create directory for saved models
if not os.path.exists('model'):
    os.makedirs('model')

# 7. Training and Evaluation Loop
results_list = []

print(f"{'Model':<20} | {'Acc':<6} | {'AUC':<6} | {'Prec':<6} | {'Rec':<6} | {'F1':<6} | {'MCC':<6}")
print("-" * 75)

for name, model in models.items():
    # Create a pipeline that combines preprocessing + the model
    clf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train
    clf_pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = clf_pipeline.predict(X_test)
    y_prob = clf_pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate Metrics
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    
    results_list.append(metrics)
    
    # Print formatted results for your Assignment Screenshot
    print(f"{name:<20} | {metrics['Accuracy']:.3f} | {metrics['AUC']:.3f} | {metrics['Precision']:.3f} | {metrics['Recall']:.3f} | {metrics['F1']:.3f} | {metrics['MCC']:.3f}")
    
    # Save the pipeline for Streamlit
    joblib.dump(clf_pipeline, f'model/{name.lower().replace(" ", "_")}.pkl')

# Convert results to DataFrame for easy copy-pasting into your README
results_df = pd.DataFrame(results_list)
results_df.to_csv('model_comparison_results.csv', index=False)

Model                | Acc    | AUC    | Prec   | Rec    | F1     | MCC   
---------------------------------------------------------------------------
Logistic Regression  | 0.842 | 0.913 | 0.877 | 0.853 | 0.865 | 0.676
Decision Tree        | 0.793 | 0.801 | 0.874 | 0.761 | 0.814 | 0.591
kNN                  | 0.875 | 0.906 | 0.884 | 0.908 | 0.896 | 0.740
Naive Bayes          | 0.804 | 0.902 | 0.876 | 0.780 | 0.825 | 0.610
Random Forest        | 0.880 | 0.940 | 0.914 | 0.881 | 0.897 | 0.755
XGBoost              | 0.842 | 0.933 | 0.892 | 0.835 | 0.863 | 0.680


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
