In [1]:
# === Imports ===
import pandas as pd
import numpy as np
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import joblib

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Metrics & Utils
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize


                                 `Load Env & MongoDB Connection`                              

In [2]:
# === Load environment variables ===
load_dotenv() 

# === MongoDB Setup ===
MONGO_URI = os.getenv("MONGO_URI")  # Your MongoDB Atlas URI
client = MongoClient(MONGO_URI)
db = client["healthcare"]
gold_collection = db["heart_disease_gold"]

                                `Fetch Data from Gold Layer`                                    

In [3]:
# === Fetch Data from Gold Layer ===
gold_data = list(gold_collection.find({}, {"_id": 0}))  # Exclude MongoDB _id
df = pd.DataFrame(gold_data)


In [4]:
# === Check your data ===
print(" Shape of Gold Layer Data:", df.shape)
print(" Columns:", df.columns.tolist())
df.head()

 Shape of Gold Layer Data: (920, 10)
 Columns: ['oldpeak', 'thalch', 'exang', 'age', 'ca', 'cp', 'dataset', 'id', 'sex', 'num']


Unnamed: 0,oldpeak,thalch,exang,age,ca,cp,dataset,id,sex,num
0,0.556818,0.633803,False,0.714286,0.0,1.0,0.0,0.0,1.0,0
1,0.465909,0.338028,True,0.795918,1.0,0.0,0.0,0.001088,1.0,2
2,0.590909,0.485915,True,0.795918,0.666667,0.0,0.0,0.002176,1.0,1
3,0.693182,0.894366,False,0.183673,0.0,0.666667,0.0,0.003264,1.0,0
4,0.454545,0.788732,False,0.265306,0.0,0.333333,0.0,0.004353,0.0,0


                                 `Train-Test Split`                                              

In [5]:
# === Split features (X) and target (y) ===
X = df.drop("num", axis=1)
y = df["num"]


In [6]:
# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # ensures class balance
)

print(" Training Set:", X_train.shape, y_train.shape)
print(" Testing Set:", X_test.shape, y_test.shape)

 Training Set: (736, 9) (736,)
 Testing Set: (184, 9) (184,)


                                       `Train Models & Evaluate`                                     

In [7]:
# === Models ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
}

# === Results Dictionary ===
results = {}

# === Binarize true labels for multiclass ROC AUC ===
classes = np.unique(y_test)
y_test_binarized = label_binarize(y_test, classes=classes)

# === Loop through models ===
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Handle probability prediction safely
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
        # If binary classification, take probability of class 1
        if y_proba.shape[1] == 2:
            y_proba_for_auc = y_proba[:, 1]
            roc_auc = roc_auc_score(y_test, y_proba_for_auc)
        else:
            # Multiclass
            roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr', average='macro')
    else:
        roc_auc = None

    # === Store metrics ===
    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "f1": f1_score(y_test, y_pred, average='macro', zero_division=0),
        "roc_auc": roc_auc
    }


                                         `Display Results`                                          

In [8]:
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="roc_auc", ascending=False)
results_df


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
XGBoost,0.641304,0.500733,0.492027,0.494543,0.866878
Random Forest,0.663043,0.416782,0.428494,0.421875,0.864948
Logistic Regression,0.61413,0.487089,0.347145,0.322925,0.826774


In [9]:
# Save model - adjust path to go outside notebooks folder
save_path = "../api/model/heart_disease_model.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

In [10]:
# Save model
best_model = models["XGBoost"]
joblib.dump(best_model, save_path )
print(f" Model saved to {save_path}")

 Model saved to ../api/model/heart_disease_model.pkl


                                    `Model Training Summary`                                        

- **Data Source:** Gold Layer (MongoDB collection `heart_disease_gold`)

- **Target Variable:** `num` (0 = No Disease, 1 = Disease)

- **Features Used:** Selected via correlation-based feature selection

- **Models Trained:** Logistic Regression, Random Forest, XGBoost

- **Evaluation Metrics:** Accuracy, Precision, Recall, F1-score, ROC-AUC

- **Best Model:** XGBoost
  - Highest ROC-AUC
  - Robust to feature types and distributions
  - Handles interactions well
  
- **Model Saved:** `heart_disease_model.pkl`
