In [1]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pymongo import MongoClient
import joblib
import os
from dotenv import load_dotenv
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize



In [2]:
# === Load environment variables ===
load_dotenv() 

# === MongoDB Setup ===
MONGO_URI = os.getenv("MONGO_URI")  # Your MongoDB Atlas URI
client = MongoClient(MONGO_URI)
db = client["healthcare"]
gold_collection = db["heart_disease_gold"]

In [3]:
# === Fetch Data from Gold Layer ===
gold_data = list(gold_collection.find({}, {"_id": 0}))  # Exclude MongoDB _id
df = pd.DataFrame(gold_data)


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    float64
 1   age       920 non-null    float64
 2   dataset   920 non-null    float64
 3   cp        920 non-null    float64
 4   trestbps  920 non-null    float64
 5   chol      920 non-null    float64
 6   thalch    920 non-null    float64
 7   oldpeak   920 non-null    float64
 8   num       920 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 64.8 KB


In [4]:
# === Check your data ===
print("âœ… Shape of Gold Layer Data:", df.shape)
print("ðŸ§¾ Columns:", df.columns.tolist())
df.head()

âœ… Shape of Gold Layer Data: (920, 9)
ðŸ§¾ Columns: ['id', 'age', 'dataset', 'cp', 'trestbps', 'chol', 'thalch', 'oldpeak', 'num']


Unnamed: 0,id,age,dataset,cp,trestbps,chol,thalch,oldpeak,num
0,0.0,0.714286,0.0,1.0,0.725,0.386401,0.633803,0.556818,0
1,0.001088,0.795918,0.0,0.0,0.8,0.474295,0.338028,0.465909,2
2,0.002176,0.795918,0.0,0.0,0.6,0.379768,0.485915,0.590909,1
3,0.003264,0.183673,0.0,0.666667,0.65,0.414594,0.894366,0.693182,0
4,0.004353,0.265306,0.0,0.333333,0.65,0.338308,0.788732,0.454545,0


In [5]:
# === Split features (X) and target (y) ===
X = df.drop("num", axis=1)
y = df["num"]


In [6]:
# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # ensures class balance
)

print("âœ… Training Set:", X_train.shape, y_train.shape)
print("âœ… Testing Set:", X_test.shape, y_test.shape)

âœ… Training Set: (736, 8) (736,)
âœ… Testing Set: (184, 8) (184,)


In [7]:
# === Models ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
}

# === Results Dictionary ===
results = {}

# === Binarize true labels for multiclass ROC AUC ===
classes = np.unique(y_test)
y_test_binarized = label_binarize(y_test, classes=classes)

# === Loop through models ===
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Handle probability prediction safely
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
        # If binary classification, take probability of class 1
        if y_proba.shape[1] == 2:
            y_proba_for_auc = y_proba[:, 1]
            roc_auc = roc_auc_score(y_test, y_proba_for_auc)
        else:
            # Multiclass
            roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr', average='macro')
    else:
        roc_auc = None

    # === Store metrics ===
    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "f1": f1_score(y_test, y_pred, average='macro', zero_division=0),
        "roc_auc": roc_auc
    }


In [8]:
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="roc_auc", ascending=False)
results_df


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
Random Forest,0.673913,0.440785,0.451076,0.444274,0.873475
XGBoost,0.646739,0.434901,0.440664,0.437598,0.865633
Logistic Regression,0.592391,0.290445,0.32055,0.286154,0.8087


In [19]:
# Save model - adjust path to go outside notebooks folder
save_path = "../api/model/heart_disease_model.pkl"
os.makedirs(os.path.dirname(save_path), exist_ok=True)

In [21]:
# Save model
rf_model = models["Random Forest"]
joblib.dump(rf_model, save_path )
print(f"âœ… Model saved to {save_path}")

âœ… Model saved to ../api/model/heart_disease_model.pkl
