In [6]:
# ==============================================================
#  Cardiac Risk – Benchmark with RandomForest & CatBoost + SHAP
# ==============================================================

import warnings
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")


# 1. LOAD & CLEAN
DATA_PATH = Path("D:/generated_health_risk_data_100k.csv")
LABEL_COL = "Cardiac Risk"

df = pd.read_csv(DATA_PATH).rename(columns=str.strip)
# drop any "id" columns
df.drop(columns=[c for c in df if "id" in c.lower()], errors="ignore", inplace=True)
# map Gender
if "Gender" in df:
    df["Gender"] = (df["Gender"].astype(str)
                             .str.lower()
                             .map({"male":1, "m":1, "female":0, "f":0}))

# 2. SPLIT
train_df, test_df = train_test_split(
    df, test_size=0.2,
    stratify=df[LABEL_COL].str.lower(),
    random_state=42
)
y_train = train_df[LABEL_COL].str.lower().map({"low":0,"moderate":1,"high":2})
y_test  = test_df[LABEL_COL].str.lower().map({"low":0,"moderate":1,"high":2})
X_train = train_df.drop(columns=[LABEL_COL])
X_test  = test_df .drop(columns=[LABEL_COL])

# 3. PREPROCESSOR
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

# 4. MODEL ZOO
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=400, max_depth=18,
        class_weight="balanced", random_state=42
    ),
    "CatBoost": CatBoostClassifier(
        verbose=0, random_state=42
    )
}

# 5. FIT & EVALUATE
results = {}
pipes   = {}
for name, clf in models.items():
    pipe = Pipeline([("prep", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)
    pipes[name] = pipe

    y_pred = pipe.predict(X_test)
    try:
        y_prob = pipe.predict_proba(X_test)
        auc    = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except:
        auc = np.nan

    results[name] = {
        "Accuracy":  accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_test, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_test, y_pred, average="macro", zero_division=0),
        "ROC-AUC":   auc
    }

res_df = pd.DataFrame(results).T.sort_values("F1-Score", ascending=False).round(3)
print("\n=== Model Comparison ===")
print(res_df)


# 6. SHAP EXPLAINABILITY (one plot per model)
bg   = X_train.sample(n=100, random_state=42)
test = X_test .sample(n=min(200, len(X_test)), random_state=42)

for name, pipe in pipes.items():
    explainer   = shap.Explainer(pipe.predict_proba, bg)
    shap_vals   = explainer(test)

    plt.figure(figsize=(8,6))
    shap.summary_plot(shap_vals, show=False)
    plt.title(f"SHAP Summary – {name}")
    plt.tight_layout()
    plt.savefig(f"shap_summary_{name}.png")
    plt.close()

    print(f"Saved SHAP summary plot for {name}")



=== Model Comparison ===
              Accuracy  Precision  Recall  F1-Score  ROC-AUC
CatBoost         0.943      0.938   0.937     0.938    0.992
RandomForest     0.934      0.931   0.920     0.925    0.989


PermutationExplainer explainer: 201it [13:40,  4.14s/it]                                                               


Saved SHAP summary plot for RandomForest


PermutationExplainer explainer: 201it [02:34,  1.23it/s]                                                               


Saved SHAP summary plot for CatBoost
