In [7]:
"""
CVD-Risk Benchmark + SHAP  (stable, no “Independent masker” error)
------------------------------------------------------------------
• Reads  /mnt/data/CVD_Data.csv      (change CSV or TARGET only)
• Dense OneHotEncoder (sklearn 0.24 → 1.4+)
• Models: LogReg, SVC, RF, XGB, LightGBM, CatBoost, HistGB, GradBoost
      └─ GradBoost is *automatically skipped* if task is multi-class
• Metrics: Accuracy / Precision / Recall / F1 / ROC-AUC
• Prints each model’s top-10 SHAP features as % of total importance
------------------------------------------------------------------
pip install --upgrade scikit-learn shap xgboost lightgbm catboost
"""

# ------------------- imports ---------------------------------------------
import warnings, shap, pandas as pd, numpy as np
import sklearn
from packaging.version import Version
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import StandardScaler, OneHotEncoder
from sklearn.compose        import ColumnTransformer
from sklearn.metrics        import (accuracy_score, precision_score,
                                    recall_score, f1_score, roc_auc_score)
from sklearn.linear_model   import LogisticRegression
from sklearn.svm            import SVC
from sklearn.ensemble       import (RandomForestClassifier,
                                    GradientBoostingClassifier,
                                    HistGradientBoostingClassifier)
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

# ---------- 1 · FILE + TARGET --------------------------------------------
CSV    = "D:\CVD_Data.csv"   # ⚙️  change if needed
TARGET = "Cardiac Risk"                  # ⚙️  label column name

# ---------- 2 · Load & clean ---------------------------------------------
df = pd.read_csv(CSV, encoding="latin1")
df.drop(columns=[c for c in df.columns if "id" in c.lower()], inplace=True)

g = next((c for c in df.columns if "gender" in c.lower()), None)
if g:
    df[g] = df[g].astype(str).str.lower().str.strip() \
                   .map({"male": 1, "m": 1, "female": 0, "f": 0})

df = df.dropna(subset=[TARGET]).copy()
for col in df.columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

X = df.drop(columns=[TARGET])
y = df[TARGET].astype("category").cat.codes
binary = y.nunique() == 2

cat_cols = X.select_dtypes("object").columns.tolist()
num_cols = X.select_dtypes("number").columns.tolist()

# ---------- 3 · Dense One-Hot encoder (version-safe) ----------------------
if Version(sklearn.__version__) >= Version("1.4"):
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse=False)

preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", enc_dense, cat_cols)
])

# ---------- 4 · Model zoo -------------------------------------------------
models = {
    "LogReg"   : LogisticRegression(max_iter=1000, class_weight="balanced"),
    "SVC"      : SVC(probability=True, kernel="rbf", class_weight="balanced"),
    "RandFor"  : RandomForestClassifier(n_estimators=150, max_depth=15,
                                        class_weight="balanced"),
    "XGBoost"  : xgb.XGBClassifier(eval_metric="logloss", max_depth=6),
    "LightGBM" : lgb.LGBMClassifier(),
    "CatBoost" : CatBoostClassifier(verbose=0),
    "HistGB"   : HistGradientBoostingClassifier(max_iter=150),
    "GradBoost": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1)
}

# ---------- 5 · Train / test split ---------------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

X_tr_p = preproc.fit_transform(X_tr)
X_te_p = preproc.transform(X_te)
feat_names = preproc.get_feature_names_out()

# ---------- 6 · SHAP helper ----------------------------------------------
def shap_top10(clf, bg_rows=200, sample_rows=300):
    """
    Return list of (feature, percent) or raise RuntimeError to skip.
    """
    # --- skip multi-class GradientBoosting (unsupported)
    if isinstance(clf, GradientBoostingClassifier) and clf.n_classes_ != 2:
        raise RuntimeError("SHAP can't handle multi-class GradientBoosting.")

    # --- choose explainer
    if isinstance(clf, LogisticRegression):
        expl = shap.LinearExplainer(
            clf,
            shap.maskers.Independent(X_tr_p[:bg_rows])
        )
    elif isinstance(clf, (RandomForestClassifier, xgb.XGBClassifier,
                          lgb.LGBMClassifier, CatBoostClassifier,
                          HistGradientBoostingClassifier,
                          GradientBoostingClassifier)):
        expl = shap.TreeExplainer(clf)
    else:                                # SVC, etc. → Kernel
        expl = shap.KernelExplainer(clf.predict_proba, X_tr_p[:bg_rows])

    sv  = expl(X_te_p[:sample_rows], check_additivity=False)
    val = sv.values
    if isinstance(val, list):
        val = np.stack(val, axis=-1)
    val = np.abs(np.asarray(val))
    if val.ndim == 3:
        val = val.mean(axis=2)           # average over classes
    imp = val.mean(axis=0)
    imp /= imp.sum()                     # % share
    idx = np.argsort(imp)[-10:][::-1]
    return [(feat_names[i], imp[i]*100) for i in idx]

# ---------- 7 · Fit, score, explain --------------------------------------
results = {}
for name, clf in models.items():
    clf.fit(X_tr_p, y_tr)
    y_pred = clf.predict(X_te_p)

    # metrics
    auc = np.nan
    try:
        if hasattr(clf, "predict_proba"):
            p = clf.predict_proba(X_te_p)
            auc = roc_auc_score(y_te, p[:, 1]) if binary else \
                  roc_auc_score(y_te, p, multi_class="ovr")
        elif hasattr(clf, "decision_function"):
            d = clf.decision_function(X_te_p)
            auc = roc_auc_score(y_te, d) if binary else \
                  roc_auc_score(y_te, d, multi_class="ovr")
    except ValueError:
        pass

    results[name] = {
        "Accuracy":  accuracy_score(y_te, y_pred),
        "Precision": precision_score(y_te, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_te, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_te, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc
    }

    # SHAP feature ranking
    try:
        top = shap_top10(clf)
        print(f"\n▶ Top-10 SHAP features for {name}")
        for feat, pct in top:
            print(f"   {feat:40s} : {pct:6.2f}%")
    except Exception as e:
        print(f"(SHAP skipped for {name}: {e})")

# ---------- 8 · Results table --------------------------------------------
print("\n=== Model Comparison ===")
(pd.DataFrame(results)
   .T.sort_values("F1-Score", ascending=False)
   .round(3)
   .pipe(print))


Using 154 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


(SHAP skipped for LogReg: LinearExplainer.explain_row() got an unexpected keyword argument 'check_additivity')
(SHAP skipped for SVC: KernelExplainer.__call__() got an unexpected keyword argument 'check_additivity')

▶ Top-10 SHAP features for RandFor
   cat__WaistHeightRatio_#VALUE!            :  14.79%
   num__Diastolic BP (mmHg)                 :  11.55%
   num__BMI                                 :   8.85%
   num__Systolic BP (mmHg)                  :   7.13%
   num__RespiratoryRate(breaths/min)        :   5.87%
   num__Heart Rate (bpm)                    :   5.44%
   num__Socioeconomic                       :   3.72%
   num__Age                                 :   3.69%
   num__Tobacco                             :   3.00%
   num__Breathlessness                      :   2.90%

▶ Top-10 SHAP features for XGBoost
   num__Diastolic BP (mmHg)                 :  23.49%
   num__Systolic BP (mmHg)                  :  11.15%
   num__RespiratoryRate(breaths/min)        :   8.42%
   num__Ag