In [3]:
"""
Diabetic-Risk Benchmark + SHAP (readable % importances)
-------------------------------------------------------
• Single target column: “Diabetic Risk”
• Dense One-Hot encoder that works on any scikit-learn version
• Models: LogReg, SVC, RF, XGB, LightGBM, CatBoost, GBDT, HistGB
• Scores: Accuracy / Precision / Recall / F1 / ROC-AUC
• Prints top-10 SHAP features for every model as % of total importance
   (no more “0.0000”)

⇢  pip install --upgrade scikit-learn shap xgboost lightgbm catboost
"""

# ---------------- imports --------------------------------------------------
import warnings, shap, pandas as pd, numpy as np, sklearn
from packaging.version import Version

from sklearn.model_selection  import train_test_split
from sklearn.preprocessing    import OneHotEncoder, StandardScaler
from sklearn.compose          import ColumnTransformer
from sklearn.metrics          import (accuracy_score, precision_score,
                                      recall_score, f1_score, roc_auc_score)
from sklearn.linear_model     import LogisticRegression
from sklearn.svm              import SVC
from sklearn.ensemble         import (RandomForestClassifier,
                                      GradientBoostingClassifier,
                                      HistGradientBoostingClassifier)
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

CSV    = r"D:\Diabetes_Data.csv"   # ← your file
TARGET = "Diabetic Risk"           # ← single label

# ---------------- 1 · Load & basic cleaning -------------------------------
df = pd.read_csv(CSV, encoding="latin1")
df.drop(columns=[c for c in df.columns if "id" in c.lower()], inplace=True)

g = next((c for c in df.columns if "gender" in c.lower()), None)
if g:
    df[g] = df[g].astype(str).str.lower().str.strip() \
                   .map({"male":1,"m":1,"female":0,"f":0})

df = df.dropna(subset=[TARGET]).copy()
for c in df.columns:
    if df[c].isnull().any():
        df[c] = df[c].fillna(df[c].mode()[0])

X = df.drop(columns=[TARGET])
y = df[TARGET].astype("category").cat.codes
binary = y.nunique() == 2

cat_cols = X.select_dtypes("object").columns.tolist()
num_cols = X.select_dtypes("number").columns.tolist()

# ---------------- 2 · Dense OneHotEncoder (sklearn-safe) -------------------
if Version(sklearn.__version__) >= Version("1.4"):
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse=False)

preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", enc_dense, cat_cols)
])

# ---------------- 3 · Model zoo -------------------------------------------
models = {
    "LogReg"   : LogisticRegression(max_iter=1000, class_weight="balanced"),
    "SVC"      : SVC(probability=True, kernel="rbf", class_weight="balanced"),
    "RandFor"  : RandomForestClassifier(n_estimators=150, max_depth=15,
                                        class_weight="balanced"),
    "XGBoost"  : xgb.XGBClassifier(eval_metric="logloss", max_depth=6),
    "LightGBM" : lgb.LGBMClassifier(),
    "CatBoost" : CatBoostClassifier(verbose=0),
    "GradBoost": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1),
    "HistGB"   : HistGradientBoostingClassifier(max_iter=150)
}

# ---------------- 4 · Train / test split ----------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

X_tr_p = preproc.fit_transform(X_tr)
X_te_p = preproc.transform(X_te)
feat_names = preproc.get_feature_names_out()

# ---------------- 5 · SHAP helper (robust) --------------------------------
def shap_top10(clf, background, sample):
    # choose explainer
    if isinstance(clf, LogisticRegression):
        expl = shap.LinearExplainer(clf, background,
                                    masker=shap.maskers.Independent)
    elif isinstance(clf, (RandomForestClassifier, xgb.XGBClassifier,
                          lgb.LGBMClassifier, CatBoostClassifier,
                          HistGradientBoostingClassifier)):
        expl = shap.TreeExplainer(clf)
    elif isinstance(clf, GradientBoostingClassifier) and clf.n_classes_ == 2:
        expl = shap.TreeExplainer(clf)
    else:
        expl = shap.KernelExplainer(clf.predict_proba, background)

    sv = expl(sample, check_additivity=False)
    arr = sv.values
    if isinstance(arr, list):         # tree multi-class returns list
        arr = np.stack(arr, axis=-1)
    arr = np.abs(np.asarray(arr))
    if arr.ndim == 3:                 # multi-class: mean over classes
        arr = arr.mean(axis=2)
    import_vec = arr.mean(axis=0)
    import_vec = import_vec / import_vec.sum()  # % share
    idx = np.argsort(import_vec)[-10:][::-1]
    return [(feat_names[i], import_vec[i]*100) for i in idx]

# ---------------- 6 · Fit, score, explain ---------------------------------
results = {}
BG, EX = 200, 300     # rows for SHAP background / explanation

for name, clf in models.items():
    clf.fit(X_tr_p, y_tr)
    y_pred = clf.predict(X_te_p)

    # metrics
    auc = np.nan
    try:
        if hasattr(clf, "predict_proba"):
            p = clf.predict_proba(X_te_p)
            auc = roc_auc_score(y_te, p[:,1]) if binary else \
                  roc_auc_score(y_te, p, multi_class="ovr")
        elif hasattr(clf, "decision_function"):
            d = clf.decision_function(X_te_p)
            auc = roc_auc_score(y_te, d) if binary else \
                  roc_auc_score(y_te, d, multi_class="ovr")
    except ValueError:
        pass

    results[name] = {
        "Accuracy":  accuracy_score(y_te, y_pred),
        "Precision": precision_score(y_te, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_te, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_te, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc
    }

    # SHAP
    try:
        top = shap_top10(clf,
                         shap.maskers.Independent(X_tr_p[:BG]),
                         X_te_p[:EX])
        print(f"\n▶ Top-10 SHAP features for {name}")
        for feat, pct in top:
            print(f"   {feat:40s} : {pct:6.2f}%")
    except Exception as e:
        print(f"(SHAP skipped for {name}: {e})")

# ---------------- 7 · Results table ---------------------------------------
print("\n=== Model Comparison ===")
(pd.DataFrame(results)
   .T.sort_values("F1-Score", ascending=False)
   .round(3)
   .pipe(print))


(SHAP skipped for LogReg: LinearExplainer.__init__() got multiple values for argument 'masker')
(SHAP skipped for SVC: Unknown type passed as data object: <class 'shap.maskers._tabular.Independent'>)

▶ Top-10 SHAP features for RandFor
   num__Diastolic BP (mmHg)                 :  12.63%
   cat__WaistHeightRatio_#VALUE!            :  11.46%
   num__BMI                                 :  11.20%
   num__Systolic BP (mmHg)                  :   9.10%
   num__Blood Glucose (mg/dL)               :   8.34%
   num__Heart Rate (bpm)                    :   6.68%
   num__Fatigue_Weakness                    :   4.43%
   num__Socioeconomic                       :   4.02%
   num__Age                                 :   3.89%
   num__Smoking                             :   3.47%

▶ Top-10 SHAP features for XGBoost
   num__BMI                                 :  21.49%
   num__Diastolic BP (mmHg)                 :  12.07%
   num__Blood Glucose (mg/dL)               :  11.88%
   num__Socioeconomic     