In [2]:

# ---------- imports -------------------------------------------------------
import warnings, shap, pandas as pd, numpy as np, sklearn
from packaging.version import Version

from sklearn.model_selection  import train_test_split
from sklearn.preprocessing    import StandardScaler, OneHotEncoder
from sklearn.compose          import ColumnTransformer
from sklearn.metrics          import (accuracy_score, precision_score,
                                      recall_score, f1_score, roc_auc_score)
from sklearn.linear_model     import LogisticRegression
from sklearn.svm              import SVC
from sklearn.ensemble         import (RandomForestClassifier,
                                      GradientBoostingClassifier,
                                      HistGradientBoostingClassifier)
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

CSV    = r"D:\Diabetes_Final.csv"   # <— your file
TARGET = "Diabetic Risk"

# ---------- 1 · Load & clean ----------------------------------------------
df = pd.read_csv(CSV, encoding="latin1")
df.drop(columns=[c for c in df.columns if "id" in c.lower()], inplace=True)

g = next((c for c in df.columns if "gender" in c.lower()), None)
if g:
    df[g] = df[g].astype(str).str.lower().str.strip() \
                   .map({"male":1,"m":1,"female":0,"f":0})

df = df.dropna(subset=[TARGET]).copy()
for c in df.columns:
    if df[c].isnull().any():
        df[c] = df[c].fillna(df[c].mode()[0])

X = df.drop(columns=[TARGET])
y = df[TARGET].astype("category").cat.codes
binary_task = y.nunique() == 2

cat_cols = X.select_dtypes("object").columns.tolist()
num_cols = X.select_dtypes("number").columns.tolist()

# ---------- 2 · Dense OneHotEncoder (version-safe) -------------------------
if Version(sklearn.__version__) >= Version("1.4"):
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse=False)

preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", enc_dense, cat_cols)
])

# ---------- 3 · Model zoo --------------------------------------------------
models = {
    "LogReg"   : LogisticRegression(max_iter=1000, class_weight="balanced"),
    "SVC"      : SVC(probability=True, kernel="rbf", class_weight="balanced"),
    "RandFor"  : RandomForestClassifier(n_estimators=150, max_depth=15,
                                        class_weight="balanced"),
    "XGBoost"  : xgb.XGBClassifier(eval_metric="logloss", max_depth=6),
    "LightGBM" : lgb.LGBMClassifier(),
    "CatBoost" : CatBoostClassifier(verbose=0),
    "GradBoost": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1),
    "HistGB"   : HistGradientBoostingClassifier(max_iter=150)
}

# ---------- 4 · Train / test split ----------------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

X_tr_p = preproc.fit_transform(X_tr)
X_te_p = preproc.transform(X_te)
feat_names = preproc.get_feature_names_out()

# ---------- 5 · SHAP helper -----------------------------------------------
def top10_shap(clf, background, sample):
    """Return list of (feat, % share) tuples or raise if unsupported."""
    if isinstance(clf, LogisticRegression):
        expl = shap.LinearExplainer(clf, background,
                                    masker=shap.maskers.Independent)
    elif isinstance(clf, (RandomForestClassifier, xgb.XGBClassifier,
                          lgb.LGBMClassifier, CatBoostClassifier,
                          HistGradientBoostingClassifier)):
        expl = shap.TreeExplainer(clf)
    elif isinstance(clf, GradientBoostingClassifier) and clf.n_classes_ == 2:
        expl = shap.TreeExplainer(clf)
    else:
        expl = shap.KernelExplainer(clf.predict_proba, background)

    sv  = expl(sample, check_additivity=False)
    val = sv.values
    if isinstance(val, list):                      # tree multi-class ⇒ list
        val = np.stack(val, axis=-1)
    val = np.abs(np.asarray(val))
    if val.ndim == 3:                              # (rows, feats, classes)
        val = val.mean(axis=2)                     # mean over classes
    imp = val.mean(axis=0)                         # mean |SHAP| over rows
    imp /= imp.sum()                               # convert to %
    idx = np.argsort(imp)[-10:][::-1]
    return [(feat_names[i], imp[i]*100) for i in idx]

# ---------- 6 · Fit, score, explain ---------------------------------------
results = {}
BG_ROWS, EX_ROWS = 200, 300

for name, clf in models.items():
    clf.fit(X_tr_p, y_tr)
    y_pred = clf.predict(X_te_p)

    # ---- metrics
    auc = np.nan
    try:
        if hasattr(clf, "predict_proba"):
            p = clf.predict_proba(X_te_p)
            auc = roc_auc_score(y_te, p[:,1]) if binary_task else \
                  roc_auc_score(y_te, p, multi_class="ovr")
        elif hasattr(clf, "decision_function"):
            d = clf.decision_function(X_te_p)
            auc = roc_auc_score(y_te, d) if binary_task else \
                  roc_auc_score(y_te, d, multi_class="ovr")
    except ValueError:
        pass

    results[name] = {
        "Accuracy":  accuracy_score(y_te, y_pred),
        "Precision": precision_score(y_te, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_te, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_te, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc
    }

    # ---- SHAP output
    try:
        bg   = shap.maskers.Independent(X_tr_p[:BG_ROWS])
        top  = top10_shap(clf, bg, X_te_p[:EX_ROWS])
        print(f"\n▶ Top-10 SHAP features for {name}")
        for feat, pct in top:
            print(f"   {feat:40s} : {pct:6.2f}%")
    except Exception as e:
        print(f"(SHAP skipped for {name}: {e})")

# ---------- 7 · Results table ---------------------------------------------
print("\n=== Model Comparison ===")
(pd.DataFrame(results)
   .T.sort_values("F1-Score", ascending=False)
   .round(3)
   .pipe(print))


(SHAP skipped for LogReg: LinearExplainer.__init__() got multiple values for argument 'masker')
(SHAP skipped for SVC: Unknown type passed as data object: <class 'shap.maskers._tabular.Independent'>)

▶ Top-10 SHAP features for RandFor
   num__BMI                                 :  19.39%
   cat__WaistHeightRatio_#VALUE!            :  15.42%
   num__Blood Glucose (mg/dL)               :   9.60%
   num__Heart Rate (bpm)                    :   7.19%
   num__Fatigue_Weakness                    :   5.28%
   num__Age                                 :   5.14%
   num__Alcohol                             :   4.33%
   num__Socioeconomic                       :   4.31%
   num__Smoking                             :   3.35%
   num__PhysicalActivity                    :   3.22%

▶ Top-10 SHAP features for XGBoost
   num__BMI                                 :  31.27%
   num__Blood Glucose (mg/dL)               :  15.14%
   num__Socioeconomic                       :   9.17%
   num__Age               