In [24]:
import pandas as pd
import numpy as np
import warnings
from pathlib import Path
from scipy import sparse
import shap

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    HistGradientBoostingClassifier
)
from sklearn.svm import SVC

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")

# 1) Load & split data
DATA_PATH = Path("D:\synthetic_diabetes_data_1000.csv")
LABEL     = "DiabeticRisk"

df = pd.read_csv(DATA_PATH)
df[LABEL] = df[LABEL].str.lower().map({"low":0,"moderate":1,"high":2})
df["Gender"] = df["Gender"].str.lower().map({"male":1,"female":0})
df.drop(columns=["PatientID"], inplace=True)

X = df.drop(columns=[LABEL])
y = df[LABEL]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# 2) Preprocess
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])
X_train_p = pre.fit_transform(X_train)
X_test_p  = pre.transform(X_test)
if sparse.issparse(X_train_p): X_train_p = X_train_p.toarray()
if sparse.issparse(X_test_p):  X_test_p  = X_test_p.toarray()

# 3) Define models (no GradientBoostingClassifier)
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
    "RandomForest":       RandomForestClassifier(n_estimators=300, max_depth=18, class_weight="balanced", random_state=42),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=300, random_state=42),
    "SVC-RBF":            SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42),
    "XGBoost":            xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss",
                                            max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "LightGBM":           lgb.LGBMClassifier(random_state=42),
    "CatBoost":           CatBoostClassifier(verbose=0, random_state=42)
}

# 4) Train & evaluate
print("\n=== Model Performance ===")
metrics = {}
for name, m in models.items():
    m.fit(X_train_p, y_train)
    y_pred = m.predict(X_test_p)
    try:
        y_prob = m.predict_proba(X_test_p)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except:
        auc = np.nan
    metrics[name] = {
        "Accuracy":  accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_test, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_test, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc
    }
perf_df = pd.DataFrame(metrics).T.sort_values("F1-Score", ascending=False).round(3)
print(perf_df)

# Prepare feature names
feature_names = num_cols + pre.named_transformers_["cat"].get_feature_names_out(cat_cols).tolist()

# 5) SHAP explainability on all test rows using unified Explainer
print("\n=== SHAP Mean(|Value|) for High-risk Class ===")
explained = {}
masker = X_train_p  # use entire training set as masker

for name, m in models.items():
    print(f"\n-- {name} --")
    # build an explainer on the model's predict_proba function
    explainer = shap.Explainer(m.predict_proba, masker)
    shap_exp = explainer(X_test_p)        # explain all test rows
    vals = shap_exp.values                # shape: (n_samples, n_classes, n_features)

    # extract the "High" risk class (class index 2)
    arr = np.abs(vals[:, 2, :])           # (n_samples, n_features)

    mean_abs = arr.mean(axis=0)
    top10 = np.argsort(mean_abs)[::-1][:10]
    for idx in top10:
        print(f"{feature_names[idx]:<30} {mean_abs[idx]:.4f}")



=== Model Performance ===
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 596
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 36
[LightGBM] [Info] Start training from score -3.361977
[LightGBM] [Info] Start training from score -1.047919
[LightGBM] [Info] Start training from score -0.486675
                      Accuracy  Precision  Recall  F1-Score  ROC AUC
SVC-RBF                  0.888      0.875   0.853     0.862    0.971
LogisticRegression       0.868      0.795   0.895     0.832    0.971
CatBoost                 0.864      0.900   0.652     0.694    0.954
XGBoost                  0.860      0.897   0.650     0.691    0.929
HistGradientBoosting     0.832      0.875   0.637     0.673    0.911
LightGBM                 0.820      0.867   0.6

PermutationExplainer explainer: 251it [00:23, 10.48it/s]                                                               


BloodGlucose                   0.0553
Gender                         0.0361
Age                            0.0258

-- RandomForest --


PermutationExplainer explainer: 251it [04:54,  1.22s/it]                                                               


BloodGlucose                   0.0211
Gender                         0.0164
Age                            0.0063

-- HistGradientBoosting --


PermutationExplainer explainer: 251it [05:48,  1.43s/it]                                                               


BloodGlucose                   0.0563
Gender                         0.0548
Age                            0.0105

-- SVC-RBF --


PermutationExplainer explainer: 251it [10:41,  2.60s/it]                                                               


BloodGlucose                   0.0629
Gender                         0.0537
Age                            0.0106

-- XGBoost --


PermutationExplainer explainer: 251it [00:17,  6.46it/s]                                                               


BloodGlucose                   0.0537
Gender                         0.0518
Age                            0.0086

-- LightGBM --


PermutationExplainer explainer: 251it [00:37,  4.85it/s]                                                               


BloodGlucose                   0.0523
Gender                         0.0513
Age                            0.0113

-- CatBoost --


PermutationExplainer explainer: 251it [02:25,  1.60it/s]                                                               

BloodGlucose                   0.0546
Gender                         0.0521
Age                            0.0066



