In [1]:


# ---------------- imports --------------------------------------------------
import warnings, shap, pandas as pd, numpy as np
from packaging.version import Version
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import OneHotEncoder, StandardScaler
from sklearn.compose        import ColumnTransformer
from sklearn.metrics        import (accuracy_score, precision_score,
                                    recall_score, f1_score, roc_auc_score)
from sklearn.linear_model   import LogisticRegression
from sklearn.svm            import SVC
from sklearn.ensemble       import (RandomForestClassifier,
                                    GradientBoostingClassifier,
                                    HistGradientBoostingClassifier)
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

# ---------- 1 · FILE + TARGET ---------------------------------------------
CSV    = "D:\HTN_Data.csv"     # ← uploaded file
TARGET = "Hypertension Risk"           # ⚙️ change if label col differs

# ---------- 2 · Load & basic cleaning -------------------------------------
df = pd.read_csv(CSV, encoding="latin1")
df.drop(columns=[c for c in df.columns if "id" in c.lower()], inplace=True)

g = next((c for c in df.columns if "gender" in c.lower()), None)
if g:
    df[g] = df[g].astype(str).str.lower().str.strip() \
                   .map({"male": 1, "m": 1, "female": 0, "f": 0})

df = df.dropna(subset=[TARGET]).copy()
for col in df.columns:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

X = df.drop(columns=[TARGET])
y = df[TARGET].astype("category").cat.codes
binary = y.nunique() == 2

cat_cols = X.select_dtypes("object").columns.tolist()
num_cols = X.select_dtypes("number").columns.tolist()

# ---------- 3 · Dense One-Hot encoder (sklearn-safe) -----------------------
if Version(pd.__version__) >= Version("1.4"):
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    enc_dense = OneHotEncoder(handle_unknown="ignore", sparse=False)

preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", enc_dense, cat_cols),
])

# ---------- 4 · Model zoo --------------------------------------------------
models = {
    "LogReg"   : LogisticRegression(max_iter=1000, class_weight="balanced"),
    "SVC"      : SVC(probability=True, kernel="rbf", class_weight="balanced"),
    "RandFor"  : RandomForestClassifier(n_estimators=150, max_depth=15,
                                        class_weight="balanced"),
    "XGBoost"  : xgb.XGBClassifier(eval_metric="logloss", max_depth=6),
    "LightGBM" : lgb.LGBMClassifier(),
    "CatBoost" : CatBoostClassifier(verbose=0),
    "HistGB"   : HistGradientBoostingClassifier(max_iter=150),
    "GradBoost": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1),
}

# ---------- 5 · Split ------------------------------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

X_tr_p = preproc.fit_transform(X_tr)
X_te_p = preproc.transform(X_te)
feat_names = preproc.get_feature_names_out()

# ---------- 6 · SHAP helper -----------------------------------------------
def shap_top10(clf, bg_rows=200, ex_rows=300):
    # Skip multi-class GradientBoosting — SHAP not supported
    if isinstance(clf, GradientBoostingClassifier) and clf.n_classes_ != 2:
        raise RuntimeError("skip-GB multi-class")

    # choose explainer
    if isinstance(clf, LogisticRegression):
        expl = shap.LinearExplainer(
            clf,
            shap.maskers.Independent(X_tr_p[:bg_rows]),
            masker=shap.maskers.Independent,
        )
    elif isinstance(clf, (RandomForestClassifier, xgb.XGBClassifier,
                          lgb.LGBMClassifier, CatBoostClassifier,
                          HistGradientBoostingClassifier,
                          GradientBoostingClassifier)):
        expl = shap.TreeExplainer(clf)
    else:  # SVC → kernel
        expl = shap.KernelExplainer(clf.predict_proba, X_tr_p[:bg_rows])

    sv = expl(X_te_p[:ex_rows], check_additivity=False)
    arr = sv.values
    if isinstance(arr, list):
        arr = np.stack(arr, axis=-1)
    arr = np.abs(np.asarray(arr))
    if arr.ndim == 3:
        arr = arr.mean(2)
    imp = arr.mean(0)
    imp /= imp.sum()               # percent share
    idx = np.argsort(imp)[-10:][::-1]
    return [(feat_names[i], imp[i]*100) for i in idx]

# ---------- 7 · Fit, score, explain ---------------------------------------
results = {}
for name, clf in models.items():
    clf.fit(X_tr_p, y_tr)
    y_pred = clf.predict(X_te_p)

    # metrics
    auc = np.nan
    try:
        if hasattr(clf, "predict_proba"):
            p = clf.predict_proba(X_te_p)
            auc = roc_auc_score(y_te, p[:, 1]) if binary else \
                  roc_auc_score(y_te, p, multi_class="ovr")
        elif hasattr(clf, "decision_function"):
            d = clf.decision_function(X_te_p)
            auc = roc_auc_score(y_te, d) if binary else \
                  roc_auc_score(y_te, d, multi_class="ovr")
    except ValueError:
        pass

    results[name] = {
        "Accuracy":  accuracy_score(y_te, y_pred),
        "Precision": precision_score(y_te, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_te, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_te, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc,
    }

    # SHAP explanation
    try:
        top = shap_top10(clf)
        print(f"\n▶ Top-10 SHAP features for {name}")
        for feat, pct in top:
            print(f"   {feat:40s} : {pct:6.2f}%")
    except RuntimeError:
        print(f"(SHAP skipped for {name}: multi-class GradientBoosting)")
    except Exception as e:
        print(f"(SHAP skipped for {name}: {e})")

# ---------- 8 · Results table ---------------------------------------------
print("\n=== Model Comparison ===")
(pd.DataFrame(results)
   .T.sort_values("F1-Score", ascending=False)
   .round(3)
   .pipe(print))


Using 154 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


(SHAP skipped for LogReg: LinearExplainer.__init__() got multiple values for argument 'masker')
(SHAP skipped for SVC: KernelExplainer.__call__() got an unexpected keyword argument 'check_additivity')

▶ Top-10 SHAP features for RandFor
   cat__WaistHeightRatio_#VALUE!            :  13.66%
   num__Diastolic BP (mmHg)                 :  11.26%
   num__BMI                                 :   8.83%
   num__Systolic BP (mmHg)                  :   5.92%
   num__Heart Rate (bpm)                    :   5.58%
   num__RespiratoryRate(breaths/min)        :   5.34%
   num__PhysicalActivity                    :   4.55%
   num__Socioeconomic                       :   3.87%
   num__Breathlessness                      :   3.72%
   num__Chestdiscomfort                     :   3.54%

▶ Top-10 SHAP features for XGBoost
   num__Diastolic BP (mmHg)                 :  27.91%
   num__RespiratoryRate(breaths/min)        :  10.57%
   num__Heart Rate (bpm)                    :  10.03%
   num__BMI              