In [1]:


import pandas as pd, numpy as np, warnings, sys
from interpret.glassbox import ExplainableBoostingClassifier as EBM
from interpret import show  # optional HTML viz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import StandardScaler, OneHotEncoder
from sklearn.compose        import ColumnTransformer
from sklearn.metrics        import (accuracy_score, precision_score,
                                    recall_score, f1_score, roc_auc_score)
from sklearn.ensemble       import (RandomForestClassifier,
                                    GradientBoostingClassifier,
                                    HistGradientBoostingClassifier)
from sklearn.linear_model   import LogisticRegression
from sklearn.multiclass     import OneVsRestClassifier
from sklearn.svm            import SVC
import xgboost as xgb, lightgbm as lgb
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
TOP_N = 10                     # how many features to print from EBM

# -------------------- Load & clean --------------------
df = pd.read_csv(r"D:\jilo_new.csv", encoding="latin1")
df.drop(columns=[c for c in df.columns if "id" in c.lower()], inplace=True)

gcol = next((c for c in df.columns if "gender" in c.lower()), None)
if gcol:
    df[gcol] = df[gcol].astype(str).str.lower().str.strip() \
                       .map({"male": 1, "m": 1, "female": 0, "f": 0})

targets = ["Overall Risk", "Cardiac Risk", "Diabetic Risk", "Hypertension Risk"]
bad = {"unknown", "not assessed", "na", ""}
df = df.dropna(subset=targets)
for col in targets:
    df = df[~df[col].astype(str).str.lower().isin(bad)]

for c in df.columns:
    if df[c].isnull().any():
        df[c] = df[c].fillna(df[c].mode(dropna=True)[0])

# -------------------- Preprocessing ------------------
X_full = df.drop(columns=targets)
cat_cols = X_full.select_dtypes(include="object").columns.tolist()
num_cols = X_full.select_dtypes(include=np.number).columns.tolist()

preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

# -------------------- Models -------------------------
base_models = {
    "LogReg":  LogisticRegression(max_iter=1000, class_weight="balanced"),
    "RandFor": RandomForestClassifier(n_estimators=150, max_depth=15,
                                      class_weight="balanced"),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False,
                                 eval_metric="mlogloss", max_depth=6),
    "SVC":     SVC(probability=True, kernel="rbf", class_weight="balanced"),
    "GradBoost": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1),
    "HistGB":   HistGradientBoostingClassifier(max_iter=150),
    "CatBoost": CatBoostClassifier(verbose=0),
    "LightGBM": lgb.LGBMClassifier()
}

# -------------------- Evaluation + EBM ----------------
results = {}

for tgt in targets:
    print(f"\n===== {tgt} =====")
    y = df[tgt].astype("category").cat.codes
    X_train, X_test, y_train, y_test = train_test_split(
        X_full, y, stratify=y, test_size=0.2, random_state=42)

    X_tr_p = preproc.fit_transform(X_train)
    X_te_p = preproc.transform(X_test)
    feat_names = preproc.get_feature_names_out()

    results[tgt] = {}

    # ---- 1. Benchmark your existing models ----------
    for name, base in base_models.items():
        model = OneVsRestClassifier(base)
        model.fit(X_tr_p, y_train)
        y_hat = model.predict(X_te_p)

        try:
            y_prob = model.predict_proba(X_te_p)
            auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
        except Exception:
            auc = np.nan

        results[tgt][name] = {
            "Accuracy":  accuracy_score(y_test, y_hat),
            "Precision": precision_score(y_test, y_hat, average="macro", zero_division=0),
            "Recall":    recall_score(y_test, y_hat, average="macro", zero_division=0),
            "F1-Score":  f1_score(y_test, y_hat, average="macro", zero_division=0),
            "ROC AUC":   auc
        }

    # ---- 2. Fit an Explainable Boosting Machine -----
    ebm = EBM(interactions=10, random_state=42)
    ebm.fit(X_tr_p, y_train)

    # Global importance (EBM scores are already additive)
    ebm_global = ebm.explain_global(name=f"{tgt}-EBM")
    scores = ebm_global.data()["scores"]          # list of feature scores
    idx_top = np.argsort(scores)[-TOP_N:][::-1]

    print(f"\n▶ Top-{TOP_N} EBM features for {tgt}")
    for i in idx_top:
        print(f"   {feat_names[i]}: {scores[i]:.4f}")

    # If you’re in a notebook and want the interactive dashboard:
    # show(ebm_global)

# -------------------- Comparison tables --------------
for tgt in targets:
    print(f"\n=== {tgt} Model Comparison ===")
    (pd.DataFrame(results[tgt]).T
       .sort_values("F1-Score", ascending=False)
       .round(3)
       .pipe(print))



===== Overall Risk =====
[LightGBM] [Info] Number of positive: 91, number of negative: 60
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 151, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.602649 -> initscore=0.416515
[LightGBM] [Info] Start training from score 0.416515
[LightGBM] [Info] Number of positive: 27, number of negative: 124
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 151, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178808 -> initscore=-1.52