In [14]:
# ==============================================================
#  Cardiac Risk – Model Benchmark  (single train/test split)
# ==============================================================

import warnings, pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler, OneHotEncoder
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline
from sklearn.metrics         import (accuracy_score, precision_score,
                                     recall_score, f1_score, roc_auc_score)
from sklearn.linear_model     import LogisticRegression, SGDClassifier
from sklearn.naive_bayes      import GaussianNB
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.tree             import DecisionTreeClassifier
from sklearn.svm              import SVC, LinearSVC
from sklearn.ensemble         import (RandomForestClassifier, ExtraTreesClassifier,
                                      GradientBoostingClassifier, AdaBoostClassifier,
                                      HistGradientBoostingClassifier)
import xgboost  as xgb
import lightgbm as lgb
from catboost   import CatBoostClassifier
warnings.filterwarnings("ignore")

# ------------------------------------------------------------------
# 1.  FILES  ───────────────────────────────────────────────────────
# ------------------------------------------------------------------
DATA_PATH = Path("D:\generated_health_risk_data_100k.csv")   # training CSV
TEST_PATH = None                                # hold-out CSV or None
LABEL_COL = "Cardiac Risk"

# ------------------------------------------------------------------
# 2.  LOAD + BASIC CLEAN  ─────────────────────────────────────────
# ------------------------------------------------------------------
df = (pd.read_csv(DATA_PATH, encoding="utf-8")
        .rename(columns=str.strip))

test_df = (pd.read_csv(TEST_PATH, encoding="utf-8").rename(columns=str.strip)
           if TEST_PATH else None)

if LABEL_COL not in df.columns:
    raise ValueError(f"'{LABEL_COL}' not in file. Headers: {list(df.columns)}")

def clean(d: pd.DataFrame) -> pd.DataFrame:
    d = d.copy()
    d.drop(columns=[c for c in d.columns if "id" in c.lower()], inplace=True)
    if "Gender" in d.columns:
        d["Gender"] = (d["Gender"].astype(str).str.lower().str.strip()
                       .map({"male":1,"m":1,"female":0,"f":0}))
    return d

df       = clean(df)
if test_df is not None:
    test_df = clean(test_df)

# ------------------------------------------------------------------
# 3.  TRAIN / TEST SPLIT  ─────────────────────────────────────────
# ------------------------------------------------------------------
if test_df is None:
    train_df, test_df = train_test_split(
        df, test_size=0.20, stratify=df[LABEL_COL].str.lower(), random_state=42)
else:
    train_df = df

y_train = train_df[LABEL_COL].str.lower().map({"low":0,"moderate":1,"high":2})
y_test  =  test_df[LABEL_COL].str.lower().map({"low":0,"moderate":1,"high":2})

X_train = train_df.drop(columns=[LABEL_COL])
X_test  = test_df.drop(columns=[LABEL_COL])

# ------------------------------------------------------------------
# 4.  PREPROCESSOR  ───────────────────────────────────────────────
# ------------------------------------------------------------------
num_cols = X_train.select_dtypes("number").columns.tolist()
cat_cols = X_train.select_dtypes("object").columns.tolist()

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

# ------------------------------------------------------------------
# 5.  MODEL ZOO (14 classifiers)  ─────────────────────────────────
# ------------------------------------------------------------------
models = {
    "LogReg"        : LogisticRegression(max_iter=1500, class_weight="balanced"),
    "GaussianNB"    : GaussianNB(),
    "KNN"           : KNeighborsClassifier(n_neighbors=15),
    "DecisionTree"  : DecisionTreeClassifier(random_state=42),
    "LinearSVC"     : LinearSVC(class_weight="balanced"),
    "SVC-RBF"       : SVC(kernel="rbf", probability=True, class_weight="balanced"),
    "SGD-Logistic"  : SGDClassifier(loss="log_loss", max_iter=2000,
                                    class_weight="balanced"),
    "RandomForest"  : RandomForestClassifier(n_estimators=400, max_depth=18,
                                             class_weight="balanced", random_state=42),
    "ExtraTrees"    : ExtraTreesClassifier(n_estimators=400, random_state=42),
    "GradientBoost" : GradientBoostingClassifier(n_estimators=400, learning_rate=0.06),
    "AdaBoost"      : AdaBoostClassifier(n_estimators=400, learning_rate=0.03),
    "HistGB"        : HistGradientBoostingClassifier(max_iter=400),
    "XGBoost"       : xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss",
                                        max_depth=6, subsample=0.85,
                                        colsample_bytree=0.9, random_state=42),
    "LightGBM"      : lgb.LGBMClassifier(random_state=42),
    "CatBoost"      : CatBoostClassifier(verbose=0, random_state=42)
}

# ------------------------------------------------------------------
# 6.  FIT & EVALUATE  ─────────────────────────────────────────────
# ------------------------------------------------------------------
results = {}
for name, clf in models.items():
    pipe = Pipeline([("prep", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    try:
        y_prob = pipe.predict_proba(X_test)
        auc    = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except Exception:
        auc = np.nan

    results[name] = {
        "Accuracy":  accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_test, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_test, y_pred, average="macro", zero_division=0),
        "ROC-AUC":   auc
    }

# ------------------------------------------------------------------
# 7.  RESULTS  ────────────────────────────────────────────────────
# ------------------------------------------------------------------
df_res = (pd.DataFrame(results).T
          .sort_values("F1-Score", ascending=False)
          .round(3))

print("\n=== Model Comparison (single train/test split) ===")
print(df_res)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 907
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 23
[LightGBM] [Info] Start training from score -0.600614
[LightGBM] [Info] Start training from score -1.198156
[LightGBM] [Info] Start training from score -1.898621

=== Model Comparison (single train/test split) ===
               Accuracy  Precision  Recall  F1-Score  ROC-AUC
CatBoost          0.943      0.938   0.937     0.938    0.992
XGBoost           0.940      0.934   0.934     0.934    0.991
LogReg            0.939      0.928   0.940     0.934    0.991
HistGB            0.940      0.934   0.933     0.934    0.991
GradientBoost     0.940      0.935   0.931     0.933    0.991
LightGBM          0.940      0.935   0.931     0.933    0.991


In [8]:
# --------------------------------------------------------------------------
# 2-bis.  Brute detection: single-feature perfect predictors
# --------------------------------------------------------------------------
perfect_cols = []
for col in df.drop(columns=[LABEL]).columns:
    # simple 80/20 split to test leakiness
    train_idx = rng.choice(df.index, size=int(0.8*len(df)), replace=False)
    test_idx  = df.index.difference(train_idx)
    y_train   = y.loc[train_idx]
    y_test    = y.loc[test_idx]

    x_train = df.loc[train_idx, col]
    x_test  = df.loc[test_idx,  col]

    # If object, use mode mapping; if numeric, round/truncate
    mapper = x_train.groupby(y_train).agg(lambda s: s.mode().iloc[0])
    preds  = x_test.map({v:k for k,v in mapper.items()}).fillna(-1)

    if (preds == y_test).all():
        perfect_cols.append(col)

for col in perfect_cols:
    df[col] = rng.permutation(df[col].values)

print("🔧  Scrambled single-feature perfect predictors:", perfect_cols)


🔧  Scrambled single-feature perfect predictors: []


In [6]:
# -- Probe B: any numeric column that matches the target codes 0/1/2 -------
dup_cols = []
for c in X_enc.columns:
    if set(X_enc[c].unique()) <= {0,1,2} and (X_enc[c] == y_enc).all():
        dup_cols.append(c)
print("\nExact numeric duplicates of the label:", dup_cols)



Exact numeric duplicates of the label: []
