In [3]:
import pandas as pd
import numpy as np
import warnings
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from scipy import sparse

warnings.filterwarnings("ignore")

# ------------------------------------------------------------------
# 1.  load and split
# ------------------------------------------------------------------
DATA_PATH = Path("D:\synthetic_diabetes_data_1000.csv")
LABEL     = "DiabeticRisk"

df = pd.read_csv(DATA_PATH)

# map labels
df[LABEL] = df[LABEL].str.lower().map({"low": 0, "moderate": 1, "high": 2})

# map gender
df["Gender"] = df["Gender"].str.lower().map({"male": 1, "female": 0})

# drop ID
df.drop(columns=["PatientID"], inplace=True)

# split
X = df.drop(columns=[LABEL])
y = df[LABEL]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

# ------------------------------------------------------------------
# 2.  preprocessing
# ------------------------------------------------------------------
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# fit & transform, then convert to dense if sparse
X_train_p = pre.fit_transform(X_train)
if sparse.issparse(X_train_p):
    X_train_p = X_train_p.toarray()

X_test_p = pre.transform(X_test)
if sparse.issparse(X_test_p):
    X_test_p = X_test_p.toarray()

# ------------------------------------------------------------------
# 3.  model zoo
# ------------------------------------------------------------------
models = {
    "LogisticReg"  : LogisticRegression(max_iter=1000, class_weight="balanced"),
    "RandomForest" : RandomForestClassifier(
                         n_estimators=300, max_depth=18,
                         class_weight="balanced", random_state=42
                      ),
    "GradientBoost": GradientBoostingClassifier(
                         n_estimators=300, learning_rate=0.08
                      ),
    "HistGB"       : HistGradientBoostingClassifier(max_iter=300),
    "SVC-RBF"      : SVC(
                         kernel="rbf", probability=True,
                         class_weight="balanced"
                      ),
    "XGBoost"      : xgb.XGBClassifier(
                         use_label_encoder=False,
                         eval_metric="mlogloss",
                         max_depth=6, subsample=0.8,
                         colsample_bytree=0.8,
                         random_state=42
                      ),
    "LightGBM"     : lgb.LGBMClassifier(random_state=42),
    "CatBoost"     : CatBoostClassifier(verbose=0, random_state=42)
}

# ------------------------------------------------------------------
# 4.  training & evaluation
# ------------------------------------------------------------------
metrics = {}
for name, est in models.items():
    est.fit(X_train_p, y_train)
    y_pred = est.predict(X_test_p)
    try:
        y_prob = est.predict_proba(X_test_p)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except Exception:
        auc = np.nan

    metrics[name] = {
        "Accuracy":  accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_test, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_test, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc
    }

# ------------------------------------------------------------------
# 5.  print results
# ------------------------------------------------------------------
results = (pd.DataFrame(metrics)
             .T
             .sort_values("F1-Score", ascending=False)
             .round(3))

print("\n=== Diabetic Risk – Train/Test Metrics ===")
print(results)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 596
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 36
[LightGBM] [Info] Start training from score -3.361977
[LightGBM] [Info] Start training from score -1.047919
[LightGBM] [Info] Start training from score -0.486675

=== Diabetic Risk – Train/Test Metrics ===
               Accuracy  Precision  Recall  F1-Score  ROC AUC
SVC-RBF           0.888      0.875   0.853     0.862    0.972
LogisticReg       0.868      0.795   0.895     0.832    0.971
CatBoost          0.864      0.900   0.652     0.694    0.954
XGBoost           0.860      0.897   0.650     0.691    0.929
HistGB            0.832      0.875   0.637     0.673    0.911
LightGBM          0.820      0.867   0.627     0.664    0.918
GradientBo