In [2]:
import pandas as pd
import numpy as np
import warnings
from pathlib import Path
from scipy import sparse

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.svm import SVC

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")

# ------------------------------------------------------------------
# 1.  Load and prepare data
# ------------------------------------------------------------------
DATA_PATH = Path("D:\synthetic_diabetes_data_100000.csv")
LABEL     = "DiabeticRisk"

df = pd.read_csv(DATA_PATH)

# Map target to numeric
df[LABEL] = df[LABEL].str.lower().map({"low": 0, "moderate": 1, "high": 2})

# Map gender to numeric
df["Gender"] = df["Gender"].str.lower().map({"male": 1, "female": 0})

# Drop ID column
df.drop(columns=["PatientID"], inplace=True)

# Split into train/test
X = df.drop(columns=[LABEL])
y = df[LABEL]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

# ------------------------------------------------------------------
# 2.  Preprocessing pipeline
# ------------------------------------------------------------------
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# Fit & transform training data
X_train_p = preprocessor.fit_transform(X_train)
# Convert to dense if sparse
if sparse.issparse(X_train_p):
    X_train_p = X_train_p.toarray()

# Transform test data
X_test_p = preprocessor.transform(X_test)
if sparse.issparse(X_test_p):
    X_test_p = X_test_p.toarray()

# ------------------------------------------------------------------
# 3.  Define models
# ------------------------------------------------------------------
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000, class_weight="balanced", random_state=42
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, max_depth=18,
        class_weight="balanced", random_state=42
    ),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.08, random_state=42
    ),
    "HistGradientBoosting": HistGradientBoostingClassifier(
        max_iter=300, random_state=42
    ),
    "SVC-RBF": SVC(
        kernel="rbf", probability=True,
        class_weight="balanced", random_state=42
    ),
    "XGBoost": xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        max_depth=6, subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

# ------------------------------------------------------------------
# 4.  Train & evaluate
# ------------------------------------------------------------------
metrics = {}
for name, model in models.items():
    model.fit(X_train_p, y_train)
    y_pred = model.predict(X_test_p)
    # Attempt to get probabilities for AUC
    try:
        y_prob = model.predict_proba(X_test_p)
        auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except Exception:
        auc = np.nan

    metrics[name] = {
        "Accuracy":  accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "Recall":    recall_score(y_test, y_pred, average="macro", zero_division=0),
        "F1-Score":  f1_score(y_test, y_pred, average="macro", zero_division=0),
        "ROC AUC":   auc
    }

# ------------------------------------------------------------------
# 5.  Display results
# ------------------------------------------------------------------
results = (
    pd.DataFrame(metrics)
      .T
      .sort_values("F1-Score", ascending=False)
      .round(3)
)

print("\n=== Diabetic Risk – Model Comparison ===")
print(results)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 745
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 36
[LightGBM] [Info] Start training from score -3.478078
[LightGBM] [Info] Start training from score -1.140101
[LightGBM] [Info] Start training from score -0.431789

=== Diabetic Risk – Model Comparison ===
                      Accuracy  Precision  Recall  F1-Score  ROC AUC
CatBoost                 1.000      1.000   1.000     1.000    1.000
HistGradientBoosting     0.999      0.999   0.996     0.997    1.000
XGBoost                  0.998      0.998   0.984     0.991    1.000
LightGBM                 0.997      0.997   0.978     0.987    1.000
RandomForest             0.955      0.957   0.932     0.944    0.995
GradientBoosting         0.986