In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
from catboost import CatBoostClassifier, Pool

# 1) Load data
DATA_PATH = Path("D:\synthetic_diabetes_data_1000.csv")
df = pd.read_csv(DATA_PATH)

# 2) Prepare target
df = df.dropna(subset=["DiabeticRisk"])
df["DiabeticRisk"] = (
    df["DiabeticRisk"]
      .str.lower()
      .map({"low": 0, "moderate": 1, "high": 2})
      .astype(int)
)

# 3) Identify and clean categoricals
categorical_cols = df.select_dtypes(include="object").columns.tolist()
if "PatientID" in categorical_cols:
    categorical_cols.remove("PatientID")
for col in categorical_cols:
    df[col] = df[col].fillna("Missing").astype(str)

# 4) Split features/label
X = df.drop(columns=["PatientID", "DiabeticRisk"])
y = df["DiabeticRisk"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

# 5) Build CatBoost Pools
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
test_pool  = Pool(X_test,  y_test,  cat_features=categorical_cols)

# 6) Configure and train the model
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    bagging_temperature=1.0,
    random_strength=1.0,
    border_count=64,
    one_hot_max_size=10,

    eval_metric="MultiClass",        # single‐value multiclass log-loss
    custom_metric=["AUC"],           # still log multiclass AUC each round

    early_stopping_rounds=50,
    verbose=100,
    thread_count=-1,
    random_seed=42,
    grow_policy="SymmetricTree"
)

model.fit(train_pool, eval_set=test_pool)

# 7) Predict on test
y_pred  = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# 8) Compute and print final metrics
print("Accuracy     :", accuracy_score(y_test, y_pred))
print("Precision    :", precision_score(y_test, y_pred, average='macro', zero_division=0))
print("Recall       :", recall_score(y_test, y_pred, average='macro', zero_division=0))
print("F1-Score     :", f1_score(y_test, y_pred, average='macro', zero_division=0))

# Multiclass ROC AUC via one-vs-rest
auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
print("ROC AUC (OVR):", auc)


0:	learn: 1.0702429	test: 1.0699463	best: 1.0699463 (0)	total: 146ms	remaining: 2m 26s
100:	learn: 0.3890227	test: 0.4876237	best: 0.4876237 (100)	total: 653ms	remaining: 5.82s
200:	learn: 0.2395785	test: 0.3840848	best: 0.3840848 (200)	total: 1.16s	remaining: 4.61s
300:	learn: 0.1662490	test: 0.3432867	best: 0.3432867 (300)	total: 1.66s	remaining: 3.86s
400:	learn: 0.1241083	test: 0.3160369	best: 0.3160369 (400)	total: 2.16s	remaining: 3.23s
500:	learn: 0.0969761	test: 0.3028663	best: 0.3028121 (497)	total: 2.67s	remaining: 2.65s
600:	learn: 0.0776396	test: 0.2914331	best: 0.2914331 (600)	total: 3.16s	remaining: 2.1s
700:	learn: 0.0641138	test: 0.2853540	best: 0.2845661 (695)	total: 3.67s	remaining: 1.56s
800:	learn: 0.0537876	test: 0.2779327	best: 0.2778071 (795)	total: 4.18s	remaining: 1.04s
900:	learn: 0.0457068	test: 0.2732112	best: 0.2732112 (900)	total: 4.69s	remaining: 515ms
999:	learn: 0.0396009	test: 0.2717321	best: 0.2715264 (957)	total: 5.18s	remaining: 0us

bestTest = 0.27