# High-ROC AUC Tabular NN — v2.1 Live + Stronger Generalization

Upgrades focused on **higher ROC AUC** and fixing the **MPS `pin_memory` warning**:

**What's new**
- **MPS-safe data loading**: auto-detect Apple Silicon (MPS) and set `pin_memory=False`.
- **Model**: wider MLP, **BatchNorm1d**, **GELU** activation, **Embedding Dropout**, and **Input Dropout** for regularization.
- **Training**: gradient clipping, **CosineAnnealingWarmRestarts** scheduler, and **SWA (Stochastic Weight Averaging)** per fold for a small generalization bump.
- Includes the **live tracker**, fold logs, per-fold/OOF plots, and Kaggle-ready `submission.csv`.

> Place `train.csv` / `test.csv` in the working directory. Adjust the CONFIG if needed.


In [None]:
# =========================
# CONFIG
# =========================
COMPETITION_NAME = "playground-series-s5e8"  # label only
ID_COL = "id"
TARGET_COL = "y"

# Files
TRAIN_PATH = "playground-series-s5e8/train.csv"
TEST_PATH = "playground-series-s5e8/test.csv" 

# CV
N_SPLITS = 7                # more folds for better validation
RANDOM_SEED = 2025

# Model / Training
BATCH_SIZE = 2048           # smaller batch for better generalization
EPOCHS = 150                # more epochs for better convergence
PATIENCE = 20               # more patience for better training
BASE_LR = 5e-4              # lower learning rate for stability
WEIGHT_DECAY = 1e-4         # stronger regularization
HIDDEN_LAYERS = [2048, 1024, 512, 256, 128]  # deeper & wider network
DROPOUT = 0.3               # higher dropout for regularization
EMB_DROPOUT = 0.1           # higher embedding dropout
INPUT_DROPOUT = 0.1         # higher input dropout
USE_CLASS_WEIGHTS = True

# Scheduler settings
COSINE_T0 = 15              # longer initial period
COSINE_T_MULT = 2           # CosineAnnealingWarmRestarts T_mult
MIN_LR = 1e-7               # lower minimum learning rate

# SWA (Stochastic Weight Averaging) settings
USE_SWA = True
SWA_START_EPOCH = 30        # start SWA later for better base training
SWA_LR = 2e-5               # lower SWA learning rate

# Live tracking settings
LIVE_PRINT = True           # print live updates
LIVE_WRITE_EVERY = 1        # write to CSV every N epochs

# Output (auto-filled below)
RUN_STAMP = None
RUN_DIR = None               # root for this run
FIGS_DIR = None
FOLDS_DIR = None
LOGS_DIR = None
MODELS_DIR = None
ARTIFACTS_DIR = None

VERBOSE_EVERY = 1
SAVE_OOF = True


In [38]:
# =========================
# IMPORTS & FOLDERS
# =========================
import os, gc, math, random, json, time
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torch.optim.swa_utils import AveragedModel, SWALR, update_bn

from datetime import datetime, timezone

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(RANDOM_SEED)

# Device detection with MPS handling
use_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
has_cuda = torch.cuda.is_available()

if has_cuda:
    device = torch.device("cuda")
elif use_mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# pin_memory only helps on CUDA; disable on MPS/CPU to avoid warnings
pin_mem = True if device.type == "cuda" else False

print("Device:", device, "| pin_memory:", pin_mem)

# Run folders
RUN_STAMP = time.strftime("%Y-%m-%d_%H-%M-%S")
RUN_DIR = Path(f"runs/{RUN_STAMP}")
FIGS_DIR = RUN_DIR / "figs"
FOLDS_DIR = RUN_DIR / "folds"
LOGS_DIR = RUN_DIR / "logs"
MODELS_DIR = RUN_DIR / "models"
ARTIFACTS_DIR = RUN_DIR / "artifacts"
for d in [RUN_DIR, FIGS_DIR, FOLDS_DIR, LOGS_DIR, MODELS_DIR, ARTIFACTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Save config
cfg = {k:v for k,v in dict(globals()).items() if k in [
    "COMPETITION_NAME","ID_COL","TARGET_COL","TRAIN_PATH","TEST_PATH",
    "N_SPLITS","RANDOM_SEED","BATCH_SIZE","EPOCHS","PATIENCE",
    "BASE_LR","WEIGHT_DECAY","HIDDEN_LAYERS","DROPOUT","EMB_DROPOUT","INPUT_DROPOUT",
    "USE_CLASS_WEIGHTS","COSINE_T0","COSINE_T_MULT","MIN_LR","USE_SWA","SWA_START_EPOCH","SWA_LR",
    "LIVE_PRINT","LIVE_WRITE_EVERY","RUN_STAMP"
]}
with open(RUN_DIR / "config.json", "w") as f:
    json.dump(cfg, f, indent=2)

print("Run folder:", RUN_DIR.as_posix())


Device: mps | pin_memory: False
Run folder: runs/2025-08-12_10-17-24


In [39]:
# =========================
# LOAD DATA
# =========================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
print("Train:", train.shape, "Test:", test.shape)

assert TARGET_COL in train.columns, f"TARGET_COL '{TARGET_COL}' missing"
assert ID_COL in train.columns, f"ID_COL '{ID_COL}' missing"
assert ID_COL in test.columns, f"ID_COL '{ID_COL}' missing in test"

feature_cols = [c for c in train.columns if c not in [TARGET_COL, ID_COL]]
missing_in_test = [c for c in feature_cols if c not in test.columns]
assert not missing_in_test, f"Missing features in test: {missing_in_test}"

y = train[TARGET_COL].values
print("Target positive rate:", y.mean().round(6))


Train: (750000, 18) Test: (250000, 17)
Target positive rate: 0.120651


In [40]:
# =========================
# FEATURE TYPING & PREPROCESS
# =========================
obj_cols = [c for c in feature_cols if train[c].dtype == 'object']
lowcard_int_cols = [c for c in feature_cols 
                    if str(train[c].dtype).startswith('int') and train[c].nunique() <= 30]
cat_cols = sorted(list(set(obj_cols + lowcard_int_cols)))
num_cols = sorted([c for c in feature_cols if c not in cat_cols])
print(f"Categoricals ({len(cat_cols)}):", cat_cols[:20])
print(f"Numerics ({len(num_cols)}):", num_cols[:20])

# Rare category handling
RARE_NAME = "__RARE__"
MIN_CAT_COUNT = 10          # lower threshold to preserve more categories

def apply_rare(series: pd.Series, min_count: int = MIN_CAT_COUNT) -> pd.Series:
    counts = series.value_counts()
    rare = counts[counts < min_count].index
    return series.where(~series.isin(rare), RARE_NAME)

encoders = {}
for c in cat_cols:
    s_tr = apply_rare(train[c].astype(str))
    s_te = apply_rare(test[c].astype(str))
    le = LabelEncoder()
    le.fit(pd.concat([s_tr, s_te], axis=0).fillna("NA"))
    encoders[c] = le
    train[c] = le.transform(s_tr.fillna("NA"))
    test[c]  = le.transform(s_te.fillna("NA"))

scaler = None
if len(num_cols) > 0:
    scaler = StandardScaler()
    train[num_cols] = scaler.fit_transform(train[num_cols])
    test[num_cols] = scaler.transform(test[num_cols])

cat_cardinalities = [int(train[c].nunique()) for c in cat_cols]
cat_cardinalities


Categoricals (9): ['contact', 'default', 'education', 'housing', 'job', 'loan', 'marital', 'month', 'poutcome']
Numerics (7): ['age', 'balance', 'campaign', 'day', 'duration', 'pdays', 'previous']


[3, 2, 4, 2, 12, 2, 3, 12, 4]

In [41]:
# =========================
# DATASET
# =========================
class TabDataset(Dataset):
    def __init__(self, df, y=None, num_cols=None, cat_cols=None):
        self.num = df[num_cols].values.astype(np.float32) if num_cols else np.zeros((len(df),0), np.float32)
        self.cat = df[cat_cols].values.astype(np.int64) if cat_cols else np.zeros((len(df),0), np.int64)
        self.y = y.astype(np.float32) if y is not None else None

    def __len__(self):
        return len(self.num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.num[idx], self.cat[idx]
        return self.num[idx], self.cat[idx], self.y[idx]


In [42]:
# =========================
# MODEL (GELU + BatchNorm + Embedding Dropout + Input Dropout)
# =========================
class TabularNN(nn.Module):
    def __init__(self, num_dim, cat_cardinalities, hidden_layers, dropout=0.25, emb_dropout=0.05, input_dropout=0.05):
        super().__init__()
        self.has_cat = len(cat_cardinalities) > 0
        self.has_num = num_dim > 0
        self.input_dropout = nn.Dropout(input_dropout) if input_dropout > 0 and self.has_num else nn.Identity()

        # Embeddings
        if self.has_cat:
            emb_dims = []
            self.emb_layers = nn.ModuleList()
            for card in cat_cardinalities:
                emb_dim = int(min(64, max(4, round(1.6 * (card ** 0.56)))))  # slightly larger cap
                self.emb_layers.append(nn.Embedding(card, emb_dim))
                emb_dims.append(emb_dim)
            self.emb_dropout = nn.Dropout(emb_dropout) if emb_dropout > 0 else nn.Identity()
            emb_total = sum(emb_dims)
        else:
            self.emb_layers = None
            self.emb_dropout = nn.Identity()
            emb_total = 0

        in_dim = (num_dim if self.has_num else 0) + emb_total

        layers = []
        prev = in_dim
        for h in hidden_layers:
            layers += [
                nn.Linear(prev, h),
                nn.BatchNorm1d(h),
                nn.GELU(),
                nn.Dropout(dropout)
            ]
            prev = h
        layers += [nn.Linear(prev, 1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, x_num, x_cat):
        feats = []

        if self.has_cat:
            embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.emb_layers)]
            cat_feat = torch.cat(embs, dim=1)
            cat_feat = self.emb_dropout(cat_feat)
            feats.append(cat_feat)

        if self.has_num:
            x_num = self.input_dropout(x_num)
            feats.append(x_num)

        x = torch.cat(feats, dim=1) if len(feats) > 1 else feats[0]
        logit = self.mlp(x).squeeze(1)
        return logit


In [43]:
# =========================
# LIVE TRACKER
# =========================
class LiveTracker:
    def __init__(self, logs_dir: Path, write_every=1, live_print=True):
        self.logs_dir = Path(logs_dir)
        self.logs_dir.mkdir(parents=True, exist_ok=True)
        self.csv_path = self.logs_dir / "live_status.csv"
        self.json_path = self.logs_dir / "status.json"
        self.write_every = max(1, int(write_every))
        self.live_print = live_print
        self.rows = []
        if not self.csv_path.exists():
            with open(self.csv_path, "w") as f:
                f.write("timestamp,fold,epoch,train_loss,val_loss,val_auc,lr,best_val_auc,elapsed_s\n")

    def update(self, fold, epoch, train_loss, val_loss, val_auc, lr, best_val_auc, t0):
        now = time.time()
        row = dict(
            timestamp=datetime.now(timezone.utc).isoformat(),
            fold=fold, epoch=epoch,
            train_loss=float(train_loss) if train_loss is not None else None,
            val_loss=float(val_loss) if val_loss is not None else None,
            val_auc=float(val_auc) if val_auc is not None else None,
            lr=float(lr), best_val_auc=float(best_val_auc) if best_val_auc is not None else None,
            elapsed_s=now - t0
        )
        self.rows.append(row)
        if self.live_print:
            print(f"[fold {fold} | ep {epoch:03d}] tr={train_loss:.5f} va={val_loss:.5f} AUC={val_auc:.6f} best={best_val_auc:.6f} lr={lr:.2e}")
        if len(self.rows) % self.write_every == 0:
            self.flush()
        with open(self.json_path, "w") as f:
            json.dump(row, f, indent=2)

    def flush(self):
        with open(self.csv_path, "a") as f:
            for r in self.rows:
                f.write(",".join([
                    r["timestamp"], str(r["fold"]), str(r["epoch"]),
                    f"{r['train_loss']:.6f}" if r["train_loss"] is not None else "",
                    f"{r['val_loss']:.6f}" if r["val_loss"] is not None else "",
                    f"{r['val_auc']:.6f}" if r["val_auc"] is not None else "",
                    f"{r['lr']:.6e}", f"{r['best_val_auc']:.6f}" if r["best_val_auc"] is not None else "",
                    f"{r['elapsed_s']:.2f}"
                ]) + "\n")
        self.rows = []


In [44]:
# =========================
# TRAINING HELPERS
# =========================
def epoch_loop(model, loader, criterion, optimizer=None, clip_grad=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()
    losses, preds, targs = [], [], []

    for batch in loader:
        if is_train:
            x_num, x_cat, y = batch
        else:
            try:
                x_num, x_cat, y = batch
            except:
                x_num, x_cat = batch
                y = None

        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        if y is not None:
            y = y.to(device)

        with torch.set_grad_enabled(is_train):
            logit = model(x_num, x_cat)
            prob = torch.sigmoid(logit)
            loss = criterion(logit, y) if y is not None else None

        if is_train:
            optimizer.zero_grad()
            loss.backward()
            if clip_grad is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
            optimizer.step()

        if loss is not None:
            losses.append(loss.item())
            targs.append(y.detach().cpu().numpy())
        preds.append(prob.detach().cpu().numpy())

    preds = np.concatenate(preds) if preds else np.array([])
    y_true = np.concatenate(targs) if targs else None
    avg_loss = float(np.mean(losses)) if losses else None
    return avg_loss, preds, y_true

class EarlyStopper:
    def __init__(self, patience=10, mode="max", min_delta=1e-6):
        self.patience = patience
        self.mode = mode
        self.best = -np.inf if mode=="max" else np.inf
        self.count = 0
        self.min_delta = min_delta
        self.best_state = None

    def step(self, metric, model):
        improved = (metric > self.best + self.min_delta) if self.mode=="max" else (metric < self.best - self.min_delta)
        if improved:
            self.best = metric
            self.count = 0
            self.best_state = {k: v.cpu().clone() for k,v in model.state_dict().items()}
            return True
        else:
            self.count += 1
            return False

    def should_stop(self):
        return self.count >= self.patience


In [45]:
# =========================
# STRATIFIED K-FOLD TRAINING (Cosine + SWA + Live Tracker)
# =========================
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

oof = np.zeros(len(train), dtype=np.float32)
test_preds = np.zeros((len(test), N_SPLITS), dtype=np.float32)

metrics_rows = []
tracker = LiveTracker(LOGS_DIR, write_every=LIVE_WRITE_EVERY, live_print=LIVE_PRINT)

pos_weight = None
if USE_CLASS_WEIGHTS:
    pos_ratio = train[TARGET_COL].mean()
    pos_weight_val = max(1e-6, (1.0 - pos_ratio) / max(1e-6, pos_ratio))
    pos_weight = torch.tensor([pos_weight_val], dtype=torch.float32, device=device)

for fold, (tr_idx, va_idx) in enumerate(skf.split(train[feature_cols], train[TARGET_COL])):
    print(f"\n===== Fold {fold} =====")
    t0 = time.time()
    tr_df = train.iloc[tr_idx].reset_index(drop=True)
    va_df = train.iloc[va_idx].reset_index(drop=True)

    tr_ds = TabDataset(tr_df, tr_df[TARGET_COL].values, num_cols, cat_cols)
    va_ds = TabDataset(va_df, va_df[TARGET_COL].values, num_cols, cat_cols)
    te_ds = TabDataset(test, None, num_cols, cat_cols)

    tr_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=pin_mem)
    va_loader = DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=pin_mem)
    te_loader = DataLoader(te_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=pin_mem)

    model = TabularNN(num_dim=len(num_cols), cat_cardinalities=cat_cardinalities,
                      hidden_layers=HIDDEN_LAYERS, dropout=DROPOUT, emb_dropout=EMB_DROPOUT, input_dropout=INPUT_DROPOUT).to(device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None else nn.BCEWithLogitsLoss()

    optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=COSINE_T0, T_mult=COSINE_T_MULT, eta_min=MIN_LR)

    # SWA setup
    swa_model = AveragedModel(model) if USE_SWA else None
    swa_scheduler = SWALR(optimizer, swa_lr=SWA_LR) if USE_SWA else None

    early = EarlyStopper(patience=PATIENCE, mode="max")
    best_auc = -np.inf
    fold_log = []

    for epoch in range(1, EPOCHS+1):
        tr_loss, _, _ = epoch_loop(model, tr_loader, criterion, optimizer=optimizer, clip_grad=0.5)  # lower gradient clipping
        va_loss, va_pred, va_true = epoch_loop(model, va_loader, criterion, optimizer=None)
        va_auc = roc_auc_score(va_true, va_pred)

        # Scheduler step per epoch
        scheduler.step(epoch + fold)  # add fold to decorrelate cycles a bit

        # SWA step (after warmup epochs)
        if USE_SWA and epoch >= SWA_START_EPOCH:
            swa_model.update_parameters(model)
            swa_scheduler.step()
        else:
            # keep base LR scheduler moving if not using SWA yet
            pass

        fold_log.append(dict(epoch=epoch, train_loss=tr_loss, val_loss=va_loss, val_auc=float(va_auc),
                             lr=float(optimizer.param_groups[0]['lr'])))

        tracker.update(
            fold=fold, epoch=epoch, train_loss=tr_loss, val_loss=va_loss, val_auc=va_auc,
            lr=optimizer.param_groups[0]['lr'], best_val_auc=max(best_auc, va_auc), t0=t0
        )

        improved = early.step(va_auc, model)
        if improved:
            best_auc = va_auc
        if early.should_stop():
            print(f"Early stopping at epoch {epoch}. Best AUC: {best_auc:.6f}")
            break

    # If SWA used: update BN stats and switch to SWA weights if better
    if USE_SWA:
        # Custom BN update for tabular model with dual inputs
        swa_model.train()
        with torch.no_grad():
            for x_num, x_cat, _ in tr_loader:
                x_num = x_num.to(device)
                x_cat = x_cat.to(device)
                swa_model(x_num, x_cat)
        
        # Evaluate SWA model
        swa_model.eval()
        with torch.no_grad():
            preds_swa = []
            targs_swa = []
            for x_num, x_cat, yb in va_loader:
                x_num = x_num.to(device); x_cat = x_cat.to(device)
                logits = swa_model(x_num, x_cat)
                preds_swa.append(torch.sigmoid(logits).cpu().numpy())
                targs_swa.append(yb.numpy())
            preds_swa = np.concatenate(preds_swa)
            targs_swa = np.concatenate(targs_swa)
            auc_swa = roc_auc_score(targs_swa, preds_swa)
        if auc_swa >= best_auc:
            best_auc = auc_swa
            # copy SWA weights into base model
            model.load_state_dict(swa_model.state_dict())
            print(f"SWA improved/kept AUC: {auc_swa:.6f}")

    # Save fold artifacts
    fold_dir = FOLDS_DIR / f"fold_{fold}"
    fold_dir.mkdir(exist_ok=True)

    log_df = pd.DataFrame(fold_log)
    log_df.to_csv(fold_dir / "train_log.csv", index=False)
    torch.save(model.state_dict(), fold_dir / "model.pth")

    # Validation predictions
    _, va_pred, va_true = epoch_loop(model, va_loader, criterion, optimizer=None)
    oof[va_idx] = va_pred.squeeze()

    # Test predictions
    _, te_pred, _ = epoch_loop(model, te_loader, criterion, optimizer=None)
    test_preds[:, fold] = te_pred.squeeze()

    # Plots
    fig_roc, ax = plt.subplots()
    RocCurveDisplay.from_predictions(va_true, va_pred, ax=ax)
    ax.set_title(f"Fold {fold} ROC")
    fig_roc.savefig(fold_dir / "roc_curve.png", bbox_inches="tight")
    plt.close(fig_roc)

    fig_lc, ax = plt.subplots()
    ax.plot(log_df["epoch"], log_df["train_loss"], label="train_loss")
    ax.plot(log_df["epoch"], log_df["val_loss"], label="val_loss")
    ax.set_xlabel("Epoch"); ax.set_ylabel("Loss"); ax.set_title(f"Fold {fold} Loss"); ax.legend()
    fig_lc.savefig(fold_dir / "loss_curve.png", bbox_inches="tight")
    plt.close(fig_lc)

    fig_auc, ax = plt.subplots()
    ax.plot(log_df["epoch"], log_df["val_auc"], label="val_auc")
    ax.set_xlabel("Epoch"); ax.set_ylabel("ROC AUC"); ax.set_title(f"Fold {fold} Val AUC"); ax.legend()
    fig_auc.savefig(fold_dir / "val_auc_curve.png", bbox_inches="tight")
    plt.close(fig_auc)

    print(f"[Interim] OOF ROC AUC using completed folds only will be printed at the end.")

# Final aggregate
metrics_df = pd.DataFrame(metrics_rows)
# metrics_rows kept only per-fold best; compute now from oof
overall_auc = roc_auc_score(train[TARGET_COL].values, oof)
print("\nOOF ROC AUC:", overall_auc)

# Save OOF + metrics
if SAVE_OOF:
    oof_df = pd.DataFrame({ID_COL: train[ID_COL].values, "oof": oof, TARGET_COL: train[TARGET_COL].values})
    oof_df.to_csv(RUN_DIR / "oof_predictions.csv", index=False)

pd.DataFrame(dict(fold=list(range(N_SPLITS)))).assign(best_val_auc=np.nan).to_csv(RUN_DIR / "metrics.csv", index=False)

# Ensemble test preds
test_pred_mean = test_preds.mean(axis=1)
sub = pd.DataFrame({ID_COL: test[ID_COL].values, TARGET_COL: test_pred_mean})
sub.to_csv(RUN_DIR / "submission.csv", index=False)
print("Submission saved to:", (RUN_DIR / "submission.csv").as_posix())

# Overall plots
fig_all, ax = plt.subplots()
RocCurveDisplay.from_predictions(train[TARGET_COL].values, oof, ax=ax)
ax.set_title("OOF ROC Curve")
fig_all.savefig(FIGS_DIR / "oof_roc_curve.png", bbox_inches="tight")
plt.close(fig_all)

fig_hist, ax = plt.subplots()
ax.hist(oof, bins=50)
ax.set_title("OOF Prediction Distribution")
ax.set_xlabel("Predicted probability")
fig_hist.savefig(FIGS_DIR / "oof_pred_hist.png", bbox_inches="tight")
plt.close(fig_hist)

with open(RUN_DIR / "README.txt", "w") as f:
    f.write(
        "This folder contains outputs for a single run.\n"
        "- oof_predictions.csv: OOF probabilities with IDs and targets\n"
        "- submission.csv: ready for Kaggle submit\n"
        "- folds/*: per-fold model.pth, training logs, and plots (ROC, losses, AUC)\n"
        "- figs/*: overall figures (OOF ROC, hist)\n"
        "- logs/live_status.csv and logs/status.json: live tracking outputs\n"
    )

print("All artifacts saved under:", RUN_DIR.as_posix())



===== Fold 0 =====
[fold 0 | ep 001] tr=0.55765 va=0.45439 AUC=0.955639 best=0.955639 lr=9.76e-04
[fold 0 | ep 001] tr=0.55765 va=0.45439 AUC=0.955639 best=0.955639 lr=9.76e-04
[fold 0 | ep 002] tr=0.49772 va=0.44086 AUC=0.958382 best=0.958382 lr=9.05e-04
[fold 0 | ep 002] tr=0.49772 va=0.44086 AUC=0.958382 best=0.958382 lr=9.05e-04
[fold 0 | ep 003] tr=0.48458 va=0.43695 AUC=0.959206 best=0.959206 lr=7.94e-04
[fold 0 | ep 003] tr=0.48458 va=0.43695 AUC=0.959206 best=0.959206 lr=7.94e-04
[fold 0 | ep 004] tr=0.47706 va=0.43520 AUC=0.959584 best=0.959584 lr=6.55e-04
[fold 0 | ep 004] tr=0.47706 va=0.43520 AUC=0.959584 best=0.959584 lr=6.55e-04
[fold 0 | ep 005] tr=0.47249 va=0.43641 AUC=0.959260 best=0.959584 lr=5.01e-04
[fold 0 | ep 005] tr=0.47249 va=0.43641 AUC=0.959260 best=0.959584 lr=5.01e-04
[fold 0 | ep 006] tr=0.47041 va=0.42991 AUC=0.960388 best=0.960388 lr=3.46e-04
[fold 0 | ep 006] tr=0.47041 va=0.42991 AUC=0.960388 best=0.960388 lr=3.46e-04
[fold 0 | ep 007] tr=0.46598 va=