# 16_v5_ResMLP_v1 ÏöîÏïΩ

- Î™®Îç∏: v5 Ïã§Ìóò(ResMLP/FT/ÏïôÏÉÅÎ∏î ÎπÑÍµê)
- ÌîºÏ≤ò: v5 ÏÑ∏Ìä∏ (TE_ONLY/RAW_ONLY/SUMMARY_ONLY ÎπÑÍµê)
- ÌïôÏäµ/ÌèâÍ∞Ä: KFold 5
- Ï†úÏ∂úÌååÏùº: (Ïã§Ìñâ ÏòµÏÖòÏóê Îî∞Îùº ÏÉùÏÑ±)


In [None]:
"""
v5 Feature Ablation Runner
- Keep v5 models (MLP + FTTransformer) & training logic
- Toggle feature groups (raw/derived/TE/etc.) per experiment
- Auto-naming submission files
"""

import os
import re
import random
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score


# ============================================================
# CONFIG
# ============================================================
SEED = 42
N_FOLDS = 5
BATCH_SIZE = 512
EPOCHS = 50
PATIENCE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è Device: {DEVICE}")

# Path priority: local -> repo default -> /mnt/data fallback
CANDIDATE_TRAIN = ["train.csv", "../../data/raw/train.csv", "/mnt/data/train.csv"]
CANDIDATE_TEST  = ["test_x.csv", "../../data/raw/test_x.csv", "/mnt/data/test_x.csv"]

def pick_existing(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    raise FileNotFoundError(f"None of these paths exist: {paths}")

TRAIN_PATH = pick_existing(CANDIDATE_TRAIN)
TEST_PATH  = pick_existing(CANDIDATE_TEST)
print(f"üìÇ TRAIN_PATH: {TRAIN_PATH}")
print(f"üìÇ TEST_PATH : {TEST_PATH}")

OUTDIR = "outputs_v5_ablation"
os.makedirs(OUTDIR, exist_ok=True)


def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)


# ============================================================
# Load data
# ============================================================
train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)

train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)
print(f"Train: {train_raw.shape}, Test: {test_raw.shape}")


# ============================================================
# Cleaning
# ============================================================
def clean_data(df):
    df = df.copy()

    # no-answer (0 -> NaN)
    for col in ["education", "engnat", "hand", "married", "urban"]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # familysize
    if "familysize" in df.columns:
        df.loc[df["familysize"] == 0, "familysize"] = np.nan
        df.loc[df["familysize"] > 15, "familysize"] = np.nan

    # TP 0 -> NaN
    for col in [f"tp{i:02d}" for i in range(1, 11)]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # Q_E clipping
    for col in [f"Q{c}E" for c in "abcdefghijklmnopqrst"]:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)

    return df


# ============================================================
# Feature Engineering (same spirit as your v5)
# ============================================================
def build_features(df):
    df = df.copy()

    # demographics
    age_map = {"10s": 1, "20s": 2, "30s": 3, "40s": 4, "50s": 5, "60s": 6, "+70s": 7}
    df["age_ord"] = df["age_group"].map(age_map)
    df["is_teenager"] = (df["age_ord"] == 1).astype(int)
    df["is_young"] = (df["age_ord"] <= 2).astype(int)
    df["is_old"] = (df["age_ord"] >= 6).astype(int)
    df["edu_low"] = (df["education"] <= 2).astype(float)
    df["edu_high"] = (df["education"] >= 3).astype(float)
    df["is_single"] = (df["married"] == 1).astype(float)
    df["is_married"] = (df["married"] == 2).astype(float)
    df["is_urban"] = (df["urban"] == 3).astype(float)
    df["is_english_native"] = (df["engnat"] == 1).astype(float)
    df["is_male"] = (df["gender"] == "Male").astype(int)

    # Q_A raw + aggregates
    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    df["qa_mean"] = df[qa_cols].mean(axis=1)
    df["qa_std"] = df[qa_cols].std(axis=1)
    df["qa_range"] = df[qa_cols].max(axis=1) - df[qa_cols].min(axis=1)
    df["qa_extreme_ratio"] = ((df[qa_cols] == 1) | (df[qa_cols] == 5)).sum(axis=1) / 20
    df["qa_neutral_ratio"] = (df[qa_cols] == 3).sum(axis=1) / 20
    df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(int)

    # Q_E log + aggregates
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        df[f"{col}_log"] = np.log1p(df[col])

    qe_log_cols = [f"{col}_log" for col in qe_cols]
    df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
    df["qe_log_std"] = df[qe_log_cols].std(axis=1)
    df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / 20
    df["qe_total_log"] = df[qe_log_cols].sum(axis=1)
    df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df["qa_all_same"] == 1)).astype(int)

    # TP Big5
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / 10
    df["extraversion"] = df["tp01"] - df["tp06"]
    df["agreeableness"] = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"] = df["tp04"] - df["tp09"]
    df["openness"] = df["tp05"] - df["tp10"]
    df["tp_mean"] = df[tp_cols].mean(axis=1)

    # WR/WF
    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    df["vocab_high"] = (df["wr_sum"] >= 11).astype(int)

    # interactions
    df["age_edu"] = df["age_ord"] * df["education"]
    df["young_low_edu"] = df["is_young"] * df["edu_low"]
    df["young_single"] = df["is_young"] * df["is_single"]
    df["old_married"] = df["is_old"] * df["is_married"]
    df["teenager_low_edu"] = df["is_teenager"] * df["edu_low"]

    return df


# ============================================================
# Target Encoding (fold-safe: fit on train_fold only)
# ============================================================
def target_encode(train_df, val_df, test_df, col, target_col, smoothing=10):
    global_mean = train_df[target_col].mean()
    agg = train_df.groupby(col)[target_col].agg(["mean", "count"])
    agg["te"] = (agg["count"] * agg["mean"] + smoothing * global_mean) / (agg["count"] + smoothing)
    te_map = agg["te"].to_dict()

    tr = train_df[col].map(te_map).fillna(global_mean).values
    va = val_df[col].map(te_map).fillna(global_mean).values
    te = test_df[col].map(te_map).fillna(global_mean).values
    return tr, va, te


def create_target_encodings(train_df, val_df, test_df, target_col="voted_bin"):
    te = {"train": {}, "val": {}, "test": {}}

    # single
    for col in ["age_group", "race", "religion"]:
        tr, va, tt = target_encode(train_df, val_df, test_df, col, target_col, smoothing=10)
        te["train"][f"{col}_te"] = tr
        te["val"][f"{col}_te"] = va
        te["test"][f"{col}_te"] = tt

    # composite (must be created on all dfs)
    for df in [train_df, val_df, test_df]:
        df["age_edu_cat"] = df["age_group"].astype(str) + "_" + df["education"].astype(str)
        df["age_married_cat"] = df["age_group"].astype(str) + "_" + df["married"].astype(str)
        df["age_race_cat"] = df["age_group"].astype(str) + "_" + df["race"].astype(str)
        df["age_edu_married_cat"] = (
            df["age_group"].astype(str) + "_" + df["education"].astype(str) + "_" + df["married"].astype(str)
        )

    for col, sm in [
        ("age_edu_cat", 5),
        ("age_married_cat", 5),
        ("age_race_cat", 5),
        ("age_edu_married_cat", 3),
    ]:
        tr, va, tt = target_encode(train_df, val_df, test_df, col, target_col, smoothing=sm)
        te["train"][f"{col}_te"] = tr
        te["val"][f"{col}_te"] = va
        te["test"][f"{col}_te"] = tt

    return te


# ============================================================
# Column pools (raw + derived)
# ============================================================
QA_RAW = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
QE_RAW = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
QE_LOG = [f"Q{c}E_log" for c in "abcdefghijklmnopqrst"]
TP_RAW = [f"tp{i:02d}" for i in range(1, 11)]
WR_RAW = [f"wr_{i:02d}" for i in range(1, 14)]
WF_RAW = [f"wf_{i:02d}" for i in range(1, 4)]

DEMO_BASE = ["age_ord", "education", "married", "urban", "engnat", "familysize", "hand"]
DEMO_DERIVED = [
    "is_teenager", "is_young", "is_old",
    "edu_low", "edu_high",
    "is_single", "is_married", "is_urban",
    "is_english_native", "is_male",
]

QA_DERIVED = ["qa_mean", "qa_std", "qa_range", "qa_extreme_ratio", "qa_neutral_ratio", "qa_all_same"]
QE_DERIVED = ["qe_log_mean", "qe_log_std", "qe_fast_ratio", "qe_total_log", "is_careless"]
TP_DERIVED = ["tp_missing_ratio", "tp_mean", "extraversion", "agreeableness", "conscientiousness", "neuroticism", "openness"]
WR_DERIVED = ["wr_sum", "wf_sum", "word_credibility", "vocab_high"]
INTERACTIONS = ["age_edu", "young_low_edu", "young_single", "old_married", "teenager_low_edu"]

TE_FEATURES = [
    "age_group_te", "race_te", "religion_te",
    "age_edu_cat_te", "age_married_cat_te", "age_race_cat_te", "age_edu_married_cat_te",
]

CAT_FEATURES = ["gender", "race", "religion"]


# ============================================================
# Feature Groups (switchable)
# ============================================================
FEATURE_GROUPS = {
    # numeric groups
    "demo_base": DEMO_BASE,
    "demo_derived": DEMO_DERIVED,
    "qa_raw": QA_RAW,
    "qa_derived": QA_DERIVED,
    "qe_log": QE_LOG,          # log versions only
    "qe_derived": QE_DERIVED,
    "tp_raw": TP_RAW,
    "tp_derived": TP_DERIVED,
    "wr_raw": WR_RAW,
    "wf_raw": WF_RAW,
    "wr_derived": WR_DERIVED,
    "interactions": INTERACTIONS,
    "te": TE_FEATURES,
}

# categorical group (always used by embedding)
CAT_GROUP = {"cat_basic": CAT_FEATURES}


def resolve_feature_list(include_groups):
    cols = []
    for g in include_groups:
        cols.extend(FEATURE_GROUPS[g])
    # de-dup but keep order
    seen = set()
    out = []
    for c in cols:
        if c not in seen:
            out.append(c)
            seen.add(c)
    return out


# ============================================================
# Experiments (edit here)
# ============================================================
EXPERIMENTS = {
    # Baselines
    "S0_demo_te": ["demo_base", "demo_derived", "te"],
    "S1_demo_te_wr_tp": ["demo_base", "demo_derived", "te", "wr_raw", "wf_raw", "wr_derived", "tp_raw", "tp_derived", "interactions"],

    # Q_A tests
    "S2_add_QA_derived": ["demo_base", "demo_derived", "te", "wr_raw", "wf_raw", "wr_derived", "tp_raw", "tp_derived", "interactions", "qa_derived"],
    "S3_add_QA_raw20":   ["demo_base", "demo_derived", "te", "wr_raw", "wf_raw", "wr_derived", "tp_raw", "tp_derived", "interactions", "qa_raw"],

    # Q_E tests
    "S4_add_QE_derived": ["demo_base", "demo_derived", "te", "wr_raw", "wf_raw", "wr_derived", "tp_raw", "tp_derived", "interactions", "qe_derived"],
    "S5_add_QE_log20":   ["demo_base", "demo_derived", "te", "wr_raw", "wf_raw", "wr_derived", "tp_raw", "tp_derived", "interactions", "qe_log"],

    # Full-ish (your v5 spirit)
    "S6_full_v5like": [
        "demo_base", "demo_derived", "te",
        "qa_raw", "qa_derived",
        "qe_log", "qe_derived",
        "tp_raw", "tp_derived",
        "wr_raw", "wf_raw", "wr_derived",
        "interactions",
    ],
}


# ============================================================
# Dataset
# ============================================================
class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]


# ============================================================
# Model 1: MLP
# ============================================================
class MLP(nn.Module):
    def __init__(self, num_features, cat_dims, embed_dim=8, hidden_dims=[256, 128, 64], dropout=0.3):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(dim + 1, embed_dim) for dim in cat_dims])

        input_dim = num_features + len(cat_dims) * embed_dim
        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
            prev_dim = hidden_dim

        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(hidden_dims[-1], 1)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x_num, x_cat):
        cat_embeds = torch.cat([emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)], dim=1)
        x = torch.cat([x_num, cat_embeds], dim=1)
        x = self.mlp(x)
        return self.output(x)


# ============================================================
# Model 2: FT-Transformer (light)
# ============================================================
class NumericalEmbedding(nn.Module):
    def __init__(self, num_features, d_token):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(num_features, d_token) * 0.02)
        self.bias = nn.Parameter(torch.zeros(num_features, d_token))

    def forward(self, x):
        return x.unsqueeze(-1) * self.weight + self.bias


class FTTransformer(nn.Module):
    def __init__(self, num_features, cat_dims, d_token=48, n_layers=2, n_heads=4, dropout=0.2):
        super().__init__()
        self.num_embed = NumericalEmbedding(num_features, d_token)
        self.cat_embeds = nn.ModuleList([nn.Embedding(dim + 1, d_token) for dim in cat_dims])
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_token) * 0.02)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token, nhead=n_heads, dim_feedforward=d_token * 2,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token // 2, 1)
        )

    def forward(self, x_num, x_cat):
        num_tokens = self.num_embed(x_num)  # (B, n_num, d)
        cat_tokens = torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeds)], dim=1)  # (B, n_cat, d)

        tokens = torch.cat([num_tokens, cat_tokens], dim=1)
        cls = self.cls_token.expand(tokens.size(0), -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)

        x = self.transformer(tokens)
        return self.head(x[:, 0])


# ============================================================
# Train / Predict
# ============================================================
def train_model(model, train_loader, val_loader, val_y, device, epochs=EPOCHS, patience=PATIENCE, lr=1e-3):
    model.to(device)

    pos_ratio = float(np.mean(val_y))
    pos_weight = torch.tensor([(1 - pos_ratio) / (pos_ratio + 1e-6)], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3)

    best_auc = 0.0
    best_state = None
    no_improve = 0

    for epoch in range(epochs):
        model.train()
        for X_num, X_cat, y in train_loader:
            X_num, X_cat, y = X_num.to(device), X_cat.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X_num, X_cat), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        model.eval()
        val_preds = []
        with torch.no_grad():
            for X_num, X_cat, _ in val_loader:
                X_num, X_cat = X_num.to(device), X_cat.to(device)
                val_preds.append(torch.sigmoid(model(X_num, X_cat)).cpu().numpy())

        val_preds = np.concatenate(val_preds).ravel()
        val_auc = roc_auc_score(val_y, val_preds)
        scheduler.step(val_auc)

        if val_auc > best_auc + 1e-5:
            best_auc = val_auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1

        if no_improve >= patience:
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_auc


def predict(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in loader:
            X_num, X_cat = batch[0].to(device), batch[1].to(device)
            preds.append(torch.sigmoid(model(X_num, X_cat)).cpu().numpy())
    return np.concatenate(preds).ravel()


# ============================================================
# Helpers
# ============================================================
def sanitize(name: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_\-]+", "", name)

def submission_name(exp_name: str, oof_auc: float, seed: int) -> str:
    ts = datetime.now().strftime("%Y%m%d-%H%M")
    exp_name = sanitize(exp_name)
    return f"sub_v5exp-{exp_name}_oof-{oof_auc:.5f}_seed{seed}_{ts}.csv"

def ensure_columns(df, cols):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns in df: {missing[:10]} ... total {len(missing)}")


# ============================================================
# One experiment runner
# ============================================================
def run_experiment(exp_name: str, include_groups: list[str], save_submission=True):
    print("\n" + "="*80)
    print(f"üß™ EXP: {exp_name}")
    print(f"   groups: {include_groups}")
    print("="*80)

    set_seed(SEED)
    train_clean = clean_data(train_raw)
    test_clean = clean_data(test_raw)

    # resolved numeric feature columns (excluding TE since TE is computed per fold)
    num_cols = resolve_feature_list([g for g in include_groups if g != "te"])
    use_te = ("te" in include_groups)

    # check required raw cols exist post build_features (will after we build)
    oof_mlp = np.zeros(len(train_clean))
    oof_ft  = np.zeros(len(train_clean))
    test_mlp = np.zeros(len(test_clean))
    test_ft  = np.zeros(len(test_clean))
    fold_auc_mlp, fold_auc_ft = [], []

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(train_clean, train_clean["voted_bin"])):
        print(f"\n--- Fold {fold+1}/{N_FOLDS} ---")

        tr_df = train_clean.iloc[tr_idx].copy().reset_index(drop=True)
        va_df = train_clean.iloc[va_idx].copy().reset_index(drop=True)
        te_df = test_clean.copy()

        tr_fe = build_features(tr_df)
        va_fe = build_features(va_df)
        te_fe = build_features(te_df)

        # ensure columns exist
        ensure_columns(tr_fe, num_cols)
        ensure_columns(va_fe, num_cols)
        ensure_columns(te_fe, num_cols)

        X_tr = tr_fe[num_cols].copy()
        X_va = va_fe[num_cols].copy()
        X_te = te_fe[num_cols].copy()

        # add TE
        if use_te:
            te_dict = create_target_encodings(tr_fe, va_fe, te_fe, target_col="voted_bin")
            for te_name in TE_FEATURES:
                X_tr[te_name] = te_dict["train"][te_name]
                X_va[te_name] = te_dict["val"][te_name]
                X_te[te_name] = te_dict["test"][te_name]

        all_num_cols = list(X_tr.columns)

        # NaN -> median from train
        for c in all_num_cols:
            med = X_tr[c].median()
            if pd.isna(med):
                med = 0
            X_tr[c] = X_tr[c].fillna(med)
            X_va[c] = X_va[c].fillna(med)
            X_te[c] = X_te[c].fillna(med)

        # scaling
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr.values)
        X_va_s = scaler.transform(X_va.values)
        X_te_s = scaler.transform(X_te.values)

        # categorical encoding (always)
        cat_dims = []
        X_cat_tr, X_cat_va, X_cat_te = [], [], []

        for col in CAT_FEATURES:
            le = LabelEncoder()
            all_vals = list(set(tr_fe[col].fillna("NaN").astype(str)) |
                           set(va_fe[col].fillna("NaN").astype(str)) |
                           set(te_fe[col].fillna("NaN").astype(str)))
            le.fit(all_vals + ["UNK"])
            cat_dims.append(len(le.classes_))

            X_cat_tr.append(le.transform(tr_fe[col].fillna("NaN").astype(str)))
            X_cat_va.append(le.transform(va_fe[col].fillna("NaN").astype(str)))
            X_cat_te.append(le.transform(te_fe[col].fillna("NaN").astype(str)))

        X_cat_tr = np.stack(X_cat_tr, axis=1)
        X_cat_va = np.stack(X_cat_va, axis=1)
        X_cat_te = np.stack(X_cat_te, axis=1)

        y_tr = tr_fe["voted_bin"].values.astype(np.float32)
        y_va = va_fe["voted_bin"].values.astype(np.float32)

        # loaders
        train_ds = TabDataset(X_tr_s, X_cat_tr, y_tr)
        val_ds   = TabDataset(X_va_s, X_cat_va, y_va)
        test_ds  = TabDataset(X_te_s, X_cat_te)

        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
        test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

        # MLP
        mlp = MLP(num_features=len(all_num_cols), cat_dims=cat_dims, embed_dim=8,
                  hidden_dims=[256, 128, 64], dropout=0.3)
        mlp, auc_m = train_model(mlp, train_loader, val_loader, y_va, DEVICE, lr=1e-3)
        fold_auc_mlp.append(auc_m)

        oof_mlp[va_idx] = predict(mlp, val_loader, DEVICE)
        test_mlp += predict(mlp, test_loader, DEVICE) / N_FOLDS
        print(f"   MLP fold AUC: {auc_m:.5f}")

        # FT
        ft = FTTransformer(num_features=len(all_num_cols), cat_dims=cat_dims, d_token=48, n_layers=2, n_heads=4, dropout=0.2)
        ft, auc_f = train_model(ft, train_loader, val_loader, y_va, DEVICE, lr=5e-4)
        fold_auc_ft.append(auc_f)

        oof_ft[va_idx] = predict(ft, val_loader, DEVICE)
        test_ft += predict(ft, test_loader, DEVICE) / N_FOLDS
        print(f"   FT  fold AUC: {auc_f:.5f}")

    # OOF scores
    y_true = train_clean["voted_bin"].values
    auc_oof_mlp = roc_auc_score(y_true, oof_mlp)
    auc_oof_ft  = roc_auc_score(y_true, oof_ft)

    oof_avg = (oof_mlp + oof_ft) / 2
    auc_oof_avg = roc_auc_score(y_true, oof_avg)

    # weight search (coarse)
    best_w = 0.5
    best_auc = auc_oof_avg
    for w in [0.3, 0.4, 0.5, 0.6, 0.7]:
        auc_w = roc_auc_score(y_true, w * oof_mlp + (1 - w) * oof_ft)
        if auc_w > best_auc:
            best_auc = auc_w
            best_w = w

    print("\n" + "-"*80)
    print(f"OOF AUC - MLP : {auc_oof_mlp:.5f}  | folds: {[f'{x:.5f}' for x in fold_auc_mlp]}")
    print(f"OOF AUC - FT  : {auc_oof_ft:.5f}  | folds: {[f'{x:.5f}' for x in fold_auc_ft]}")
    print(f"OOF AUC - AVG : {auc_oof_avg:.5f}")
    print(f"OOF AUC - BEST WEIGHT: w={best_w:.1f} (MLP) + {1-best_w:.1f} (FT) => {best_auc:.5f}")
    print("-"*80)

    # final preds
    final_test = best_w * test_mlp + (1 - best_w) * test_ft

    if save_submission:
        fname = submission_name(exp_name, best_auc, SEED)
        outpath = os.path.join(OUTDIR, fname)

        sub = pd.DataFrame({
            "index": test_raw["index"] if "index" in test_raw.columns else np.arange(len(test_raw)),
            "voted": final_test
        })
        sub.to_csv(outpath, index=False)
        print(f"üíæ saved: {outpath}")
        print(f"   pred range: [{final_test.min():.4f}, {final_test.max():.4f}]")

    return {
        "exp": exp_name,
        "oof_mlp": auc_oof_mlp,
        "oof_ft": auc_oof_ft,
        "oof_avg": auc_oof_avg,
        "oof_best": best_auc,
        "best_w": best_w
    }


# ============================================================
# Main: run selected experiments
# ============================================================
if __name__ == "__main__":
    results = []
    for exp_name, groups in EXPERIMENTS.items():
        res = run_experiment(exp_name, groups, save_submission=True)
        results.append(res)

    # summary
    results = sorted(results, key=lambda x: x["oof_best"], reverse=True)
    print("\n\n" + "="*80)
    print("üèÅ EXPERIMENT SUMMARY (sorted by oof_best)")
    print("="*80)
    for r in results:
        print(f"{r['exp']:<22} | best={r['oof_best']:.5f} | avg={r['oof_avg']:.5f} | mlp={r['oof_mlp']:.5f} | ft={r['oof_ft']:.5f} | w={r['best_w']:.1f}")


üñ•Ô∏è Device: cpu
üìÇ TRAIN_PATH: ../../data/raw/train.csv
üìÇ TEST_PATH : ../../data/raw/test_x.csv
Train: (45532, 79), Test: (11383, 77)

üß™ EXP: S0_demo_te
   groups: ['demo_base', 'demo_derived', 'te']

--- Fold 1/5 ---
   MLP fold AUC: 0.77746
   FT  fold AUC: 0.77706

--- Fold 2/5 ---
   MLP fold AUC: 0.76696
   FT  fold AUC: 0.76664

--- Fold 3/5 ---
   MLP fold AUC: 0.76388
   FT  fold AUC: 0.76215

--- Fold 4/5 ---
   MLP fold AUC: 0.75928
   FT  fold AUC: 0.75786

--- Fold 5/5 ---
   MLP fold AUC: 0.76440
   FT  fold AUC: 0.76223

--------------------------------------------------------------------------------
OOF AUC - MLP : 0.76606  | folds: ['0.77746', '0.76696', '0.76388', '0.75928', '0.76440']
OOF AUC - FT  : 0.76299  | folds: ['0.77706', '0.76664', '0.76215', '0.75786', '0.76223']
OOF AUC - AVG : 0.76600
OOF AUC - BEST WEIGHT: w=0.7 (MLP) + 0.3 (FT) => 0.76641
--------------------------------------------------------------------------------
üíæ saved: outputs_v5_a