In [4]:
# train_vote_v7_optuna.py
# ============================================================
# Vote Prediction v7 - Optuna tuning (FT) -> Full Seed Ensemble
# ============================================================
import os
import json
import random
import argparse
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score

import optuna


# -------------------------
# Paths / Config
# -------------------------
BASE_DIR = "../../data/raw"
OUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

TRAIN_PATH = os.path.join(BASE_DIR, "train.csv")
TEST_PATH  = os.path.join(BASE_DIR, "test_x.csv")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
USE_AMP = True

# Full training seeds (after tuning)
SEEDS_FULL = [42, 202, 777, 1024, 2048]
N_FOLDS_FULL = 5

# Tuning setting (cheap & fast)
TUNE_SEED = 42
TUNE_N_FOLDS = 1           # ‚úÖ ÌäúÎãùÏùÄ 1 foldÎ°úÎßå
TUNE_MAX_EPOCHS = 60       # ‚úÖ ÌäúÎãùÏùÄ ÏßßÍ≤å
TUNE_PATIENCE = 8

# Train setting (final)
EPOCHS = 120
PATIENCE = 12

# Fixed MLP config (ÌäúÎãùÏùÄ FTÎßå)
MLP_CFG = dict(
    embed_dim=8,
    hidden_dims=[384, 192, 96],
    dropout=0.30,
    lr=1e-3,
    weight_decay=1e-4
)

W_STEP = 0.01  # ensemble weight search step


# -------------------------
# Utils
# -------------------------
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# -------------------------
# Clean / Feature Eng
# -------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for col in ["education", "engnat", "hand", "married", "urban"]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    if "familysize" in df.columns:
        df.loc[df["familysize"] == 0, "familysize"] = np.nan
        df.loc[df["familysize"] > 15, "familysize"] = np.nan

    for col in [f"tp{i:02d}" for i in range(1, 11)]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    for col in [f"Q{c}E" for c in "abcdefghijklmnopqrst"]:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)

    return df


def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    age_map = {"10s": 1, "20s": 2, "30s": 3, "40s": 4, "50s": 5, "60s": 6, "+70s": 7}
    df["age_ord"] = df["age_group"].map(age_map)

    df["is_teenager"] = (df["age_ord"] == 1).astype(int)
    df["is_young"]    = (df["age_ord"] <= 2).astype(int)
    df["is_old"]      = (df["age_ord"] >= 6).astype(int)

    df["edu_low"]  = (df["education"] <= 2).astype(float)
    df["edu_high"] = (df["education"] >= 3).astype(float)

    df["is_single"]  = (df["married"] == 1).astype(float)
    df["is_married"] = (df["married"] == 2).astype(float)

    df["is_urban"]          = (df["urban"] == 3).astype(float)
    df["is_english_native"] = (df["engnat"] == 1).astype(float)
    df["is_male"]           = (df["gender"] == "Male").astype(int)

    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    df["qa_mean"] = df[qa_cols].mean(axis=1)
    df["qa_std"]  = df[qa_cols].std(axis=1)
    df["qa_range"] = df[qa_cols].max(axis=1) - df[qa_cols].min(axis=1)
    df["qa_extreme_ratio"] = ((df[qa_cols] == 1) | (df[qa_cols] == 5)).sum(axis=1) / 20
    df["qa_neutral_ratio"] = (df[qa_cols] == 3).sum(axis=1) / 20
    df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(int)

    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        df[f"{col}_log"] = np.log1p(df[col])

    qe_log_cols = [f"{col}_log" for col in qe_cols]
    df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
    df["qe_log_std"]  = df[qe_log_cols].std(axis=1)
    df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / 20
    df["qe_total_log"]  = df[qe_log_cols].sum(axis=1)
    df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df["qa_all_same"] == 1)).astype(int)

    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / 10
    df["extraversion"]      = df["tp01"] - df["tp06"]
    df["agreeableness"]     = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"]       = df["tp04"] - df["tp09"]
    df["openness"]          = df["tp05"] - df["tp10"]
    df["tp_mean"] = df[tp_cols].mean(axis=1)

    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    df["vocab_high"] = (df["wr_sum"] >= 11).astype(int)

    df["age_edu"] = df["age_ord"] * df["education"]
    df["young_low_edu"] = df["is_young"] * df["edu_low"]
    df["young_single"]  = df["is_young"] * df["is_single"]
    df["old_married"]   = df["is_old"] * df["is_married"]
    df["teenager_low_edu"] = df["is_teenager"] * df["edu_low"]

    return df


def target_encode(train_df, val_df, test_df, col, target_col, smoothing=10):
    global_mean = train_df[target_col].mean()
    agg = train_df.groupby(col)[target_col].agg(["mean", "count"])
    agg["te"] = (agg["count"] * agg["mean"] + smoothing * global_mean) / (agg["count"] + smoothing)
    te_map = agg["te"].to_dict()
    return (
        train_df[col].map(te_map).fillna(global_mean).values,
        val_df[col].map(te_map).fillna(global_mean).values,
        test_df[col].map(te_map).fillna(global_mean).values,
    )


def create_target_encodings(train_df, val_df, test_df, target_col="voted_bin"):
    te_dict = {"train": {}, "val": {}, "test": {}}

    for col in ["age_group", "race", "religion"]:
        tr, va, te = target_encode(train_df, val_df, test_df, col, target_col, smoothing=10)
        te_dict["train"][f"{col}_te"] = tr
        te_dict["val"][f"{col}_te"]   = va
        te_dict["test"][f"{col}_te"]  = te

    for df in [train_df, val_df, test_df]:
        df["age_edu_cat"] = df["age_group"].astype(str) + "_" + df["education"].astype(str)
        df["age_married_cat"] = df["age_group"].astype(str) + "_" + df["married"].astype(str)
        df["age_race_cat"] = df["age_group"].astype(str) + "_" + df["race"].astype(str)
        df["age_edu_married_cat"] = (
            df["age_group"].astype(str) + "_" + df["education"].astype(str) + "_" + df["married"].astype(str)
        )

    for col, sm in [("age_edu_cat", 5), ("age_married_cat", 5), ("age_race_cat", 5), ("age_edu_married_cat", 3)]:
        tr, va, te = target_encode(train_df, val_df, test_df, col, target_col, smoothing=sm)
        te_dict["train"][f"{col}_te"] = tr
        te_dict["val"][f"{col}_te"]   = va
        te_dict["test"][f"{col}_te"]  = te

    return te_dict


qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
qe_log_cols = [f"Q{c}E_log" for c in "abcdefghijklmnopqrst"]
wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
tp_cols = [f"tp{i:02d}" for i in range(1, 11)]

num_features = (
    qa_cols + qe_log_cols + wr_cols + wf_cols + tp_cols
    + [
        "age_ord", "education", "married", "urban", "engnat", "familysize", "hand",
        "is_teenager", "is_young", "is_old", "edu_low", "edu_high",
        "is_single", "is_married", "is_urban", "is_english_native", "is_male",
        "qa_mean", "qa_std", "qa_range", "qa_extreme_ratio", "qa_neutral_ratio", "qa_all_same",
        "qe_log_mean", "qe_log_std", "qe_fast_ratio", "qe_total_log", "is_careless",
        "tp_missing_ratio", "tp_mean",
        "extraversion", "agreeableness", "conscientiousness", "neuroticism", "openness",
        "wr_sum", "wf_sum", "word_credibility", "vocab_high",
        "age_edu", "young_low_edu", "young_single", "old_married", "teenager_low_edu",
    ]
)

te_features = [
    "age_group_te", "race_te", "religion_te",
    "age_edu_cat_te", "age_married_cat_te", "age_race_cat_te", "age_edu_married_cat_te",
]

cat_features = ["gender", "race", "religion"]


# -------------------------
# Dataset
# -------------------------
class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]


# -------------------------
# Models
# -------------------------
class MLP(nn.Module):
    def __init__(self, num_features, cat_dims, embed_dim=8, hidden_dims=[256, 128, 64], dropout=0.3):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(dim + 1, embed_dim) for dim in cat_dims])

        input_dim = num_features + len(cat_dims) * embed_dim
        layers = []
        prev_dim = input_dim

        for h in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, h),
                nn.BatchNorm1d(h),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
            prev_dim = h

        self.mlp = nn.Sequential(*layers)
        self.out = nn.Linear(hidden_dims[-1], 1)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x_num, x_cat):
        cat_emb = torch.cat([emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)], dim=1)
        x = torch.cat([x_num, cat_emb], dim=1)
        x = self.mlp(x)
        return self.out(x)


class NumericalEmbedding(nn.Module):
    def __init__(self, num_features, d_token):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(num_features, d_token) * 0.02)
        self.bias = nn.Parameter(torch.zeros(num_features, d_token))

    def forward(self, x):
        return x.unsqueeze(-1) * self.weight + self.bias


class FTTransformer(nn.Module):
    def __init__(self, num_features, cat_dims, d_token=128, n_layers=4, n_heads=8, dropout=0.15):
        super().__init__()
        self.num_embed = NumericalEmbedding(num_features, d_token)
        self.cat_embeds = nn.ModuleList([nn.Embedding(dim + 1, d_token) for dim in cat_dims])
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_token) * 0.02)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_heads,
            dim_feedforward=d_token * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token // 2, 1)
        )

    def forward(self, x_num, x_cat):
        num_tokens = self.num_embed(x_num)
        cat_tokens = torch.stack([emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeds)], dim=1)

        tokens = torch.cat([num_tokens, cat_tokens], dim=1)
        cls = self.cls_token.expand(tokens.size(0), -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)

        x = self.transformer(tokens)
        return self.head(x[:, 0])


# -------------------------
# Scheduler
# -------------------------
def make_cosine_warmup_scheduler(optimizer, warmup_steps, total_steps, min_lr_ratio=0.05):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / max(1, warmup_steps)
        progress = float(step - warmup_steps) / max(1, total_steps - warmup_steps)
        cosine = 0.5 * (1.0 + np.cos(np.pi * progress))
        return min_lr_ratio + (1.0 - min_lr_ratio) * cosine
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


# -------------------------
# Train / Predict
# -------------------------
def train_model(model, train_loader, val_loader, y_train, y_val, device,
                epochs=120, patience=12, lr=1e-3, weight_decay=1e-4,
                use_amp=True, warmup_ratio=0.08):
    model.to(device)

    pos_ratio = float(np.mean(y_train))
    pos_weight = torch.tensor([(1 - pos_ratio) / (pos_ratio + 1e-6)], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    total_steps = epochs * max(1, len(train_loader))
    warmup_steps = int(total_steps * warmup_ratio)
    scheduler = make_cosine_warmup_scheduler(optimizer, warmup_steps, total_steps, min_lr_ratio=0.05)

    scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.startswith("cuda")))
    best_auc, best_state, no_improve = -1.0, None, 0

    for epoch in range(1, epochs + 1):
        model.train()
        for X_num, X_cat, y in train_loader:
            X_num, X_cat, y = X_num.to(device), X_cat.to(device), y.to(device)
            optimizer.zero_grad(set_to_none=True)

            with torch.cuda.amp.autocast(enabled=(use_amp and device.startswith("cuda"))):
                logits = model(X_num, X_cat)
                loss = criterion(logits, y)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

        model.eval()
        val_preds = []
        with torch.no_grad():
            for X_num, X_cat, _ in val_loader:
                X_num, X_cat = X_num.to(device), X_cat.to(device)
                with torch.cuda.amp.autocast(enabled=(use_amp and device.startswith("cuda"))):
                    p = torch.sigmoid(model(X_num, X_cat))
                val_preds.append(p.detach().cpu().numpy())

        val_preds = np.concatenate(val_preds).ravel()
        val_auc = roc_auc_score(y_val, val_preds)

        if val_auc > best_auc + 1e-5:
            best_auc = val_auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1

        if no_improve >= patience:
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_auc


@torch.no_grad()
def predict(model, loader, device, use_amp=True):
    model.eval()
    preds = []
    for batch in loader:
        X_num, X_cat = batch[0].to(device), batch[1].to(device)
        with torch.cuda.amp.autocast(enabled=(use_amp and device.startswith("cuda"))):
            p = torch.sigmoid(model(X_num, X_cat))
        preds.append(p.detach().cpu().numpy())
    return np.concatenate(preds).ravel()


# -------------------------
# One fold runner (shared)
# -------------------------
def prepare_fold_data(train_df, val_df, test_df):
    tr_fe = build_features(train_df)
    va_fe = build_features(val_df)
    te_fe = build_features(test_df)

    te_dict = create_target_encodings(tr_fe, va_fe, te_fe, "voted_bin")
    all_num = num_features + te_features

    X_tr = tr_fe[num_features].copy()
    X_va = va_fe[num_features].copy()
    X_te = te_fe[num_features].copy()

    for te_name in te_features:
        X_tr[te_name] = te_dict["train"][te_name]
        X_va[te_name] = te_dict["val"][te_name]
        X_te[te_name] = te_dict["test"][te_name]

    for c in all_num:
        med = X_tr[c].median()
        if pd.isna(med):
            med = 0.0
        X_tr[c] = X_tr[c].fillna(med)
        X_va[c] = X_va[c].fillna(med)
        X_te[c] = X_te[c].fillna(med)

    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr.values)
    X_va_s = scaler.transform(X_va.values)
    X_te_s = scaler.transform(X_te.values)

    cat_dims = []
    X_cat_tr, X_cat_va, X_cat_te = [], [], []
    for col in cat_features:
        le = LabelEncoder()
        all_vals = list(
            set(tr_fe[col].fillna("NaN").astype(str))
            | set(va_fe[col].fillna("NaN").astype(str))
            | set(te_fe[col].fillna("NaN").astype(str))
        )
        le.fit(all_vals + ["UNK"])
        cat_dims.append(len(le.classes_))

        X_cat_tr.append(le.transform(tr_fe[col].fillna("NaN").astype(str)))
        X_cat_va.append(le.transform(va_fe[col].fillna("NaN").astype(str)))
        X_cat_te.append(le.transform(te_fe[col].fillna("NaN").astype(str)))

    X_cat_tr = np.stack(X_cat_tr, axis=1)
    X_cat_va = np.stack(X_cat_va, axis=1)
    X_cat_te = np.stack(X_cat_te, axis=1)

    y_tr = tr_fe["voted_bin"].values.astype(np.float32)
    y_va = va_fe["voted_bin"].values.astype(np.float32)

    return X_tr_s, X_va_s, X_te_s, X_cat_tr, X_cat_va, X_cat_te, y_tr, y_va, cat_dims


# -------------------------
# Optuna Objective (FT only)
# -------------------------
def objective(trial):
    set_seed(TUNE_SEED)

    # search space
    d_token = trial.suggest_categorical("d_token", [64, 96, 128, 160])
    n_layers = trial.suggest_int("n_layers", 2, 6)
    n_heads = trial.suggest_categorical("n_heads", [4, 8])
    dropout = trial.suggest_float("dropout", 0.05, 0.30)
    lr = trial.suggest_float("lr", 3e-4, 2e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.03, 0.15)
    batch_size = trial.suggest_categorical("batch_size", [512, 1024])

    train_clean = clean_data(train_raw)
    test_clean  = clean_data(test_raw)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=TUNE_SEED)
    tr_idx, va_idx = next(iter(skf.split(train_clean, train_clean["voted_bin"])))  # ‚úÖ 1 foldÎßå

    tr_df = train_clean.iloc[tr_idx].reset_index(drop=True).copy()
    va_df = train_clean.iloc[va_idx].reset_index(drop=True).copy()
    te_df = test_clean.copy()

    X_tr_s, X_va_s, _, X_cat_tr, X_cat_va, _, y_tr, y_va, cat_dims = prepare_fold_data(tr_df, va_df, te_df)

    train_ds = TabDataset(X_tr_s, X_cat_tr, y_tr)
    val_ds   = TabDataset(X_va_s, X_cat_va, y_va)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True, pin_memory=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True)

    ft = FTTransformer(
        num_features=X_tr_s.shape[1],
        cat_dims=cat_dims,
        d_token=d_token,
        n_layers=n_layers,
        n_heads=n_heads,
        dropout=dropout,
    )

    ft, auc = train_model(
        ft, train_loader, val_loader,
        y_train=y_tr, y_val=y_va,
        device=DEVICE,
        epochs=TUNE_MAX_EPOCHS,
        patience=TUNE_PATIENCE,
        lr=lr,
        weight_decay=weight_decay,
        use_amp=USE_AMP,
        warmup_ratio=warmup_ratio
    )

    # prune
    trial.report(auc, step=0)
    if trial.should_prune():
        raise optuna.TrialPruned()

    return auc


# -------------------------
# Full Train (with best params)
# -------------------------
def run_full_training(best_params: dict):
    all_results = []

    for seed in SEEDS_FULL:
        print("\n" + "=" * 80)
        print(f"üå± FULL TRAIN seed={seed}")
        print("=" * 80)
        set_seed(seed)

        train_clean = clean_data(train_raw)
        test_clean  = clean_data(test_raw)

        oof_mlp = np.zeros(len(train_clean), dtype=np.float32)
        oof_ft  = np.zeros(len(train_clean), dtype=np.float32)
        test_mlp = np.zeros(len(test_clean), dtype=np.float32)
        test_ft  = np.zeros(len(test_clean), dtype=np.float32)

        skf = StratifiedKFold(n_splits=N_FOLDS_FULL, shuffle=True, random_state=seed)

        for fold, (tr_idx, va_idx) in enumerate(skf.split(train_clean, train_clean["voted_bin"]), start=1):
            print(f"\n[Fold {fold}/{N_FOLDS_FULL}]")

            tr_df = train_clean.iloc[tr_idx].reset_index(drop=True).copy()
            va_df = train_clean.iloc[va_idx].reset_index(drop=True).copy()
            te_df = test_clean.copy()

            X_tr_s, X_va_s, X_te_s, X_cat_tr, X_cat_va, X_cat_te, y_tr, y_va, cat_dims = prepare_fold_data(tr_df, va_df, te_df)

            train_ds = TabDataset(X_tr_s, X_cat_tr, y_tr)
            val_ds   = TabDataset(X_va_s, X_cat_va, y_va)
            test_ds  = TabDataset(X_te_s, X_cat_te, None)

            batch_size = int(best_params.get("batch_size", 1024))
            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True, pin_memory=True)
            val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True)
            test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True)

            # ---- MLP (fixed)
            mlp = MLP(
                num_features=X_tr_s.shape[1],
                cat_dims=cat_dims,
                embed_dim=MLP_CFG["embed_dim"],
                hidden_dims=MLP_CFG["hidden_dims"],
                dropout=MLP_CFG["dropout"]
            )
            mlp, _ = train_model(
                mlp, train_loader, val_loader,
                y_train=y_tr, y_val=y_va,
                device=DEVICE,
                epochs=EPOCHS,
                patience=PATIENCE,
                lr=MLP_CFG["lr"],
                weight_decay=MLP_CFG["weight_decay"],
                use_amp=USE_AMP,
                warmup_ratio=0.08
            )
            oof_mlp[va_idx] = predict(mlp, val_loader, DEVICE, use_amp=USE_AMP)
            test_mlp += predict(mlp, test_loader, DEVICE, use_amp=USE_AMP) / N_FOLDS_FULL

            # ---- FT (best params)
            ft = FTTransformer(
                num_features=X_tr_s.shape[1],
                cat_dims=cat_dims,
                d_token=int(best_params["d_token"]),
                n_layers=int(best_params["n_layers"]),
                n_heads=int(best_params["n_heads"]),
                dropout=float(best_params["dropout"]),
            )
            ft, _ = train_model(
                ft, train_loader, val_loader,
                y_train=y_tr, y_val=y_va,
                device=DEVICE,
                epochs=EPOCHS,
                patience=PATIENCE,
                lr=float(best_params["lr"]),
                weight_decay=float(best_params["weight_decay"]),
                use_amp=USE_AMP,
                warmup_ratio=float(best_params["warmup_ratio"])
            )
            oof_ft[va_idx] = predict(ft, val_loader, DEVICE, use_amp=USE_AMP)
            test_ft += predict(ft, test_loader, DEVICE, use_amp=USE_AMP) / N_FOLDS_FULL

            del mlp, ft, train_ds, val_ds, test_ds, train_loader, val_loader, test_loader
            if DEVICE.startswith("cuda"):
                torch.cuda.empty_cache()

        # weight search per seed
        y_all = train_clean["voted_bin"].values.astype(np.float32)
        best_w, best_auc = 0.5, -1.0
        for w in np.arange(0.0, 1.0 + 1e-9, W_STEP):
            oof_w = w * oof_mlp + (1 - w) * oof_ft
            auc = roc_auc_score(y_all, oof_w)
            if auc > best_auc:
                best_auc = auc
                best_w = float(w)

        test_ens = best_w * test_mlp + (1 - best_w) * test_ft
        all_results.append(dict(seed=seed, best_w=best_w, best_auc=best_auc,
                                oof_mlp=oof_mlp, oof_ft=oof_ft,
                                test_mlp=test_mlp, test_ft=test_ft,
                                test_ens=test_ens))

        print(f"seed={seed} | best_auc={best_auc:.5f} | best_w={best_w:.2f}")

    # final seed-avg
    y_all = train_raw["voted_bin"].values.astype(np.float32)
    oof_ens_all = np.mean([r["best_w"] * r["oof_mlp"] + (1 - r["best_w"]) * r["oof_ft"] for r in all_results], axis=0)
    final_oof_auc = roc_auc_score(y_all, oof_ens_all)

    test_ens_all = np.mean([r["test_ens"] for r in all_results], axis=0)
    test_mlp_all = np.mean([r["test_mlp"] for r in all_results], axis=0)
    test_ft_all  = np.mean([r["test_ft"] for r in all_results], axis=0)

    print("\n" + "#" * 80)
    print(f"üèÅ FINAL OOF AUC: {final_oof_auc:.5f}")
    print("#" * 80)

    sub = pd.DataFrame({
        "index": test_raw["index"] if "index" in test_raw.columns else np.arange(len(test_raw)),
        "voted": test_ens_all
    })
    sub.to_csv(os.path.join(OUT_DIR, "submission_colab_v7_optuna_ensemble.csv"), index=False)

    sub_mlp = sub.copy()
    sub_mlp["voted"] = test_mlp_all
    sub_mlp.to_csv(os.path.join(OUT_DIR, "submission_colab_v7_optuna_mlp.csv"), index=False)

    sub_ft = sub.copy()
    sub_ft["voted"] = test_ft_all
    sub_ft.to_csv(os.path.join(OUT_DIR, "submission_colab_v7_optuna_ft.csv"), index=False)

    # save final report
    report = {
        "final_oof_auc": float(final_oof_auc),
        "seeds": SEEDS_FULL,
        "best_params": best_params,
        "seed_results": [{"seed": r["seed"], "best_auc": float(r["best_auc"]), "best_w": float(r["best_w"])} for r in all_results]
    }
    with open(os.path.join(OUT_DIR, "report_colab_v7_optuna.json"), "w") as f:
        json.dump(report, f, indent=2)

    print("üíæ Saved to:", OUT_DIR)
    print(" - submission_colab_v7_optuna_ensemble.csv")
    print(" - submission_colab_v7_optuna_mlp.csv")
    print(" - submission_colab_v7_optuna_ft.csv")
    print(" - report_colab_v7_optuna.json")

    return final_oof_auc


# -------------------------
# Entry
# -------------------------
if __name__ == "__main__":
    # Colab/JupyterÏóêÏÑúÎäî argparseÍ∞Ä Íº¨Ïùº Ïàò ÏûàÏñ¥ÏÑú ÏïàÏ†ÑÌïòÍ≤å Ï≤òÎ¶¨
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--mode", type=str, choices=["tune", "train"], default="train")  # ‚úÖ default
    parser.add_argument("--trials", type=int, default=60)
    args, _ = parser.parse_known_args()  # ‚úÖ Ïïå Ïàò ÏóÜÎäî Ïù∏Ïûê Î¨¥Ïãú

    print(f"üñ•Ô∏è Device: {DEVICE}")
    train_raw = pd.read_csv(TRAIN_PATH)
    test_raw  = pd.read_csv(TEST_PATH)
    train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)

    best_path = os.path.join(BASE_DIR, "best_params_ft.json")

    if args.mode == "tune":
        print(f"üîé Optuna tuning... trials={args.trials}")
        sampler = optuna.samplers.TPESampler(seed=TUNE_SEED)
        pruner = optuna.pruners.MedianPruner(n_startup_trials=10)
        study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
        study.optimize(objective, n_trials=args.trials)

        best_params = study.best_params
        with open(best_path, "w") as f:
            json.dump(best_params, f, indent=2)
        print("‚úÖ Best params saved:", best_path)
        print("Best AUC:", study.best_value)
        print("Best params:", best_params)

    elif args.mode == "train":
        if not os.path.exists(best_path):
            raise FileNotFoundError(f"Run tune first. Missing: {best_path}")
        with open(best_path, "r") as f:
            best_params = json.load(f)
        print("‚úÖ Loaded best params:", best_params)
        run_full_training(best_params)

üñ•Ô∏è Device: cpu
‚úÖ Loaded best params: {'d_token': 64, 'n_layers': 3, 'n_heads': 8, 'dropout': 0.09030532181350111, 'lr': 0.0017502789790582254, 'weight_decay': 0.00026568139241144923, 'warmup_ratio': 0.10600845078125082, 'batch_size': 512}

üå± FULL TRAIN seed=42

[Fold 1/5]

[Fold 2/5]

[Fold 3/5]

[Fold 4/5]

[Fold 5/5]
seed=42 | best_auc=0.77133 | best_w=0.64

üå± FULL TRAIN seed=202

[Fold 1/5]

[Fold 2/5]

[Fold 3/5]

[Fold 4/5]

[Fold 5/5]
seed=202 | best_auc=0.77255 | best_w=0.57

üå± FULL TRAIN seed=777

[Fold 1/5]

[Fold 2/5]

[Fold 3/5]

[Fold 4/5]

[Fold 5/5]
seed=777 | best_auc=0.77202 | best_w=0.54

üå± FULL TRAIN seed=1024

[Fold 1/5]

[Fold 2/5]

[Fold 3/5]

[Fold 4/5]

[Fold 5/5]
seed=1024 | best_auc=0.77196 | best_w=0.55

üå± FULL TRAIN seed=2048

[Fold 1/5]

[Fold 2/5]

[Fold 3/5]

[Fold 4/5]

[Fold 5/5]
seed=2048 | best_auc=0.77227 | best_w=0.59

################################################################################
üèÅ FINAL OOF AUC: 0.77427
##