# 13_FTTransformer_v9_full ÏöîÏïΩ

- Î™®Îç∏: FT-Transformer
- ÌîºÏ≤ò: FEATURE_SET=full
- ÌïôÏäµ/ÌèâÍ∞Ä: KFold 5, seed=42
- Ï†úÏ∂úÌååÏùº: submission_13_FTTransformer_v9_full.csv


In [1]:
import os
import random
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.metrics import roc_auc_score

# ============================================================
# CONFIG
# ============================================================
SEED = 42
N_FOLDS = 5
BATCH_SIZE = 512
EPOCHS = 50
PATIENCE = 8
LR = 1e-3
WEIGHT_DECAY = 1e-4

# ÌîºÏ≤ò ÏÑ∏Ìä∏: "full" / "core" / "core+qa"
FEATURE_SET = "full"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è Device: {DEVICE}")
print(f"üìä Config: {N_FOLDS}-Fold, epochs={EPOCHS}, batch={BATCH_SIZE}, feature_set={FEATURE_SET}")

# ============================================================
# utils
# ============================================================
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

def smart_read_csv(primary_path: str, fallback_path: str):
    if os.path.exists(primary_path):
        return pd.read_csv(primary_path)
    return pd.read_csv(fallback_path)

# ============================================================
# 1. Load
# ============================================================
train_raw = smart_read_csv("../../data/raw/train.csv", "train.csv")
test_raw  = smart_read_csv("../../data/raw/test_x.csv", "test_x.csv")

print(f"Train: {train_raw.shape}, Test: {test_raw.shape}")

train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)
print(f"ÌÉÄÍ≤ü Î∂ÑÌè¨(ÎØ∏Ìà¨Ìëú=1): {train_raw['voted_bin'].mean():.1%}")

# ============================================================
# 2. Clean
# ============================================================
def clean_data(df):
    df = df.copy()

    # 0 -> NaN (Î¨¥ÏùëÎãµ)
    zero_to_nan_cols = ["education", "engnat", "hand", "married", "urban"]
    for col in zero_to_nan_cols:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # familysize
    if "familysize" in df.columns:
        df.loc[df["familysize"] == 0, "familysize"] = np.nan
        df.loc[df["familysize"] > 15, "familysize"] = np.nan

    # TP 0 -> NaN
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    for col in tp_cols:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # QE clip (ms)
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)

    return df

# ============================================================
# 3. Feature Engineering
# ============================================================
def build_features(df):
    df = df.copy()

    # demographics
    age_map = {"10s": 1, "20s": 2, "30s": 3, "40s": 4, "50s": 5, "60s": 6, "+70s": 7}
    if "age_group" in df.columns:
        df["age_ord"] = df["age_group"].map(age_map)

    df["is_male"] = (df["gender"] == "Male").astype(int) if "gender" in df.columns else 0

    df["edu_low"]  = (df["education"] <= 2).astype(float) if "education" in df.columns else np.nan
    df["edu_high"] = (df["education"] >= 3).astype(float) if "education" in df.columns else np.nan

    df["is_single"] = (df["married"] == 1).astype(float) if "married" in df.columns else np.nan
    df["is_married"] = (df["married"] == 2).astype(float) if "married" in df.columns else np.nan

    df["is_urban"] = (df["urban"] == 3).astype(float) if "urban" in df.columns else np.nan
    df["is_rural"] = (df["urban"] == 1).astype(float) if "urban" in df.columns else np.nan

    df["is_english_native"] = (df["engnat"] == 1).astype(float) if "engnat" in df.columns else np.nan

    # QA
    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    qa_cols = [c for c in qa_cols if c in df.columns]
    if len(qa_cols) > 0:
        df["qa_mean"] = df[qa_cols].mean(axis=1)
        df["qa_std"] = df[qa_cols].std(axis=1)
        df["qa_min"] = df[qa_cols].min(axis=1)
        df["qa_max"] = df[qa_cols].max(axis=1)
        df["qa_range"] = df["qa_max"] - df["qa_min"]

        df["qa_extreme_ratio"] = ((df[qa_cols] == 1) | (df[qa_cols] == 5)).sum(axis=1) / len(qa_cols)
        df["qa_neutral_ratio"] = (df[qa_cols] == 3).sum(axis=1) / len(qa_cols)
        df["qa_negative_ratio"] = (df[qa_cols] <= 2).sum(axis=1) / len(qa_cols)
        df["qa_positive_ratio"] = (df[qa_cols] >= 4).sum(axis=1) / len(qa_cols)
        df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(int)

    # QE log + stats
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    qe_cols = [c for c in qe_cols if c in df.columns]
    qe_log_cols = []
    for col in qe_cols:
        newc = f"{col}_log"
        df[newc] = np.log1p(df[col])
        qe_log_cols.append(newc)

    if len(qe_cols) > 0:
        df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
        df["qe_log_std"] = df[qe_log_cols].std(axis=1)
        df["qe_log_min"] = df[qe_log_cols].min(axis=1)
        df["qe_log_max"] = df[qe_log_cols].max(axis=1)
        df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / len(qe_cols)
        df["qe_slow_ratio"] = (df[qe_cols] > 10000).sum(axis=1) / len(qe_cols)
        df["qe_total_log"] = df[qe_log_cols].sum(axis=1)

        # careless proxy
        df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df.get("qa_all_same", 0) == 1)).astype(int)

    # TP Big5
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    tp_cols = [c for c in tp_cols if c in df.columns]
    if len(tp_cols) > 0:
        df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / len(tp_cols)
        # big5 diffs (Ï°¥Ïû¨ Í∞ÄÏ†ï)
        if set(["tp01","tp06"]).issubset(df.columns): df["extraversion"] = df["tp01"] - df["tp06"]
        if set(["tp07","tp02"]).issubset(df.columns): df["agreeableness"] = df["tp07"] - df["tp02"]
        if set(["tp03","tp08"]).issubset(df.columns): df["conscientiousness"] = df["tp03"] - df["tp08"]
        if set(["tp04","tp09"]).issubset(df.columns): df["neuroticism"] = df["tp04"] - df["tp09"]
        if set(["tp05","tp10"]).issubset(df.columns): df["openness"] = df["tp05"] - df["tp10"]
        df["tp_mean"] = df[tp_cols].mean(axis=1)
        df["tp_std"] = df[tp_cols].std(axis=1)

    # WR/WF
    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
    wr_cols = [c for c in wr_cols if c in df.columns]
    wf_cols = [c for c in wf_cols if c in df.columns]
    if len(wr_cols) > 0:
        df["wr_sum"] = df[wr_cols].sum(axis=1)
    if len(wf_cols) > 0:
        df["wf_sum"] = df[wf_cols].sum(axis=1)

    if "wr_sum" in df.columns and "wf_sum" in df.columns:
        df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
        df["vocab_low"] = (df["wr_sum"] <= 7).astype(int)
        df["vocab_high"] = (df["wr_sum"] >= 11).astype(int)

    # interactions (Í∞ÄÎ≥çÍ≤å)
    if "age_ord" in df.columns and "education" in df.columns:
        df["age_edu"] = df["age_ord"] * df["education"]
    if "wr_sum" in df.columns and "education" in df.columns:
        df["vocab_edu"] = df["wr_sum"] * df["education"]

    return df

# ============================================================
# 4. Target Encoding (optional but often helps)
# ============================================================
def target_encode(train_df, val_df, test_df, col, target_col, smoothing=100):
    global_mean = train_df[target_col].mean()
    agg = train_df.groupby(col)[target_col].agg(["mean", "count"])
    agg["te"] = (agg["count"] * agg["mean"] + smoothing * global_mean) / (agg["count"] + smoothing)
    te_map = agg["te"].to_dict()

    tr = train_df[col].map(te_map).fillna(global_mean).values
    va = val_df[col].map(te_map).fillna(global_mean).values
    te = test_df[col].map(te_map).fillna(global_mean).values
    return tr, va, te

def create_all_target_encodings(train_df, val_df, test_df, target_col="voted_bin"):
    te_dict = {"train": {}, "val": {}, "test": {}}

    # smoothing ÌÅ¨Í≤å (Ìù¨ÏÜå Ìäê Î∞©ÏßÄ)
    SM_SINGLE = 100
    SM_2WAY = 200
    SM_3WAY = 500

    single_cols = ["age_group", "race", "religion"]
    for col in single_cols:
        if col in train_df.columns:
            tr, va, te = target_encode(train_df, val_df, test_df, col, target_col, smoothing=SM_SINGLE)
            te_dict["train"][f"{col}_te"] = tr
            te_dict["val"][f"{col}_te"] = va
            te_dict["test"][f"{col}_te"] = te

    # combos (Î¨∏ÏûêÏó¥ Í≤∞Ìï©)
    def make_cat(df, name, cols):
        df[name] = df[cols].astype(str).agg("_".join, axis=1)

    for df_ in [train_df, val_df, test_df]:
        for req in [["age_group","education"], ["age_group","married"], ["age_group","race"], ["age_group","education","married"]]:
            if all(c in df_.columns for c in req):
                pass

    if all(c in train_df.columns for c in ["age_group","education"]):
        for d in [train_df, val_df, test_df]:
            make_cat(d, "age_edu_cat", ["age_group","education"])
        tr, va, te = target_encode(train_df, val_df, test_df, "age_edu_cat", target_col, smoothing=SM_2WAY)
        te_dict["train"]["age_edu_te"] = tr
        te_dict["val"]["age_edu_te"] = va
        te_dict["test"]["age_edu_te"] = te

    if all(c in train_df.columns for c in ["age_group","married"]):
        for d in [train_df, val_df, test_df]:
            make_cat(d, "age_married_cat", ["age_group","married"])
        tr, va, te = target_encode(train_df, val_df, test_df, "age_married_cat", target_col, smoothing=SM_2WAY)
        te_dict["train"]["age_married_te"] = tr
        te_dict["val"]["age_married_te"] = va
        te_dict["test"]["age_married_te"] = te

    if all(c in train_df.columns for c in ["age_group","race"]):
        for d in [train_df, val_df, test_df]:
            make_cat(d, "age_race_cat", ["age_group","race"])
        tr, va, te = target_encode(train_df, val_df, test_df, "age_race_cat", target_col, smoothing=SM_2WAY)
        te_dict["train"]["age_race_te"] = tr
        te_dict["val"]["age_race_te"] = va
        te_dict["test"]["age_race_te"] = te

    if all(c in train_df.columns for c in ["age_group","education","married"]):
        for d in [train_df, val_df, test_df]:
            make_cat(d, "age_edu_married_cat", ["age_group","education","married"])
        tr, va, te = target_encode(train_df, val_df, test_df, "age_edu_married_cat", target_col, smoothing=SM_3WAY)
        te_dict["train"]["age_edu_married_te"] = tr
        te_dict["val"]["age_edu_married_te"] = va
        te_dict["test"]["age_edu_married_te"] = te

    return te_dict

# ============================================================
# 5. Feature columns by set
# ============================================================
def get_feature_cols(df):
    qa_cols = [c for c in [f"Q{ch}A" for ch in "abcdefghijklmnopqrst"] if c in df.columns]
    qe_cols = [c for c in [f"Q{ch}E" for ch in "abcdefghijklmnopqrst"] if c in df.columns]
    qe_log_cols = [f"{c}_log" for c in qe_cols]  # build_featuresÏóêÏÑú ÏÉùÏÑ±Îê®
    tp_cols = [c for c in [f"tp{i:02d}" for i in range(1, 11)] if c in df.columns]
    wr_cols = [c for c in [f"wr_{i:02d}" for i in range(1, 14)] if c in df.columns]
    wf_cols = [c for c in [f"wf_{i:02d}" for i in range(1, 4)] if c in df.columns]

    # Í≥µÌÜµ core
    core_num = []
    for c in ["age_ord","education","married","urban","engnat","familysize","hand","is_male",
              "edu_low","edu_high","is_single","is_married","is_urban","is_rural","is_english_native",
              "age_edu","vocab_edu",
              "tp_missing_ratio","tp_mean","tp_std",
              "extraversion","agreeableness","conscientiousness","neuroticism","openness",
              "wr_sum","wf_sum","word_credibility","vocab_low","vocab_high",
              "qe_log_mean","qe_log_std","qe_log_min","qe_log_max","qe_fast_ratio","qe_slow_ratio","qe_total_log",
              "qa_mean","qa_std","qa_min","qa_max","qa_range","qa_extreme_ratio","qa_neutral_ratio","qa_negative_ratio","qa_positive_ratio","qa_all_same",
              "is_careless",
             ]:
        if c in df.columns:
            core_num.append(c)

    if FEATURE_SET == "core":
        num_features = core_num
    elif FEATURE_SET == "core+qa":
        num_features = core_num + qa_cols
    else:
        # full: ÏõêÎ≥∏ÏùÑ ÏµúÎåÄÌïú Ìè¨Ìï®
        num_features = core_num + qa_cols + qe_log_cols + tp_cols + wr_cols + wf_cols

    # TE ÌîºÏ≤òÎäî ÎèôÏ†ÅÏúºÎ°ú Î∂ôÏùº Í≤É
    te_features = [
        "age_group_te","race_te","religion_te",
        "age_edu_te","age_married_te","age_race_te","age_edu_married_te"
    ]
    cat_features = [c for c in ["gender","race","religion"] if c in df.columns]
    return num_features, te_features, cat_features

# ============================================================
# Dataset
# ============================================================
class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

# ============================================================
# FT-Transformer (minimal, strong baseline)
# ============================================================
class FTTransformer(nn.Module):
    """
    FT-Transformer style:
    - Each numerical feature -> token via Linear(1->d)
    - Each categorical feature -> embedding token (d)
    - [CLS] token + transformer encoder
    - head on CLS
    """
    def __init__(self, n_num, cat_dims, d_token=64, n_heads=8, n_layers=3, dropout=0.2):
        super().__init__()
        self.n_num = n_num
        self.n_cat = len(cat_dims)
        self.d = d_token

        # numeric tokenizers: one Linear per feature
        self.num_tokenizers = nn.ModuleList([nn.Linear(1, d_token) for _ in range(n_num)])

        # categorical embeddings
        self.cat_embeds = nn.ModuleList([nn.Embedding(dim + 1, d_token) for dim in cat_dims])

        # CLS token
        self.cls = nn.Parameter(torch.zeros(1, 1, d_token))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_heads,
            dim_feedforward=d_token * 4,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, 1)
        )

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.cls, std=0.02)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.02)

    def forward(self, x_num, x_cat):
        # x_num: (B, n_num)
        B = x_num.size(0)
        tokens = []

        for i in range(self.n_num):
            xi = x_num[:, i:i+1]  # (B,1)
            tokens.append(self.num_tokenizers[i](xi).unsqueeze(1))  # (B,1,d)

        for j in range(self.n_cat):
            cj = x_cat[:, j]
            tokens.append(self.cat_embeds[j](cj).unsqueeze(1))  # (B,1,d)

        x = torch.cat(tokens, dim=1)  # (B, n_tokens, d)
        cls = self.cls.expand(B, -1, -1)  # (B,1,d)
        x = torch.cat([cls, x], dim=1)  # (B, 1+n_tokens, d)

        x = self.encoder(x)
        cls_out = x[:, 0, :]
        return self.head(cls_out)

# ============================================================
# Train / Predict
# ============================================================
def train_fold(model, train_loader, val_loader, y_train, y_val, device):
    model.to(device)

    pos_ratio = float(np.mean(y_train))
    pos_weight = torch.tensor([(1 - pos_ratio) / (pos_ratio + 1e-6)], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3)

    best_auc = -1
    best_state = None
    no_improve = 0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0.0

        for X_num, X_cat, y in train_loader:
            X_num, X_cat, y = X_num.to(device), X_cat.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(X_num, X_cat)
            loss = criterion(logits, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()

        # val
        model.eval()
        preds = []
        with torch.no_grad():
            for X_num, X_cat, _ in val_loader:
                X_num, X_cat = X_num.to(device), X_cat.to(device)
                p = torch.sigmoid(model(X_num, X_cat))
                preds.append(p.cpu().numpy())
        preds = np.concatenate(preds).ravel()
        val_auc = roc_auc_score(y_val, preds)

        scheduler.step(val_auc)

        if val_auc > best_auc + 1e-5:
            best_auc = val_auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1

        if (epoch + 1) % 10 == 0:
            print(f"    Epoch {epoch+1}: loss={total_loss/len(train_loader):.4f}, val_auc={val_auc:.5f}, best={best_auc:.5f}")

        if no_improve >= PATIENCE:
            print(f"    Early stopping at epoch {epoch+1}")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_auc

def predict(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in loader:
            if len(batch) == 2:
                X_num, X_cat = batch
            else:
                X_num, X_cat, _ = batch
            X_num, X_cat = X_num.to(device), X_cat.to(device)
            p = torch.sigmoid(model(X_num, X_cat))
            preds.append(p.cpu().numpy())
    return np.concatenate(preds).ravel()

# ============================================================
# Main
# ============================================================
def main():
    train_clean = clean_data(train_raw)
    test_clean = clean_data(test_raw)

    oof = np.zeros(len(train_clean))
    test_pred = np.zeros(len(test_clean))
    fold_aucs = []

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(train_clean, train_clean["voted_bin"])):
        print("\n" + "="*50)
        print(f"üìÇ Fold {fold+1}/{N_FOLDS}")
        print("="*50)

        tr_df = train_clean.iloc[tr_idx].copy().reset_index(drop=True)
        va_df = train_clean.iloc[va_idx].copy().reset_index(drop=True)
        te_df = test_clean.copy()

        tr_fe = build_features(tr_df)
        va_fe = build_features(va_df)
        te_fe = build_features(te_df)

        num_features, te_features, cat_features = get_feature_cols(tr_fe)

        # Target Encoding ÏÉùÏÑ±
        te_dict = create_all_target_encodings(tr_fe, va_fe, te_fe, target_col="voted_bin")

        # numeric table
        all_num = num_features + te_features

        X_tr = tr_fe[num_features].copy()
        X_va = va_fe[num_features].copy()
        X_te = te_fe[num_features].copy()

        # append TE (ÏóÜÏúºÎ©¥ global meanÏúºÎ°ú Îì§Ïñ¥Í∞ÄÍ≤åÎÅî te_dictÏóê ÏóÜÎäîÍ±¥ Í±¥ÎÑàÎúÄ)
        for tename in te_features:
            if tename in te_dict["train"]:
                X_tr[tename] = te_dict["train"][tename]
                X_va[tename] = te_dict["val"][tename]
                X_te[tename] = te_dict["test"][tename]
            else:
                # Ìï¥Îãπ Ïª¨ÎüºÏù¥ ÏóÜÏúºÎ©¥ Í∑∏ÎÉ• 0ÏúºÎ°ú
                X_tr[tename] = 0.0
                X_va[tename] = 0.0
                X_te[tename] = 0.0

        # fillna by train median
        for c in all_num:
            med = X_tr[c].median()
            if pd.isna(med):
                med = 0.0
            X_tr[c] = X_tr[c].fillna(med)
            X_va[c] = X_va[c].fillna(med)
            X_te[c] = X_te[c].fillna(med)

        # scale (QuantileTransformer)
        scaler = QuantileTransformer(
            n_quantiles=2000,
            output_distribution="normal",
            random_state=SEED
        )
        X_tr_s = scaler.fit_transform(X_tr.values)
        X_va_s = scaler.transform(X_va.values)
        X_te_s = scaler.transform(X_te.values)

        # categorical -> label encoding (fold ÎÇ¥ÏóêÏÑú ÏùºÍ¥Ä)
        cat_dims = []
        Xc_tr_list, Xc_va_list, Xc_te_list = [], [], []

        for col in cat_features:
            le = LabelEncoder()
            tr_col = tr_fe[col].fillna("__NAN__").astype(str)
            va_col = va_fe[col].fillna("__NAN__").astype(str)
            te_col = te_fe[col].fillna("__NAN__").astype(str)

            all_vals = list(set(tr_col.unique()) | set(va_col.unique()) | set(te_col.unique()))
            if "__UNK__" not in all_vals:
                all_vals.append("__UNK__")

            le.fit(all_vals)
            cat_dims.append(len(le.classes_))

            Xc_tr_list.append(le.transform(tr_col))
            Xc_va_list.append(le.transform(va_col.apply(lambda x: x if x in le.classes_ else "__UNK__")))
            Xc_te_list.append(le.transform(te_col.apply(lambda x: x if x in le.classes_ else "__UNK__")))

        if len(cat_features) == 0:
            # catÏù¥ ÏóÜÏúºÎ©¥ ÎçîÎØ∏ 1Ïª¨Îüº
            Xc_tr = np.zeros((len(tr_fe), 1), dtype=np.int64)
            Xc_va = np.zeros((len(va_fe), 1), dtype=np.int64)
            Xc_te = np.zeros((len(te_fe), 1), dtype=np.int64)
            cat_dims = [1]
        else:
            Xc_tr = np.stack(Xc_tr_list, axis=1)
            Xc_va = np.stack(Xc_va_list, axis=1)
            Xc_te = np.stack(Xc_te_list, axis=1)

        y_tr = tr_fe["voted_bin"].values.astype(np.float32)
        y_va = va_fe["voted_bin"].values.astype(np.float32)

        train_ds = TabDataset(X_tr_s, Xc_tr, y_tr)
        val_ds = TabDataset(X_va_s, Xc_va, y_va)
        test_ds = TabDataset(X_te_s, Xc_te)

        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
        val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
        test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

        model = FTTransformer(
            n_num=X_tr_s.shape[1],
            cat_dims=cat_dims,
            d_token=64,
            n_heads=8,
            n_layers=3,
            dropout=0.2
        )
        print(f"    Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞: {sum(p.numel() for p in model.parameters()):,}")

        model, best_auc = train_fold(model, train_loader, val_loader, y_tr, y_va, DEVICE)
        fold_aucs.append(best_auc)

        oof[va_idx] = predict(model, val_loader, DEVICE)
        test_pred += predict(model, test_loader, DEVICE) / N_FOLDS

        print(f"  ‚úÖ Fold {fold+1} AUC: {best_auc:.5f}")

    final_auc = roc_auc_score(train_clean["voted_bin"], oof)

    print("\n" + "="*60)
    print("üéâ ÏµúÏ¢Ö Í≤∞Í≥º")
    print("="*60)
    print(f"üèÜ OOF AUC: {final_auc:.5f}")
    print(f"üìä Fold AUCs: {[f'{x:.5f}' for x in fold_aucs]}")
    print(f"üìä Mean ¬± Std: {np.mean(fold_aucs):.5f} ¬± {np.std(fold_aucs):.5f}")

    out_name = "submission_13_FTTransformer_v9_full.csv"
    sub = pd.DataFrame({
        "index": test_raw["index"] if "index" in test_raw.columns else np.arange(len(test_raw)),
        "voted": test_pred
    })
    sub.to_csv(out_name, index=False)
    print(f"\nüíæ Ï†ÄÏû• ÏôÑÎ£å: {out_name}")
    print(f"   ÏòàÏ∏° Î≤îÏúÑ: [{test_pred.min():.4f}, {test_pred.max():.4f}]")
    print(f"   ÏòàÏ∏° ÌèâÍ∑†: {test_pred.mean():.4f}")

    return final_auc

if __name__ == "__main__":
    main()



üñ•Ô∏è Device: cpu
üìä Config: 5-Fold, epochs=50, batch=512, feature_set=full
Train: (45532, 78), Test: (11383, 77)
ÌÉÄÍ≤ü Î∂ÑÌè¨(ÎØ∏Ìà¨Ìëú=1): 54.7%

üìÇ Fold 1/5
    Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞: 167,425
    Epoch 10: loss=0.5070, val_auc=0.77511, best=0.77595
    Epoch 20: loss=0.4962, val_auc=0.77596, best=0.77699
    Early stopping at epoch 22
  ‚úÖ Fold 1 AUC: 0.77699

üìÇ Fold 2/5
    Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞: 167,425
    Epoch 10: loss=0.5050, val_auc=0.76530, best=0.76604
    Early stopping at epoch 16
  ‚úÖ Fold 2 AUC: 0.76604

üìÇ Fold 3/5
    Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞: 167,425
    Epoch 10: loss=0.5004, val_auc=0.75807, best=0.76099
    Early stopping at epoch 10
  ‚úÖ Fold 3 AUC: 0.76099

üìÇ Fold 4/5
    Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞: 167,425
    Epoch 10: loss=0.5039, val_auc=0.75886, best=0.75890
    Epoch 20: loss=0.4919, val_auc=0.75963, best=0.76055
    Early stopping at epoch 23
  ‚úÖ Fold 4 AUC: 0.76055

üìÇ Fold 5/5
    Î™®Îç∏ ÌååÎùºÎØ∏ÌÑ∞: 167,425
    Epoch 10: loss=0.5016, val_auc=0.7