# 12_FTTransformer_v8_rawcenter ÏöîÏïΩ

- Î™®Îç∏: FT-Transformer
- ÌîºÏ≤ò: Í∏∞Ï°¥ FE + raw center Î≤ÑÏ†Ñ(run_name=ft_rawcenter)
- ÌïôÏäµ/ÌèâÍ∞Ä: KFold 5, seed=42
- Ï†úÏ∂úÌååÏùº: submission_12_FTTransformer_v8_rawcenter.csv


In [1]:
import os
import random
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import roc_auc_score

# ============================================================
# CONFIG
# ============================================================
SEED = 42
N_FOLDS = 5
BATCH_SIZE = 512
EPOCHS = 60
PATIENCE = 10
LR = 1e-3
WEIGHT_DECAY = 1e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# FT-Transformer params
D_MODEL = 64
N_HEADS = 8
N_LAYERS = 3
FFN_MULT = 4
DROPOUT = 0.15

print(f"üñ•Ô∏è Device: {DEVICE}")
print(f"üìä Config: {N_FOLDS}-Fold | batch={BATCH_SIZE} | epochs={EPOCHS}")

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# ============================================================
# 1) Load
# ============================================================
train_raw = pd.read_csv("../../data/raw/train.csv")
test_raw  = pd.read_csv("../../data/raw/test_x.csv")

train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)
y_full = train_raw["voted_bin"].values.astype(np.float32)

print(f"Train: {train_raw.shape} | Test: {test_raw.shape}")
print(f"Target positive rate (voted==2): {train_raw['voted_bin'].mean():.2%}")

# ============================================================
# 2) Cleaning (Ïù¥ÏÉÅÏπò/Î¨¥ÏùëÎãµ Ï≤òÎ¶¨)
# ============================================================
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Î¨¥ÏùëÎãµ(0)ÏùÑ NaNÏúºÎ°ú: Ïã§Ï†ú 0 Í∞íÏù¥ ÏïÑÎãàÎùº 'Î¨¥ÏùëÎãµ' ÏùòÎØ∏
    zero_to_nan_cols = ["education", "engnat", "hand", "married", "urban"]
    for col in zero_to_nan_cols:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # familysize Ïù¥ÏÉÅÏπò
    if "familysize" in df.columns:
        df.loc[df["familysize"] == 0, "familysize"] = np.nan
        df.loc[df["familysize"] > 15, "familysize"] = np.nan

    # TP: 0ÏùÄ Î¨¥ÏùëÎãµ
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    for col in tp_cols:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # Q_E ÏùëÎãµÏãúÍ∞Ñ: heavy-tail -> clip
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)

    return df

# ============================================================
# 3) Feature build (ÏõêÎ≥∏ Ï§ëÏã¨ + ÏµúÏÜå ÌååÏÉù)
#    - ÏõêÎ≥∏ QA/QE/TP/WR/WF Ïú†ÏßÄ
#    - QEÎäî log ÌååÏÉù(ÏõêÎ≥∏ timeÏùÄ Î≤ÑÎ¶¨Í≥† logÎßå ÏÇ¨Ïö©)
#    - Í≤∞Ï∏° ÏûêÏ≤¥Í∞Ä Ïã†Ìò∏Ïù∏ Î∏îÎ°ù: missing ratio Ï∂îÍ∞Ä
# ============================================================
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # ---- Categorical base (Î¨∏Ïûê/Î≤îÏ£ºÎ°ú ÎëêÎäî Í≤å ÏûÑÎ≤†Îî©Ïóê Ïú†Î¶¨)
    # age_group, gender, race, religionÏùÄ ÏõêÎûò Î≤îÏ£ºÌòï
    # education/married/urban/engnat/handÎèÑ Í∞í Ï¢ÖÎ•ò Ï†ÅÏñ¥ÏÑú catÎ°ú Ïì∞Îäî Í≤å Ï¢ÖÏ¢Ö Îçî Ï¢ãÏùå
    # (Îã®, ÏàòÏπòÌòïÏúºÎ°úÎèÑ ÏùºÎ∂Ä ÌååÏÉùÏùÑ ÎßåÎì§Í∏¥ Ìï®)
    # Ïó¨Í∏∞ÏÑúÎäî catÎ°úÎßå ÏÇ¨Ïö©ÌïòÍ≥†, ÏàòÏπòÌòï ÌååÏÉùÏùÄ Î≥ÑÎèÑ ÏàòÏπò Ïª¨ÎüºÏúºÎ°ú Îë†.

    # ---- Numeric block: Q_A (1~5 ordinal) ÏõêÎ≥∏ 20Í∞ú Í∑∏ÎåÄÎ°ú
    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]

    # ---- Numeric block: Q_E -> log
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        df[f"{col}_log"] = np.log1p(df[col])
    qe_log_cols = [f"{col}_log" for col in qe_cols]

    # ---- Numeric block: TP 10Í∞ú ÏõêÎ≥∏ (NaN ÌóàÏö©)
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]

    # ---- Numeric block: WR/WF ÏõêÎ≥∏
    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]

    # ---- Minimal derived (Î™®Îç∏Ïù¥ Ïä§Ïä§Î°ú ÌïôÏäµÌïòÍ∏∞ Ïñ¥Î†µÍ±∞ÎÇò, Í≤∞Ï∏°/ÏÑ±ÏùòÏã†Ìò∏Îäî Í∞ïÏ†ú Ï£ºÏûÖ)
    df["qa_mean"] = df[qa_cols].mean(axis=1)
    df["qa_std"]  = df[qa_cols].std(axis=1)
    df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(int)

    df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
    df["qe_log_std"]  = df[qe_log_cols].std(axis=1)
    df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / 20
    df["qe_slow_ratio"] = (df[qe_cols] > 10000).sum(axis=1) / 20

    df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / 10

    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]

    # ÏÑ±ÏùòÏóÜÏùå Ïã†Ìò∏(ÏïÑÏ£º Í∞ïÌï®)
    df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df["qa_all_same"] == 1)).astype(int)

    return df

# ============================================================
# 4) Column definitions
# ============================================================
QA_COLS = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
QE_LOG_COLS = [f"Q{c}E_log" for c in "abcdefghijklmnopqrst"]
TP_COLS = [f"tp{i:02d}" for i in range(1, 11)]
WR_COLS = [f"wr_{i:02d}" for i in range(1, 14)]
WF_COLS = [f"wf_{i:02d}" for i in range(1, 4)]

NUM_COLS = (
    QA_COLS
    + QE_LOG_COLS
    + TP_COLS
    + WR_COLS
    + WF_COLS
    + [
        "familysize",
        "qa_mean","qa_std","qa_all_same",
        "qe_log_mean","qe_log_std","qe_fast_ratio","qe_slow_ratio",
        "tp_missing_ratio",
        "wr_sum","wf_sum","word_credibility",
        "is_careless",
    ]
)

CAT_COLS = [
    "age_group", "gender", "race", "religion",
    "education", "married", "urban", "engnat", "hand"
]

# ============================================================
# 5) Simple label encoding with UNK/NAN handling (per fold)
# ============================================================
def fit_cat_maps(train_df: pd.DataFrame, cat_cols):
    maps = {}
    sizes = []
    for c in cat_cols:
        s = train_df[c].copy()
        s = s.fillna("__NAN__").astype(str)
        uniq = s.unique().tolist()
        # reserve: 0=__UNK__, 1.. = known
        m = {v: i+1 for i, v in enumerate(uniq)}
        maps[c] = m
        sizes.append(len(m) + 1)  # +1 for __UNK__ at 0
    return maps, sizes

def transform_cats(df: pd.DataFrame, cat_cols, maps):
    arrs = []
    for c in cat_cols:
        s = df[c].fillna("__NAN__").astype(str)
        m = maps[c]
        enc = s.map(lambda x: m.get(x, 0)).astype(np.int64).values
        arrs.append(enc)
    return np.stack(arrs, axis=1)

# ============================================================
# 6) Dataset
# ============================================================
class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

# ============================================================
# 7) FT-Transformer (feature-token Î∞©Ïãù)
#   - Í∞Å numeric feature -> (x * W + b) Î°ú d_model ÌÜ†ÌÅ∞
#   - categorical -> embedding ÌÜ†ÌÅ∞
#   - [CLS] ÌÜ†ÌÅ∞ Ï∂îÍ∞Ä ÌõÑ Transformer encoder
# ============================================================
class FTTransformer(nn.Module):
    def __init__(self, n_num, cat_sizes, d_model=64, n_heads=8, n_layers=3, ffn_mult=4, dropout=0.15):
        super().__init__()
        self.n_num = n_num
        self.n_cat = len(cat_sizes)
        self.d_model = d_model

        # numeric feature-token parameters
        self.num_weight = nn.Parameter(torch.randn(n_num, d_model) * 0.02)
        self.num_bias   = nn.Parameter(torch.zeros(n_num, d_model))

        # categorical embeddings (each cat feature = one token)
        self.cat_embeds = nn.ModuleList([
            nn.Embedding(sz, d_model) for sz in cat_sizes
        ])

        # CLS token
        self.cls = nn.Parameter(torch.zeros(1, 1, d_model))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * ffn_mult,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1)
        )

        self._init()

    def _init(self):
        nn.init.normal_(self.cls, std=0.02)
        for emb in self.cat_embeds:
            nn.init.normal_(emb.weight, std=0.02)

    def forward(self, x_num, x_cat):
        # x_num: [B, n_num]
        # numeric tokens: [B, n_num, d_model] = x[:,:,None]*W + b
        num_tokens = x_num.unsqueeze(-1) * self.num_weight.unsqueeze(0) + self.num_bias.unsqueeze(0)

        # categorical tokens: concat along token dimension
        cat_tokens = []
        for i, emb in enumerate(self.cat_embeds):
            cat_tokens.append(emb(x_cat[:, i]))  # [B, d_model]
        if len(cat_tokens) > 0:
            cat_tokens = torch.stack(cat_tokens, dim=1)  # [B, n_cat, d_model]
            tokens = torch.cat([num_tokens, cat_tokens], dim=1)
        else:
            tokens = num_tokens

        # prepend CLS
        cls = self.cls.expand(tokens.size(0), -1, -1)  # [B,1,d_model]
        tokens = torch.cat([cls, tokens], dim=1)       # [B,1+n_tokens,d_model]

        out = self.encoder(tokens)
        cls_out = out[:, 0]  # CLS representation
        logits = self.head(cls_out)
        return logits

# ============================================================
# 8) Train / Predict
# ============================================================
def train_one_fold(model, train_loader, val_loader, y_train, y_val, device):
    model.to(device)

    # pos_weightÎäî "train fold" Í∏∞Ï§Ä
    pos_ratio = float(np.mean(y_train))
    pos_weight = torch.tensor([(1 - pos_ratio) / (pos_ratio + 1e-6)], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3)

    best_auc = -1.0
    best_state = None
    no_improve = 0

    for epoch in range(1, EPOCHS + 1):
        model.train()
        tr_losses = []

        for Xn, Xc, yb in train_loader:
            Xn, Xc, yb = Xn.to(device), Xc.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(Xn, Xc)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            tr_losses.append(loss.item())

        # val
        model.eval()
        preds = []
        with torch.no_grad():
            for Xn, Xc, _ in val_loader:
                Xn, Xc = Xn.to(device), Xc.to(device)
                p = torch.sigmoid(model(Xn, Xc)).detach().cpu().numpy().ravel()
                preds.append(p)
        preds = np.concatenate(preds)
        val_auc = roc_auc_score(y_val, preds)
        scheduler.step(val_auc)

        if val_auc > best_auc + 1e-5:
            best_auc = val_auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1

        if epoch % 10 == 0:
            print(f"    Epoch {epoch:03d} | loss={np.mean(tr_losses):.4f} | val_auc={val_auc:.5f} | best={best_auc:.5f}")

        if no_improve >= PATIENCE:
            print(f"    Early stopping at epoch {epoch}")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_auc

def predict(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in loader:
            if len(batch) == 2:
                Xn, Xc = batch
            else:
                Xn, Xc, _ = batch
            Xn, Xc = Xn.to(device), Xc.to(device)
            p = torch.sigmoid(model(Xn, Xc)).detach().cpu().numpy().ravel()
            preds.append(p)
    return np.concatenate(preds)

# ============================================================
# 9) Main CV Loop
# ============================================================
def main(run_name="ft_base"):
    set_seed(SEED)

    train_clean = clean_data(train_raw)
    test_clean  = clean_data(test_raw)

    train_fe = build_features(train_clean)
    test_fe  = build_features(test_clean)

    oof = np.zeros(len(train_fe), dtype=np.float32)
    test_pred = np.zeros(len(test_fe), dtype=np.float32)
    fold_aucs = []

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(train_fe, y_full), 1):
        print("\n" + "="*52)
        print(f"üìÇ Fold {fold}/{N_FOLDS}")
        print("="*52)

        tr_df = train_fe.iloc[tr_idx].copy().reset_index(drop=True)
        va_df = train_fe.iloc[va_idx].copy().reset_index(drop=True)
        te_df = test_fe.copy().reset_index(drop=True)

        y_tr = y_full[tr_idx]
        y_va = y_full[va_idx]

        # --------- numeric: NaN -> train median, then QuantileTransformer
        X_tr_num = tr_df[NUM_COLS].copy()
        X_va_num = va_df[NUM_COLS].copy()
        X_te_num = te_df[NUM_COLS].copy()

        for c in NUM_COLS:
            med = X_tr_num[c].median()
            if pd.isna(med):
                med = 0.0
            X_tr_num[c] = X_tr_num[c].fillna(med)
            X_va_num[c] = X_va_num[c].fillna(med)
            X_te_num[c] = X_te_num[c].fillna(med)

        scaler = QuantileTransformer(
            n_quantiles=2000,
            output_distribution="normal",
            random_state=SEED
        )
        X_tr_num = scaler.fit_transform(X_tr_num.values)
        X_va_num = scaler.transform(X_va_num.values)
        X_te_num = scaler.transform(X_te_num.values)

        # --------- categorical: fit on train fold only (UNK=0)
        cat_maps, cat_sizes = fit_cat_maps(tr_df, CAT_COLS)
        X_tr_cat = transform_cats(tr_df, CAT_COLS, cat_maps)
        X_va_cat = transform_cats(va_df, CAT_COLS, cat_maps)
        X_te_cat = transform_cats(te_df, CAT_COLS, cat_maps)

        train_ds = TabDataset(X_tr_num, X_tr_cat, y_tr)
        val_ds   = TabDataset(X_va_num, X_va_cat, y_va)
        test_ds  = TabDataset(X_te_num, X_te_cat)

        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
        val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
        test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

        model = FTTransformer(
            n_num=X_tr_num.shape[1],
            cat_sizes=cat_sizes,
            d_model=D_MODEL,
            n_heads=N_HEADS,
            n_layers=N_LAYERS,
            ffn_mult=FFN_MULT,
            dropout=DROPOUT
        )

        n_params = sum(p.numel() for p in model.parameters())
        print(f"    Model params: {n_params:,} | n_num={X_tr_num.shape[1]} | n_cat={len(cat_sizes)}")

        model, best_auc = train_one_fold(model, train_loader, val_loader, y_tr, y_va, DEVICE)
        fold_aucs.append(best_auc)

        oof[va_idx] = predict(model, val_loader, DEVICE)
        test_pred += predict(model, test_loader, DEVICE) / N_FOLDS

        print(f"  ‚úÖ Fold {fold} best AUC: {best_auc:.5f}")

    oof_auc = roc_auc_score(y_full, oof)

    print("\n" + "="*60)
    print("üéâ Final Result")
    print("="*60)
    print(f"üèÜ OOF AUC: {oof_auc:.5f}")
    print(f"üìä Fold AUCs: {[f'{x:.5f}' for x in fold_aucs]}")
    print(f"üìä Mean ¬± Std: {np.mean(fold_aucs):.5f} ¬± {np.std(fold_aucs):.5f}")

    sub = pd.DataFrame({
        "index": test_raw["index"] if "index" in test_raw.columns else np.arange(len(test_raw)),
        "voted": test_pred
    })
    out_path = "submission_12_FTTransformer_v8_rawcenter.csv"
    sub.to_csv(out_path, index=False)
    print(f"\nüíæ Saved: {out_path}")
    print(f"   pred range: [{test_pred.min():.4f}, {test_pred.max():.4f}] | mean={test_pred.mean():.4f}")

    return oof_auc

if __name__ == "__main__":
    main(run_name="ft_rawcenter")



üñ•Ô∏è Device: cpu
üìä Config: 5-Fold | batch=512 | epochs=60
Train: (45532, 79) | Test: (11383, 77)
Target positive rate (voted==2): 54.68%

üìÇ Fold 1/5
    Model params: 168,129 | n_num=79 | n_cat=9
    Epoch 010 | loss=0.5024 | val_auc=0.77849 | best=0.77917
    Epoch 020 | loss=0.4929 | val_auc=0.77724 | best=0.77926
    Early stopping at epoch 23
  ‚úÖ Fold 1 best AUC: 0.77926

üìÇ Fold 2/5
    Model params: 168,129 | n_num=79 | n_cat=9
    Epoch 010 | loss=0.5022 | val_auc=0.76960 | best=0.76971
    Epoch 020 | loss=0.4874 | val_auc=0.76697 | best=0.77138
    Early stopping at epoch 23
  ‚úÖ Fold 2 best AUC: 0.77138

üìÇ Fold 3/5
    Model params: 168,129 | n_num=79 | n_cat=9
    Epoch 010 | loss=0.4968 | val_auc=0.75926 | best=0.76046
    Early stopping at epoch 15
  ‚úÖ Fold 3 best AUC: 0.76046

üìÇ Fold 4/5
    Model params: 168,129 | n_num=79 | n_cat=9
    Epoch 010 | loss=0.4985 | val_auc=0.76536 | best=0.76536
    Epoch 020 | loss=0.4834 | val_auc=0.76134 | best=0.76