# 03_FTTransformer_v2_kfold 요약

- 모델: FT-Transformer
- 피처: v1 + missing indicator + qa_missing_ratio + race/religion 원본
- 학습/평가: KFold 5, 누수 방지 전처리
- 제출파일: submission_03_FTTransformer_v2_kfold.csv


# 04_third_test

## 사용 피처
- Q_A 태도: neg_att, pos_att, neutral_ratio, confident_ratio
- Big5: diff/strength 10개
- 단어 인지: wr_sum, wf_sum, word_credibility, cred_bin
- 인구통계: age_group_ord, education, urban_ord
- 무응답 indicator: education_is_missing, urban_is_missing, hand_is_missing, married_is_missing
- QA missing: qa_missing_ratio
- 범주형: hand_cat, married_cat, race_simple, religion_simple

## 모델
- FT-Transformer (CLS token 사용)
- NumericalEmbedding + Categorical Embedding

## 학습 세팅
- Loss: BCEWithLogitsLoss(pos_weight)
- Optimizer: AdamW
- Scheduler: ReduceLROnPlateau (mode='max')
- Early Stopping: val AUC 기준
- Metric: ROC-AUC
- 제출: 확률(sub['voted']=test_probs)


In [1]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score


## 0. 설정


In [2]:
# 공통 하이퍼파라미터
SEED = 42
N_SPLITS = 5
BATCH_SIZE = 256
EPOCHS = 40
PATIENCE = 6
LR = 3e-4
WEIGHT_DECAY = 3e-4
D_TOKEN = 64
N_LAYERS = 2
N_HEADS = 4
DROPOUT = 0.2

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


## 1. 유틸


In [3]:
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def make_cfg(train_df):
    cfg = {}
    edu = pd.to_numeric(train_df["education"], errors="coerce").replace(0, np.nan)
    cfg["education_mean"] = float(edu.mean())
    return cfg


def _normalize_cat(s: pd.Series) -> pd.Series:
    s = s.astype("string")
    s = s.fillna("__MISSING__")
    s = s.replace({"<NA>": "__MISSING__", "nan": "__MISSING__", "NaN": "__MISSING__"})
    return s


def _fit_cat_maps(df_train, cat_cols):
    cat_maps = {}
    for col in cat_cols:
        s = _normalize_cat(df_train[col])
        if "__MISSING__" not in s.unique():
            s = pd.concat([s, pd.Series(["__MISSING__"])], ignore_index=True)
        cats = sorted(s.astype(str).unique().tolist())
        cat_maps[col] = {v: i for i, v in enumerate(cats)}
    return cat_maps


def _transform_cat(series, mapping):
    s = _normalize_cat(series)
    return s.apply(lambda x: mapping.get(x, mapping.get("__MISSING__"))).astype(int).values


## 2. 데이터 로드


In [4]:
seed_everything(SEED)
train_raw = pd.read_csv("../../data/raw/train.csv")
test_raw  = pd.read_csv("../../data/raw/test_x.csv")
sample_sub = pd.read_csv("../../data/raw/sample_submission.csv")


## 3. build_features (요구사항 반영)


In [5]:
def build_features(df_raw, cfg=None, is_train=True):
    df = df_raw.copy()

    if is_train and "voted" in df.columns:
        df["voted_bin"] = (df["voted"] == 2).astype(int)

    if "age_group" in df.columns:
        age_map = {"10s":1, "20s":2, "30s":3, "40s":4, "50s":5, "60s":6, "+70s":7}
        df["age_group_ord"] = df["age_group"].map(age_map).astype("float32")

    if "education" in df.columns:
        df["education"] = pd.to_numeric(df["education"], errors="coerce")
        df.loc[df["education"] == 0, "education"] = np.nan
        df["education_is_missing"] = df["education"].isna().astype("float32")
        if cfg is not None:
            df["education"] = df["education"].fillna(cfg["education_mean"])
        df["education"] = df["education"].astype("float32")

    if "married" in df.columns:
        df["married"] = pd.to_numeric(df["married"], errors="coerce")
        df.loc[df["married"] == 0, "married"] = np.nan
        df["married_is_missing"] = df["married"].isna().astype("float32")
        df["married_cat"] = df["married"].astype("string")

    if "hand" in df.columns:
        df["hand"] = pd.to_numeric(df["hand"], errors="coerce")
        df.loc[df["hand"] == 0, "hand"] = np.nan
        df["hand_is_missing"] = df["hand"].isna().astype("float32")
        df["hand_cat"] = df["hand"].astype("string")

    if "urban" in df.columns:
        df["urban"] = pd.to_numeric(df["urban"], errors="coerce")
        df.loc[df["urban"] == 0, "urban"] = np.nan
        df["urban_is_missing"] = df["urban"].isna().astype("float32")
        df["urban_ord"] = df["urban"].astype("float32")

    neg_cols = ["QbA","QcA","QjA","QmA","QoA","QsA"]
    pos_cols = ["QkA","QqA"]
    other_cols = ["QeA","QfA","QhA","QrA"]

    qa_cols = [c for c in df.columns if c.startswith("Q") and c.endswith("A")]

    for col in qa_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df.loc[df[col] == 0, col] = np.nan

    if all(c in df.columns for c in neg_cols):
        df["neg_att"] = df[neg_cols].mean(axis=1)

    if all(c in df.columns for c in pos_cols):
        df["pos_att"] = df[pos_cols].mean(axis=1)

    if all(c in df.columns for c in other_cols):
        other = df[other_cols]
        denom = other.notna().sum(axis=1)
        df["neutral_ratio"] = np.where(denom > 0, (other == 3).sum(axis=1) / denom, np.nan).astype("float32")
        df["confident_ratio"] = np.where(denom > 0, other.isin([1,2,4,5]).sum(axis=1) / denom, np.nan).astype("float32")

    if len(qa_cols) > 0:
        qa_mat = df[qa_cols]
        df["qa_missing_ratio"] = qa_mat.isna().sum(axis=1) / len(qa_cols)

    tp_pairs = {
        "extraversion": ("tp01", "tp06"),
        "agreeableness": ("tp07", "tp02"),
        "conscientiousness": ("tp03", "tp08"),
        "neuroticism": ("tp04", "tp09"),
        "openness": ("tp05", "tp10"),
    }
    for trait, (a, b) in tp_pairs.items():
        if a in df.columns and b in df.columns:
            df[a] = pd.to_numeric(df[a], errors="coerce")
            df[b] = pd.to_numeric(df[b], errors="coerce")
            df[f"{trait}_diff"] = (df[a] - df[b]).astype("float32")
            df[f"{trait}_strength"] = df[f"{trait}_diff"].abs().astype("float32")

    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]

    if all(c in df.columns for c in wr_cols):
        df["wr_sum"] = df[wr_cols].sum(axis=1).astype("float32")

    if all(c in df.columns for c in wf_cols):
        df["wf_sum"] = df[wf_cols].sum(axis=1).astype("float32")

    if "wr_sum" in df.columns and "wf_sum" in df.columns:
        df["word_credibility"] = (df["wr_sum"] - df["wf_sum"]).astype("float32")

    if "word_credibility" in df.columns:
        df["cred_bin"] = pd.cut(
            df["word_credibility"],
            bins=[-3, 1, 6, 13],
            labels=["Low", "Mid", "High"]
        )

    return df


## 4. 컬럼 정의


In [6]:
TARGET = "voted_bin"
FEATURE_COLS = [
    "neg_att", "pos_att", "neutral_ratio", "confident_ratio",
    "extraversion_diff", "extraversion_strength",
    "agreeableness_diff", "agreeableness_strength",
    "conscientiousness_diff", "conscientiousness_strength",
    "neuroticism_diff", "neuroticism_strength",
    "openness_diff", "openness_strength",
    "wr_sum", "wf_sum", "word_credibility", "cred_bin",
    "age_group_ord", "education", "urban_ord",
    "education_is_missing", "urban_is_missing", "hand_is_missing", "married_is_missing",
    "qa_missing_ratio",
    "hand_cat", "married_cat",
    "race", "religion",
]

CAT_COLS = ["race", "religion", "cred_bin", "hand_cat", "married_cat"]


## 5. 전처리 (누수 방지)


In [7]:
def fit_transform(train_df, val_df, test_df, feature_cols, cat_cols, target):
    y_train = train_df[target].values
    y_val = val_df[target].values

    cat_maps = _fit_cat_maps(train_df, cat_cols)

    X_cat_train = np.stack([_transform_cat(train_df[c], cat_maps[c]) for c in cat_cols], axis=1)
    X_cat_val   = np.stack([_transform_cat(val_df[c], cat_maps[c])   for c in cat_cols], axis=1)
    X_cat_test  = np.stack([_transform_cat(test_df[c], cat_maps[c])  for c in cat_cols], axis=1)

    num_cols = [c for c in feature_cols if c not in cat_cols]

    train_num = train_df[num_cols].apply(pd.to_numeric, errors="coerce")
    val_num   = val_df[num_cols].apply(pd.to_numeric, errors="coerce")
    test_num  = test_df[num_cols].apply(pd.to_numeric, errors="coerce")

    train_means = train_num.mean()
    train_num = train_num.fillna(train_means)
    val_num   = val_num.fillna(train_means)
    test_num  = test_num.fillna(train_means)

    BINARY_COLS = [
        "education_is_missing","urban_is_missing","hand_is_missing","married_is_missing"
    ]
    RATIO_COLS = ["qa_missing_ratio","neutral_ratio","confident_ratio"]

    pass_cols = [c for c in (BINARY_COLS + RATIO_COLS) if c in num_cols]
    scale_cols = [c for c in num_cols if c not in pass_cols]

    # 스케일링은 continuous만
    if len(scale_cols) > 0:
        scaler = StandardScaler()
        X_num_train_scaled = scaler.fit_transform(train_num[scale_cols].values)
        X_num_val_scaled   = scaler.transform(val_num[scale_cols].values)
        X_num_test_scaled  = scaler.transform(test_num[scale_cols].values)
    else:
        scaler = None
        X_num_train_scaled = np.zeros((len(train_num), 0))
        X_num_val_scaled   = np.zeros((len(val_num), 0))
        X_num_test_scaled  = np.zeros((len(test_num), 0))

    # pass-through (0/1, ratio)
    if len(pass_cols) > 0:
        X_num_train_pass = train_num[pass_cols].values
        X_num_val_pass   = val_num[pass_cols].values
        X_num_test_pass  = test_num[pass_cols].values
    else:
        X_num_train_pass = np.zeros((len(train_num), 0))
        X_num_val_pass   = np.zeros((len(val_num), 0))
        X_num_test_pass  = np.zeros((len(test_num), 0))

    X_num_train = np.concatenate([X_num_train_scaled, X_num_train_pass], axis=1)
    X_num_val   = np.concatenate([X_num_val_scaled,   X_num_val_pass], axis=1)
    X_num_test  = np.concatenate([X_num_test_scaled,  X_num_test_pass], axis=1)

    cat_dims = [len(cat_maps[c]) for c in cat_cols]

    return {
        "X_num_train": X_num_train,
        "X_num_val": X_num_val,
        "X_num_test": X_num_test,
        "X_cat_train": X_cat_train,
        "X_cat_val": X_cat_val,
        "X_cat_test": X_cat_test,
        "y_train": y_train,
        "y_val": y_val,
        "cat_dims": cat_dims,
        "num_cols": num_cols,
    }


## 6. Dataset


In [8]:
class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]


## 7. FT-Transformer (CLS token)


In [9]:
class NumericalEmbedding(nn.Module):
    def __init__(self, num_features, d_token):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(num_features, d_token))
        self.bias = nn.Parameter(torch.zeros(num_features, d_token))

    def forward(self, x):
        x = x.unsqueeze(-1)
        return x * self.weight + self.bias


class FTTransformer(nn.Module):
    def __init__(self, cat_dims, num_features, d_token=D_TOKEN, n_layers=N_LAYERS,
                 n_heads=N_HEADS, dropout=DROPOUT, attn_dropout=DROPOUT):
        super().__init__()
        self.cat_embeds = nn.ModuleList([nn.Embedding(dim, d_token) for dim in cat_dims])
        self.num_embed = NumericalEmbedding(num_features, d_token)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_heads,
            dim_feedforward=d_token * 4,
            dropout=DROPOUT,
            batch_first=True,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token, 1)
        )

    def forward(self, x_num, x_cat):
        cat_tokens = []
        for i, emb in enumerate(self.cat_embeds):
            cat_tokens.append(emb(x_cat[:, i]))
        cat_tokens = torch.stack(cat_tokens, dim=1) if len(cat_tokens) > 0 else None

        num_tokens = self.num_embed(x_num)
        tokens = num_tokens if cat_tokens is None else torch.cat([cat_tokens, num_tokens], dim=1)

        cls = self.cls_token.expand(tokens.size(0), -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)

        x = self.transformer(tokens)
        cls_out = x[:, 0]
        return self.head(cls_out)


## 8. 학습/예측


In [10]:
def train_one_fold(X_num_train, X_cat_train, y_train, X_num_val, X_cat_val, y_val, cat_dims, num_features):
    train_ds = TabDataset(X_num_train, X_cat_train, y_train)
    val_ds = TabDataset(X_num_val, X_cat_val, y_val)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = FTTransformer(cat_dims=cat_dims, num_features=num_features).to(DEVICE)

    pos_weight = (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-6)
    pos_weight_t = torch.tensor([pos_weight], dtype=torch.float32, device=DEVICE)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3, min_lr=1e-5
    )

    best_auc = -1.0
    best_state = None
    patience_ctr = 0

    for epoch in range(EPOCHS):
        model.train()
        for xb_num, xb_cat, yb in train_loader:
            xb_num = xb_num.to(DEVICE)
            xb_cat = xb_cat.to(DEVICE)
            yb = yb.to(DEVICE)

            optimizer.zero_grad()
            logits = model(xb_num, xb_cat)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        model.eval()
        val_probs = []
        with torch.no_grad():
            for xb_num, xb_cat, yb in val_loader:
                xb_num = xb_num.to(DEVICE)
                xb_cat = xb_cat.to(DEVICE)
                logits = model(xb_num, xb_cat)
                probs = torch.sigmoid(logits).cpu().numpy().ravel()
                val_probs.append(probs)

        val_probs = np.concatenate(val_probs)
        val_auc = roc_auc_score(y_val, val_probs)
        scheduler.step(val_auc)

        if val_auc > best_auc + 1e-4:
            best_auc = val_auc
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_ctr = 0
        else:
            patience_ctr += 1

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1} | val AUC {val_auc:.4f}")

        if patience_ctr >= PATIENCE:
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_auc


def predict_proba(model, X_num, X_cat):
    ds = TabDataset(X_num, X_cat, y=None)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)

    model.eval()
    probs = []
    with torch.no_grad():
        for xb_num, xb_cat in loader:
            xb_num = xb_num.to(DEVICE)
            xb_cat = xb_cat.to(DEVICE)
            logits = model(xb_num, xb_cat)
            p = torch.sigmoid(logits).cpu().numpy().ravel()
            probs.append(p)
    return np.concatenate(probs)


## 9. K-Fold 학습 및 제출


In [None]:
y_all = (train_raw["voted"] == 2).astype(int).values
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

fold_aucs = []
test_probs_list = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(train_raw, y_all), 1):
    print(f"[Fold {fold}]")
    tr_raw = train_raw.iloc[tr_idx].copy()
    va_raw = train_raw.iloc[va_idx].copy()

    cfg = make_cfg(tr_raw)

    tr_feat = build_features(tr_raw, cfg=cfg, is_train=True)
    va_feat = build_features(va_raw, cfg=cfg, is_train=True)
    te_feat = build_features(test_raw, cfg=cfg, is_train=False)

    prep = fit_transform(tr_feat, va_feat, te_feat, FEATURE_COLS, CAT_COLS, TARGET)

    num_features = prep["X_num_train"].shape[1]

    model, best_auc = train_one_fold(
        prep["X_num_train"], prep["X_cat_train"], prep["y_train"],
        prep["X_num_val"], prep["X_cat_val"], prep["y_val"],
        prep["cat_dims"], num_features
)


    fold_aucs.append(best_auc)
    test_probs = predict_proba(model, prep["X_num_test"], prep["X_cat_test"])
    test_probs_list.append(test_probs)

    print(f"Fold {fold} best AUC: {best_auc:.4f}")

mean_auc = float(np.mean(fold_aucs))
print(f"Mean Validation ROC-AUC: {mean_auc:.4f}")

# test probs average
test_probs_mean = np.mean(np.stack(test_probs_list, axis=0), axis=0)

# 제출 생성 (index 매핑 방식)
test_pred = pd.DataFrame({
    "index": test_raw["index"].values,
    "voted": test_probs_mean
})

sub = sample_sub.drop(columns=["voted"]).merge(test_pred, on="index", how="left")
sub = sub[["index", "voted"]].sort_values("index").reset_index(drop=True)
sub["voted"] = sub["voted"].astype(float)

assert sub["voted"].isna().sum() == 0, "NaN in submission"

sub.to_csv("submission_03_FTTransformer_v2_kfold.csv", index=False)
print("saved submission_03_FTTransformer_v2_kfold.csv")



[Fold 1]
Epoch 5 | val AUC 0.7716
Epoch 10 | val AUC 0.7732
Epoch 15 | val AUC 0.7744
Epoch 20 | val AUC 0.7737
Fold 1 best AUC: 0.7744
[Fold 2]
Epoch 5 | val AUC 0.7620
Epoch 10 | val AUC 0.7644
Epoch 15 | val AUC 0.7657
Epoch 20 | val AUC 0.7659
Fold 2 best AUC: 0.7663
[Fold 3]
Epoch 5 | val AUC 0.7573
Epoch 10 | val AUC 0.7585
Epoch 15 | val AUC 0.7595
Fold 3 best AUC: 0.7605
[Fold 4]
Epoch 5 | val AUC 0.7550
Epoch 10 | val AUC 0.7554
Epoch 15 | val AUC 0.7570
Fold 4 best AUC: 0.7572
[Fold 5]
Epoch 5 | val AUC 0.7618
Epoch 10 | val AUC 0.7656
Epoch 15 | val AUC 0.7656
Fold 5 best AUC: 0.7656
Mean Validation ROC-AUC: 0.7648
saved submission_ft_third_test_prob_class2.csv
