# 05_FTTransformer_v4_gemini_feats 요약

- 모델: FT-Transformer
- 피처: v1 베이스 + Gemini 요약(응답시간/문항통계/신뢰도)
- 학습/평가: 단일 split
- 제출파일: submission_05_FTTransformer_v4_gemini_feats.csv


## 1. 라이브러리 로드


In [None]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score



## 2. 설정값 및 시드 고정


In [None]:
# =============================
# CONFIG
# =============================
SEED = 42
BATCH_SIZE = 256
EPOCHS = 50
PATIENCE = 6
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(SEED)



## 3. 데이터 로드


In [None]:
train_raw = pd.read_csv("../../data/raw/train.csv")
test_raw  = pd.read_csv("../../data/raw/test_x.csv")



## 4. 피처 생성 함수


In [None]:
# =============================
# Feature Engineering
# =============================
def build_features(df_raw, cfg=None, is_train=True):
    df = df_raw.copy()

    # 0) target
    if is_train and "voted" in df.columns:
        df["voted_bin"] = (df["voted"] == 2).astype(int)

    # 1) age_group ordinal
    if "age_group" in df.columns:
        age_map = {"10s":1, "20s":2, "30s":3, "40s":4, "50s":5, "60s":6, "+70s":7}
        df["age_group_ord"] = df["age_group"].map(age_map).astype("float32")

    # 2) education (0 -> NaN -> train mean)
    if "education" in df.columns:
        df["education"] = pd.to_numeric(df["education"], errors="coerce")
        df.loc[df["education"] == 0, "education"] = np.nan
        if cfg is not None:
            df["education"] = df["education"].fillna(cfg["education_mean"])
        df["education"] = df["education"].astype("float32")

    # 3) married_cat
    if "married" in df.columns:
        df["married"] = pd.to_numeric(df["married"], errors="coerce")
        df.loc[df["married"] == 0, "married"] = np.nan
        df["married_cat"] = df["married"].astype("string")

    # 4) hand_cat
    if "hand" in df.columns:
        df["hand"] = pd.to_numeric(df["hand"], errors="coerce")
        df.loc[df["hand"] == 0, "hand"] = np.nan
        df["hand_cat"] = df["hand"].astype("string")

    # 5) urban_ord
    if "urban" in df.columns:
        df["urban"] = pd.to_numeric(df["urban"], errors="coerce")
        df.loc[df["urban"] == 0, "urban"] = np.nan
        df["urban_ord"] = df["urban"].astype("float32")

    # 6) race/religion simple
    def simplify_major_other(series, majors):
        return series.apply(lambda x: x if x in majors else "Other")

    if "race" in df.columns and cfg is not None:
        df["race_simple"] = simplify_major_other(df["race"], cfg["race_majors"]).astype(str)

    if "religion" in df.columns and cfg is not None:
        df["religion_simple"] = simplify_major_other(df["religion"], cfg["religion_majors"]).astype(str)

    # -----------------------------
    # (BASE) Q_A 요약
    # -----------------------------
    neg_cols = ["QbA","QcA","QjA","QmA","QoA","QsA"]
    pos_cols = ["QkA","QqA"]
    other_cols = ["QeA","QfA","QhA","QrA"]

    for col in neg_cols + pos_cols + other_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    if all(c in df.columns for c in neg_cols):
        df["neg_att"] = df[neg_cols].mean(axis=1)

    if all(c in df.columns for c in pos_cols):
        df["pos_att"] = df[pos_cols].mean(axis=1)

    if all(c in df.columns for c in other_cols):
        other = df[other_cols]
        df["neutral_ratio"] = (other == 3).mean(axis=1).astype("float32")
        df["confident_ratio"] = ((other <= 2) | (other >= 4)).mean(axis=1).astype("float32")

    # -----------------------------
    # (BASE) Big5 diff/strength
    # -----------------------------
    tp_pairs = {
        "extraversion": ("tp01", "tp06"),
        "agreeableness": ("tp07", "tp02"),
        "conscientiousness": ("tp03", "tp08"),
        "neuroticism": ("tp04", "tp09"),
        "openness": ("tp05", "tp10"),
    }
    for trait, (a, b) in tp_pairs.items():
        if a in df.columns and b in df.columns:
            df[a] = pd.to_numeric(df[a], errors="coerce")
            df[b] = pd.to_numeric(df[b], errors="coerce")
            df[f"{trait}_diff"] = (df[a] - df[b]).astype("float32")
            df[f"{trait}_strength"] = df[f"{trait}_diff"].abs().astype("float32")

    # -----------------------------
    # (BASE) word sums
    # -----------------------------
    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]

    if all(c in df.columns for c in wr_cols):
        df["wr_sum"] = df[wr_cols].sum(axis=1).astype("float32")
    if all(c in df.columns for c in wf_cols):
        df["wf_sum"] = df[wf_cols].sum(axis=1).astype("float32")

    if "wr_sum" in df.columns and "wf_sum" in df.columns:
        df["word_credibility"] = (df["wr_sum"] - df["wf_sum"]).astype("float32")
        # Gemini 버전(패널티 강화)도 추가
        df["reliability_score"] = (df["wr_sum"] - 2.0 * df["wf_sum"]).astype("float32")

    if "word_credibility" in df.columns:
        df["cred_bin"] = pd.cut(
            df["word_credibility"],
            bins=[-3, 1, 6, 13],
            labels=["Low", "Mid", "High"]
        )

    # =============================
    # (ADD: Gemini summary only)
    # =============================

    # (1) total_time_log: Q?E 전체 로그합
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    if all(c in df.columns for c in qe_cols):
        for c in qe_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce")
        df["total_time_log"] = np.log1p(df[qe_cols]).sum(axis=1).astype("float32")

    # (2) mach_score / mach_std (역채점 포함)
    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    reverse_cols = ["QaA","QdA","QgA","QiA","QlA","QnA","QpA","QrA","QtA"]
    if all(c in df.columns for c in qa_cols):
        for c in qa_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce")
        for c in reverse_cols:
            if c in df.columns:
                df[c] = 6 - df[c]  # 1~5 가정
        df["mach_score"] = df[qa_cols].mean(axis=1).astype("float32")
        df["mach_std"] = df[qa_cols].std(axis=1).astype("float32")

    # (3) age_edu interaction (categorical)
    if "age_group" in df.columns and "education" in df.columns:
        df["age_edu"] = df["age_group"].astype("string") + "_" + df["education"].astype("string")

    return df



## 5. 피처 리스트


In [None]:
# =============================
# Feature columns
# =============================
target = "voted_bin"

# (기본 + Gemini 요약만 추가)
feature_cols = [
    # base
    "neg_att", "pos_att", "neutral_ratio", "confident_ratio",
    "extraversion_diff", "extraversion_strength",
    "agreeableness_diff", "agreeableness_strength",
    "conscientiousness_diff", "conscientiousness_strength",
    "neuroticism_diff", "neuroticism_strength",
    "openness_diff", "openness_strength",
    "wr_sum", "wf_sum", "word_credibility", "cred_bin",
    "age_group_ord", "education", "urban_ord",
    "hand_cat", "married_cat",
    "race_simple", "religion_simple",

    # add (gemini summary only)
    "total_time_log",      # 전체 시간 로그합
    "mach_score", "mach_std",
    "reliability_score",
    "age_edu",             # interaction (cat)
]

cat_cols = ["race_simple", "religion_simple", "cred_bin", "hand_cat", "married_cat", "age_edu"]



## 6. 전처리 함수 (인코딩/스케일링)


In [None]:
def _normalize_cat(s: pd.Series) -> pd.Series:
    s = s.astype("string")
    s = s.fillna("__MISSING__")
    s = s.replace({"<NA>": "__MISSING__", "nan": "__MISSING__", "NaN": "__MISSING__"})
    return s


def _fit_label_encoders(df_train, cat_cols):
    encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        s = _normalize_cat(df_train[col])
        if "__MISSING__" not in s.unique():
            s = pd.concat([s, pd.Series(["__MISSING__"])], ignore_index=True)
        le.fit(s.astype(str))
        encoders[col] = le
    return encoders


def _transform_with_encoder(series, le: LabelEncoder):
    s = _normalize_cat(series)
    classes = set(le.classes_)
    s = s.apply(lambda x: x if x in classes else "__MISSING__")
    return le.transform(s.astype(str))


def preprocess_split_fit_transform(df_train, df_val, df_test, feature_cols, cat_cols, target):
    y_train = df_train[target].values
    y_val = df_val[target].values

    encoders = _fit_label_encoders(df_train, cat_cols)

    X_cat_train = np.stack([_transform_with_encoder(df_train[c], encoders[c]) for c in cat_cols], axis=1)
    X_cat_val   = np.stack([_transform_with_encoder(df_val[c], encoders[c])   for c in cat_cols], axis=1)
    X_cat_test  = np.stack([_transform_with_encoder(df_test[c], encoders[c])  for c in cat_cols], axis=1)

    num_cols = [c for c in feature_cols if c not in cat_cols]
    train_num = df_train[num_cols].apply(pd.to_numeric, errors="coerce")
    val_num   = df_val[num_cols].apply(pd.to_numeric, errors="coerce")
    test_num  = df_test[num_cols].apply(pd.to_numeric, errors="coerce")

    train_means = train_num.mean()
    train_num = train_num.fillna(train_means)
    val_num   = val_num.fillna(train_means)
    test_num  = test_num.fillna(train_means)

    scaler = StandardScaler()
    X_num_train = scaler.fit_transform(train_num.values)
    X_num_val   = scaler.transform(val_num.values)
    X_num_test  = scaler.transform(test_num.values)

    return {
        "X_num_train": X_num_train,
        "X_num_val": X_num_val,
        "X_num_test": X_num_test,
        "X_cat_train": X_cat_train,
        "X_cat_val": X_cat_val,
        "X_cat_test": X_cat_test,
        "y_train": y_train,
        "y_val": y_val,
        "encoders": encoders,
        "scaler": scaler,
        "num_cols": num_cols,
    }



## 7. Train/Val 분리 및 피처 생성


In [None]:
# split
raw_y = (train_raw["voted"] == 2).astype(int)
train_raw_split, val_raw_split = train_test_split(
    train_raw, test_size=0.2, random_state=SEED, stratify=raw_y
)

cfg = {}
edu = pd.to_numeric(train_raw_split["education"], errors="coerce").replace(0, np.nan)
cfg["education_mean"] = float(edu.mean())
cfg["race_majors"] = set(train_raw_split["race"].value_counts(dropna=True).head(5).index)
cfg["religion_majors"] = set(train_raw_split["religion"].value_counts(dropna=True).head(5).index)

train_feat = build_features(train_raw_split, cfg=cfg, is_train=True)
val_feat   = build_features(val_raw_split,   cfg=cfg, is_train=True)
test_feat  = build_features(test_raw,        cfg=cfg, is_train=False)

prep = preprocess_split_fit_transform(train_feat, val_feat, test_feat, feature_cols, cat_cols, target)

X_num_train = prep["X_num_train"]
X_num_val   = prep["X_num_val"]
X_num_test  = prep["X_num_test"]
X_cat_train = prep["X_cat_train"]
X_cat_val   = prep["X_cat_val"]
X_cat_test  = prep["X_cat_test"]
y_train     = prep["y_train"]
y_val       = prep["y_val"]
num_cols    = prep["num_cols"]



## 8. Dataset & 모델 정의


In [None]:
class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]


class NumericalEmbedding(nn.Module):
    def __init__(self, num_features, d_token):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(num_features, d_token))
        self.bias = nn.Parameter(torch.zeros(num_features, d_token))

    def forward(self, x):
        x = x.unsqueeze(-1)
        return x * self.weight + self.bias


class FTTransformer(nn.Module):
    def __init__(self, cat_dims, num_features, d_token=64, n_layers=2, n_heads=4, dropout=0.2, attn_dropout=0.2):
        super().__init__()
        self.cat_embeds = nn.ModuleList([nn.Embedding(dim, d_token) for dim in cat_dims])
        self.num_embed = NumericalEmbedding(num_features, d_token)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_token))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token, nhead=n_heads, dim_feedforward=d_token * 4,
            dropout=attn_dropout, batch_first=True, activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, d_token),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_token, 1)
        )

    def forward(self, x_num, x_cat):
        cat_tokens = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embeds)]
        cat_tokens = torch.stack(cat_tokens, dim=1) if len(cat_tokens) > 0 else None

        num_tokens = self.num_embed(x_num)
        tokens = num_tokens if cat_tokens is None else torch.cat([cat_tokens, num_tokens], dim=1)

        cls = self.cls_token.expand(tokens.size(0), -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)

        x = self.transformer(tokens)
        return self.head(x[:, 0])



## 9. 학습/평가 루프


In [None]:
def train_one_model(model, train_loader, val_loader, y_val, device, epochs=EPOCHS, patience=PATIENCE):
    model.to(device)

    pos_weight = (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-6)
    pos_weight_t = torch.tensor([pos_weight], dtype=torch.float32, device=device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, min_lr=1e-5)

    best_auc = -1.0
    best_state = None
    patience_ctr = 0

    for epoch in range(epochs):
        model.train()
        for xb_num, xb_cat, yb in train_loader:
            xb_num, xb_cat, yb = xb_num.to(device), xb_cat.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb_num, xb_cat)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        model.eval()
        val_probs = []
        with torch.no_grad():
            for xb_num, xb_cat, _ in val_loader:
                xb_num, xb_cat = xb_num.to(device), xb_cat.to(device)
                probs = torch.sigmoid(model(xb_num, xb_cat)).cpu().numpy().ravel()
                val_probs.append(probs)
        val_probs = np.concatenate(val_probs)
        val_auc = roc_auc_score(y_val, val_probs)
        scheduler.step(val_auc)

        if val_auc > best_auc + 1e-4:
            best_auc = val_auc
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_ctr = 0
        else:
            patience_ctr += 1

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1} | val AUC {val_auc:.4f} (best {best_auc:.4f})")

        if patience_ctr >= patience:
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_auc


def predict_proba(model, loader, device):
    model.eval()
    probs = []
    with torch.no_grad():
        for xb_num, xb_cat in loader:
            xb_num, xb_cat = xb_num.to(device), xb_cat.to(device)
            p = torch.sigmoid(model(xb_num, xb_cat)).cpu().numpy().ravel()
            probs.append(p)
    return np.concatenate(probs)



## 10. 학습 실행


In [None]:
cat_dims = [len(prep["encoders"][c].classes_) for c in cat_cols]
model = FTTransformer(cat_dims=cat_dims, num_features=len(num_cols), d_token=64, n_layers=2, n_heads=4, dropout=0.2, attn_dropout=0.2)

train_ds = TabDataset(X_num_train, X_cat_train, y_train)
val_ds   = TabDataset(X_num_val,   X_cat_val,   y_val)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)

model, best_auc = train_one_model(model, train_loader, val_loader, y_val, DEVICE)
print(f"Best Validation ROC-AUC: {best_auc:.4f}")



## 11. 테스트 예측 및 제출


In [None]:
test_ds = TabDataset(X_num_test, X_cat_test, y=None)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)
test_probs = predict_proba(model, test_loader, DEVICE)

sub = pd.read_csv("../../data/raw/sample_submission.csv")
sub["voted"] = test_probs
sub.to_csv("submission_05_FTTransformer_v4_gemini_feats.csv", index=False)
print("saved submission_05_FTTransformer_v4_gemini_feats.csv")

