# 17_v5_FTLight_v1 ÏöîÏïΩ

- Î™®Îç∏: v5 Ïã§Ìóò(MLP/FT/ENS ÎπÑÍµê)
- ÌîºÏ≤ò: TE_ONLY/RAW_ONLY/SUMMARY_ONLY Î™®Îìú
- ÌïôÏäµ/ÌèâÍ∞Ä: KFold 5
- Ï†úÏ∂úÌååÏùº: (ÏΩîÎìú ÎÇ¥ Ï†ÄÏû• ÏóÜÏùå)


In [None]:
# FT vs MLP vs ensemble test

In [1]:
import os
import random
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# ============================================================
# CONFIG
# ============================================================
SEED = 42
N_FOLDS = 5
BATCH_SIZE = 512
EPOCHS = 25        # <-- Îπ†Î•∏ ÌåêÎã®Ïö© (Ïú†ÎßùÌïòÎ©¥ 50~80)
PATIENCE = 6       # <-- Îπ†Î•∏ ÌåêÎã®Ïö©
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

TRAIN_PATH = "../../data/raw/train.csv"
TEST_PATH  = "../../data/raw/test_x.csv"

# Î°úÏª¨/ÎÖ∏Ìä∏Î∂ÅÏóêÏÑú ÌååÏùºÎ™ÖÎßå ÏûàÎäî Í≤ΩÏö∞ ÎåÄÎπÑ
FALLBACK_TRAIN = "train.csv"
FALLBACK_TEST  = "test_x.csv"

# ============================================================
# Utils
# ============================================================
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

def smart_read_csv(path, fallback):
    if os.path.exists(path):
        return pd.read_csv(path)
    if os.path.exists(fallback):
        return pd.read_csv(fallback)
    raise FileNotFoundError(f"Cannot find csv: {path} or {fallback}")

print(f"üñ•Ô∏è Device: {DEVICE}")
print("üìÇ Loading data...")
train_raw = smart_read_csv(TRAIN_PATH, FALLBACK_TRAIN)
test_raw  = smart_read_csv(TEST_PATH,  FALLBACK_TEST)

train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)
print(f"Train: {train_raw.shape}, Test: {test_raw.shape}")

# ============================================================
# Clean / Preprocess (ÎÑà ÏΩîÎìú Í∏∞Ï§Ä)
# ============================================================
def clean_data(df):
    df = df.copy()

    # Î¨¥ÏùëÎãµ (0 -> NaN)
    for col in ['education', 'engnat', 'hand', 'married', 'urban']:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # familysize
    if 'familysize' in df.columns:
        df.loc[df['familysize'] == 0, 'familysize'] = np.nan
        df.loc[df['familysize'] > 15, 'familysize'] = np.nan

    # TP 0 -> NaN
    for col in [f"tp{i:02d}" for i in range(1, 11)]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    # Q_E clipping
    for col in [f"Q{c}E" for c in "abcdefghijklmnopqrst"]:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)

    return df

def build_features(df):
    df = df.copy()

    # Demographic
    age_map = {"10s": 1, "20s": 2, "30s": 3, "40s": 4, "50s": 5, "60s": 6, "+70s": 7}
    df["age_ord"] = df["age_group"].map(age_map)
    df["is_teenager"] = (df["age_ord"] == 1).astype(int)
    df["is_young"] = (df["age_ord"] <= 2).astype(int)
    df["is_old"] = (df["age_ord"] >= 6).astype(int)
    df["edu_low"] = (df["education"] <= 2).astype(float)
    df["edu_high"] = (df["education"] >= 3).astype(float)
    df["is_single"] = (df["married"] == 1).astype(float)
    df["is_married"] = (df["married"] == 2).astype(float)
    df["is_urban"] = (df["urban"] == 3).astype(float)
    df["is_english_native"] = (df["engnat"] == 1).astype(float)
    df["is_male"] = (df["gender"] == "Male").astype(int)

    # Q_A original cols + summary
    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    df["qa_mean"] = df[qa_cols].mean(axis=1)
    df["qa_std"] = df[qa_cols].std(axis=1)
    df["qa_range"] = df[qa_cols].max(axis=1) - df[qa_cols].min(axis=1)
    df["qa_extreme_ratio"] = ((df[qa_cols] == 1) | (df[qa_cols] == 5)).sum(axis=1) / 20
    df["qa_neutral_ratio"] = (df[qa_cols] == 3).sum(axis=1) / 20
    df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(int)

    # Q_E log + summary
    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        df[f"{col}_log"] = np.log1p(df[col])
    qe_log_cols = [f"{col}_log" for col in qe_cols]
    df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
    df["qe_log_std"] = df[qe_log_cols].std(axis=1)
    df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / 20
    df["qe_total_log"] = df[qe_log_cols].sum(axis=1)
    df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df["qa_all_same"] == 1)).astype(int)

    # TP + Big5 diff + summary
    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / 10
    df["extraversion"] = df["tp01"] - df["tp06"]
    df["agreeableness"] = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"] = df["tp04"] - df["tp09"]
    df["openness"] = df["tp05"] - df["tp10"]
    df["tp_mean"] = df[tp_cols].mean(axis=1)

    # WR/WF + summary
    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    df["vocab_high"] = (df["wr_sum"] >= 11).astype(int)

    # Interactions
    df["age_edu"] = df["age_ord"] * df["education"]
    df["young_low_edu"] = df["is_young"] * df["edu_low"]
    df["young_single"] = df["is_young"] * df["is_single"]
    df["old_married"] = df["is_old"] * df["is_married"]
    df["teenager_low_edu"] = df["is_teenager"] * df["edu_low"]

    return df

# ============================================================
# Target Encoding (CV leak Î∞©ÏßÄ: fold-trainÎßåÏúºÎ°ú map)
# ============================================================
def target_encode(train_df, val_df, col, target_col, smoothing=10):
    global_mean = train_df[target_col].mean()
    agg = train_df.groupby(col)[target_col].agg(['mean', 'count'])
    te = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
    te_map = te.to_dict()
    tr = train_df[col].map(te_map).fillna(global_mean).values
    va = val_df[col].map(te_map).fillna(global_mean).values
    return tr, va

def make_te_features(tr_df, va_df, target_col="voted_bin"):
    te_names = []
    out_tr = {}
    out_va = {}

    # single
    for col in ['age_group', 'race', 'religion']:
        tr, va = target_encode(tr_df, va_df, col, target_col, smoothing=10)
        name = f"{col}_te"
        out_tr[name], out_va[name] = tr, va
        te_names.append(name)

    # combos
    tr_df = tr_df.copy()
    va_df = va_df.copy()
    tr_df['age_edu_cat'] = tr_df['age_group'].astype(str) + '_' + tr_df['education'].astype(str)
    va_df['age_edu_cat'] = va_df['age_group'].astype(str) + '_' + va_df['education'].astype(str)
    tr_df['age_edu_married_cat'] = tr_df['age_group'].astype(str) + '_' + tr_df['education'].astype(str) + '_' + tr_df['married'].astype(str)
    va_df['age_edu_married_cat'] = va_df['age_group'].astype(str) + '_' + va_df['education'].astype(str) + '_' + va_df['married'].astype(str)

    for col, sm in [('age_edu_cat', 5), ('age_edu_married_cat', 3)]:
        tr, va = target_encode(tr_df, va_df, col, target_col, smoothing=sm)
        name = f"{col}_te"
        out_tr[name], out_va[name] = tr, va
        te_names.append(name)

    return te_names, out_tr, out_va

# ============================================================
# Feature Groups (A/B/C Ïã§Ìóò)
# ============================================================
QA_COLS = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
QE_LOG_COLS = [f"Q{c}E_log" for c in "abcdefghijklmnopqrst"]
TP_COLS = [f"tp{i:02d}" for i in range(1, 11)]
WR_COLS = [f"wr_{i:02d}" for i in range(1, 14)]
WF_COLS = [f"wf_{i:02d}" for i in range(1, 4)]

SUMMARY_COLS = [
    "age_ord","education","married","urban","engnat","familysize","hand",
    "is_teenager","is_young","is_old","edu_low","edu_high",
    "is_single","is_married","is_urban","is_english_native","is_male",
    "qa_mean","qa_std","qa_range","qa_extreme_ratio","qa_neutral_ratio","qa_all_same",
    "qe_log_mean","qe_log_std","qe_fast_ratio","qe_total_log","is_careless",
    "tp_missing_ratio","tp_mean",
    "extraversion","agreeableness","conscientiousness","neuroticism","openness",
    "wr_sum","wf_sum","word_credibility","vocab_high",
    "age_edu","young_low_edu","young_single","old_married","teenager_low_edu",
]

def get_feature_set(mode):
    """
    mode:
      - "TE_ONLY": TEÎßå
      - "RAW_ONLY": ÏõêÎ≥∏(qa + qe_log + tp + wr/wf + demographic ÏõêÎ≥∏ Î™áÍ∞ú) / TE ÏóÜÏùå
      - "SUMMARY_ONLY": ÏöîÏïΩ/ÌååÏÉùÎßå / TE ÏóÜÏùå
    """
    if mode == "TE_ONLY":
        base = []  # TEÎ•º Îí§ÏóêÏÑú Î∂ôÏùº Í±∞Îùº ÎπÑÏõåÎë†
        use_te = True
    elif mode == "RAW_ONLY":
        base = QA_COLS + QE_LOG_COLS + TP_COLS + WR_COLS + WF_COLS + [
            "age_ord","education","married","urban","engnat","familysize","hand","is_male"
        ]
        use_te = False
    elif mode == "SUMMARY_ONLY":
        base = SUMMARY_COLS
        use_te = False
    else:
        raise ValueError("mode must be one of: TE_ONLY, RAW_ONLY, SUMMARY_ONLY")
    return base, use_te

# ============================================================
# Dataset
# ============================================================
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

# ============================================================
# Models
# ============================================================
class MLP(nn.Module):
    def __init__(self, input_dim, hidden=[256,128,64], dropout=0.3):
        super().__init__()
        layers=[]
        prev=input_dim
        for h in hidden:
            layers += [nn.Linear(prev,h), nn.BatchNorm1d(h), nn.SiLU(), nn.Dropout(dropout)]
            prev=h
        layers += [nn.Linear(prev,1)]
        self.net=nn.Sequential(*layers)
    def forward(self,x):
        return self.net(x)

class FTTransformer(nn.Module):
    def __init__(self, n_features, d_token=64, n_layers=2, n_heads=4, dropout=0.2):
        super().__init__()
        self.embed = nn.Linear(1, d_token)
        self.cls = nn.Parameter(torch.randn(1,1,d_token)*0.02)
        enc = nn.TransformerEncoderLayer(
            d_model=d_token, nhead=n_heads, dim_feedforward=d_token*4,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.tr = nn.TransformerEncoder(enc, num_layers=n_layers)
        self.head = nn.Sequential(nn.LayerNorm(d_token), nn.Linear(d_token,1))
        self.n_features = n_features
    def forward(self,x):
        x = x.unsqueeze(-1)     # (B, F, 1)
        x = self.embed(x)       # (B, F, D)
        cls = self.cls.expand(x.size(0), -1, -1)
        x = torch.cat([cls, x], dim=1)  # (B, 1+F, D)
        x = self.tr(x)
        return self.head(x[:,0])

# ============================================================
# Train / Predict
# ============================================================
def train_one(model, tr_loader, va_loader, y_va, lr=1e-3):
    model.to(DEVICE)
    crit = nn.BCEWithLogitsLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    best_auc=-1
    best_state=None
    bad=0

    for epoch in range(EPOCHS):
        model.train()
        for x,y in tr_loader:
            x,y = x.to(DEVICE), y.to(DEVICE)
            opt.zero_grad()
            loss = crit(model(x), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        # val
        model.eval()
        preds=[]
        with torch.no_grad():
            for x,y in va_loader:
                x = x.to(DEVICE)
                preds.append(torch.sigmoid(model(x)).cpu().numpy())
        preds = np.concatenate(preds).ravel()
        auc = roc_auc_score(y_va, preds)

        if auc > best_auc + 1e-5:
            best_auc = auc
            best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}
            bad=0
        else:
            bad += 1
        if bad >= PATIENCE:
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_auc

def predict(model, loader):
    model.eval()
    preds=[]
    with torch.no_grad():
        for x in loader:
            if isinstance(x, (tuple,list)):
                x = x[0]
            x = x.to(DEVICE)
            preds.append(torch.sigmoid(model(x)).cpu().numpy())
    return np.concatenate(preds).ravel()

# ============================================================
# Core Experiment Runner
# ============================================================
def run_experiment(mode="TE_ONLY"):
    print("\n" + "="*80)
    print(f"üß™ EXPERIMENT: {mode}")
    print("="*80)

    set_seed(SEED)

    tr = clean_data(train_raw)
    tr = build_features(tr)

    y = tr["voted_bin"].values.astype(np.float32)

    base_cols, use_te = get_feature_set(mode)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    oof_mlp = np.zeros(len(tr))
    oof_ft  = np.zeros(len(tr))

    fold_auc_mlp=[]
    fold_auc_ft=[]
    fold_auc_ens=[]

    for fold, (tr_idx, va_idx) in enumerate(skf.split(tr, y), 1):
        tr_df = tr.iloc[tr_idx].reset_index(drop=True)
        va_df = tr.iloc[va_idx].reset_index(drop=True)

        # Base features
        X_tr = tr_df[base_cols].copy() if len(base_cols) else pd.DataFrame(index=range(len(tr_df)))
        X_va = va_df[base_cols].copy() if len(base_cols) else pd.DataFrame(index=range(len(va_df)))

        # TE features (only if mode says so)
        te_cols = []
        if use_te:
            te_cols, te_tr, te_va = make_te_features(tr_df, va_df, target_col="voted_bin")
            for c in te_cols:
                X_tr[c] = te_tr[c]
                X_va[c] = te_va[c]

        # NaN -> median (train fold)
        for c in X_tr.columns:
            med = X_tr[c].median()
            if pd.isna(med):
                med = 0
            X_tr[c] = X_tr[c].fillna(med)
            X_va[c] = X_va[c].fillna(med)

        # scaling
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr.values)
        X_va_s = scaler.transform(X_va.values)

        y_tr = y[tr_idx]
        y_va = y[va_idx]

        tr_loader = DataLoader(TabDataset(X_tr_s, y_tr), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        va_loader = DataLoader(TabDataset(X_va_s, y_va), batch_size=BATCH_SIZE, shuffle=False)
        va_pred_loader = DataLoader(TabDataset(X_va_s), batch_size=BATCH_SIZE, shuffle=False)

        n_features = X_tr_s.shape[1]

        # MLP
        mlp = MLP(n_features, hidden=[256,128,64], dropout=0.3)
        mlp, auc_mlp = train_one(mlp, tr_loader, va_loader, y_va, lr=1e-3)
        pred_mlp = predict(mlp, va_pred_loader)

        # FT
        ft = FTTransformer(n_features, d_token=64, n_layers=2, n_heads=4, dropout=0.2)
        ft, auc_ft = train_one(ft, tr_loader, va_loader, y_va, lr=5e-4)
        pred_ft = predict(ft, va_pred_loader)

        # Ensemble (simple avg)
        pred_ens = 0.5 * pred_mlp + 0.5 * pred_ft
        auc_ens = roc_auc_score(y_va, pred_ens)

        oof_mlp[va_idx] = pred_mlp
        oof_ft[va_idx]  = pred_ft

        fold_auc_mlp.append(auc_mlp)
        fold_auc_ft.append(auc_ft)
        fold_auc_ens.append(auc_ens)

        print(f"[Fold {fold}]  MLP={auc_mlp:.5f} | FT={auc_ft:.5f} | ENS(0.5)={auc_ens:.5f} | n_feat={n_features}")

    # OOF summary
    oof_auc_mlp = roc_auc_score(y, oof_mlp)
    oof_auc_ft  = roc_auc_score(y, oof_ft)
    oof_auc_ens = roc_auc_score(y, 0.5*oof_mlp + 0.5*oof_ft)

    print("\n--- OOF SUMMARY ---")
    print(f"MLP  OOF AUC: {oof_auc_mlp:.5f} | fold mean: {np.mean(fold_auc_mlp):.5f}")
    print(f"FT   OOF AUC: {oof_auc_ft:.5f} | fold mean: {np.mean(fold_auc_ft):.5f}")
    print(f"ENS  OOF AUC: {oof_auc_ens:.5f} | fold mean: {np.mean(fold_auc_ens):.5f}")
    print("-------------------")

    return {
        "mode": mode,
        "oof_mlp": oof_auc_mlp,
        "oof_ft": oof_auc_ft,
        "oof_ens": oof_auc_ens,
        "fold_mlp": fold_auc_mlp,
        "fold_ft": fold_auc_ft,
        "fold_ens": fold_auc_ens,
    }

def main():
    results = []
    for mode in ["TE_ONLY", "RAW_ONLY", "SUMMARY_ONLY"]:
        results.append(run_experiment(mode))

    print("\n" + "="*80)
    print("üèÅ FINAL COMPARISON (OOF AUC)")
    print("="*80)
    for r in results:
        print(f"{r['mode']:12s} | MLP={r['oof_mlp']:.5f} | FT={r['oof_ft']:.5f} | ENS={r['oof_ens']:.5f}")

if __name__ == "__main__":
    main()


üñ•Ô∏è Device: cpu
üìÇ Loading data...
Train: (45532, 79), Test: (11383, 77)

üß™ EXPERIMENT: TE_ONLY
[Fold 1]  MLP=0.77012 | FT=0.76485 | ENS(0.5)=0.76895 | n_feat=5
[Fold 2]  MLP=0.76065 | FT=0.75947 | ENS(0.5)=0.76119 | n_feat=5
[Fold 3]  MLP=0.75592 | FT=0.75457 | ENS(0.5)=0.75694 | n_feat=5
[Fold 4]  MLP=0.75340 | FT=0.74796 | ENS(0.5)=0.75204 | n_feat=5
[Fold 5]  MLP=0.75609 | FT=0.74911 | ENS(0.5)=0.75367 | n_feat=5

--- OOF SUMMARY ---
MLP  OOF AUC: 0.75888 | fold mean: 0.75924
FT   OOF AUC: 0.75462 | fold mean: 0.75519
ENS  OOF AUC: 0.75823 | fold mean: 0.75856
-------------------

üß™ EXPERIMENT: RAW_ONLY
[Fold 1]  MLP=0.76326 | FT=0.60543 | ENS(0.5)=0.75472 | n_feat=74


KeyboardInterrupt: 

In [None]:
# MLP + optunaÎ°ú Í∞ÄÏïºÍ≤†Ïùå