# S FT-Transformer + Seed Ensemble (voted==2 확률 제출)

## 핵심
- 타깃: `y = (voted==2)` (양성=2)
- 모델 출력(sigmoid) = **P(voted=2)** → 그대로 제출
- FT-Transformer pooling은 **GlobalAveragePooling1D()** 사용(오류 방지)
- 제출 파일: `submission_STEP4_FIXED.csv` (index+voted)


## 0) 설치(필요 시)


In [1]:
# 필요할 때만 실행하세요.
# !pip install --default-timeout=300 tensorflow scikit-learn


## 1) 데이터 로드(공용 경로)


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', 250)
pd.set_option('display.width', 200)

def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "requirements.txt").exists() or (p / "README.md").exists():
            return p
    raise FileNotFoundError("프로젝트 루트를 찾지 못했습니다. vote-AI 루트에 requirements.txt 또는 README.md가 있는지 확인하세요.")

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_DIR = PROJECT_ROOT / "data" / "raw"

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test_x.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

print("train:", train.shape, "test:", test.shape, "sub:", sub.shape)
train.head()


train: (45532, 78) test: (11383, 77) sub: (11383, 2)


Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,QfA,QfE,QgA,QgE,QhA,QhE,QiA,QiE,QjA,QjE,QkA,QkE,QlA,QlE,QmA,QmE,QnA,QnE,QoA,QoE,QpA,QpE,QqA,QqE,QrA,QrE,QsA,QsE,QtA,QtE,age_group,education,engnat,familysize,gender,hand,married,race,religion,tp01,tp02,tp03,tp04,tp05,tp06,tp07,tp08,tp09,tp10,urban,voted,wf_01,wf_02,wf_03,wr_01,wr_02,wr_03,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
0,0,3.0,363,4.0,1370,5.0,997,1.0,1024,2.0,1577,5.0,539,2.0,586,4.0,1095,5.0,1142,4.0,1287,4.0,883,4.0,851,2.0,851,5.0,816,2.0,579,2.0,924,2.0,366,2.0,876,2.0,633,1.0,1115,30s,2,1,4,Female,1,3,White,Other,2,2,2,1,2,1,7,4,4,3,1,2,0,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1
1,1,5.0,647,5.0,1313,3.0,3387,5.0,2969,1.0,4320,3.0,2190,1.0,826,1.0,4082,5.0,1867,3.0,1264,5.0,2943,4.0,3927,1.0,4329,5.0,1828,1.0,1214,5.0,2414,5.0,1356,1.0,3039,4.0,4304,1.0,1346,20s,4,2,3,Female,1,1,Asian,Hindu,1,1,0,0,1,2,3,4,0,4,3,2,0,0,0,0,1,0,1,1,0,1,1,0,1,0,1,1
2,2,4.0,1623,1.0,1480,1.0,1021,4.0,3374,5.0,1333,1.0,531,4.0,1167,1.0,1016,3.0,2653,2.0,1569,5.0,998,5.0,2547,2.0,918,4.0,2153,2.0,1304,1.0,1131,5.0,937,4.0,1327,1.0,1170,1.0,1409,30s,3,1,3,Male,1,2,White,Other,2,3,1,5,3,4,2,6,1,3,2,1,0,0,1,1,1,0,1,1,0,1,1,1,1,0,1,1
3,3,3.0,504,3.0,2311,4.0,992,3.0,3245,1.0,357,2.0,1519,4.0,159,3.0,2275,5.0,2809,4.0,5614,3.0,3219,4.0,1296,4.0,9046,4.0,1216,4.0,1169,4.0,23868,3.0,581,4.0,8830,4.0,2392,5.0,1312,20s,4,2,0,Female,1,1,Asian,Hindu,2,4,1,1,1,3,1,3,1,3,3,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,1
4,4,1.0,927,1.0,707,5.0,556,2.0,1062,1.0,1014,2.0,628,1.0,991,1.0,1259,5.0,1153,5.0,1388,5.0,740,5.0,1181,4.0,547,2.0,575,1.0,754,4.0,1140,5.0,323,5.0,1070,1.0,583,2.0,1889,20s,3,1,2,Male,1,2,White,Agnostic,1,1,1,6,0,2,0,6,2,6,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1


## 2) 타깃 정의 (제출 안전)
- y=1 ⇢ voted==2
- 제출 voted 값 ⇢ P(voted==2)


In [3]:
y = (train['voted'] == 2).astype('int32')
X = train.drop(columns=['voted'])

print("raw voted:", sorted(train['voted'].unique()))
print("y counts (1=voted==2):")
print(y.value_counts())
print("pos ratio:", float(y.mean()))


raw voted: [np.int64(1), np.int64(2)]
y counts (1=voted==2):
voted
1    24898
0    20634
Name: count, dtype: int64
pos ratio: 0.5468242115435298


## 3) 컬럼 그룹 탐지


In [4]:
cols = list(X.columns)
q_like = [c for c in cols if re.match(r"^Q", c)]

QA_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]A", c)])
QE_cols = sorted([c for c in q_like if re.fullmatch(r"Q[a-z]E", c)])

TP_cols = sorted([c for c in cols if re.fullmatch(r"tp\d{2}", c)])
WR_cols = sorted([c for c in cols if re.fullmatch(r"wr_?\d{2}", c)])
WF_cols = sorted([c for c in cols if re.fullmatch(r"wf_?\d{2}", c)])

print("Q_A:", len(QA_cols), "Q_E:", len(QE_cols), "TP:", len(TP_cols), "WR:", len(WR_cols), "WF:", len(WF_cols))


Q_A: 20 Q_E: 20 TP: 10 WR: 13 WF: 3


## 4) Feature Engineering (최소+강한 것)
- Reverse + Mach + delay + TIPI + wr/wf + teenager
- 행 삭제 없음


In [5]:
FLIP_PUBLIC = ["QeA", "QfA", "QkA", "QqA", "QrA"]
FLIP_SECRET = ["QaA", "QdA", "QgA", "QiA", "QnA"]
TPS = ['tp01','tp02','tp03','tp04','tp05','tp06','tp07','tp08','tp09','tp10']

CAT_COLS = ['age_group', 'gender', 'race', 'religion', 'education', 'engnat', 'married', 'urban']

def fe_step4(df: pd.DataFrame, tp_means=None):
    df = df.copy()
    for c in ['index', 'hand']:
        if c in df.columns:
            df = df.drop(columns=[c])

    # Q_A reverse + Mach
    for c in QA_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    for c in FLIP_PUBLIC + FLIP_SECRET:
        if c in df.columns:
            df[c] = 6 - df[c]

    df['T'] = df['QcA'] - df['QfA'] + df['QoA'] - df['QrA'] + df['QsA']
    df['V'] = df['QbA'] - df['QeA'] + df['QhA'] + df['QjA'] + df['QmA'] - df['QqA']
    df['M'] = - df['QkA']
    df['Mach_score'] = df[QA_cols].mean(axis=1)

    # Q_E delay 요약 후 drop
    for c in QE_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce').clip(lower=0)
    delay_sum = df[QE_cols].sum(axis=1)
    df['delay_sum']  = delay_sum
    df['delay_root'] = np.power(delay_sum, 1/10)
    df['delay_log']  = np.log1p(delay_sum)
    df['delay_std']  = df[QE_cols].std(axis=1)
    df = df.drop(columns=QE_cols)

    # wr/wf 요약
    if len(WR_cols) > 0:
        wr = df[WR_cols].apply(pd.to_numeric, errors='coerce')
        df['wr_yes_count'] = wr.sum(axis=1)
        df['wr_yes_ratio'] = wr.mean(axis=1)
    if len(WF_cols) > 0:
        wf = df[WF_cols].apply(pd.to_numeric, errors='coerce')
        df['wf_yes_count'] = wf.sum(axis=1)
        df['wf_yes_ratio'] = wf.mean(axis=1)
    if 'wr_yes_count' in df.columns and 'wf_yes_count' in df.columns:
        df['wr_minus_wf'] = df['wr_yes_count'] - df['wf_yes_count']

    # tp mean impute
    for c in TPS:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df.loc[df[c] == 0, c] = np.nan
    if tp_means is None:
        tp_means = {c: float(df[c].mean()) for c in TPS}
    for c in TPS:
        df[c] = df[c].fillna(tp_means[c])

    df['Ex']  = df['tp01'] - df['tp06']
    df['Ag']  = df['tp07'] - df['tp02']
    df['Con'] = df['tp03'] - df['tp08']
    df['Es']  = df['tp09'] - df['tp04']
    df['Op']  = df['tp05'] - df['tp10']

    df['teenager_ox'] = (df['age_group'].astype(str) == '10s').astype('int32')

    # categorical as string
    for c in CAT_COLS:
        if c in df.columns:
            df[c] = df[c].astype(str)

    # numeric NaN -> 0
    cat_cols = [c for c in CAT_COLS if c in df.columns]
    num_cols = [c for c in df.columns if c not in cat_cols]
    df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

    return df, tp_means

X_fe, tp_means_all = fe_step4(X, tp_means=None)
T_fe, _ = fe_step4(test, tp_means=tp_means_all)

cat_cols = [c for c in CAT_COLS if c in X_fe.columns]
num_cols = [c for c in X_fe.columns if c not in cat_cols]

print("X_fe:", X_fe.shape, "T_fe:", T_fe.shape)
print("#cat:", len(cat_cols), "#num:", len(num_cols))


X_fe: (45532, 74) T_fe: (11383, 74)
#cat: 8 #num: 66


## 5) FT-Transformer
- pooling: GlobalAveragePooling1D 사용


In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

class TransformerBlock(layers.Layer):
    def __init__(self, dim, num_heads, mlp_ratio=4, dropout=0.1):
        super().__init__()
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=dim, dropout=dropout)
        self.drop1 = layers.Dropout(dropout)

        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.mlp = keras.Sequential([
            layers.Dense(dim * mlp_ratio, activation='gelu'),
            layers.Dropout(dropout),
            layers.Dense(dim),
        ])
        self.drop2 = layers.Dropout(dropout)

    def call(self, x, training=False):
        h = self.norm1(x)
        h = self.attn(h, h, training=training)
        x = x + self.drop1(h, training=training)
        h = self.norm2(x)
        h = self.mlp(h, training=training)
        x = x + self.drop2(h, training=training)
        return x


def build_ft_transformer_from_fold_train(X_fold, cat_cols, num_cols,
                                        dim=64, depth=3, heads=4,
                                        dropout=0.15, lr=2e-3, weight_decay=5e-2):
    inputs = {}
    tokens = []  # (batch, n_tokens, dim)

    # categorical tokens
    for c in cat_cols:
        inp = keras.Input(shape=(1,), name=c, dtype=tf.string)
        lookup = layers.StringLookup(output_mode='int')
        lookup.adapt(X_fold[c].astype(str).values)  # fold-train only

        vocab = lookup.vocabulary_size()
        emb = layers.Embedding(vocab, dim)
        t = emb(lookup(inp))  # (batch, 1, dim)

        inputs[c] = inp
        tokens.append(t)

    # numeric tokens
    num_inp = keras.Input(shape=(len(num_cols),), name='num', dtype=tf.float32)
    norm = layers.Normalization()
    norm.adapt(X_fold[num_cols].values.astype('float32'))  # fold-train only
    xnum = norm(num_inp)
    xnum = layers.Reshape((len(num_cols), 1))(xnum)
    xnum = layers.Dense(dim)(xnum)

    inputs['num'] = num_inp
    tokens.append(xnum)

    x = layers.Concatenate(axis=1)(tokens)
    x = layers.Dropout(dropout)(x)

    for _ in range(depth):
        x = TransformerBlock(dim=dim, num_heads=heads, dropout=dropout)(x)

    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.GlobalAveragePooling1D()(x)  # ✅ fixed pooling

    x = layers.Dense(dim, activation='relu')(x)
    x = layers.Dropout(dropout)(x)
    out = layers.Dense(1, activation='sigmoid')(x)

    model = keras.Model(inputs=inputs, outputs=out)
    opt = keras.optimizers.AdamW(learning_rate=lr, weight_decay=weight_decay)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[keras.metrics.AUC(name='auc')])
    return model


## 6) tf.data 변환


In [7]:
def df_to_dataset(df: pd.DataFrame, y=None, cat_cols=None, num_cols=None, batch_size=256, shuffle=False, seed=42):
    feats = {}
    for c in cat_cols:
        feats[c] = df[c].astype(str).values
    feats['num'] = df[num_cols].values.astype('float32')

    if y is None:
        ds = tf.data.Dataset.from_tensor_slices(feats)
    else:
        ds = tf.data.Dataset.from_tensor_slices((feats, y.values.astype('float32')))

    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(df), 10000), seed=seed, reshuffle_each_iteration=True)

    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)


## 7) CV + seed 앙상블
시간이 길면 SEEDS를 [0,1,2]로 줄여서 먼저 확인하세요.


In [None]:
# ✅ 빠른 확인 모드 (추천)
EPOCHS = 12          # 30 -> 12
BATCH  = 512         # 256 -> 512 (CPU에서 종종 더 빠름)
SEEDS  = [0]         # 3개 -> 1개 (일단 방향 확인)
N_SPLITS = 3         # 5-fold -> 3-fold
DEPTH = 2            # 3 -> 2 (Transformer block)
HEADS = 4
DIM = 64
DROPOUT = 0.15

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof = np.zeros(len(X_fe), dtype='float32')

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_fe, y)):
    X_tr = X_fe.iloc[tr_idx].reset_index(drop=True)
    y_tr = y.iloc[tr_idx].reset_index(drop=True)
    X_va = X_fe.iloc[va_idx].reset_index(drop=True)
    y_va = y.iloc[va_idx].reset_index(drop=True)

    # class_weight
    pos = float(y_tr.mean()); neg = 1.0 - pos
    class_weight = {0: 1.0, 1: neg / (pos + 1e-9)}

    tf.keras.utils.set_random_seed(SEEDS[0] + fold)

    model = build_ft_transformer_from_fold_train(
        X_tr, cat_cols, num_cols,
        dim=DIM, depth=DEPTH, heads=HEADS, dropout=DROPOUT,
        lr=2e-3, weight_decay=5e-2
    )

    tr_ds = df_to_dataset(X_tr, y_tr, cat_cols, num_cols, batch_size=BATCH, shuffle=True, seed=SEEDS[0]+fold)
    va_ds = df_to_dataset(X_va, y_va, cat_cols, num_cols, batch_size=BATCH, shuffle=False)

    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=3, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor='val_auc', mode='max', factor=0.5, patience=1, min_lr=1e-5),
    ]

    model.fit(tr_ds, validation_data=va_ds, epochs=EPOCHS, verbose=0,
              callbacks=callbacks, class_weight=class_weight)

    pred_va = model.predict(
        df_to_dataset(X_va, None, cat_cols, num_cols, batch_size=BATCH),
        verbose=0
    ).reshape(-1)
    pred_va = np.nan_to_num(pred_va, nan=0.5)

    auc = roc_auc_score(y_va, pred_va)
    oof[va_idx] = pred_va
    print(f"[FAST] fold {fold} AUC={auc:.6f}")

fast_auc = roc_auc_score(y, oof)
print("\n  FAST OOF AUC:", fast_auc)

[FAST] fold 0 AUC=0.769114
[FAST] fold 1 AUC=0.759387


## 8) 최종 학습 → 제출 파일 생성
- 제출 값은 P(voted=2)
- sample_submission 그대로(index+voted)


In [None]:
# 최종 제출은 SEEDS를 7개로 늘려도 됨
FINAL_SEEDS = [0,1,2,3,4,5,6]

def train_full_predict_one_seed(seed: int):
    tf.keras.utils.set_random_seed(seed)

    model = build_ft_transformer_from_fold_train(
        X_fe, cat_cols, num_cols,
        dim=64, depth=3, heads=4, dropout=0.15,
        lr=2e-3, weight_decay=5e-2
    )

    pos = float(y.mean()); neg = 1.0 - pos
    class_weight = {0: 1.0, 1: neg / (pos + 1e-9)}

    tr_ds = df_to_dataset(X_fe, y, cat_cols, num_cols, batch_size=BATCH, shuffle=True, seed=seed)
    model.fit(tr_ds, epochs=max(12, EPOCHS//2), verbose=0, class_weight=class_weight)

    pred = model.predict(df_to_dataset(T_fe, None, cat_cols, num_cols, batch_size=BATCH), verbose=0).reshape(-1)
    return np.nan_to_num(pred, nan=0.5)

preds = []
for seed in FINAL_SEEDS:
    p = train_full_predict_one_seed(seed)
    preds.append(p)
    print(f"seed {seed} done. mean={float(p.mean()):.6f}")

pred_test = np.mean(np.vstack(preds), axis=0).reshape(-1)

submission = sub.copy()
submission['voted'] = pred_test

print("submission shape:", submission.shape)
print("columns:", submission.columns.tolist())
print(submission['voted'].describe())

assert submission.shape == (11383, 2)
assert submission.columns.tolist() == ['index', 'voted']

out_path = PROJECT_ROOT / "submission_STEP4_FIXED.csv"
submission.to_csv(out_path, index=False)

print("Saved:", out_path)
submission.head()
