# 27_mps_stratkey_multiseed_lightft_bestonly.ipynb

MPS(맥 GPU) 가속 + 응답태도 기반 stratify key + 경량 FTT + OOF 최적 앙상블(best만 저장)


In [1]:
import os, random, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score

DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("DEVICE:", DEVICE)
DL_KW = dict(num_workers=0, pin_memory=False)


DEVICE: mps


In [2]:
SEEDS = [42, 202, 777]
N_FOLDS = 5
BATCH_SIZE = 512
EPOCHS = 35
PATIENCE = 6
LR_MLP = 1e-3
LR_FT  = 7e-4
FT_D_TOKEN = 32
FT_LAYERS  = 1
FT_HEADS   = 4
FT_DROPOUT = 0.15

def set_seed(seed):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed)


In [3]:
train_raw = pd.read_csv("../../data/raw/train.csv")
test_raw  = pd.read_csv("../../data/raw/test_x.csv")
train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)
print("Train:", train_raw.shape, "Test:", test_raw.shape, "pos_ratio:", float(train_raw["voted_bin"].mean()))


Train: (45532, 79) Test: (11383, 77) pos_ratio: 0.5468242115435298


In [4]:
QA_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
QE_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]

def make_strat_key(df: pd.DataFrame):
    qa = df[QA_cols]
    qe = df[QE_cols].clip(lower=0)
    qa_extreme = (((qa == 1) | (qa == 5)).sum(axis=1) / 20.0).fillna(0)
    qe_total_log = np.log1p(qe).sum(axis=1).fillna(0)
    qa_bin = pd.qcut(qa_extreme, 5, labels=False, duplicates="drop").astype(int)
    qe_bin = pd.qcut(qe_total_log, 5, labels=False, duplicates="drop").astype(int)
    y = df["voted_bin"].astype(int)
    return y.astype(str) + "_" + qa_bin.astype(str) + "_" + qe_bin.astype(str)

strat_key_full = make_strat_key(train_raw)
print("strat_key unique:", strat_key_full.nunique())


strat_key unique: 50


In [5]:
def clean_data(df):
    df = df.copy()
    for col in ['education', 'engnat', 'hand', 'married', 'urban']:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan
    if 'familysize' in df.columns:
        df.loc[df['familysize'] == 0, 'familysize'] = np.nan
        df.loc[df['familysize'] > 15, 'familysize'] = np.nan
    for col in [f"tp{i:02d}" for i in range(1, 11)]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan
    for col in [f"Q{c}E" for c in "abcdefghijklmnopqrst"]:
        if col in df.columns:
            df[col] = df[col].clip(lower=0, upper=60000)
    return df

def build_features(df):
    df = df.copy()
    age_map = {"10s": 1, "20s": 2, "30s": 3, "40s": 4, "50s": 5, "60s": 6, "+70s": 7}
    df["age_ord"] = df["age_group"].map(age_map)
    df["is_teenager"] = (df["age_ord"] == 1).astype(int)
    df["is_young"] = (df["age_ord"] <= 2).astype(int)
    df["is_old"] = (df["age_ord"] >= 6).astype(int)
    df["edu_low"] = (df["education"] <= 2).astype(float)
    df["edu_high"] = (df["education"] >= 3).astype(float)
    df["is_single"] = (df["married"] == 1).astype(float)
    df["is_married"] = (df["married"] == 2).astype(float)
    df["is_urban"] = (df["urban"] == 3).astype(float)
    df["is_english_native"] = (df["engnat"] == 1).astype(float)
    df["is_male"] = (df["gender"] == "Male").astype(int)

    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    df["qa_mean"] = df[qa_cols].mean(axis=1)
    df["qa_std"] = df[qa_cols].std(axis=1)
    df["qa_range"] = df[qa_cols].max(axis=1) - df[qa_cols].min(axis=1)
    df["qa_extreme_ratio"] = ((df[qa_cols] == 1) | (df[qa_cols] == 5)).sum(axis=1) / 20
    df["qa_neutral_ratio"] = (df[qa_cols] == 3).sum(axis=1) / 20
    df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(int)

    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for col in qe_cols:
        df[f"{col}_log"] = np.log1p(df[col])
    qe_log_cols = [f"{col}_log" for col in qe_cols]
    df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
    df["qe_log_std"] = df[qe_log_cols].std(axis=1)
    df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / 20
    df["qe_total_log"] = df[qe_log_cols].sum(axis=1)
    df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df["qa_all_same"] == 1)).astype(int)

    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / 10
    df["extraversion"] = df["tp01"] - df["tp06"]
    df["agreeableness"] = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"] = df["tp04"] - df["tp09"]
    df["openness"] = df["tp05"] - df["tp10"]
    df["tp_mean"] = df[tp_cols].mean(axis=1)

    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    df["vocab_high"] = (df["wr_sum"] >= 11).astype(int)

    df["age_edu"] = df["age_ord"] * df["education"]
    df["young_low_edu"] = df["is_young"] * df["edu_low"]
    df["young_single"] = df["is_young"] * df["is_single"]
    df["old_married"] = df["is_old"] * df["is_married"]
    df["teenager_low_edu"] = df["is_teenager"] * df["edu_low"]
    return df

def target_encode(train_df, val_df, test_df, col, target_col, smoothing=10):
    global_mean = train_df[target_col].mean()
    agg = train_df.groupby(col)[target_col].agg(['mean', 'count'])
    agg['te'] = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
    te_map = agg['te'].to_dict()
    return (train_df[col].map(te_map).fillna(global_mean).values,
            val_df[col].map(te_map).fillna(global_mean).values,
            test_df[col].map(te_map).fillna(global_mean).values)

def create_target_encodings(train_df, val_df, test_df, target_col="voted_bin"):
    te = {'train': {}, 'val': {}, 'test': {}}
    for col in ['age_group', 'race', 'religion']:
        tr, va, te_ = target_encode(train_df, val_df, test_df, col, target_col, 10)
        te['train'][f'{col}_te'] = tr
        te['val'][f'{col}_te'] = va
        te['test'][f'{col}_te'] = te_
    for df in [train_df, val_df, test_df]:
        df['age_edu_cat'] = df['age_group'].astype(str) + '_' + df['education'].astype(str)
        df['age_married_cat'] = df['age_group'].astype(str) + '_' + df['married'].astype(str)
        df['age_race_cat'] = df['age_group'].astype(str) + '_' + df['race'].astype(str)
        df['age_edu_married_cat'] = df['age_group'].astype(str) + '_' + df['education'].astype(str) + '_' + df['married'].astype(str)
    for col, sm in [('age_edu_cat', 5), ('age_married_cat', 5), ('age_race_cat', 5), ('age_edu_married_cat', 3)]:
        tr, va, te_ = target_encode(train_df, val_df, test_df, col, target_col, sm)
        te['train'][f'{col}_te'] = tr
        te['val'][f'{col}_te'] = va
        te['test'][f'{col}_te'] = te_
    return te


In [6]:
qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
qe_log_cols = [f"Q{c}E_log" for c in "abcdefghijklmnopqrst"]
wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
tp_cols = [f"tp{i:02d}" for i in range(1, 11)]

num_features = (
    qa_cols + qe_log_cols + wr_cols + wf_cols + tp_cols +
    [
        "age_ord", "education", "married", "urban", "engnat", "familysize", "hand",
        "is_teenager", "is_young", "is_old", "edu_low", "edu_high",
        "is_single", "is_married", "is_urban", "is_english_native", "is_male",
        "qa_mean", "qa_std", "qa_range", "qa_extreme_ratio", "qa_neutral_ratio", "qa_all_same",
        "qe_log_mean", "qe_log_std", "qe_fast_ratio", "qe_total_log", "is_careless",
        "tp_missing_ratio", "tp_mean",
        "extraversion", "agreeableness", "conscientiousness", "neuroticism", "openness",
        "wr_sum", "wf_sum", "word_credibility", "vocab_high",
        "age_edu", "young_low_edu", "young_single", "old_married", "teenager_low_edu",
    ]
)
te_features = ['age_group_te', 'race_te', 'religion_te',
               'age_edu_cat_te', 'age_married_cat_te', 'age_race_cat_te', 'age_edu_married_cat_te']
cat_features = ['gender', 'race', 'religion']

class TabDataset(Dataset):
    def __init__(self, X_num, X_cat, y=None):
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    def __len__(self): return len(self.X_num)
    def __getitem__(self, idx):
        if self.y is None: return self.X_num[idx], self.X_cat[idx]
        return self.X_num[idx], self.X_cat[idx], self.y[idx]

class MLP(nn.Module):
    def __init__(self, num_features, cat_dims, embed_dim=8, hidden_dims=(256,128,64), dropout=0.3):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(dim + 1, embed_dim) for dim in cat_dims])
        input_dim = num_features + len(cat_dims) * embed_dim
        layers_=[]; prev=input_dim
        for h in hidden_dims:
            layers_ += [nn.Linear(prev,h), nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout)]
            prev=h
        self.mlp = nn.Sequential(*layers_)
        self.out = nn.Linear(prev, 1)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None: nn.init.zeros_(m.bias)
    def forward(self, x_num, x_cat):
        cat_emb = torch.cat([emb(x_cat[:,i]) for i,emb in enumerate(self.embeddings)], dim=1)
        x = torch.cat([x_num, cat_emb], dim=1)
        x = self.mlp(x)
        return self.out(x)

class NumericalEmbedding(nn.Module):
    def __init__(self, num_features, d_token):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(num_features, d_token) * 0.02)
        self.bias = nn.Parameter(torch.zeros(num_features, d_token))
    def forward(self, x):
        return x.unsqueeze(-1) * self.weight + self.bias

class FTTransformer(nn.Module):
    def __init__(self, num_features, cat_dims, d_token=FT_D_TOKEN, n_layers=FT_LAYERS, n_heads=FT_HEADS, dropout=FT_DROPOUT):
        super().__init__()
        self.num_embed = NumericalEmbedding(num_features, d_token)
        self.cat_embeds = nn.ModuleList([nn.Embedding(dim + 1, d_token) for dim in cat_dims])
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_token) * 0.02)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_token, nhead=n_heads, dim_feedforward=d_token*2,
            dropout=dropout, activation='gelu', batch_first=True, norm_first=True
        )
        self.transformer = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(nn.LayerNorm(d_token),
                                  nn.Linear(d_token, d_token//2),
                                  nn.GELU(),
                                  nn.Dropout(dropout),
                                  nn.Linear(d_token//2, 1))
    def forward(self, x_num, x_cat):
        num_tokens = self.num_embed(x_num)
        cat_tokens = torch.stack([emb(x_cat[:,i]) for i,emb in enumerate(self.cat_embeds)], dim=1)
        tokens = torch.cat([num_tokens, cat_tokens], dim=1)
        cls = self.cls_token.expand(tokens.size(0), -1, -1)
        tokens = torch.cat([cls, tokens], dim=1)
        x = self.transformer(tokens)
        return self.head(x[:,0])


In [7]:
def train_model(model, train_loader, val_loader, y_train_np, y_val_np, lr):
    model.to(DEVICE)
    pos_ratio = float(np.mean(y_train_np))
    pos_weight = torch.tensor([(1 - pos_ratio) / (pos_ratio + 1e-6)], device=DEVICE)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

    best_auc=-1
    best_state=None
    no_imp=0

    for epoch in range(EPOCHS):
        model.train()
        for X_num, X_cat, yy in train_loader:
            X_num, X_cat, yy = X_num.to(DEVICE), X_cat.to(DEVICE), yy.to(DEVICE)
            optimizer.zero_grad()
            logits = model(X_num, X_cat)
            loss = criterion(logits, yy)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        model.eval()
        preds=[]
        with torch.no_grad():
            for X_num, X_cat, _ in val_loader:
                X_num, X_cat = X_num.to(DEVICE), X_cat.to(DEVICE)
                preds.append(torch.sigmoid(model(X_num, X_cat)).cpu().numpy())
        preds = np.concatenate(preds).ravel()
        auc = roc_auc_score(y_val_np, preds)
        scheduler.step(auc)

        if auc > best_auc + 1e-5:
            best_auc = auc
            best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
            no_imp=0
        else:
            no_imp += 1
            if no_imp >= PATIENCE:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model

def predict(model, loader):
    model.eval()
    preds=[]
    with torch.no_grad():
        for batch in loader:
            X_num, X_cat = batch[0].to(DEVICE), batch[1].to(DEVICE)
            preds.append(torch.sigmoid(model(X_num, X_cat)).cpu().numpy())
    return np.concatenate(preds).ravel()

def best_weight(y_true, p1, p2):
    best_auc=-1; best_w=0.5
    for w in np.linspace(0.0, 1.0, 201):
        p = w*p1 + (1-w)*p2
        auc = roc_auc_score(y_true, p)
        if auc > best_auc:
            best_auc, best_w = auc, w
    return best_w, best_auc

def rank_avg(a, b):
    ra = a.argsort().argsort().astype(np.float32)
    rb = b.argsort().argsort().astype(np.float32)
    r = (ra + rb) / 2.0
    r = (r - r.min()) / (r.max() - r.min() + 1e-12)
    return r


In [9]:
train_clean = clean_data(train_raw)
test_clean  = clean_data(test_raw)
y_all = train_clean["voted_bin"].values.astype(np.float32)

oof_mlp_all=[]; oof_ft_all=[]; test_mlp_all=[]; test_ft_all=[]

for seed in SEEDS:
    set_seed(seed)
    print("\n==============================")
    print("SEED:", seed)
    print("==============================")

    oof_mlp = np.zeros(len(train_clean), dtype=np.float32)
    oof_ft  = np.zeros(len(train_clean), dtype=np.float32)
    test_mlp = np.zeros(len(test_clean), dtype=np.float32)
    test_ft  = np.zeros(len(test_clean), dtype=np.float32)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)

    for fold,(tr_idx, va_idx) in enumerate(skf.split(train_clean, strat_key_full)):
        tr = train_clean.iloc[tr_idx].copy().reset_index(drop=True)
        va = train_clean.iloc[va_idx].copy().reset_index(drop=True)
        te = test_clean.copy()

        tr_fe = build_features(tr)
        va_fe = build_features(va)
        te_fe = build_features(te)

        te_dict = create_target_encodings(tr_fe, va_fe, te_fe, "voted_bin")
        all_num = num_features + te_features

        X_tr = tr_fe[num_features].copy()
        X_va = va_fe[num_features].copy()
        X_te = te_fe[num_features].copy()
        for te_name in te_features:
            X_tr[te_name] = te_dict["train"][te_name]
            X_va[te_name] = te_dict["val"][te_name]
            X_te[te_name] = te_dict["test"][te_name]

        for col in all_num:
            med = X_tr[col].median()
            if pd.isna(med): med = 0
            X_tr[col] = X_tr[col].fillna(med)
            X_va[col] = X_va[col].fillna(med)
            X_te[col] = X_te[col].fillna(med)

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr.values)
        X_va_s = scaler.transform(X_va.values)
        X_te_s = scaler.transform(X_te.values)

        cat_dims=[]
        X_cat_tr=[]; X_cat_va=[]; X_cat_te=[]
        for col in cat_features:
            le = LabelEncoder()
            all_vals = list(set(tr_fe[col].fillna("NaN").astype(str)) |
                            set(va_fe[col].fillna("NaN").astype(str)) |
                            set(te_fe[col].fillna("NaN").astype(str)))
            le.fit(all_vals + ["UNK"])
            cat_dims.append(len(le.classes_))
            X_cat_tr.append(le.transform(tr_fe[col].fillna("NaN").astype(str)))
            X_cat_va.append(le.transform(va_fe[col].fillna("NaN").astype(str)))
            X_cat_te.append(le.transform(te_fe[col].fillna("NaN").astype(str)))
        X_cat_tr = np.stack(X_cat_tr, axis=1)
        X_cat_va = np.stack(X_cat_va, axis=1)
        X_cat_te = np.stack(X_cat_te, axis=1)

        y_tr = tr_fe["voted_bin"].values.astype(np.float32)
        y_va = va_fe["voted_bin"].values.astype(np.float32)

        train_ds = TabDataset(X_tr_s, X_cat_tr, y_tr)
        val_ds   = TabDataset(X_va_s, X_cat_va, y_va)
        test_ds  = TabDataset(X_te_s, X_cat_te, None)

        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, **DL_KW)
        val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, **DL_KW)
        test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, **DL_KW)

        mlp = MLP(num_features=len(all_num), cat_dims=cat_dims)
        mlp = train_model(mlp, train_loader, val_loader, y_tr, y_va, lr=LR_MLP)
        p_mlp_va = predict(mlp, val_loader)
        oof_mlp[va_idx] = p_mlp_va
        test_mlp += predict(mlp, test_loader) / N_FOLDS

        ft = FTTransformer(num_features=len(all_num), cat_dims=cat_dims)
        ft = train_model(ft, train_loader, val_loader, y_tr, y_va, lr=LR_FT)
        p_ft_va = predict(ft, val_loader)
        oof_ft[va_idx] = p_ft_va
        test_ft += predict(ft, test_loader) / N_FOLDS

        print(f" fold {fold+1}/{N_FOLDS} done")

    print(" seed OOF mlp:", roc_auc_score(y_all, oof_mlp))
    print(" seed OOF ft :", roc_auc_score(y_all, oof_ft))

    oof_mlp_all.append(oof_mlp); oof_ft_all.append(oof_ft)
    test_mlp_all.append(test_mlp); test_ft_all.append(test_ft)

oof_mlp = np.mean(np.vstack(oof_mlp_all), axis=0)
oof_ft  = np.mean(np.vstack(oof_ft_all), axis=0)
test_mlp = np.mean(np.vstack(test_mlp_all), axis=0)
test_ft  = np.mean(np.vstack(test_ft_all), axis=0)

w_mlp, auc_prob = best_weight(y_all, oof_mlp, oof_ft)
oof_prob = w_mlp*oof_mlp + (1-w_mlp)*oof_ft
test_prob = w_mlp*test_mlp + (1-w_mlp)*test_ft

oof_rank = rank_avg(oof_mlp, oof_ft)
test_rank = rank_avg(test_mlp, test_ft)

auc_prob = roc_auc_score(y_all, oof_prob)
auc_rank = roc_auc_score(y_all, oof_rank)

print("\nOOF prob:", auc_prob, "w_mlp:", w_mlp)
print("OOF rank:", auc_rank)

if auc_rank > auc_prob:
    final_test = test_rank
    chosen = "rank"
else:
    final_test = test_prob
    chosen = "prob"

print("Chosen best:", chosen)

submission_best = pd.DataFrame({
    "index": test_raw["index"].values if "index" in test_raw.columns else np.arange(len(test_raw)),
    "voted": final_test
})

print("submission:", submission_best.shape, submission_best.columns.tolist())
print(submission_best["voted"].describe())

assert submission_best.shape == (11383, 2)
assert submission_best.columns.tolist() == ["index", "voted"]
assert float(submission_best["voted"].min()) >= 0.0 and float(submission_best["voted"].max()) <= 1.0

submission_best.to_csv("submission2_best.csv", index=False)
print("Saved: submission_best2.csv")



SEED: 42
 fold 1/5 done
 fold 2/5 done
 fold 3/5 done
 fold 4/5 done
 fold 5/5 done
 seed OOF mlp: 0.7691547141882352
 seed OOF ft : 0.7670905523672962

SEED: 202
 fold 1/5 done
 fold 2/5 done
 fold 3/5 done
 fold 4/5 done
 fold 5/5 done
 seed OOF mlp: 0.7699006615986147
 seed OOF ft : 0.7661762919920798

SEED: 777
 fold 1/5 done
 fold 2/5 done
 fold 3/5 done
 fold 4/5 done
 fold 5/5 done
 seed OOF mlp: 0.7697866401246445
 seed OOF ft : 0.767978846569841

OOF prob: 0.7725834246606837 w_mlp: 0.72
OOF rank: 0.7722335937448479
Chosen best: prob
submission: (11383, 2) ['index', 'voted']
count    11383.000000
mean         0.514691
std          0.250436
min          0.108352
25%          0.314292
50%          0.429080
75%          0.725085
max          0.995402
Name: voted, dtype: float64
Saved: submission_best2.csv


In [None]:
SAVE_NAME = "submission_best2.csv"
submission_best.to_csv(SAVE_NAME, index=False)
print("Saved:", SAVE_NAME)


/Users/admin/Downloads/AI 헬스케어 수업/oz코딩 수업/해커톤 (1)/vote-AI/notebooks/taeeun
