Ìï¥Ïª§ÌÜ§ Ï†úÏ∂ú Ïó∞ÏäµÏö©

In [1]:
import os, json, time, random, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score


# ============================================================
# CONFIG
# ============================================================
TRAIN_PATH = "../../data/raw/train.csv"
TEST_PATH  = "../../data/raw/test_x.csv"

N_FOLDS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

EPOCHS = 80
PATIENCE = 10

# ‚úÖ Ïó¨Í∏∞ seedÎßå Î∞îÍøîÏÑú ÏïôÏÉÅÎ∏î
SEEDS = [42, 202, 777]  # ÏõêÌïòÎ©¥ [42, 202, 777, 1024, 2026]ÍπåÏßÄ ÎäòÎ†§ÎèÑ Îê®

print(f"üñ•Ô∏è Device: {DEVICE}")


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


# ============================================================
# Load
# ============================================================
train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)
train_raw["voted_bin"] = (train_raw["voted"] == 2).astype(int)

print(f"Train: {train_raw.shape}, Test: {test_raw.shape}")


# ============================================================
# Preprocess
# ============================================================
def clean_data(df):
    df = df.copy()

    for col in ["education", "engnat", "hand", "married", "urban"]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    if "familysize" in df.columns:
        df.loc[df["familysize"] == 0, "familysize"] = np.nan
        df.loc[df["familysize"] > 15, "familysize"] = np.nan

    for col in [f"tp{i:02d}" for i in range(1, 11)]:
        if col in df.columns:
            df.loc[df[col] == 0, col] = np.nan

    for col in [f"Q{c}E" for c in "abcdefghijklmnopqrst"]:
        if col in df.columns:
            df[col] = df[col].clip(lower=100, upper=60000)

    return df


# ============================================================
# Feature Engineering (ÎÑà ÏΩîÎìú Í∑∏ÎåÄÎ°ú)
# ============================================================
def build_features(df):
    df = df.copy()

    age_map = {"10s": 1, "20s": 2, "30s": 3, "40s": 4, "50s": 5, "60s": 6, "+70s": 7}
    df["age_ord"] = df["age_group"].map(age_map)

    df["is_teenager"] = (df["age_ord"] == 1).astype(float)
    df["is_young"] = (df["age_ord"] <= 2).astype(float)
    df["is_old"] = (df["age_ord"] >= 6).astype(float)

    df["edu_low"] = (df["education"] <= 2).astype(float)
    df["edu_high"] = (df["education"] >= 3).astype(float)

    df["is_single"] = (df["married"] == 1).astype(float)
    df["is_married"] = (df["married"] == 2).astype(float)

    df["is_urban"] = (df["urban"] == 3).astype(float)
    df["is_english_native"] = (df["engnat"] == 1).astype(float)
    df["is_male"] = (df["gender"] == "Male").astype(float)

    qa_cols = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
    df["qa_mean"] = df[qa_cols].mean(axis=1)
    df["qa_std"] = df[qa_cols].std(axis=1)
    df["qa_range"] = df[qa_cols].max(axis=1) - df[qa_cols].min(axis=1)
    df["qa_extreme_ratio"] = ((df[qa_cols] == 1) | (df[qa_cols] == 5)).sum(axis=1) / len(qa_cols)
    df["qa_neutral_ratio"] = (df[qa_cols] == 3).sum(axis=1) / len(qa_cols)
    df["qa_all_same"] = (df[qa_cols].std(axis=1) == 0).astype(float)

    qe_cols = [f"Q{c}E" for c in "abcdefghijklmnopqrst"]
    for c in qe_cols:
        df[f"{c}_log"] = np.log1p(df[c])

    qe_log_cols = [f"{c}_log" for c in qe_cols]
    df["qe_log_mean"] = df[qe_log_cols].mean(axis=1)
    df["qe_log_std"] = df[qe_log_cols].std(axis=1)
    df["qe_fast_ratio"] = (df[qe_cols] < 500).sum(axis=1) / len(qe_cols)
    df["qe_total_log"] = df[qe_log_cols].sum(axis=1)
    df["is_careless"] = ((df[qe_cols].mean(axis=1) < 500) | (df["qa_all_same"] == 1)).astype(float)

    tp_cols = [f"tp{i:02d}" for i in range(1, 11)]
    df["tp_missing_ratio"] = df[tp_cols].isna().sum(axis=1) / len(tp_cols)
    df["extraversion"] = df["tp01"] - df["tp06"]
    df["agreeableness"] = df["tp07"] - df["tp02"]
    df["conscientiousness"] = df["tp03"] - df["tp08"]
    df["neuroticism"] = df["tp04"] - df["tp09"]
    df["openness"] = df["tp05"] - df["tp10"]
    df["tp_mean"] = df[tp_cols].mean(axis=1)

    wr_cols = [f"wr_{i:02d}" for i in range(1, 14)]
    wf_cols = [f"wf_{i:02d}" for i in range(1, 4)]
    df["wr_sum"] = df[wr_cols].sum(axis=1)
    df["wf_sum"] = df[wf_cols].sum(axis=1)
    df["word_credibility"] = df["wr_sum"] - df["wf_sum"]
    df["vocab_high"] = (df["wr_sum"] >= 11).astype(float)

    df["age_edu"] = df["age_ord"] * df["education"]
    df["young_low_edu"] = df["is_young"] * df["edu_low"]
    df["young_single"] = df["is_young"] * df["is_single"]
    df["old_married"] = df["is_old"] * df["is_married"]
    df["teenager_low_edu"] = df["is_teenager"] * df["edu_low"]

    df["age_edu_cat"] = df["age_group"].astype(str) + "_" + df["education"].astype(str)
    df["age_married_cat"] = df["age_group"].astype(str) + "_" + df["married"].astype(str)
    df["age_race_cat"] = df["age_group"].astype(str) + "_" + df["race"].astype(str)
    df["age_edu_married_cat"] = df["age_group"].astype(str) + "_" + df["education"].astype(str) + "_" + df["married"].astype(str)

    return df


# ============================================================
# TE (fold only)
# ============================================================
def target_encode(train_df, val_df, test_df, col, target_col="voted_bin", smoothing=10):
    global_mean = train_df[target_col].mean()
    agg = train_df.groupby(col)[target_col].agg(["mean", "count"])
    te = (agg["count"] * agg["mean"] + smoothing * global_mean) / (agg["count"] + smoothing)
    te_map = te.to_dict()

    return (
        train_df[col].map(te_map).fillna(global_mean).values,
        val_df[col].map(te_map).fillna(global_mean).values,
        test_df[col].map(te_map).fillna(global_mean).values,
    )


def make_te(train_df, val_df, test_df):
    out = {"train": {}, "val": {}, "test": {}}
    single_cols = [("age_group", 10), ("race", 10), ("religion", 10)]
    combo_cols  = [("age_edu_cat", 5), ("age_married_cat", 5), ("age_race_cat", 5), ("age_edu_married_cat", 3)]

    for c, sm in single_cols + combo_cols:
        tr, va, ts = target_encode(train_df, val_df, test_df, c, "voted_bin", sm)
        out["train"][f"{c}_te"] = tr
        out["val"][f"{c}_te"] = va
        out["test"][f"{c}_te"] = ts
    return out


QA_RAW = [f"Q{c}A" for c in "abcdefghijklmnopqrst"]
QE_RAW = [f"Q{c}E_log" for c in "abcdefghijklmnopqrst"]
TP_RAW = [f"tp{i:02d}" for i in range(1, 11)]
WR_RAW = [f"wr_{i:02d}" for i in range(1, 14)]
WF_RAW = [f"wf_{i:02d}" for i in range(1, 4)]

RAW_FEATURES = QA_RAW + QE_RAW + TP_RAW + WR_RAW + WF_RAW + ["education", "married", "urban", "engnat", "familysize", "hand", "age_ord"]

SUMMARY_FEATURES = [
    "is_teenager", "is_young", "is_old", "edu_low", "edu_high",
    "is_single", "is_married", "is_urban", "is_english_native", "is_male",
    "qa_mean", "qa_std", "qa_range", "qa_extreme_ratio", "qa_neutral_ratio", "qa_all_same",
    "qe_log_mean", "qe_log_std", "qe_fast_ratio", "qe_total_log", "is_careless",
    "tp_missing_ratio", "tp_mean",
    "extraversion", "agreeableness", "conscientiousness", "neuroticism", "openness",
    "wr_sum", "wf_sum", "word_credibility", "vocab_high",
    "age_edu", "young_low_edu", "young_single", "old_married", "teenager_low_edu",
]

TE_FEATURES = [
    "age_group_te", "race_te", "religion_te",
    "age_edu_cat_te", "age_married_cat_te", "age_race_cat_te", "age_edu_married_cat_te",
]


# ============================================================
# Dataset / Model
# ============================================================
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        if self.y is None: return self.X[idx]
        return self.X[idx], self.y[idx]


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers += [nn.Linear(prev, h), nn.BatchNorm1d(h), nn.SiLU(), nn.Dropout(dropout)]
            prev = h
        layers += [nn.Linear(prev, 1)]
        self.net = nn.Sequential(*layers)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    def forward(self, x): return self.net(x)


def train_one_fold(model, tr_loader, va_loader, y_va, lr, weight_decay, device):
    model.to(device)
    pos_ratio = float(np.mean(y_va))
    pos_weight = torch.tensor([(1 - pos_ratio) / (pos_ratio + 1e-6)], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="max", factor=0.5, patience=3)

    best_auc, best_state, no_imp = -1, None, 0
    for _ in range(EPOCHS):
        model.train()
        for x, y in tr_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            loss = criterion(model(x), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        model.eval()
        preds = []
        with torch.no_grad():
            for x, _y in va_loader:
                x = x.to(device)
                preds.append(torch.sigmoid(model(x)).cpu().numpy())
        preds = np.concatenate(preds).ravel()
        auc = roc_auc_score(y_va, preds)
        sched.step(auc)

        if auc > best_auc + 1e-5:
            best_auc = auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            no_imp = 0
        else:
            no_imp += 1
        if no_imp >= PATIENCE:
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return best_auc


def predict(model, loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for x in loader:
            if isinstance(x, (tuple, list)):
                x = x[0]
            x = x.to(device)
            preds.append(torch.sigmoid(model(x)).cpu().numpy())
    return np.concatenate(preds).ravel()


def make_X(train_df, val_df, test_df, use_raw, use_summary, use_te):
    cols = []
    if use_raw:
        cols += RAW_FEATURES
    if use_summary:
        cols += SUMMARY_FEATURES
    if use_te:
        te = make_te(train_df, val_df, test_df)
        for te_name in TE_FEATURES:
            train_df[te_name] = te["train"][te_name]
            val_df[te_name]   = te["val"][te_name]
            test_df[te_name]  = te["test"][te_name]
        cols += TE_FEATURES

    X_tr = train_df[cols].copy()
    X_va = val_df[cols].copy()
    X_ts = test_df[cols].copy()

    for c in cols:
        med = X_tr[c].median()
        if pd.isna(med): med = 0.0
        X_tr[c] = X_tr[c].fillna(med)
        X_va[c] = X_va[c].fillna(med)
        X_ts[c] = X_ts[c].fillna(med)

    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr.values)
    X_va = scaler.transform(X_va.values)
    X_ts = scaler.transform(X_ts.values)

    return X_tr, X_va, X_ts, len(cols)


# ============================================================
# ‚úÖ BEST PARAMS (ÎÑàÍ∞Ä Ï§Ä Í∑∏ÎåÄÎ°ú)
# ============================================================
BEST = {
  "use_te": True,
  "use_summary": True,
  "use_raw": True,
  "n_layers": 3,
  "hidden_base": 256,
  "shrink": 0.8212081026195875,
  "dropout": 0.400484176222794,
  "lr": 0.0014692944373311206,
  "weight_decay": 0.001847088696674732,
  "batch_size": 512
}


def hidden_dims_from(best):
    n_layers = int(best["n_layers"])
    base = int(best["hidden_base"])
    shrink = float(best["shrink"])
    dims = []
    cur = base
    for _ in range(n_layers):
        dims.append(int(cur))
        cur = max(32, cur * shrink)
    return dims


def run_one_seed(seed, train_all, test_all, best):
    set_seed(seed)
    print("\n" + "="*80)
    print(f"üå± SEED RUN: {seed}")
    print("="*80)

    use_raw = bool(best["use_raw"])
    use_summary = bool(best["use_summary"])
    use_te = bool(best["use_te"])

    hidden_dims = hidden_dims_from(best)
    dropout = float(best["dropout"])
    lr = float(best["lr"])
    wd = float(best["weight_decay"])
    bs = int(best["batch_size"])

    y = train_all["voted_bin"].values.astype(np.float32)
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)  # ‚úÖ seed Î∞òÏòÅ

    oof = np.zeros(len(train_all), dtype=np.float32)
    test_pred = np.zeros(len(test_all), dtype=np.float32)
    fold_aucs = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(train_all, y), 1):
        tr_df = train_all.iloc[tr_idx].copy().reset_index(drop=True)
        va_df = train_all.iloc[va_idx].copy().reset_index(drop=True)
        ts_df = test_all.copy()

        X_tr, X_va, X_ts, n_feat = make_X(tr_df, va_df, ts_df, use_raw, use_summary, use_te)

        y_tr = tr_df["voted_bin"].values.astype(np.float32)
        y_va = va_df["voted_bin"].values.astype(np.float32)

        tr_ds = TabDataset(X_tr, y_tr)
        va_ds = TabDataset(X_va, y_va)
        ts_ds = TabDataset(X_ts)

        tr_loader = DataLoader(tr_ds, batch_size=bs, shuffle=True, drop_last=True)
        va_loader = DataLoader(va_ds, batch_size=bs, shuffle=False)
        ts_loader = DataLoader(ts_ds, batch_size=bs, shuffle=False)

        model = MLP(input_dim=n_feat, hidden_dims=hidden_dims, dropout=dropout)
        auc = train_one_fold(model, tr_loader, va_loader, y_va, lr, wd, DEVICE)
        fold_aucs.append(auc)

        oof[va_idx] = predict(model, va_loader, DEVICE)
        test_pred += predict(model, ts_loader, DEVICE) / N_FOLDS

        print(f"[Seed {seed}][Fold {fold}] AUC={auc:.5f} | n_feat={n_feat}")

    oof_auc = roc_auc_score(train_all["voted_bin"], oof)
    print(f"‚úÖ SEED {seed} OOF AUC: {oof_auc:.5f} | fold mean: {np.mean(fold_aucs):.5f}")

    return oof_auc, oof, test_pred


def main():
    train_clean = clean_data(train_raw)
    test_clean  = clean_data(test_raw)

    train_fe = build_features(train_clean)
    test_fe  = build_features(test_clean)

    all_oof = []
    all_test = []
    seed_scores = []

    for s in SEEDS:
        oof_auc, oof, test_pred = run_one_seed(s, train_fe, test_fe, BEST)
        seed_scores.append((s, oof_auc))
        all_oof.append(oof)
        all_test.append(test_pred)

        # Ï§ëÍ∞Ñ Ï†ÄÏû•(ÌòπÏãú Ï§ëÎã® ÎåÄÎπÑ)
        np.save(f"oof_seed{s}.npy", oof)
        np.save(f"test_seed{s}.npy", test_pred)

    oof_ens = np.mean(all_oof, axis=0)
    test_ens = np.mean(all_test, axis=0)
    oof_auc_ens = roc_auc_score(train_fe["voted_bin"], oof_ens)

    print("\n" + "="*80)
    print("üèÅ SEED ENSEMBLE SUMMARY")
    print("="*80)
    for s, sc in seed_scores:
        print(f"Seed {s}: OOF AUC = {sc:.5f}")
    print(f"‚úÖ Ensemble OOF AUC = {oof_auc_ens:.5f}")

    # submission
    sub = pd.DataFrame({
        "index": test_raw["index"] if "index" in test_raw.columns else np.arange(len(test_raw)),
        "voted": test_ens
    })
    out = f"submission_seed_ens_{len(SEEDS)}.csv"
    sub.to_csv(out, index=False)
    print(f"üíæ Saved: {out}")
    print(f"   pred range: [{test_ens.min():.4f}, {test_ens.max():.4f}]")


if __name__ == "__main__":
    main()


üñ•Ô∏è Device: cpu
Train: (45532, 79), Test: (11383, 77)

üå± SEED RUN: 42
[Seed 42][Fold 1] AUC=0.78299 | n_feat=117
[Seed 42][Fold 2] AUC=0.76903 | n_feat=117
[Seed 42][Fold 3] AUC=0.76187 | n_feat=117
[Seed 42][Fold 4] AUC=0.76182 | n_feat=117
[Seed 42][Fold 5] AUC=0.76909 | n_feat=117
‚úÖ SEED 42 OOF AUC: 0.76882 | fold mean: 0.76896

üå± SEED RUN: 202
[Seed 202][Fold 1] AUC=0.76867 | n_feat=117
[Seed 202][Fold 2] AUC=0.77597 | n_feat=117
[Seed 202][Fold 3] AUC=0.76949 | n_feat=117
[Seed 202][Fold 4] AUC=0.76065 | n_feat=117
[Seed 202][Fold 5] AUC=0.77171 | n_feat=117
‚úÖ SEED 202 OOF AUC: 0.76907 | fold mean: 0.76930

üå± SEED RUN: 777
[Seed 777][Fold 1] AUC=0.76767 | n_feat=117
[Seed 777][Fold 2] AUC=0.77591 | n_feat=117
[Seed 777][Fold 3] AUC=0.76289 | n_feat=117
[Seed 777][Fold 4] AUC=0.76551 | n_feat=117
[Seed 777][Fold 5] AUC=0.77357 | n_feat=117
‚úÖ SEED 777 OOF AUC: 0.76875 | fold mean: 0.76911

üèÅ SEED ENSEMBLE SUMMARY
Seed 42: OOF AUC = 0.76882
Seed 202: OOF AUC = 0