In [None]:
# =========================================
# 0. Setup
# =========================================
import os, gc, random, math, time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

TRAIN_PATH = "/content/vote_ai/train.csv"
TEST_PATH  = "/content/vote_ai/test_x.csv"
assert os.path.exists(TRAIN_PATH), f"Missing: {TRAIN_PATH}"
assert os.path.exists(TEST_PATH),  f"Missing: {TEST_PATH}"

SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from string import ascii_lowercase

# =========================
# Config
# =========================
TRAIN_PATH = "/content/vote_ai/train.csv"     # 네 코랩에 맞춘 경로
TEST_PATH  = "/content/vote_ai/test_x.csv"

SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

N_FOLDS = 7
N_REPEAT = 3
EPOCHS = 30
BATCH_SIZE = 512
NUM_WORKERS = 2

def seed_everything(seed=SEED):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
seed_everything(SEED)

# =========================
# Load
# =========================
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

train["voted_bin"] = (train["voted"] == 2).astype(int).values
y = train["voted_bin"].astype(np.float32).values

# =========================
# Feature Engineering (Rank1-lite, 안전버전)
#  - flip 공개문항만 (secret flip은 데이터/정의 불확실로 일단 제외)
#  - delay
#  - Mach features
#  - categorical dummies
#  - QE 원본은 drop (delay로 요약)
# =========================
def fe_rank1_safe(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    dataset = [train_df, test_df]

    qs = list(ascii_lowercase)[:20]
    A_cols = [f"Q{q}A" for q in qs]
    E_cols = [f"Q{q}E" for q in qs]

    # 1) flip 공개문항(1등이 자주 쓰는 subset)
    flipping_columns = ["QeA","QfA","QkA","QqA","QrA"]
    for df in dataset:
        for c in flipping_columns:
            if c in df.columns:
                # 0/NaN은 그대로 두고 1~5만 flip
                df[c] = np.where(df[c].between(1,5), 6 - df[c], df[c])

    # 2) Mach-like features
    for df in dataset:
        df["T"] = df["QcA"] - df["QfA"] + df["QoA"] - df["QrA"] + df["QsA"]
        df["V"] = df["QbA"] - df["QeA"] + df["QhA"] + df["QjA"] + df["QmA"] - df["QqA"]
        df["M"] = -df["QkA"]
        df["Mach_score"] = df[A_cols].replace(0, np.nan).mean(axis=1)

    # 3) delay
    for df in dataset:
        e = df[E_cols].clip(lower=100, upper=60000)  # 너가 기존에 하던 클립 반영
        df["delay_sum"] = e.sum(axis=1)
        df["delay"] = np.power(df["delay_sum"].clip(lower=0), 1/10)

    # 4) tp (기본 diff만, 1등처럼 뒤집기는 일단 보류)
    tps = [f"tp{i:02d}" for i in range(1,11)]
    for df in dataset:
        for c in tps:
            df.loc[df[c] == 0, c] = np.nan
        df["Ex"]  = df["tp01"] - df["tp06"]
        df["Ag"]  = df["tp07"] - df["tp02"]
        df["Con"] = df["tp03"] - df["tp08"]
        df["Es"]  = df["tp09"] - df["tp04"]
        df["Op"]  = df["tp05"] - df["tp10"]

    # 5) categorical -> string
    cat_cols = ["education","engnat","married","urban","age_group","gender","race","religion","hand"]
    for df in dataset:
        for c in cat_cols:
            if c in df.columns:
                df[c] = df[c].astype(str)

    # 6) drop QE 원본, index
    drop_cols = E_cols + ["index"]
    for df in dataset:
        for c in drop_cols:
            if c in df.columns:
                df.drop(columns=[c], inplace=True)

    X_train = train_df.drop(columns=["voted", "voted_bin"])
    X_test  = test_df.copy()

    X_train = pd.get_dummies(X_train)
    X_test  = pd.get_dummies(X_test)
    X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

    # ✅ inf/nan 정리
    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    X_test  = X_test.replace([np.inf, -np.inf], np.nan)

    med = X_train.median()
    X_train = X_train.fillna(med)
    X_test  = X_test.fillna(med)

    # ✅ 스케일링 (NN 필수)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.values).astype(np.float32)
    X_test  = scaler.transform(X_test.values).astype(np.float32)

    return X_train, X_test

X, X_test = fe_rank1_safe(train, test)
print("X:", X.shape, "X_test:", X_test.shape)
print("nan ratio:", np.isnan(X).mean(), "inf ratio:", np.isinf(X).mean())

# =========================
# Model (강한 MLP: Residual-ish)
# =========================
class MLP(nn.Module):
    def __init__(self, in_dim, hidden=256, dropout=0.25):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden)
        self.bn1 = nn.BatchNorm1d(hidden)
        self.fc2 = nn.Linear(hidden, hidden)
        self.bn2 = nn.BatchNorm1d(hidden)
        self.out = nn.Linear(hidden, 1)
        self.act = nn.SiLU()
        self.dp = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dp(self.act(self.bn1(self.fc1(x))))
        h = self.dp(self.act(self.bn2(self.fc2(x))))
        x = x + h  # residual
        return self.out(x).squeeze(1)

def train_fold(model, tr_loader, va_loader, y_tr_np, y_va_np):
    model.to(DEVICE)

    # ✅ pos_weight는 train 기준
    pos = float(y_tr_np.mean())
    pos_weight = torch.tensor([(1-pos)/(pos+1e-6)], device=DEVICE)
    crit = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    opt = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)

    best_auc, best_state = -1, None

    for epoch in range(EPOCHS):
        model.train()
        for xb, yb in tr_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            loss = crit(model(xb), yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
        sched.step()

        model.eval()
        pred = []
        with torch.no_grad():
            for xb, _ in va_loader:
                xb = xb.to(DEVICE)
                pred.append(torch.sigmoid(model(xb)).detach().cpu().numpy())
        pred = np.concatenate(pred)
        auc = roc_auc_score(y_va_np, pred)

        if auc > best_auc:
            best_auc = auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    model.load_state_dict(best_state)
    return best_auc, model

# =========================
# CV + Repeat
# =========================
X_t = torch.tensor(X, dtype=torch.float32)
y_t = torch.tensor(y, dtype=torch.float32)
Xtest_t = torch.tensor(X_test, dtype=torch.float32)

oof = np.zeros(len(X), dtype=np.float32)
test_pred = np.zeros(len(X_test), dtype=np.float32)
auc_list = []

for rep in range(N_REPEAT):
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED + rep)
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr, y_tr = X_t[tr_idx], y_t[tr_idx]
        X_va, y_va = X_t[va_idx], y_t[va_idx]

        tr_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True,
                               drop_last=True, num_workers=NUM_WORKERS, pin_memory=True)
        va_loader = DataLoader(TensorDataset(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False,
                               drop_last=False, num_workers=NUM_WORKERS, pin_memory=True)

        model = MLP(in_dim=X.shape[1], hidden=256, dropout=0.25)
        auc, model = train_fold(model, tr_loader, va_loader, y_tr.numpy(), y_va.numpy())
        auc_list.append(auc)
        print(f"[rep {rep+1}/{N_REPEAT}][fold {fold}/{N_FOLDS}] AUC={auc:.5f}")

        # OOF
        model.eval()
        with torch.no_grad():
            pred_va = torch.sigmoid(model(X_va.to(DEVICE))).detach().cpu().numpy()
            oof[va_idx] += pred_va / N_REPEAT

        # TEST
        te_loader = DataLoader(TensorDataset(Xtest_t, torch.zeros(len(Xtest_t))), batch_size=BATCH_SIZE,
                               shuffle=False, drop_last=False, num_workers=NUM_WORKERS, pin_memory=True)
        pred_ts = []
        with torch.no_grad():
            for xb, _ in te_loader:
                xb = xb.to(DEVICE)
                pred_ts.append(torch.sigmoid(model(xb)).detach().cpu().numpy())
        pred_ts = np.concatenate(pred_ts)
        test_pred += pred_ts / (N_REPEAT * N_FOLDS)

oof_auc = roc_auc_score(y, oof)
print("\n====================")
print("OOF AUC:", float(oof_auc))
print("Fold mean:", float(np.mean(auc_list)))
print("====================\n")

sub = pd.DataFrame({
    "index": test["index"] if "index" in test.columns else np.arange(len(test)),
    "voted": test_pred
})
sub.to_csv("submission_nn_rank1_safe.csv", index=False)
print("saved -> submission_nn_rank1_safe.csv")
print("pred range:", float(test_pred.min()), float(test_pred.max()))
