In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
import gc

drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_data = pd.read_csv('../../data/raw/train.csv')
test_data = pd.read_csv('../../data/raw/test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)

train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)

train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

train_y = 2 - train_y.to_numpy()
train_x = train_x.to_numpy().astype(float) 
test_x = test_x.to_numpy().astype(float)

DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
train_y_t = torch.tensor(train_y, dtype=torch.float32)
train_x_t = torch.tensor(train_x, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)

train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5
# ---------------------------------------

# 학습 설정 (0.78116 하이퍼파라미터)
N_REPEAT = 5
N_SKFOLD = 7
N_EPOCH = 48
BATCH_SIZE = 72

final_predictions = np.zeros((len(test_x_t), 1))
total_val_aucs = []

for repeat in range(N_REPEAT):
    skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True)
    for fold, (t_idx, v_idx) in enumerate(skf.split(train_x_t, train_y_t)):
        train_loader = DataLoader(TensorDataset(train_x_t[t_idx], train_y_t[t_idx]), batch_size=BATCH_SIZE, shuffle=True)
        valid_loader = DataLoader(TensorDataset(train_x_t[v_idx], train_y_t[v_idx]), batch_size=BATCH_SIZE, shuffle=False)
        
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(91, 180, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(180, 32, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(32, 1)
        ).to(DEVICE)
        
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=N_EPOCH // 6, eta_min=4e-4)
        
        best_auc = 0
        fold_pred = np.zeros((len(test_x_t), 1))

        for epoch in range(N_EPOCH):
            model.train()
            for xx, yy in train_loader:
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                optimizer.zero_grad()
                # 마지막 배치에서 차원 붕괴를 막기 위해 view(-1)만 최소한으로 적용
                out = model(xx).view(-1)
                loss = criterion(out, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch)

            model.eval()
            v_preds, v_targets = [], []
            with torch.no_grad():
                for xx, yy in valid_loader:
                    v_preds.extend(torch.sigmoid(model(xx.to(DEVICE)).view(-1)).cpu().numpy())
                    v_targets.extend(yy.numpy())
            
            curr_auc = roc_auc_score(v_targets, v_preds)
            if curr_auc > best_auc:
                best_auc = curr_auc
                with torch.no_grad():
                    # 원본 성공 로직: 2 - 확률
                    fold_pred = (2. - torch.sigmoid(model(test_x_t.to(DEVICE)).view(-1, 1)).cpu().numpy())

        total_val_aucs.append(best_auc)
        final_predictions += fold_pred / (N_REPEAT * N_SKFOLD)
        print(f"R{repeat+1} F{fold+1} Best AUC: {best_auc:.5f}")
        del model; gc.collect()

print(f"\n최종 평균 Validation AUC: {np.mean(total_val_aucs):.5f}")
submission = pd.read_csv('../../data/raw/sample_submission.csv')
submission['voted'] = final_predictions
submission.to_csv(f"./submissions/Pure_0781_AUC_{np.mean(total_val_aucs):.5f}.csv", index=False)

R1 F1 Best AUC: 0.76773
R1 F2 Best AUC: 0.77465
R1 F3 Best AUC: 0.76620
R1 F4 Best AUC: 0.78038
R1 F5 Best AUC: 0.76871
R1 F6 Best AUC: 0.77432
R1 F7 Best AUC: 0.78019
R2 F1 Best AUC: 0.77055
R2 F2 Best AUC: 0.77360
R2 F3 Best AUC: 0.77755
R2 F4 Best AUC: 0.76835
R2 F5 Best AUC: 0.78230
R2 F6 Best AUC: 0.76646
R2 F7 Best AUC: 0.77460
R3 F1 Best AUC: 0.77538
R3 F2 Best AUC: 0.76680
R3 F3 Best AUC: 0.77752
R3 F4 Best AUC: 0.77231
R3 F5 Best AUC: 0.77917
R3 F6 Best AUC: 0.76818
R3 F7 Best AUC: 0.77449
R4 F1 Best AUC: 0.76736
R4 F2 Best AUC: 0.77735
R4 F3 Best AUC: 0.76714
R4 F4 Best AUC: 0.77205
R4 F5 Best AUC: 0.78596
R4 F6 Best AUC: 0.77453
R4 F7 Best AUC: 0.76636
R5 F1 Best AUC: 0.77341
R5 F2 Best AUC: 0.77282
R5 F3 Best AUC: 0.76837
R5 F4 Best AUC: 0.77407
R5 F5 Best AUC: 0.77702
R5 F6 Best AUC: 0.77172
R5 F7 Best AUC: 0.77513

최종 평균 Validation AUC: 0.77322
