In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
import optuna
from tqdm.auto import tqdm
from datetime import datetime
import gc
import os

# 1. 경로 및 환경 설정
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

if not os.path.exists(SUB_PATH):
    os.makedirs(SUB_PATH)

def seed_everything(seed=0):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(0)

# 2. 데이터 전처리 (사용자 검증 로직 100% 반영)
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1).astype(replace_dict)
test_x = test_data.drop(drop_list, axis=1).astype(replace_dict)

train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

train_y = (2 - train_y.to_numpy()).astype(np.float32)
train_x = train_x.to_numpy().astype(np.float32)
test_x = test_x.to_numpy().astype(np.float32)

train_x_t = torch.tensor(train_x, dtype=torch.float32)
train_y_t = torch.tensor(train_y, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)

# 수동 스케일링 로직
train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2.
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5

# 3. 모델 정의
class DynamicMLP(nn.Module):
    def __init__(self, input_dim, h1, h2, drop_rate):
        super(DynamicMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(input_dim, h1, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            nn.Dropout(drop_rate),
            nn.Linear(h1, h2, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(h2, 1)
        )
    def forward(self, x):
        return self.net(x)

# 4. Optuna 목적 함수
def objective(trial):
    h1 = trial.suggest_int("h1", 128, 512, step=32)
    h2 = trial.suggest_int("h2", 16, 64, step=8)
    drop_rate = trial.suggest_float("drop_rate", 0.3, 0.6)
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    cv_aucs = []
    
    for t_idx, v_idx in skf.split(train_x_t, train_y_t):
        t_loader = DataLoader(TensorDataset(train_x_t[t_idx], train_y_t[t_idx]), batch_size=72, shuffle=True)
        v_loader = DataLoader(TensorDataset(train_x_t[v_idx], train_y_t[v_idx]), batch_size=72, shuffle=False)
        
        model = DynamicMLP(91, h1, h2, drop_rate).to(DEVICE)
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        
        best_auc = 0
        for epoch in range(15):
            model.train()
            for xx, yy in t_loader:
                optimizer.zero_grad()
                loss = criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE))
                loss.backward()
                optimizer.step()
            
            model.eval()
            v_preds = []
            with torch.no_grad():
                for vx, _ in v_loader:
                    v_preds.extend(torch.sigmoid(model(vx.to(DEVICE)).view(-1)).cpu().numpy())
            auc = roc_auc_score(train_y_t[v_idx].numpy(), v_preds)
            best_auc = max(best_auc, auc)
        cv_aucs.append(best_auc)
    return np.mean(cv_aucs)

# 5. 메인 실행 루프
if __name__ == "__main__":
    print(">>> Phase 1: Hyperparameter Optimization (Optuna)")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10)
    
    best_params = study.best_params
    print(f"\n[Best Parameters]: {best_params}")

    print("\n>>> Phase 2: Final Training with 5-Repeat 7-Fold")
    N_REPEAT = 5
    N_SKFOLD = 7
    final_predictions = np.zeros((len(test_x_t), 1))
    total_val_aucs = []

    for r in range(N_REPEAT):
        skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=r, shuffle=True)
        pbar = tqdm(enumerate(skf.split(train_x_t, train_y_t)), total=N_SKFOLD, desc=f"Repeat {r+1}/{N_REPEAT}")
        
        for f, (t_idx, v_idx) in pbar:
            t_loader = DataLoader(TensorDataset(train_x_t[t_idx], train_y_t[t_idx]), batch_size=72, shuffle=True)
            v_loader = DataLoader(TensorDataset(train_x_t[v_idx], train_y_t[v_idx]), batch_size=72, shuffle=False)
            
            model = DynamicMLP(91, best_params['h1'], best_params['h2'], best_params['drop_rate']).to(DEVICE)
            optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
            scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=8, eta_min=4e-4)
            criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
            
            best_fold_auc = 0
            best_fold_pred = np.zeros((len(test_x_t), 1))
            
            for epoch in range(48):
                model.train()
                for xx, yy in t_loader:
                    optimizer.zero_grad()
                    loss = criterion(model(xx.to(DEVICE)).view(-1), yy.to(DEVICE))
                    loss.backward()
                    optimizer.step()
                    scheduler.step(epoch)
                
                model.eval()
                v_preds = []
                with torch.no_grad():
                    for vx, _ in v_loader:
                        v_preds.extend(torch.sigmoid(model(vx.to(DEVICE)).view(-1)).cpu().numpy())
                
                curr_auc = roc_auc_score(train_y_t[v_idx].numpy(), v_preds)
                if curr_auc > best_fold_auc:
                    best_fold_auc = curr_auc
                    with torch.no_grad():
                        p = (2. - torch.sigmoid(model(test_x_t.to(DEVICE)).view(-1, 1))).cpu().numpy()
                        best_fold_pred = p
            
            total_val_aucs.append(best_fold_auc)
            final_predictions += best_fold_pred / (N_REPEAT * N_SKFOLD)
            pbar.set_postfix({'Fold_AUC': f'{best_fold_auc:.4f}'})
            del model; gc.collect()

    mean_auc = np.mean(total_val_aucs)
    print(f"\nFinal Mean Validation AUC: {mean_auc:.5f}")

    # 점수 상관없이 무조건 생성
    print(">>> Saving submission file...")
    submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
    submission['voted'] = final_predictions
    
    timestamp = datetime.now().strftime("%m%d_%H%M")
    filename = f"{SUB_PATH}Optuna_Hybrid_{timestamp}_AUC_{mean_auc:.4f}.csv"
    submission.to_csv(filename, index=False)
    print(f"File Saved: {filename}")

[I 2026-01-31 23:10:36,454] A new study created in memory with name: no-name-a285bcc4-cce2-42e7-af8a-4beee1f2c159


>>> Phase 1: Hyperparameter Optimization (Optuna)


[I 2026-01-31 23:11:36,535] Trial 0 finished with value: 0.7696615005139787 and parameters: {'h1': 512, 'h2': 16, 'drop_rate': 0.5760850656580785}. Best is trial 0 with value: 0.7696615005139787.
[I 2026-01-31 23:12:27,649] Trial 1 finished with value: 0.769289650857918 and parameters: {'h1': 224, 'h2': 24, 'drop_rate': 0.5545884692375175}. Best is trial 0 with value: 0.7696615005139787.
[I 2026-01-31 23:13:19,917] Trial 2 finished with value: 0.7693281368839825 and parameters: {'h1': 256, 'h2': 40, 'drop_rate': 0.5228758292886999}. Best is trial 0 with value: 0.7696615005139787.
[I 2026-01-31 23:14:09,848] Trial 3 finished with value: 0.7698969326095025 and parameters: {'h1': 160, 'h2': 64, 'drop_rate': 0.3140080126847166}. Best is trial 3 with value: 0.7698969326095025.
[I 2026-01-31 23:15:00,428] Trial 4 finished with value: 0.7705013786202648 and parameters: {'h1': 320, 'h2': 16, 'drop_rate': 0.4293558393863447}. Best is trial 4 with value: 0.7705013786202648.
[I 2026-01-31 23:15:5


[Best Parameters]: {'h1': 288, 'h2': 64, 'drop_rate': 0.37915667558151395}

>>> Phase 2: Final Training with 5-Repeat 7-Fold


Repeat 1/5:   0%|          | 0/7 [00:00<?, ?it/s]

Repeat 2/5:   0%|          | 0/7 [00:00<?, ?it/s]

Repeat 3/5:   0%|          | 0/7 [00:00<?, ?it/s]

Repeat 4/5:   0%|          | 0/7 [00:00<?, ?it/s]

Repeat 5/5:   0%|          | 0/7 [00:00<?, ?it/s]


Final Mean Validation AUC: 0.77272
>>> Saving submission file...
File Saved: ./submissions/Optuna_Hybrid_0131_2354_AUC_0.7727.csv
