In [1]:
import random
from datetime import datetime
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import gc

# 1. 시드 및 장치 설정
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(0)
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 2. 데이터 로드 및 전처리
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_data = pd.read_csv('../../data/raw/train.csv')
test_data = pd.read_csv('../../data/raw/test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)

train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

# Baseline 특유의 타겟 처리 (1->1, 2->0)
train_y = 2 - train_y.to_numpy()
train_x = train_x.to_numpy().astype(float) 
test_x = test_x.to_numpy().astype(float)

# 텐서 변환 및 수동 스케일링
train_y_t = torch.tensor(train_y, dtype=torch.float32)
train_x_t = torch.tensor(train_x, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)

train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5

test_len = len(test_x_t)
N_REPEAT = 15
N_SKFOLD = 5
N_EPOCH = 48
BATCH_SIZE = 72
LOADER_PARAM = {'batch_size': BATCH_SIZE, 'num_workers': 0, 'pin_memory': True} # M1 안정성 위해 0 권장

prediction = np.zeros((test_len, 1), dtype=np.float32)
total_val_aucs = []

# 3. 학습 루프
for repeat in range(N_REPEAT):
    skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True)
    
    for skfold, (train_idx, valid_idx) in enumerate(skf.split(train_x, train_y)):
        train_loader = DataLoader(TensorDataset(train_x_t[train_idx], train_y_t[train_idx]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        valid_loader = DataLoader(TensorDataset(train_x_t[valid_idx], train_y_t[valid_idx]),
                                  shuffle=False, **LOADER_PARAM)
        
        # 모델: 노드 수 180 -> 256 상향
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(91, 256, bias=False), # 180에서 상향
            nn.LeakyReLU(0.05, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, 32, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(32, 1)
        ).to(DEVICE)
        
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=N_EPOCH // 6, eta_min=4e-4)
        
        prediction_t = np.zeros((test_len, 1), dtype=np.float32)
        loss_t = 1.0
        best_auc_in_fold = 0

        for epoch in tqdm(range(N_EPOCH), desc='R{:02d} S{:02d}'.format(repeat + 1, skfold + 1)):
            model.train()
            for idx, (xx, yy) in enumerate(train_loader):
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + idx / len(train_loader))

            # 검증 및 AUC 확인
            model.eval()
            val_preds, val_targets = [], []
            running_loss, running_count = 0.0, 0
            with torch.no_grad():
                for xx, yy in valid_loader:
                    xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                    logits = model(xx).squeeze()
                    loss = criterion(logits, yy)
                    running_loss += loss.item() * len(yy)
                    running_count += len(yy)
                    val_preds.extend(torch.sigmoid(logits).cpu().numpy())
                    val_targets.extend(yy.cpu().numpy())
            
            avg_val_loss = running_loss / running_count
            current_auc = roc_auc_score(val_targets, val_preds)

            # 성공 공식: Loss가 최소일 때의 테스트 예측값 저장
            if avg_val_loss < loss_t:
                loss_t = avg_val_loss
                best_auc_in_fold = current_auc
                # Test 예측 (Baseline 특유의 2. - sigmoid 유지)
                test_preds_fold = []
                test_loader = DataLoader(test_x_t, batch_size=BATCH_SIZE, shuffle=False)
                for xx in test_loader:
                    xx = xx.to(DEVICE)
                    # Baseline 방식의 결과값 산출
                    p = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                    test_preds_fold.extend(p)
                prediction_t = np.array(test_preds_fold).reshape(-1, 1)

        total_val_aucs.append(best_auc_in_fold)
        prediction += prediction_t / (N_REPEAT * N_SKFOLD)
        
        # 메모리 관리
        del model; gc.collect(); torch.mps.empty_cache()

# 4. 최종 결과 출력
print("\n" + "="*40)
print(f"평균 Validation AUC: {np.mean(total_val_aucs):.5f}")
print(f"최고 AUC 기록: {np.max(total_val_aucs):.5f}")
print("="*40)

# 5. 제출 파일 저장
sub_df = pd.read_csv('../../data/raw/sample_submission.csv')
sub_df.iloc[:, 1:] = prediction
save_path = f'./submissions/exp09_{datetime.now().strftime("%m%d-%H%M")}_AUC_{np.mean(total_val_aucs):.4f}.csv'
sub_df.to_csv(save_path, index=False)
print(f"저장 완료: {save_path}")

R01 S01: 100%|██████████| 48/48 [01:02<00:00,  1.30s/it]
R01 S02: 100%|██████████| 48/48 [01:03<00:00,  1.33s/it]
R01 S03: 100%|██████████| 48/48 [01:03<00:00,  1.31s/it]
R01 S04: 100%|██████████| 48/48 [01:02<00:00,  1.30s/it]
R01 S05: 100%|██████████| 48/48 [01:03<00:00,  1.32s/it]
R02 S01: 100%|██████████| 48/48 [01:02<00:00,  1.31s/it]
R02 S02: 100%|██████████| 48/48 [01:02<00:00,  1.30s/it]
R02 S03: 100%|██████████| 48/48 [01:00<00:00,  1.26s/it]
R02 S04: 100%|██████████| 48/48 [01:01<00:00,  1.28s/it]
R02 S05: 100%|██████████| 48/48 [01:00<00:00,  1.27s/it]
R03 S01: 100%|██████████| 48/48 [01:01<00:00,  1.28s/it]
R03 S02: 100%|██████████| 48/48 [01:01<00:00,  1.27s/it]
R03 S03: 100%|██████████| 48/48 [01:01<00:00,  1.28s/it]
R03 S04: 100%|██████████| 48/48 [01:01<00:00,  1.29s/it]
R03 S05: 100%|██████████| 48/48 [01:01<00:00,  1.29s/it]
R04 S01: 100%|██████████| 48/48 [01:01<00:00,  1.28s/it]
R04 S02: 100%|██████████| 48/48 [01:01<00:00,  1.29s/it]
R04 S03: 100%|██████████| 48/48


평균 Validation AUC: 0.77233
최고 AUC 기록: 0.77892
저장 완료: ./submissions/exp09_0131-2057_AUC_0.7723.csv



