In [1]:
import random
from datetime import datetime
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import gc

# 경로 설정
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"

# 시드 고정
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(0)
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 데이터 로드 및 전처리 (기존 로직 유지)
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_data = pd.read_csv(DATA_PATH + 'train.csv')
test_data = pd.read_csv(DATA_PATH + 'test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)

train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

train_y = 2 - train_y.to_numpy() # 1->1, 2->0 변환
train_x = train_x.to_numpy().astype(float) 
test_x = test_x.to_numpy().astype(float)

# 텐서 변환 및 수동 스케일링 (성공했던 로직)
train_y_t = torch.tensor(train_y, dtype=torch.float32)
train_x_t = torch.tensor(train_x, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)

train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5

test_len = len(test_x_t)
N_REPEAT = 5
N_SKFOLD = 7
N_EPOCH = 48
BATCH_SIZE = 72
LOADER_PARAM = {'batch_size': BATCH_SIZE, 'num_workers': 0, 'pin_memory': True}

prediction = np.zeros((test_len, 1), dtype=np.float32)
total_auc_scores = [] # 전체 AUC 기록용

for repeat in range(N_REPEAT):
    seed_everything(repeat) # 리피트마다 시드 변경으로 다양성 확보
    skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True)
    
    for skfold, (train_idx, valid_idx) in enumerate(skf.split(train_x, train_y)):
        train_loader = DataLoader(TensorDataset(train_x_t[train_idx], train_y_t[train_idx]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        valid_loader = DataLoader(TensorDataset(train_x_t[valid_idx], train_y_t[valid_idx]),
                                  shuffle=False, drop_last=False, **LOADER_PARAM)
        test_loader = DataLoader(TensorDataset(test_x_t, torch.zeros((test_len,), dtype=torch.float32)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
        
        # 모델 구조 수정: Capacity 감소 및 Dropout 강화
        model = nn.Sequential(
            nn.Linear(91, 256),    # 512 -> 256으로 축소
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.05),
            nn.Dropout(0.5),       # 0.4 -> 0.5로 상향
            nn.Linear(256, 128),
            nn.LeakyReLU(0.05),
            nn.Dropout(0.3),       # 0.2 -> 0.3으로 상향
            nn.Linear(128, 1)
        ).to(DEVICE)
        
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=N_EPOCH // 6, eta_min=4e-4)
        
        prediction_t = np.zeros((test_len, 1), dtype=np.float32)
        best_loss = 1e10
        best_auc = 0 # 폴드 내 최고 AUC 기록

        for epoch in tqdm(range(N_EPOCH), desc='R{:02d} S{:02d}'.format(repeat + 1, skfold + 1)):
            model.train()
            for xx, yy in train_loader:
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + (xx.size(0)/len(train_loader)))

            # Validation 및 AUC 계산
            model.eval()
            val_preds = []
            val_targets = []
            val_loss = 0
            with torch.no_grad():
                for xx, yy in valid_loader:
                    xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                    logits = model(xx).squeeze()
                    loss = criterion(logits, yy)
                    val_loss += loss.item() * len(yy)
                    
                    probs = torch.sigmoid(logits)
                    val_preds.extend(probs.cpu().numpy())
                    val_targets.extend(yy.cpu().numpy())
            
            avg_val_loss = val_loss / len(valid_idx)
            current_auc = roc_auc_score(val_targets, val_preds)

            # Best Model 저장 (Loss 기준 혹은 AUC 기준으로 선택 가능, 여기선 Loss 유지)
            if avg_val_loss < best_loss:
                best_loss = avg_val_loss
                best_auc = current_auc # 해당 시점의 AUC 저장
                
                # Test 예측
                temp_test_preds = []
                for xx, _ in test_loader:
                    xx = xx.to(DEVICE)
                    # 기존 2. - sigmoid 로직 유지 (voted=2일 확률)
                    pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                    temp_test_preds.extend(pred)
                prediction_t = np.array(temp_test_preds).reshape(-1, 1)

        total_auc_scores.append(best_auc)
        prediction += prediction_t / (N_REPEAT * N_SKFOLD)
        print(f'Fold AUC: {best_auc:.5f}')
        
        # 메모리 정리
        del model; gc.collect(); torch.mps.empty_cache()

# 최종 결과 출력
print("\n" + "="*30)
print(f"최종 평균 Validation AUC: {np.mean(total_auc_scores):.5f} (+/- {np.std(total_auc_scores):.5f})")
print("="*30)

# 파일 저장
df = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df.iloc[:, 1:] = prediction
save_name = f"{SUB_PATH}WideMLP_{datetime.now().strftime('%m%d-%H%M')}_AUC_{np.mean(total_auc_scores):.4f}.csv"
df.to_csv(save_name, index=False)
print(f"\nsaved: {save_name}")

R01 S01: 100%|██████████| 48/48 [01:32<00:00,  1.93s/it]


Fold AUC: 0.76510


R01 S02: 100%|██████████| 48/48 [01:31<00:00,  1.90s/it]


Fold AUC: 0.77448


R01 S03: 100%|██████████| 48/48 [01:29<00:00,  1.87s/it]


Fold AUC: 0.76517


R01 S04: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.78022


R01 S05: 100%|██████████| 48/48 [01:31<00:00,  1.91s/it]


Fold AUC: 0.76689


R01 S06: 100%|██████████| 48/48 [01:31<00:00,  1.91s/it]


Fold AUC: 0.77381


R01 S07: 100%|██████████| 48/48 [01:31<00:00,  1.91s/it]


Fold AUC: 0.78121


R02 S01: 100%|██████████| 48/48 [01:31<00:00,  1.90s/it]


Fold AUC: 0.76915


R02 S02: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.77408


R02 S03: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.77720


R02 S04: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.76844


R02 S05: 100%|██████████| 48/48 [01:30<00:00,  1.88s/it]


Fold AUC: 0.78102


R02 S06: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.76499


R02 S07: 100%|██████████| 48/48 [01:32<00:00,  1.93s/it]


Fold AUC: 0.77468


R03 S01: 100%|██████████| 48/48 [01:32<00:00,  1.92s/it]


Fold AUC: 0.77384


R03 S02: 100%|██████████| 48/48 [01:29<00:00,  1.86s/it]


Fold AUC: 0.76619


R03 S03: 100%|██████████| 48/48 [01:29<00:00,  1.86s/it]


Fold AUC: 0.77565


R03 S04: 100%|██████████| 48/48 [01:31<00:00,  1.90s/it]


Fold AUC: 0.77222


R03 S05: 100%|██████████| 48/48 [01:30<00:00,  1.88s/it]


Fold AUC: 0.77816


R03 S06: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.76705


R03 S07: 100%|██████████| 48/48 [01:30<00:00,  1.90s/it]


Fold AUC: 0.77310


R04 S01: 100%|██████████| 48/48 [01:31<00:00,  1.90s/it]


Fold AUC: 0.76570


R04 S02: 100%|██████████| 48/48 [01:29<00:00,  1.86s/it]


Fold AUC: 0.77732


R04 S03: 100%|██████████| 48/48 [01:28<00:00,  1.85s/it]


Fold AUC: 0.76688


R04 S04: 100%|██████████| 48/48 [01:29<00:00,  1.87s/it]


Fold AUC: 0.76889


R04 S05: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.78429


R04 S06: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.77414


R04 S07: 100%|██████████| 48/48 [01:30<00:00,  1.88s/it]


Fold AUC: 0.76618


R05 S01: 100%|██████████| 48/48 [01:31<00:00,  1.92s/it]


Fold AUC: 0.77430


R05 S02: 100%|██████████| 48/48 [01:31<00:00,  1.90s/it]


Fold AUC: 0.77259


R05 S03: 100%|██████████| 48/48 [01:30<00:00,  1.88s/it]


Fold AUC: 0.76739


R05 S04: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.77194


R05 S05: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]


Fold AUC: 0.77419


R05 S06: 100%|██████████| 48/48 [01:31<00:00,  1.92s/it]


Fold AUC: 0.77017


R05 S07: 100%|██████████| 48/48 [01:31<00:00,  1.91s/it]

Fold AUC: 0.77391

최종 평균 Validation AUC: 0.77230 (+/- 0.00511)

saved: ./submissions/WideMLP_0131-1838_AUC_0.7723.csv



