In [1]:
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from datetime import datetime
import gc

# 1. 환경 설정 및 시드 고정
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(0)
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 2. 모델 아키텍처: Deep ResNet-MLP
class ResBlock(nn.Module):
    def __init__(self, in_features, out_features, dropout_rate):
        super().__init__()
        self.ln = nn.LayerNorm(in_features)
        self.fc = nn.Linear(in_features, out_features)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout_rate)
        self.shortcut = nn.Linear(in_features, out_features) if in_features != out_features else nn.Identity()

    def forward(self, x):
        res = self.shortcut(x)
        x = self.ln(x)
        x = self.fc(x)
        x = self.gelu(x)
        x = self.dropout(x)
        return x + res

class DeepResNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.stem = nn.Sequential(nn.Linear(input_dim, 256), nn.GELU())
        self.layer1 = ResBlock(256, 256, 0.2)
        self.layer2 = ResBlock(256, 128, 0.2)
        self.layer3 = ResBlock(128, 64, 0.1)
        self.head = nn.Linear(64, 1)

    def forward(self, x):
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return self.head(x)

# 3. 데이터 로드 및 전처리
train_data = pd.read_csv(DATA_PATH + 'train.csv')
test_data = pd.read_csv(DATA_PATH + 'test_x.csv')

# 이상치 제거
train_data = train_data[train_data.familysize <= 50]

# 피처 엔지니어링 (조언 드린 내용 반영)
qa_cols = [f'Q{c}A' for c in 'abcdefghijklmnopqrst']
for df in [train_data, test_data]:
    df['qa_std'] = df[qa_cols].std(axis=1) # 응답 일관성
    df['tp_sum'] = df[[f'tp{i:02d}' for i in range(1, 11)]].sum(axis=1)

drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE', 'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE', 'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

train_y = 2 - train_data['voted'].to_numpy() # 1(Voted) -> 1, 2(Not) -> 0 변환
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)

train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)

# 원핫 인코딩 합쳐서 수행 (컬럼 일치)
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x)
train_x = all_x[:len(train_x)].to_numpy().astype(float)
test_x = all_x[len(train_x):].to_numpy().astype(float)

# 수동 스케일링 (기존 코드 유지)
train_x_t = torch.tensor(train_x, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)
train_y_t = torch.tensor(train_y, dtype=torch.float32)

# 4. 학습 설정
N_REPEAT = 3  # 시간 관계상 3회 권장 (앙상블 효과 충분)
N_SKFOLD = 5
N_EPOCH = 50
BATCH_SIZE = 128
LOADER_PARAM = {'batch_size': BATCH_SIZE, 'num_workers': 0, 'pin_memory': True}

final_prediction = np.zeros((len(test_x), 1), dtype=np.float32)
total_val_auc = []

for repeat in range(N_REPEAT):
    seed_everything(repeat)
    skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True)
    
    for skfold, (train_idx, valid_idx) in enumerate(skf.split(train_x, train_y)):
        train_loader = DataLoader(TensorDataset(train_x_t[train_idx], train_y_t[train_idx]), 
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        valid_loader = DataLoader(TensorDataset(train_x_t[valid_idx], train_y_t[valid_idx]), 
                                  shuffle=False, **LOADER_PARAM)
        
        model = DeepResNet(input_dim=train_x.shape[1]).to(DEVICE)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-1)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, eta_min=1e-4)
        
        best_auc = 0
        best_pred_test = None

        for epoch in tqdm(range(N_EPOCH), desc=f'R{repeat+1} S{skfold+1}'):
            model.train()
            for xx, yy in train_loader:
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                optimizer.zero_grad()
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step()

            # Validation 체크
            model.eval()
            val_preds, val_targets = [], []
            with torch.no_grad():
                for xx, yy in valid_loader:
                    xx = xx.to(DEVICE)
                    pred = torch.sigmoid(model(xx).squeeze())
                    val_preds.extend(pred.cpu().numpy())
                    val_targets.extend(yy.numpy())
            
            val_auc = roc_auc_score(val_targets, val_preds)
            
            if val_auc > best_auc:
                best_auc = val_auc
                # 최적 모델일 때 Test 예측 기록
                with torch.no_grad():
                    test_pred = torch.sigmoid(model(test_x_t.to(DEVICE))).cpu().numpy()
                    best_pred_test = test_pred

        total_val_auc.append(best_auc)
        final_prediction += best_pred_test / (N_REPEAT * N_SKFOLD)
        
        # 메모리 관리
        del model; gc.collect(); torch.mps.empty_cache()

# 5. 결과 출력 및 저장
print(f"\n[Final Results]")
print(f"Mean Validation AUC: {np.mean(total_val_auc):.5f} (+/- {np.std(total_val_auc):.5f})")

df = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df.iloc[:, 1:] = final_prediction
filename = f"{SUB_PATH}ResNet_DL_{datetime.now().strftime('%m%d-%H%M')}_AUC_{np.mean(total_val_auc):.4f}.csv"
df.to_csv(filename, index=False)
print(f"Submission file saved: {filename}")

R1 S1: 100%|██████████| 50/50 [01:19<00:00,  1.59s/it]
R1 S2: 100%|██████████| 50/50 [01:16<00:00,  1.52s/it]
R1 S3: 100%|██████████| 50/50 [01:15<00:00,  1.52s/it]
R1 S4: 100%|██████████| 50/50 [01:15<00:00,  1.52s/it]
R1 S5: 100%|██████████| 50/50 [01:15<00:00,  1.52s/it]
R2 S1: 100%|██████████| 50/50 [01:16<00:00,  1.53s/it]
R2 S2: 100%|██████████| 50/50 [01:15<00:00,  1.50s/it]
R2 S3: 100%|██████████| 50/50 [01:14<00:00,  1.50s/it]
R2 S4: 100%|██████████| 50/50 [01:14<00:00,  1.49s/it]
R2 S5: 100%|██████████| 50/50 [01:16<00:00,  1.52s/it]
R3 S1: 100%|██████████| 50/50 [01:16<00:00,  1.52s/it]
R3 S2: 100%|██████████| 50/50 [01:16<00:00,  1.52s/it]
R3 S3: 100%|██████████| 50/50 [01:16<00:00,  1.52s/it]
R3 S4: 100%|██████████| 50/50 [01:15<00:00,  1.51s/it]
R3 S5: 100%|██████████| 50/50 [01:15<00:00,  1.52s/it]


[Final Results]
Mean Validation AUC: 0.76620 (+/- 0.00346)
Submission file saved: ./submissions/ResNet_DL_0131-1633_AUC_0.7662.csv



