In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import os
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

# --- [1] 경로 설정 및 환경 구축 ---
DATA_PATH = '../../data/raw/'
OOF_PATH = './oof_data/'
SUB_PATH = './submissions/'

# 폴더가 없으면 생성
os.makedirs(OOF_PATH, exist_ok=True)
os.makedirs(SUB_PATH, exist_ok=True)

def seed_everything(seed=0):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(0)
DEVICE = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

# --- [2] 정밀 Feature Fusion (m1 + m7) ---
def get_m9_features():
    train = pd.read_csv(f'{DATA_PATH}train.csv')
    test = pd.read_csv(f'{DATA_PATH}test_x.csv')
    
    a_cols = [f'Q{i}A' for i in 'abcdefghijklmnopqrst']
    
    # m1 & m7 공통 필터링 적용
    train = train[train.familysize <= 50].copy()
    train = train[train[a_cols].std(axis=1) > 0].reset_index(drop=True)
    
    combined = pd.concat([train.drop('voted', axis=1), test], axis=0).reset_index(drop=True)
    
    # 1. m7 Refined Features
    combined['Q_Var'] = combined[a_cols].var(axis=1)
    combined['mach_score'] = combined[a_cols].mean(axis=1)
    
    # 2. m1 Key Derived Features
    combined['wr_total'] = combined[[f'wr_{i:02d}' for i in range(1, 14)]].sum(axis=1)
    combined['age_encoded'] = combined['age_group'].str.extract('(\d+)').astype(float).fillna(60)
    
    # Categorical Encoding
    cat_cols = ['race', 'religion', 'urban', 'education', 'married', 'gender']
    for col in cat_cols:
        combined[col] = LabelEncoder().fit_transform(combined[col].astype(str))
        
    # Numerical Scaling
    num_cols = a_cols + ['familysize', 'Q_Var', 'mach_score', 'wr_total', 'age_encoded']
    scaler = StandardScaler()
    combined[num_cols] = scaler.fit_transform(combined[num_cols])
    
    features = cat_cols + num_cols
    train_x = combined.iloc[:len(train)][features].values
    test_x = combined.iloc[len(train):][features].values
    train_y = (2 - train['voted'].to_numpy()).astype(np.float32)
    
    return train_x, train_y, test_x

# --- [3] 모델 아키텍처 (DAE + Fusion Classifier) ---
class DenoisingAutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        noise = torch.randn_like(x) * 0.05 # 노이즈 강도 조절
        latent = self.encoder(x + noise)
        return latent, self.decoder(latent)

class M9Classifier(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim + latent_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4), # 과적합 방지 강화
            nn.Linear(256, 1)
        )

    def forward(self, x, latent):
        return self.fc(torch.cat([x, latent], dim=1))

# --- [4] 학습 및 검증 프로세스 ---
X, y, X_test = get_m9_features()
X_tensor = torch.FloatTensor(X).to(DEVICE)
y_tensor = torch.FloatTensor(y).view(-1, 1).to(DEVICE)
X_test_tensor = torch.FloatTensor(X_test).to(DEVICE)

# Phase 1: DAE 학습 (특징량 정제)
dae = DenoisingAutoEncoder(X.shape[1]).to(DEVICE)
optimizer_dae = optim.Adam(dae.parameters(), lr=1e-3)
criterion_dae = nn.MSELoss()

print('\n--- [Step 1] DAE Pre-training ---')
pbar_dae = tqdm(range(60), desc='DAE Training')
for epoch in pbar_dae:
    dae.train()
    optimizer_dae.zero_grad()
    latent, recon = dae(X_tensor)
    loss = criterion_dae(recon, X_tensor)
    loss.backward()
    optimizer_dae.step()
    pbar_dae.set_postfix(Loss=f'{loss.item():.6f}')

# Phase 2: m9 분류기 학습 (Cross-Validation)
dae.eval()
with torch.no_grad():
    latent_full, _ = dae(X_tensor)
    latent_test, _ = dae(X_test_tensor)

skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=0)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

print('\n--- [Step 2] m9 Classifier Training ---')
for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    train_ds = TensorDataset(X_tensor[t_idx], latent_full[t_idx], y_tensor[t_idx])
    train_loader = DataLoader(train_ds, batch_size=512, shuffle=True)
    
    model = M9Classifier(X.shape[1], 64).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    
    best_auc = 0
    fold_test_probs = None
    
    pbar_fold = tqdm(range(40), desc=f'Fold {fold+1}', leave=False)
    for epoch in pbar_fold:
        model.train()
        for bx, bl, by in train_loader:
            optimizer.zero_grad()
            loss = criterion(model(bx, bl), by)
            loss.backward()
            optimizer.step()
            
        model.eval()
        with torch.no_grad():
            v_probs = torch.sigmoid(model(X_tensor[v_idx], latent_full[v_idx])).squeeze().cpu().numpy()
            auc = roc_auc_score(y[v_idx], v_probs)
            if auc > best_auc:
                best_auc = auc
                oof_preds[v_idx] = v_probs
                fold_test_probs = torch.sigmoid(model(X_test_tensor, latent_test)).squeeze().cpu().numpy()
        
        pbar_fold.set_postfix(Best_AUC=f'{best_auc:.5f}')
    
    test_preds += fold_test_probs / 7
    print(f'Fold {fold+1} Best AUC: {best_auc:.5f}')

# --- [5] 결과 저장 및 마무리 ---
total_auc = roc_auc_score(y, oof_preds)
print(f'\n[Final Result] m9 OOF AUC: {total_auc:.5f}')

# 1. OOF 저장 (중요: 추후 검증 및 앙상블용)
np.save(f'{OOF_PATH}exp33_m9_v3_AUC_{total_auc:.5f}.npy', oof_preds)

# 2. Submission 저장
sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sub['voted'] = 2.0 - pd.Series(test_preds).rank(pct=True).values
output_name = f'{SUB_PATH}exp33_m9_v3_sub_{total_auc:.5f}.csv'
sub.to_csv(output_name, index=False)

print(f'OOF and Submission saved successfully.')


--- [Step 1] DAE Pre-training ---


DAE Training:   0%|          | 0/60 [00:00<?, ?it/s]


--- [Step 2] m9 Classifier Training ---


Fold 1:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 1 Best AUC: 0.74637


Fold 2:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 2 Best AUC: 0.76059


Fold 3:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 3 Best AUC: 0.75990


Fold 4:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 4 Best AUC: 0.74847


Fold 5:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 5 Best AUC: 0.76471


Fold 6:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 6 Best AUC: 0.75437


Fold 7:   0%|          | 0/40 [00:00<?, ?it/s]

Fold 7 Best AUC: 0.76029

[Final Result] m9 OOF AUC: 0.75549
OOF and Submission saved successfully.
