In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

# --- [1] 경로 및 환경 설정 ---
DATA_PATH = '../../data/raw/'
OOF_PATH = './oof_data/'
SUB_PATH = './submissions/'

# 폴더 자동 생성
os.makedirs(OOF_PATH, exist_ok=True)
os.makedirs(SUB_PATH, exist_ok=True)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()
DEVICE = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print(f'Using Device: {DEVICE}')

# --- [2] m7 기반 정밀 피처 로드 ---
def get_m7_plus_features():
    train = pd.read_csv(f'{DATA_PATH}train.csv')
    test = pd.read_csv(f'{DATA_PATH}test_x.csv')
    a_cols = [f'Q{i}A' for i in 'abcdefghijklmnopqrst']
    
    # m7 필터링 기준 준수
    train = train[(train.familysize <= 50) & (train[a_cols].std(axis=1) > 0)].reset_index(drop=True)
    y = (2 - train['voted'].to_numpy()).astype(np.float32)
    
    combined = pd.concat([train.drop('voted', axis=1), test], axis=0).reset_index(drop=True)
    
    # Feature Engineering (m7 Core + m1 Aux)
    combined['Q_Var'] = combined[a_cols].var(axis=1)
    combined['mach_score'] = combined[a_cols].mean(axis=1)
    combined['wr_total'] = combined[[f'wr_{i:02d}' for i in range(1, 14)]].sum(axis=1)
    combined['age_numeric'] = combined['age_group'].str.extract('(\d+)').astype(float).fillna(30)
    
    num_cols = a_cols + ['familysize', 'Q_Var', 'mach_score', 'wr_total', 'age_numeric']
    scaler = StandardScaler()
    combined[num_cols] = scaler.fit_transform(combined[num_cols])
    
    cat_cols = ['race', 'religion', 'urban', 'education', 'married', 'gender']
    for col in cat_cols:
        combined[col] = LabelEncoder().fit_transform(combined[col].astype(str))
        
    features = num_cols + cat_cols
    return combined.iloc[:len(train)][features].values, y, combined.iloc[len(train):][features].values

# --- [3] m7 정통 아키텍처 ---
class M7V2Net(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x)

# --- [4] 메인 학습 루프 (OOF & Test Pred Accumulation) ---
X, y, X_test = get_m7_plus_features()
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
oof = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

X_test_tensor = torch.FloatTensor(X_test).to(DEVICE)

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    X_t, X_v = torch.FloatTensor(X[t_idx]).to(DEVICE), torch.FloatTensor(X[v_idx]).to(DEVICE)
    y_t, y_v = torch.FloatTensor(y[t_idx]).view(-1, 1).to(DEVICE), y[v_idx]
    
    model = M7V2Net(X.shape[1]).to(DEVICE)
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    criterion = nn.BCEWithLogitsLoss()
    
    best_fold_auc = 0
    best_fold_probs = None
    
    pbar = tqdm(range(60), desc=f'Fold {fold+1}', leave=True)
    for epoch in pbar:
        model.train()
        optimizer.zero_grad()
        loss = criterion(model(X_t), y_t)
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            v_probs = torch.sigmoid(model(X_v)).squeeze().cpu().numpy()
            auc = roc_auc_score(y_v, v_probs)
            if auc > best_fold_auc:
                best_fold_auc = auc
                oof[v_idx] = v_probs
                best_fold_probs = torch.sigmoid(model(X_test_tensor)).squeeze().cpu().numpy()
        
        pbar.set_postfix(Loss=f'{loss.item():.4f}', AUC=f'{auc:.4f}', Best=f'{best_fold_auc:.4f}')
    
    test_preds += best_fold_probs / 7
    print(f'Fold {fold+1} Best AUC: {best_fold_auc:.5f}')

# --- [5] 결과 저장 (OOF .npy & Sub .csv) ---
total_auc = roc_auc_score(y, oof)
print(f'\n[Final Result] m7_v2 OOF AUC: {total_auc:.5f}')

# 1. OOF 저장
oof_filename = f'{OOF_PATH}m7_v2_fusion_AUC_{total_auc:.5f}.npy'
np.save(oof_filename, oof)
print(f'OOF saved to: {oof_filename}')

# 2. Submission 저장
sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sub['voted'] = 2.0 - pd.Series(test_preds).rank(pct=True).values
sub_filename = f'{SUB_PATH}m7_v2_sub_AUC_{total_auc:.5f}.csv'
sub.to_csv(sub_filename, index=False)
print(f'Submission saved to: {sub_filename}')

Using Device: mps


Fold 1:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 1 Best AUC: 0.75182


Fold 2:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 2 Best AUC: 0.75850


Fold 3:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 3 Best AUC: 0.74424


Fold 4:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 4 Best AUC: 0.74633


Fold 5:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 5 Best AUC: 0.75733


Fold 6:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 6 Best AUC: 0.74962


Fold 7:   0%|          | 0/60 [00:00<?, ?it/s]

Fold 7 Best AUC: 0.74815

[Final Result] m7_v2 OOF AUC: 0.75076
OOF saved to: ./oof_data/m7_v2_fusion_AUC_0.75076.npy
Submission saved to: ./submissions/m7_v2_sub_AUC_0.75076.csv
