In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

# --- [1] 경로 및 환경 설정 ---
DATA_PATH = '../../data/raw/'
OOF_PATH = './oof_data/'
SUB_PATH = './submissions/'

os.makedirs(OOF_PATH, exist_ok=True)
os.makedirs(SUB_PATH, exist_ok=True)

def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()
DEVICE = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

# --- [2] Pure m7 Feature (m1 변수 완전 제거) ---
def get_original_m7_features():
    train = pd.read_csv(f'{DATA_PATH}train.csv')
    test = pd.read_csv(f'{DATA_PATH}test_x.csv')
    a_cols = [f'Q{i}A' for i in 'abcdefghijklmnopqrst']
    
    # m7의 성지: familysize <= 50 & std > 0 필터링
    train = train[(train.familysize <= 50) & (train[a_cols].std(axis=1) > 0)].reset_index(drop=True)
    y = (2 - train['voted'].to_numpy()).astype(np.float32)
    
    combined = pd.concat([train.drop('voted', axis=1), test], axis=0).reset_index(drop=True)
    
    # m7의 심장: 오직 심리 지표만 사용
    combined['Q_Var'] = combined[a_cols].var(axis=1)
    combined['mach_score'] = combined[a_cols].mean(axis=1)
    
    # [주의] StandardScaler 쓰지 않음. Q_A는 1~5 그대로, 나머지는 최소한의 영향만.
    # familysize만 숫자가 너무 튀므로 log 처리 정도로만 억제 (m7의 방식)
    combined['familysize'] = np.log1p(combined['familysize'])
    
    features = a_cols + ['familysize', 'Q_Var', 'mach_score']
    
    print(f'Final Features Count: {len(features)}')
    return combined.iloc[:len(train)][features].values, y, combined.iloc[len(train):][features].values

# --- [3] m7 박제 아키텍처 (256-128-64 ReLU) ---
class M7RestoreNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

# --- [4] Training Loop ---
X, y, X_test = get_original_m7_features()
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
oof = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

X_test_tensor = torch.FloatTensor(X_test).to(DEVICE)

for fold, (t_idx, v_idx) in enumerate(skf.split(X, y)):
    X_t, X_v = torch.FloatTensor(X[t_idx]).to(DEVICE), torch.FloatTensor(X[v_idx]).to(DEVICE)
    y_t, y_v = torch.FloatTensor(y[t_idx]).view(-1, 1).to(DEVICE), y[v_idx]
    
    model = M7RestoreNet(X.shape[1]).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()
    
    best_fold_auc = 0
    pbar = tqdm(range(80), desc=f'Fold {fold+1}')
    
    for epoch in pbar:
        model.train()
        optimizer.zero_grad()
        loss = criterion(model(X_t), y_t)
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            v_probs = torch.sigmoid(model(X_v)).squeeze().cpu().numpy()
            auc = roc_auc_score(y_v, v_probs)
            if auc > best_fold_auc:
                best_fold_auc = auc
                oof[v_idx] = v_probs
                test_probs = torch.sigmoid(model(X_test_tensor)).squeeze().cpu().numpy()
        
        pbar.set_postfix(Loss=f'{loss.item():.4f}', AUC=f'{auc:.4f}', Best=f'{best_fold_auc:.4f}')
    
    test_preds += test_probs / 7
    print(f'Fold {fold+1} Best: {best_fold_auc:.5f}')

# --- [5] 저장 ---
total_auc = roc_auc_score(y, oof)
print(f'\n[Final Baseline Result] m7_original OOF AUC: {total_auc:.5f}')

np.save(f'{OOF_PATH}m7_restored_AUC_{total_auc:.5f}.npy', oof)
sub = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sub['voted'] = 2.0 - pd.Series(test_preds).rank(pct=True).values
sub.to_csv(f'{SUB_PATH}m7_restored_sub_{total_auc:.5f}.csv', index=False)

Final Features Count: 23


Fold 1:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 1 Best: 0.60797


Fold 2:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 2 Best: 0.59910


Fold 3:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 3 Best: 0.59980


Fold 4:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 4 Best: 0.60486


Fold 5:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 5 Best: 0.59669


Fold 6:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 6 Best: 0.59679


Fold 7:   0%|          | 0/80 [00:00<?, ?it/s]

Fold 7 Best: 0.59775

[Final Baseline Result] m7_original OOF AUC: 0.60024
