In [1]:
import numpy as np
import pandas as pd
import random
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.isotonic import IsotonicRegression

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# 데이터 로드

BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / 'data:raw'

train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test_x.csv')

TARGET = 'voted'

In [3]:
# 타겟 인코딩
train_df[TARGET] = train_df[TARGET] - 1

In [4]:
# 칼럼 정의 / 전처리 과정
# 불필요 칼럼 제거
if 'index' in train_df.columns:
    train_df.drop(columns=['index'], inplace=True)
    test_df.drop(columns=['index'], inplace=True)

# 수치 / 범주 칼럼 자동 탐색
NUM_COLS = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
NUM_COLS.remove(TARGET)

CAT_COLS = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

In [5]:
# 결측치 처리
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_df[NUM_COLS] = num_imputer.fit_transform(train_df[NUM_COLS])
test_df[NUM_COLS] = num_imputer.transform(test_df[NUM_COLS])

train_df[CAT_COLS] = cat_imputer.fit_transform(train_df[CAT_COLS])
test_df[CAT_COLS] = cat_imputer.transform(test_df[CAT_COLS])

In [6]:
# 원 핫 인코딩
train_df = pd.get_dummies(train_df, columns=CAT_COLS, drop_first=True)
test_df = pd.get_dummies(test_df, columns=CAT_COLS, drop_first=True)

train_df, test_df = train_df.align(test_df, axis=1, fill_value=0)

In [8]:
# 스케일링
scaler = StandardScaler()

X = train_df.drop(columns=[TARGET])
y = train_df[TARGET].values

X = scaler.fit_transform(X)
X_test = scaler.transform(test_df.drop(columns=[TARGET]))

In [9]:
# train / validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
# 파이토치 데이터셋
class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

In [11]:
# 딥러닝 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze()

In [12]:
# Seed Ensemble 학습
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device = 'cpu'
seeds = [0, 1, 2, 3, 4]

val_preds_all = []
test_preds_all = []

for seed in seeds:
    print(f'\n Training with seed {seed}')
    set_seed(seed)

    model = MLP(X_train.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCEWithLogitsLoss()

    train_loader = DataLoader(
        TabularDataset(X_train, y_train), batch_size=256, shuffle=True
    )
    val_loader = DataLoader(
        TabularDataset(X_val, y_val), batch_size=512, shuffle=False
    )
    test_loader = DataLoader(
        TabularDataset(X_test), batch_size=512, shuffle=False
    )

    best_auc = 0
    best_val_pred = None
    best_test_pred = None

    for epoch in range(30):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()

    #Validation
    model.eval()
    val_preds = []
    with torch.no_grad():
        for xb, _ in val_loader:
            xb = xb.to(device)
            val_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())

    auc = roc_auc_score(y_val, val_preds)
    if auc > best_auc:
        best_auc = auc
        best_val_pred = np.array(val_preds)

        # Test prediction
        test_preds = []
        with torch.no_grad():
            for xb in test_loader:
                xb = xb.to(device)
                test_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
            best_test_pred = np.array(test_preds)

        print(f'Epoch {epoch+1} | Val AUC: {auc:.5f}')

    val_preds_all.append(best_val_pred)
    test_preds_all.append(best_test_pred)        


 Training with seed 0
Epoch 30 | Val AUC: 0.75820

 Training with seed 1
Epoch 30 | Val AUC: 0.75739

 Training with seed 2
Epoch 30 | Val AUC: 0.75597

 Training with seed 3
Epoch 30 | Val AUC: 0.75738

 Training with seed 4
Epoch 30 | Val AUC: 0.75665


In [13]:
# Validation 기준 Isotonic Calibration
val_pred_mean = np.mean(val_preds_all, axis=0)
test_pred_mean = np.mean(test_preds_all, axis=0)

iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(val_pred_mean, y_val)

test_pred_cal = iso.transform(test_pred_mean)

In [16]:
# submission 파일 생성
submission = pd.read_csv(DATA_DIR / 'sample_submission.csv')

submission['voted'] = test_pred_cal

submission.to_csv('submit_260128_hana02.csv', index=False)
print('❗️ submission.csv 저장 완료')

❗️ submission.csv 저장 완료
