In [1]:
import random
import os
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# --- Reproducibility ---
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# --- Path Configuration ---
DATA_PATH = "../../data/raw/"
SUB_PATH = "./submissions/"

# --- Device Configuration ---
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# --- Data Configuration ---
drop_list = [
    'QaE', 'QbE', 'QcE', 'QdE', 'QeE',
    'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
    'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
    'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
    'index', 'hand'
]
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

# --- Load Data ---
train_data = pd.read_csv(f"{DATA_PATH}train.csv")
test_data = pd.read_csv(f"{DATA_PATH}test_x.csv")

# --- Preprocessing ---
train_data = train_data.drop(train_data[train_data.familysize > 50].index)

train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)

train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)

train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

train_y = 2 - train_y.to_numpy()
train_x = train_x.to_numpy().astype(float) 
test_x = test_x.to_numpy().astype(float)

# --- Tensor Conversion & Normalization ---
train_y_t = torch.tensor(train_y, dtype=torch.float32)
train_x_t = torch.tensor(train_x, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)

# --- Manual Scaling ---
# 질문 답변 컬럼 (Q1A ~ Q20A)
train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2

# 가족 구성원 수 (familysize)
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.

# 성격 유형 검사 컬럼 (tp01 ~ tp10)
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5

# --- Hyperparameters ---
test_len = len(test_x_t)
N_REPEAT = 5
N_SKFOLD = 7
N_EPOCH = 48
BATCH_SIZE = 72

LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 4,
    'pin_memory': True
}

# --- Training Loop ---
prediction = np.zeros((test_len, 1), dtype=np.float32)

for repeat in range(N_REPEAT):
    skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True)
    tot = 0.

    for skfold, (train_idx, valid_idx) in enumerate(skf.split(train_x, train_y)):
        train_idx, valid_idx = list(train_idx), list(valid_idx)
        
        train_loader = DataLoader(TensorDataset(train_x_t[train_idx, :], train_y_t[train_idx]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        valid_loader = DataLoader(TensorDataset(train_x_t[valid_idx, :], train_y_t[valid_idx]),
                                  shuffle=False, drop_last=False, **LOADER_PARAM)
        test_loader = DataLoader(TensorDataset(test_x_t, torch.zeros((test_len,), dtype=torch.float32)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)

        # Model Definition
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(91, 180, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(180, 32, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(32, 1)
        ).to(DEVICE)

        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=N_EPOCH // 6, eta_min=4e-4)

        prediction_t = np.zeros((test_len, 1), dtype=np.float32)
        loss_t = 1.

        for epoch in tqdm(range(N_EPOCH), desc='Repeat {} Fold {:02d}/{:02d}'.format(repeat + 1, skfold + 1, N_SKFOLD)):
            model.train()
            for idx, (xx, yy) in enumerate(train_loader):
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + idx / len(train_loader))

            with torch.no_grad():
                model.eval()
                running_loss, running_count = 0., 0
                for xx, yy in valid_loader:
                    xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                    pred = model(xx).squeeze()
                    loss = criterion(pred, yy)
                    running_loss += loss.item() * len(yy)
                    running_count += len(yy)

                # Save Best Prediction
                if running_loss / running_count < loss_t:
                    loss_t = running_loss / running_count
                    for idx, (xx, _) in enumerate(test_loader):
                        xx = xx.to(DEVICE)
                        pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                        prediction_t[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction)), :] = pred[:, :].copy()
        
        prediction[:, :] += prediction_t[:, :].copy() / (N_REPEAT * N_SKFOLD)
        tot += loss_t
    
    print('Repeat {} Average Loss -> {:6.4f}'.format(repeat + 1, tot / N_SKFOLD))

# --- Submission ---
df = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
df.iloc[:, 1:] = prediction
output_name = f"{SUB_PATH}{datetime.now().strftime('%m%d-%H%M')}.csv"
df.to_csv(output_name, index=False)

Repeat 1 Fold 01/07: 100%|██████████| 48/48 [03:39<00:00,  4.57s/it]
Repeat 1 Fold 02/07: 100%|██████████| 48/48 [03:47<00:00,  4.74s/it]
Repeat 1 Fold 03/07: 100%|██████████| 48/48 [03:56<00:00,  4.93s/it]
Repeat 1 Fold 04/07: 100%|██████████| 48/48 [03:35<00:00,  4.48s/it]
Repeat 1 Fold 05/07: 100%|██████████| 48/48 [03:39<00:00,  4.57s/it]
Repeat 1 Fold 06/07: 100%|██████████| 48/48 [03:34<00:00,  4.46s/it]
Repeat 1 Fold 07/07: 100%|██████████| 48/48 [03:39<00:00,  4.57s/it]


Repeat 1 Average Loss -> 0.6054


Repeat 2 Fold 01/07: 100%|██████████| 48/48 [03:36<00:00,  4.51s/it]
Repeat 2 Fold 02/07: 100%|██████████| 48/48 [03:32<00:00,  4.43s/it]
Repeat 2 Fold 03/07: 100%|██████████| 48/48 [03:37<00:00,  4.53s/it]
Repeat 2 Fold 04/07: 100%|██████████| 48/48 [03:40<00:00,  4.59s/it]
Repeat 2 Fold 05/07: 100%|██████████| 48/48 [03:44<00:00,  4.68s/it]
Repeat 2 Fold 06/07: 100%|██████████| 48/48 [03:47<00:00,  4.73s/it]
Repeat 2 Fold 07/07: 100%|██████████| 48/48 [03:35<00:00,  4.48s/it]


Repeat 2 Average Loss -> 0.6051


Repeat 3 Fold 01/07: 100%|██████████| 48/48 [03:46<00:00,  4.72s/it]
Repeat 3 Fold 02/07: 100%|██████████| 48/48 [03:37<00:00,  4.54s/it]
Repeat 3 Fold 03/07: 100%|██████████| 48/48 [03:28<00:00,  4.35s/it]
Repeat 3 Fold 04/07: 100%|██████████| 48/48 [03:25<00:00,  4.28s/it]
Repeat 3 Fold 05/07: 100%|██████████| 48/48 [03:32<00:00,  4.43s/it]
Repeat 3 Fold 06/07: 100%|██████████| 48/48 [03:30<00:00,  4.38s/it]
Repeat 3 Fold 07/07: 100%|██████████| 48/48 [04:04<00:00,  5.10s/it]


Repeat 3 Average Loss -> 0.6053


Repeat 4 Fold 01/07: 100%|██████████| 48/48 [04:02<00:00,  5.06s/it]
Repeat 4 Fold 02/07: 100%|██████████| 48/48 [03:50<00:00,  4.80s/it]
Repeat 4 Fold 03/07: 100%|██████████| 48/48 [03:36<00:00,  4.51s/it]
Repeat 4 Fold 04/07: 100%|██████████| 48/48 [03:40<00:00,  4.59s/it]
Repeat 4 Fold 05/07: 100%|██████████| 48/48 [03:37<00:00,  4.53s/it]
Repeat 4 Fold 06/07: 100%|██████████| 48/48 [03:57<00:00,  4.95s/it]
Repeat 4 Fold 07/07: 100%|██████████| 48/48 [03:51<00:00,  4.83s/it]


Repeat 4 Average Loss -> 0.6054


Repeat 5 Fold 01/07: 100%|██████████| 48/48 [03:27<00:00,  4.32s/it]
Repeat 5 Fold 02/07: 100%|██████████| 48/48 [03:27<00:00,  4.33s/it]
Repeat 5 Fold 03/07: 100%|██████████| 48/48 [03:37<00:00,  4.54s/it]
Repeat 5 Fold 04/07: 100%|██████████| 48/48 [03:29<00:00,  4.36s/it]
Repeat 5 Fold 05/07: 100%|██████████| 48/48 [03:24<00:00,  4.25s/it]
Repeat 5 Fold 06/07: 100%|██████████| 48/48 [03:18<00:00,  4.14s/it]
Repeat 5 Fold 07/07: 100%|██████████| 48/48 [03:19<00:00,  4.16s/it]

Repeat 5 Average Loss -> 0.6055



