In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [2]:
# 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)

In [3]:
# 데이터 로드

BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / 'data:raw'


train_df = pd.read_csv(DATA_DIR /'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test_x.csv')

TARGET = 'voted'

train_df[TARGET] = train_df[TARGET] - 1

In [4]:
# 핵심 Feature 그룹 정의
# 정치 / 성향 관련
A_COLS = [col for col in train_df.columns if col.endswith('A')]

# 행동 / 경험 지표
E_COLS = [col for col in train_df.columns if col.endswith('E')]

# 기본 인구통계
BASE_CAT_COLS = [
    'gender', 'education', 'married', 'urban',
    'race', 'religion', 'age_group'
]

#familysize 파생변수 사용
FAMILY_COL = 'familysize'

In [5]:
# 결측치 처리
num_cols = A_COLS + E_COLS + [FAMILY_COL]

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

train_df[BASE_CAT_COLS] = cat_imputer.fit_transform(train_df[BASE_CAT_COLS])
test_df[BASE_CAT_COLS] = cat_imputer.transform(test_df[BASE_CAT_COLS])

In [6]:
# 연관성이 있어보이는 파생변수 생성

# 정치 관심도 총합 (A계열 평균)
train_df['A_mean'] = train_df[A_COLS].mean(axis=1)
test_df['A_mean'] = test_df[A_COLS].mean(axis=1)

# 정치 관심 분산 -> 확신형 vs 무관심형 구분
train_df['A_std'] = train_df[A_COLS].std(axis=1)
test_df['A_std'] = test_df[A_COLS].std(axis=1)

# 사회적 참여 지표 (E 계열 중 non-zero 비율)
train_df['E_active_ratio'] = (train_df[E_COLS] > 0).mean(axis=1)
test_df['E_active_ratio'] = (test_df[E_COLS] > 0).mean(axis=1)

# familysize 파생 (사회적 연결성)
def family_bin(x):
    if x == 1: return 'single'
    elif x <= 2: return 'small'
    elif x <= 4: return 'medium'
    elif x <= 6: return 'large'
    else: return 'very large'

train_df['family_bin'] = train_df[FAMILY_COL].apply(family_bin)
test_df['family_bin'] = test_df[FAMILY_COL].apply(family_bin)


In [7]:
# 스케일링

scale_cols = A_COLS + E_COLS + ['A_mean', 'A_std', 'E_active_ratio']

scaler = RobustScaler()
train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
test_df[scale_cols] = scaler.transform(test_df[scale_cols])

In [8]:
# feature 선택

USE_NUM_COLS = scale_cols
USE_CAT_COLS = BASE_CAT_COLS + ['family_bin']

# 원핫
X_train_cat = pd.get_dummies(train_df[USE_CAT_COLS], drop_first=True)
X_test_cat = pd.get_dummies(test_df[USE_CAT_COLS], drop_first=True)

X_train_cat, X_test_cat = X_train_cat.align(
    X_test_cat, join='left', axis=1, fill_value=0
)

X_train = np.hstack([
    train_df[USE_NUM_COLS].values,
    X_train_cat.values
])

X_test = np.hstack([
    test_df[USE_NUM_COLS].values,
    X_test_cat.values
])

y = train_df[TARGET].values

In [9]:
# Train / Validation Split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [10]:
# Dataset / Dataloader

class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]
    

train_loader = DataLoader(TabDataset(X_tr, y_tr), batch_size=512, shuffle=True)
val_loader = DataLoader(TabDataset(X_val, y_val), batch_size=1024)
test_loader = DataLoader(TabDataset(X_test), batch_size=1024)

In [15]:
# 딥러닝 모델

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [16]:
# 학습 루프

device = 'cpu'
model = MLP(X_train.shape[1]).to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-4
)

criterion = nn.BCEWithLogitsLoss()

best_auc = 0
best_state = None

for epoch in range(25):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
            trues.extend(yb.numpy())
            
    auc = roc_auc_score(trues, preds)

    if auc > best_auc:
        best_auc = auc
        best_state = model.state_dict()

    print(f'Epoch {epoch+1} | Val AUC: {auc:.5f}')

model.load_state_dict(best_state)
print('Best Val AUC:', best_auc)

Epoch 1 | Val AUC: 0.72754
Epoch 2 | Val AUC: 0.74911
Epoch 3 | Val AUC: 0.75189
Epoch 4 | Val AUC: 0.75510
Epoch 5 | Val AUC: 0.75549
Epoch 6 | Val AUC: 0.75471
Epoch 7 | Val AUC: 0.75640
Epoch 8 | Val AUC: 0.75691
Epoch 9 | Val AUC: 0.76021
Epoch 10 | Val AUC: 0.75878
Epoch 11 | Val AUC: 0.75738
Epoch 12 | Val AUC: 0.75974
Epoch 13 | Val AUC: 0.75902
Epoch 14 | Val AUC: 0.75959
Epoch 15 | Val AUC: 0.75981
Epoch 16 | Val AUC: 0.75919
Epoch 17 | Val AUC: 0.76176
Epoch 18 | Val AUC: 0.76155
Epoch 19 | Val AUC: 0.76067
Epoch 20 | Val AUC: 0.76303
Epoch 21 | Val AUC: 0.76089
Epoch 22 | Val AUC: 0.76163
Epoch 23 | Val AUC: 0.76284
Epoch 24 | Val AUC: 0.76400
Epoch 25 | Val AUC: 0.76314
Best Val AUC: 0.7639974241526318


In [1]:
# 연습용 데이콘 제출 점수 0.751 (퍼블릭)