In [17]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [18]:
BASE_DIR = Path.cwd().parent.parent
DATA_DIR = BASE_DIR / 'data:raw'

train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test_x.csv')

TARGET = 'voted'

In [19]:
print(train_df.shape, test_df.shape)
print(train_df[TARGET].value_counts())
print(train_df.dtypes.value_counts())

(45532, 78) (11383, 77)
voted
2    24898
1    20634
Name: count, dtype: int64
int64      54
float64    20
object      4
Name: count, dtype: int64


In [20]:
train_df[TARGET] = train_df[TARGET].map({1: 0, 2: 1})

In [21]:
NUM_COLS = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
NUM_COLS.remove(TARGET)

CAT_COLS = train_df.select_dtypes(
    include=['object', 'category','string']
    ).columns.tolist()

DROP_COLS = ['index'] if 'index' in train_df.columns else []

In [22]:
train_df.drop(columns=DROP_COLS, inplace=True)
test_df.drop(columns=DROP_COLS, inplace=True)

In [23]:
DROP_COLS = ['index'] if 'index' in train_df.columns else []
train_df.drop(columns=DROP_COLS, inplace=True)
test_df.drop(columns=DROP_COLS, inplace=True)

NUM_COLS = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
NUM_COLS.remove(TARGET)

CAT_COLS = train_df.select_dtypes(
    include=['object', 'category', 'string']
).columns.tolist()

In [24]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_df[NUM_COLS] = num_imputer.fit_transform(train_df[NUM_COLS])
test_df[NUM_COLS] = num_imputer.transform(test_df[NUM_COLS])

train_df[CAT_COLS] = cat_imputer.fit_transform(train_df[CAT_COLS])
test_df[CAT_COLS] = cat_imputer.transform(test_df[CAT_COLS])

In [26]:
FAMILY_COL = 'familysize'

upper = train_df[FAMILY_COL].quantile(0.99)

train_df[FAMILY_COL] = train_df[FAMILY_COL].clip(upper=upper)
test_df[FAMILY_COL] = test_df[FAMILY_COL].clip(upper=upper)

train_df[FAMILY_COL] = train_df[FAMILY_COL].round().astype(int)
test_df[FAMILY_COL] = test_df[FAMILY_COL].round().astype(int)

In [27]:
def family_bin(x):
    if x == 1:
        return 'single'
    elif x <= 2:
        return 'small'
    elif x <= 4:
        return 'medium'
    elif x <= 6:
        return 'large'
    else:
        return 'very_large'

In [28]:
train_df['family_bin'] = train_df[FAMILY_COL].apply(family_bin)
test_df['family_bin'] = test_df[FAMILY_COL].apply(family_bin)

CAT_COLS.append('family_bin')

In [29]:
scaler = RobustScaler()

train_df[NUM_COLS] = scaler.fit_transform(train_df[NUM_COLS])
test_df[NUM_COLS] = scaler.transform(test_df[NUM_COLS])

In [30]:
X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET].values

X_test = test_df.copy()

X_train_onehot = pd.get_dummies(X_train, columns=CAT_COLS, drop_first=True)
X_test_onehot = pd.get_dummies(X_test, columns=CAT_COLS, drop_first=True)

X_train_onehot, X_test_onehot = X_train_onehot.align(
    X_test_onehot, join='left', axis=1, fill_value=0
)

In [31]:
X_train = X_train_onehot.values.astype(np.float32)
X_test = X_test_onehot.values.astype(np.float32)
y_train = y_train.astype(np.float32)

In [32]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=42
)

In [33]:
class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

In [34]:
train_loader = DataLoader(TabularDataset(X_tr, y_tr), batch_size=256, shuffle=True)
val_loader = DataLoader(TabularDataset(X_val, y_val), batch_size=512)
test_loader = DataLoader(TabularDataset(X_test), batch_size=512)

In [38]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128), 
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128,1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [40]:
device = 'cpu'

model = MLP(X_train.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

best_auc = 0
patience = 5
cnt = 0

for epoch in range(50):
    model.train()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(x), y)
        loss.backward()
        optimizer.step()

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device)
            preds.extend(torch.sigmoid(model(x)).cpu().numpy())
            trues.extend(y.numpy())

    auc = roc_auc_score(trues, preds)
    print(f'Epoch {epoch+1} | Val AUC: {auc:.5f}')

    if auc > best_auc:
        best_auc = auc
        cnt = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        cnt += 1
        if cnt >= patience:
            break

Epoch 1 | Val AUC: 0.74357
Epoch 2 | Val AUC: 0.74802
Epoch 3 | Val AUC: 0.75093
Epoch 4 | Val AUC: 0.75298
Epoch 5 | Val AUC: 0.75326
Epoch 6 | Val AUC: 0.75712
Epoch 7 | Val AUC: 0.75308
Epoch 8 | Val AUC: 0.75560
Epoch 9 | Val AUC: 0.75845
Epoch 10 | Val AUC: 0.75501
Epoch 11 | Val AUC: 0.75866
Epoch 12 | Val AUC: 0.75972
Epoch 13 | Val AUC: 0.75734
Epoch 14 | Val AUC: 0.75927
Epoch 15 | Val AUC: 0.76281
Epoch 16 | Val AUC: 0.75938
Epoch 17 | Val AUC: 0.75544
Epoch 18 | Val AUC: 0.76308
Epoch 19 | Val AUC: 0.76006
Epoch 20 | Val AUC: 0.76318
Epoch 21 | Val AUC: 0.76407
Epoch 22 | Val AUC: 0.76083
Epoch 23 | Val AUC: 0.76304
Epoch 24 | Val AUC: 0.76431
Epoch 25 | Val AUC: 0.76324
Epoch 26 | Val AUC: 0.76407
Epoch 27 | Val AUC: 0.75246
Epoch 28 | Val AUC: 0.76141
Epoch 29 | Val AUC: 0.75493


In [41]:
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

test_probs = []
with torch.no_grad():
    for x in test_loader:
        x = x.to(device)
        test_probs.extend(torch.sigmoid(model(x)).cpu().numpy())

In [43]:
submission = pd.read_csv(DATA_DIR / 'sample_submission.csv')
submission['voted'] = test_probs
submission.to_csv('submit_260128_hana_01.csv', index=False)

In [None]:
# 0.76378 -> 연습용 데이콘 제출 리더보드 점수 