In [1]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
CFG = {
    'BATCH_SIZE': 4096,
    'EPOCHS': 10,
    'LEARNING_RATE': 1e-3,
    'SEED' : 42
}
device = "cuda" if torch.cuda.is_available() else "cpu"

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

In [3]:
# 데이터 로드
all_train = pd.read_parquet("./train.parquet", engine="pyarrow")
test = pd.read_parquet("./test.parquet", engine="pyarrow").drop(columns=['ID'])

print("Train shape:", all_train.shape)
print("Test shape:", test.shape)

Train shape: (10704179, 119)
Test shape: (1527298, 118)


In [17]:
all_train['clicked'].value_counts()

clicked
0    10500000
1      204179
Name: count, dtype: int64

In [4]:
# clicked == 1 데이터
clicked_1 = all_train[all_train['clicked'] == 1]

# clicked == 0 데이터에서 동일 개수x2 만큼 무작위 추출 (다운 샘플링)
clicked_0 = all_train[all_train['clicked'] == 0].sample(n=len(clicked_1)*2, random_state=42)

# 두 데이터프레임 합치기
train = pd.concat([clicked_1, clicked_0], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
# Target / Sequence
target_col = "clicked"
seq_col = "seq"

# 학습에 사용할 피처: ID/seq/target 제외, 나머지 전부
FEATURE_EXCLUDE = {target_col, seq_col, "ID"}
feature_cols = [c for c in train.columns if c not in FEATURE_EXCLUDE]

print("Num features:", len(feature_cols))
print("Sequence:", seq_col)
print("Target:", target_col)

Num features: 117
Sequence: seq
Target: clicked


In [6]:
class ClickDataset(Dataset):
    def __init__(self, df, feature_cols, seq_col, target_col=None, has_target=True):
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.seq_col = seq_col
        self.target_col = target_col
        self.has_target = has_target

        # 비-시퀀스 피처: 전부 연속값으로
        self.X = self.df[self.feature_cols].astype(float).fillna(0).values

        # 시퀀스: 문자열 그대로 보관 (lazy 파싱)
        self.seq_strings = self.df[self.seq_col].astype(str).values

        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx], dtype=torch.float)

        # 전체 시퀀스 사용 (빈 시퀀스만 방어)
        s = self.seq_strings[idx]
        if s:
            arr = np.fromstring(s, sep=",", dtype=np.float32)
        else:
            arr = np.array([], dtype=np.float32)

        if arr.size == 0:
            arr = np.array([0.0], dtype=np.float32)  # 빈 시퀀스 방어

        seq = torch.from_numpy(arr)  # shape (seq_len,)

        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float)
            return x, seq, y
        else:
            return x, seq

In [7]:
def collate_fn_train(batch):
    xs, seqs, ys = zip(*batch)
    xs = torch.stack(xs)
    ys = torch.stack(ys)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)  # 빈 시퀀스 방지
    return xs, seqs_padded, seq_lengths, ys

def collate_fn_infer(batch):
    xs, seqs = zip(*batch)
    xs = torch.stack(xs)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return xs, seqs_padded, seq_lengths

In [8]:
class TabularSeqModel(nn.Module):
    def __init__(self, d_features, lstm_hidden=32, hidden_units=[1024, 512, 256, 128], dropout=0.2):
        super().__init__()
        # 모든 비-시퀀스 피처에 BN
        self.bn_x = nn.BatchNorm1d(d_features)
        # seq: 숫자 시퀀스 → LSTM
        self.lstm = nn.LSTM(input_size=1, hidden_size=lstm_hidden, batch_first=True)

        # 최종 MLP
        input_dim = d_features + lstm_hidden
        layers = []
        for h in hidden_units:
            layers += [nn.Linear(input_dim, h), nn.ReLU(), nn.Dropout(dropout)]
            input_dim = h
        layers += [nn.Linear(input_dim, 1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, x_feats, x_seq, seq_lengths):
        # 비-시퀀스 피처
        x = self.bn_x(x_feats)

        # 시퀀스 → LSTM (pack)
        x_seq = x_seq.unsqueeze(-1)  # (B, L, 1)
        packed = nn.utils.rnn.pack_padded_sequence(
            x_seq, seq_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h = h_n[-1]                  # (B, lstm_hidden)

        z = torch.cat([x, h], dim=1)
        return self.mlp(z).squeeze(1)  # logits

In [9]:
def train_model(train_df, feature_cols, seq_col, target_col,
                batch_size=512, epochs=3, lr=1e-3, device="cuda"):

    # 1) split
    tr_df, va_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)

    # 2) Dataset / Loader (l_max 인자 제거)
    train_dataset = ClickDataset(tr_df, feature_cols, seq_col, target_col, has_target=True)
    val_dataset   = ClickDataset(va_df, feature_cols, seq_col, target_col, has_target=True)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn_train)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn_train)

    # 3) 모델
    d_features = len(feature_cols)
    model = TabularSeqModel(d_features=d_features, lstm_hidden=64, hidden_units=[256,128], dropout=0.2).to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 4) Loop
    for epoch in range(1, epochs+1):
        model.train()
        train_loss = 0.0
        for xs, seqs, seq_lens, ys in tqdm(train_loader, desc=f"Train Epoch {epoch}"):
            xs, seqs, seq_lens, ys = xs.to(device), seqs.to(device), seq_lens.to(device), ys.to(device)
            optimizer.zero_grad()
            logits = model(xs, seqs, seq_lens)
            loss = criterion(logits, ys)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * ys.size(0)
        train_loss /= len(train_dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xs, seqs, seq_lens, ys in tqdm(val_loader, desc=f"Val Epoch {epoch}"):
                xs, seqs, seq_lens, ys = xs.to(device), seqs.to(device), seq_lens.to(device), ys.to(device)
                logits = model(xs, seqs, seq_lens)
                loss = criterion(logits, ys)
                val_loss += loss.item() * len(ys)
        val_loss /= len(val_dataset)

        print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    return model

In [10]:
model = train_model(
    train_df=train,
    feature_cols=feature_cols,
    seq_col=seq_col,
    target_col=target_col,
    batch_size=CFG['BATCH_SIZE'],
    epochs=CFG['EPOCHS'],
    lr=CFG['LEARNING_RATE'],
    device=device
)

Train Epoch 1: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:12<00:00,  1.65it/s]
Val Epoch 1: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.01it/s]


[Epoch 1] Train Loss: 0.5904 | Val Loss: 0.5746


Train Epoch 2: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.72it/s]
Val Epoch 2: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.05it/s]


[Epoch 2] Train Loss: 0.5740 | Val Loss: 0.5706


Train Epoch 3: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.73it/s]
Val Epoch 3: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.03it/s]


[Epoch 3] Train Loss: 0.5702 | Val Loss: 0.5706


Train Epoch 4: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.72it/s]
Val Epoch 4: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.02it/s]


[Epoch 4] Train Loss: 0.5680 | Val Loss: 0.5671


Train Epoch 5: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.72it/s]
Val Epoch 5: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.04it/s]


[Epoch 5] Train Loss: 0.5666 | Val Loss: 0.5664


Train Epoch 6: 100%|██████████████████████████████████████████████████████████████████| 120/120 [00:47<00:00,  2.52it/s]
Val Epoch 6: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.07it/s]


[Epoch 6] Train Loss: 0.5648 | Val Loss: 0.5663


Train Epoch 7: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.72it/s]
Val Epoch 7: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.03it/s]


[Epoch 7] Train Loss: 0.5638 | Val Loss: 0.5656


Train Epoch 8: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.72it/s]
Val Epoch 8: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.07it/s]


[Epoch 8] Train Loss: 0.5626 | Val Loss: 0.5642


Train Epoch 9: 100%|██████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.73it/s]
Val Epoch 9: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.03it/s]


[Epoch 9] Train Loss: 0.5615 | Val Loss: 0.5650


Train Epoch 10: 100%|█████████████████████████████████████████████████████████████████| 120/120 [01:09<00:00,  1.72it/s]
Val Epoch 10: 100%|█████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.03it/s]

[Epoch 10] Train Loss: 0.5610 | Val Loss: 0.5639





In [11]:
# 1) Dataset/Loader
test_ds = ClickDataset(test, feature_cols, seq_col, has_target=False)
test_ld = DataLoader(test_ds, batch_size=CFG['BATCH_SIZE'], shuffle=False, collate_fn=collate_fn_infer)

# 2) Predict
model.eval()
outs = []
with torch.no_grad():
    for xs, seqs, lens in tqdm(test_ld, desc="Inference"):
        xs, seqs, lens = xs.to(device), seqs.to(device), lens.to(device)
        outs.append(torch.sigmoid(model(xs, seqs, lens)).cpu())

test_preds = torch.cat(outs).numpy()

Inference: 100%|██████████████████████████████████████████████████████████████████████| 373/373 [02:21<00:00,  2.64it/s]


In [12]:
submit = pd.read_csv('./sample_submission.csv')
submit['clicked'] = test_preds
submit.to_csv('./baseline_submit.csv', index=False)

In [16]:
train

Unnamed: 0,gender,age_group,inventory_id,day_of_week,hour,seq,l_feat_1,l_feat_2,l_feat_3,l_feat_4,...,history_b_22,history_b_23,history_b_24,history_b_25,history_b_26,history_b_27,history_b_28,history_b_29,history_b_30,clicked
0,2.0,7.0,2,1,22,"321,516,57,74,527,77,317,75,269,450,15,75,483,...",2.0,2.0,2.0,22.0,...,0.115092,0.115092,0.019182,0.007673,0.071613,0.081843,0.025576,0.066498,0.126598,0
1,1.0,7.0,2,3,07,"144,57,516,97,165,527,74,318,77,317,480,480,28...",2.0,2.0,2.0,24.0,...,0.125694,0.125694,0.020949,0.008380,0.078210,0.089382,0.027932,0.072623,0.184358,0
2,2.0,7.0,2,6,08,"516,57,408,408,408,408,154,408,269,479,57,408,...",2.0,2.0,2.0,7.0,...,0.075060,0.075060,0.012510,0.005004,0.046704,0.053376,0.016680,0.043368,0.110091,1
3,1.0,7.0,2,3,00,"9,57,516,338,416,516,114,195,27,516,527,74,318...",2.0,2.0,2.0,8.0,...,0.060318,0.060318,0.010053,0.004021,0.037531,0.042893,0.013404,0.034850,0.022117,0
4,1.0,7.0,37,6,22,"138,132,9,101,532,74,77,318,132,101,532,101,13...",2.0,2.0,2.0,7.0,...,0.066816,0.066816,0.011136,0.004454,0.041574,0.047514,0.014848,0.038605,0.048998,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612532,1.0,7.0,36,5,20,"9,18,516,57,97,74,527,318,77,463,212,193,151,1...",1.0,2.0,1.0,7.0,...,0.028881,0.028881,0.004814,0.001925,0.107828,0.041075,0.006418,0.033374,0.328307,1
612533,1.0,8.0,37,2,10,"57,516,74,452,318,207,269,452,245,508,51,508,5...",2.0,2.0,2.0,22.0,...,0.142857,0.142857,0.023810,0.009524,0.088889,0.101587,0.031746,0.082540,0.052381,0
612534,2.0,8.0,2,1,01,"9,57,516,97,74,527,77,132,532,138,101,101,132,...",2.0,2.0,3.0,16.0,...,0.030708,0.030708,0.005118,0.002047,0.019107,0.021837,0.013648,0.017742,0.067554,0
612535,1.0,8.0,2,1,13,"57,516,97,527,74,315,317,269,311,479,57,74,315...",2.0,2.0,2.0,7.0,...,0.103689,0.103689,0.017282,0.006913,0.064518,0.073734,0.023042,0.059909,0.114055,1
