## 1. SETUP

In [None]:
# import zipfile
# import os
# import pandas as pd

# # zip_path = '/content/drive/MyDrive/2025_dacon_toss/data/toss_datasets.zip'
# extract_path = './data' # 데이터 존재 경로

# # os.makedirs(extract_path, exist_ok=True) # 경로 없으면 생성

# # with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# #     zip_ref.extractall(extract_path)

# # print(f"'{zip_path}' 압축 해제 완료. 해제 경로: '{extract_path}'")

# # Load the train.parquet file
# train = pd.read_parquet(os.path.join(extract_path, "train.parquet"), engine="pyarrow")
# test = pd.read_parquet(os.path.join(extract_path, "test.parquet"), engine="pyarrow")

# print(f"Train shape: {train.shape}")
# print(f"Test shape: {test.shape}")
# print("데이터 로드 완료")

Train shape: (10704179, 119)
Test shape: (1527298, 119)
데이터 로드 완료


In [None]:
import pandas as pd
import numpy as np
import os, random
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

CFG = {
    'BATCH_SIZE': 1024,
    'EPOCHS': 5,
    'LEARNING_RATE': 1e-3,
    'SEED': 42
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED'])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


## 2. DATA LOADED

In [None]:
print("데이터 로드 시작")
train = pd.read_parquet("../data/train.parquet", engine="pyarrow") 
test = pd.read_parquet("../data/test.parquet", engine="pyarrow")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("데이터 로드 완료")

target_col = "clicked"
seq_col = "seq"
FEATURE_EXCLUDE = {target_col, seq_col, "ID"}
feature_cols = [c for c in train.columns if c not in FEATURE_EXCLUDE]

cat_cols = ["gender", "age_group", "inventory_id", "l_feat_14"]
num_cols = [c for c in feature_cols if c not in cat_cols]
print(f"Num features: {len(num_cols)} | Cat features: {len(cat_cols)}")

데이터 로드 시작
Train shape: (10704179, 119)
Test shape: (1527298, 119)
데이터 로드 완료
Num features: 113 | Cat features: 4


## 3. ENCODER

In [None]:
def encode_categoricals(train_df, test_df, cat_cols):
    encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        all_values = pd.concat([train_df[col], test_df[col]], axis=0).astype(str).fillna("UNK")
        le.fit(all_values)
        train_df[col] = le.transform(train_df[col].astype(str).fillna("UNK"))
        test_df[col]  = le.transform(test_df[col].astype(str).fillna("UNK"))
        encoders[col] = le
        print(f"{col} unique categories: {len(le.classes_)}")
    return train_df, test_df, encoders

train, test, cat_encoders = encode_categoricals(train, test, cat_cols)


gender unique categories: 3
age_group unique categories: 9
inventory_id unique categories: 18
l_feat_14 unique categories: 3286


## 4. MODULE DEFINE

In [None]:
class ClickDataset(Dataset):
    def __init__(self, df, num_cols, cat_cols, seq_col, target_col=None, has_target=True):
        self.df = df.reset_index(drop=True)
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.seq_col = seq_col
        self.target_col = target_col
        self.has_target = has_target
        self.num_X = self.df[self.num_cols].astype(float).fillna(0).values
        self.cat_X = self.df[self.cat_cols].astype(int).values
        self.seq_strings = self.df[self.seq_col].astype(str).values
        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        num_x = torch.tensor(self.num_X[idx], dtype=torch.float)
        cat_x = torch.tensor(self.cat_X[idx], dtype=torch.long)
        s = self.seq_strings[idx]
        if s:
            arr = np.fromstring(s, sep=",", dtype=np.float32)
        else:
            arr = np.array([0.0], dtype=np.float32)
        seq = torch.from_numpy(arr)
        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float)
            return num_x, cat_x, seq, y
        else:
            return num_x, cat_x, seq

def collate_fn_train(batch):
    num_x, cat_x, seqs, ys = zip(*batch)
    num_x = torch.stack(num_x)
    cat_x = torch.stack(cat_x)
    ys = torch.stack(ys)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return num_x, cat_x, seqs_padded, seq_lengths, ys

def collate_fn_infer(batch):
    num_x, cat_x, seqs = zip(*batch)
    num_x = torch.stack(num_x)
    cat_x = torch.stack(cat_x)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return num_x, cat_x, seqs_padded, seq_lengths

class CrossNetwork(nn.Module):
    def __init__(self, input_dim, num_layers=2):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Linear(input_dim, 1, bias=True) for _ in range(num_layers)
        ])

    def forward(self, x0):
        x = x0
        for w in self.layers:
            x = x0 * w(x) + x
        return x

class WideDeepCTR(nn.Module):
    def __init__(self, num_features, cat_cardinalities, emb_dim=16, lstm_hidden=64,
                 hidden_units=[512,256,128], dropout=[0.1,0.2,0.3]):
        super().__init__()
        self.emb_layers = nn.ModuleList([
            nn.Embedding(cardinality, emb_dim) for cardinality in cat_cardinalities
        ])
        cat_input_dim = emb_dim * len(cat_cardinalities)
        self.bn_num = nn.BatchNorm1d(num_features)
        self.lstm = nn.LSTM(input_size=1, hidden_size=lstm_hidden,
                            num_layers=2, batch_first=True, bidirectional=True)
        seq_out_dim = lstm_hidden * 2
        self.cross = CrossNetwork(num_features + cat_input_dim + seq_out_dim, num_layers=2)
        input_dim = num_features + cat_input_dim + seq_out_dim
        layers = []
        for i, h in enumerate(hidden_units):
            layers += [nn.Linear(input_dim, h), nn.ReLU(), nn.Dropout(dropout[i % len(dropout)])]
            input_dim = h
        layers += [nn.Linear(input_dim, 1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, num_x, cat_x, seqs, seq_lengths):
        num_x = self.bn_num(num_x)
        cat_embs = [emb(cat_x[:, i]) for i, emb in enumerate(self.emb_layers)]
        cat_feat = torch.cat(cat_embs, dim=1)
        seqs = seqs.unsqueeze(-1)
        packed = nn.utils.rnn.pack_padded_sequence(seqs, seq_lengths.cpu(),
                                                   batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        h = torch.cat([h_n[-2], h_n[-1]], dim=1)
        z = torch.cat([num_x, cat_feat, h], dim=1)
        z_cross = self.cross(z)
        out = self.mlp(z_cross)
        return out.squeeze(1)

### GDCN

In [None]:

from basic_layers import FeaturesEmbedding, MultiLayerPerceptron

class GDCNModule(nn.Module):
    def __init__(self, cat_cardinalities, emb_dim=16, cn_layers=3, mlp_layers=(400, 400, 400), dropout=0.5):
        super().__init__()
        # FeaturesEmbedding is assumed to be defined in models.basic_layers
        self.embedding = FeaturesEmbedding(cat_cardinalities, emb_dim, concat=True)

        if isinstance(emb_dim, int):
            self.embed_output_dim = len(cat_cardinalities) * emb_dim
        else:
            self.embed_output_dim = sum(emb_dim)

        self.cross_net = GateCorssLayer(self.embed_output_dim, cn_layers)
        self.deep = MultiLayerPerceptron(self.embed_output_dim, mlp_layers, output_layer=False, dropout=dropout)
        self.output_layer = nn.Linear(self.embed_output_dim + mlp_layers[-1], 1)

    def forward(self, cat_x):
        x_embed = self.embedding(cat_x)
        cross_out = self.cross_net(x_embed)
        deep_out = self.deep(x_embed)
        out = self.output_layer(torch.cat([cross_out, deep_out], dim=1))
        return out.squeeze(1)


In [None]:
import torch
import torch.nn as nn


class GDCNP(nn.Module):
    def __init__(self, field_dims, embed_dim, cn_layers=3, mlp_layers=(400, 400, 400), dropout=0.5):
        super(GDCNP, self).__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim, concat=True)
        # self.embed_output_dim = len(field_dims) * embed_dim
        if isinstance(embed_dim, int):
            self.embed_output_dim = len(field_dims) * embed_dim
        else:
            self.embed_output_dim = sum(embed_dim)
        self.cross_net = GateCorssLayer(self.embed_output_dim, cn_layers)
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_layers, output_layer=False, dropout=dropout)
        self.fc = torch.nn.Linear(mlp_layers[-1] + self.embed_output_dim, 1)

    def forward(self, x):
        x_emb = self.embedding(x)
        # x_emb = self.embedding(x).view(-1, self.embed_output_dim)
        cross_cn = self.cross_net(x_emb)
        cross_mlp = self.mlp(x_emb)
        pred_y = self.fc(torch.cat([cross_cn, cross_mlp], dim=1))
        return pred_y


class GDCNS(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, cn_layers=3, mlp_layers=(400, 400, 400), dropout=0.5):
        super(GDCNS, self).__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim, concat=True)
        if isinstance(embed_dim, int):
            self.embed_output_dim = len(field_dims) * embed_dim
        else:
            self.embed_output_dim = sum(embed_dim)
        self.cross_net = GateCorssLayer(self.embed_output_dim, cn_layers)
        self.pred_layer = MultiLayerPerceptron(self.embed_output_dim, mlp_layers, output_layer=True,
                                               dropout=dropout)

    def forward(self, x):
        x_embed = self.embedding(x)
        # x_embed = self.embedding(x).view(-1, self.embed_output_dim)
        cross_cn = self.cross_net(x_embed)
        pred_y = self.pred_layer(cross_cn)
        return pred_y


class GateCorssNetwork(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, cn_layers=3):
        super(GateCorssNetwork, self).__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim, concat=True)
        if isinstance(embed_dim, int):
            self.embed_output_dim = len(field_dims) * embed_dim
        else:
            self.embed_output_dim = sum(embed_dim)
        self.cross_net = GateCorssLayer(self.embed_output_dim, cn_layers)
        self.pred_layer = torch.nn.Linear(self.embed_output_dim, 1)

    def forward(self, x):
        x_embed = self.embedding(x)
        cross_cn = self.cross_net(x_embed)
        pred_y = self.pred_layer(cross_cn)
        return pred_y


class GateCorssLayer(nn.Module):
    #  The core structure： gated corss layer.
    def __init__(self, input_dim, cn_layers=3):
        super().__init__()

        self.cn_layers = cn_layers

        self.w = torch.nn.ModuleList([
            torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(cn_layers)
        ])
        self.wg = torch.nn.ModuleList([
            torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(cn_layers)
        ])

        self.b = torch.nn.ParameterList([torch.nn.Parameter(
            torch.zeros((input_dim,))) for _ in range(cn_layers)])

        for i in range(cn_layers):
            torch.nn.init.uniform_(self.b[i].data)

        self.activation = nn.Sigmoid()

    def forward(self, x):
        x0 = x
        for i in range(self.cn_layers):
            xw = self.w[i](x) # Feature Crossing
            xg = self.activation(self.wg[i](x)) # Information Gate
            x = x0 * (xw + self.b[i]) * xg + x
        return x

In [None]:
class HybridGDCN(nn.Module):
    def __init__(self, num_features, cat_cardinalities, emb_dim=16, lstm_hidden=64,
                 cn_layers=3, mlp_layers=(400, 400, 400), dropout=0.5):
        super().__init__()
        self.embedding = FeaturesEmbedding(cat_cardinalities, emb_dim, concat=True)
        self.bn_num = nn.BatchNorm1d(num_features)
        self.lstm = nn.LSTM(input_size=1, hidden_size=lstm_hidden, num_layers=2,
                            batch_first=True, bidirectional=True)

        if isinstance(emb_dim, int):
            cat_feat_dim = len(cat_cardinalities) * emb_dim
        else:
            cat_feat_dim = sum(emb_dim)

        self.seq_out_dim = lstm_hidden * 2
        self.input_dim = cat_feat_dim + num_features + self.seq_out_dim

        self.cross_net = GateCorssLayer(self.input_dim, cn_layers)
        self.deep = MultiLayerPerceptron(self.input_dim, mlp_layers, output_layer=False, dropout=dropout)
        self.output_layer = nn.Linear(self.input_dim + mlp_layers[-1], 1)

    def forward(self, num_x, cat_x, seqs, seq_lengths):
        num_x = self.bn_num(num_x)
        cat_feat = self.embedding(cat_x)

        seqs = seqs.unsqueeze(-1)
        packed = nn.utils.rnn.pack_padded_sequence(seqs, seq_lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        h = torch.cat([h_n[-2], h_n[-1]], dim=1)

        x = torch.cat([cat_feat, num_x, h], dim=1)
        cross_out = self.cross_net(x)
        deep_out = self.deep(x)
        out = self.output_layer(torch.cat([cross_out, deep_out], dim=1))
        return out.squeeze(1)


## 5. TRAINING

In [None]:
def train_model(train_df, num_cols, cat_cols, seq_col, target_col, batch_size, epochs, lr, device, model_type='wide_deep'):
    train_dataset = ClickDataset(train_df, num_cols, cat_cols, seq_col, target_col, True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              collate_fn=collate_fn_train, pin_memory=True)
    cat_cardinalities = [len(cat_encoders[c].classes_) for c in cat_cols]

    if model_type == 'gdcn':
        model = GDCNModule(cat_cardinalities, emb_dim=16).to(device)
    elif model_type == 'hybrid_gdcn':
        model = HybridGDCN(num_features=len(num_cols),
                           cat_cardinalities=cat_cardinalities,
                           emb_dim=16).to(device)
    else:
        model = WideDeepCTR(num_features=len(num_cols),
                            cat_cardinalities=cat_cardinalities,
                            emb_dim=16).to(device)

    pos_weight_value = (len(train_df) - train_df[target_col].sum()) / train_df[target_col].sum()
    pos_weight = torch.tensor([pos_weight_value], dtype=torch.float).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2, T_mult=2)

    print("학습 시작")
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"[Train Epoch {epoch}]"):
            if model_type == 'gdcn':
                num_x, cat_x, seqs, lens, ys = batch
                cat_x, ys = cat_x.to(device), ys.to(device)
                logits = model(cat_x)
            else:
                num_x, cat_x, seqs, lens, ys = batch
                num_x, cat_x, seqs, lens, ys = num_x.to(device), cat_x.to(device), seqs.to(device), lens.to(device), ys.to(device)
                logits = model(num_x, cat_x, seqs, lens)

            loss = criterion(logits, ys)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item() * ys.size(0)


        total_loss /= len(train_dataset)
        print(f"[Epoch {epoch}] Train Loss: {total_loss:.4f}")
    print("학습 완료")
    return model


In [None]:
# model = train_model(
#     train_df=train,
#     num_cols=num_cols,
#     cat_cols=cat_cols,
#     seq_col=seq_col,
#     target_col=target_col,
#     batch_size=CFG['BATCH_SIZE'],
#     epochs=CFG['EPOCHS'],
#     lr=CFG['LEARNING_RATE'],
#     device=device,
#     model_type='gdcn'  # Options: 'gdcn', 'hybrid_gdcn', 'wide_deep'
# )

# public lb : 0.321

학습 시작


[Train Epoch 1]: 100%|██████████| 10454/10454 [21:59<00:00,  7.92it/s]


[Epoch 1] Train Loss: 1.2761


[Train Epoch 2]: 100%|██████████| 10454/10454 [21:35<00:00,  8.07it/s]


[Epoch 2] Train Loss: 1.2606


[Train Epoch 3]: 100%|██████████| 10454/10454 [22:31<00:00,  7.73it/s]


[Epoch 3] Train Loss: 1.2549


[Train Epoch 4]: 100%|██████████| 10454/10454 [22:09<00:00,  7.86it/s]


[Epoch 4] Train Loss: 1.2555


[Train Epoch 5]: 100%|██████████| 10454/10454 [21:46<00:00,  8.00it/s]


[Epoch 5] Train Loss: 1.2515
학습 완료


In [None]:
model = train_model(
    train_df=train,
    num_cols=num_cols,
    cat_cols=cat_cols,
    seq_col=seq_col,
    target_col=target_col,
    batch_size=CFG['BATCH_SIZE'],
    epochs=CFG['EPOCHS'],
    lr=CFG['LEARNING_RATE'],
    device=device,
    model_type='hybrid_gdcn'  # Options: 'gdcn', 'hybrid_gdcn', 'wide_deep'
)


학습 시작


[Train Epoch 1]: 100%|██████████| 10454/10454 [1:24:58<00:00,  2.05it/s]


[Epoch 1] Train Loss: 1.2576


[Train Epoch 2]: 100%|██████████| 10454/10454 [1:25:50<00:00,  2.03it/s]


[Epoch 2] Train Loss: 1.1859


[Train Epoch 3]: 100%|██████████| 10454/10454 [1:25:38<00:00,  2.03it/s]


[Epoch 3] Train Loss: 1.1633


[Train Epoch 4]: 100%|██████████| 10454/10454 [1:25:22<00:00,  2.04it/s]


[Epoch 4] Train Loss: 1.1666


[Train Epoch 5]: 100%|██████████| 10454/10454 [1:26:03<00:00,  2.02it/s]


[Epoch 5] Train Loss: 1.1504
학습 완료


In [None]:
os.makedirs('0926', exist_ok=True)
save_path = 'model_hybrid_gdcn_5epoch.pt'
torch.save(model.state_dict(), save_path)
print(f"모델 저장 완료: {save_path}")

✅ 모델 저장 완료: 0926/model_hybrid_gdcn.pt
