In [None]:
from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive/MyDrive/open/

Mounted at /gdrive
/gdrive/MyDrive/open


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.utils.rnn as rnn_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from tqdm import tqdm
import matplotlib.pyplot as plt


all_train = pd.read_parquet("./train.parquet", engine="pyarrow")
test = pd.read_parquet("./test.parquet", engine="pyarrow").drop(columns=['ID'])
train_df = all_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
target_col = "clicked"
seq_col = "seq"

categorical_cols = ['gender', 'age_group', 'inventory_id', 'day_of_week', 'hour', 'l_feat_14']
numerical_feat_cols = [c for c in train_df.columns if c.startswith('feat_')]
numerical_history_cols = [c for c in train_df.columns if c.startswith('history_')]
numerical_l_feat_cols = [c for c in train_df.columns if c.startswith('l_feat_') and c not in categorical_cols]
numerical_cols = numerical_feat_cols + numerical_history_cols + numerical_l_feat_cols

# Label Encoding
vocab_sizes = {}
for col in categorical_cols:
    all_categories = pd.concat([all_train[col], test[col]]).astype(str).unique()
    vocab_sizes[col] = len(all_categories) + 1
    le = LabelEncoder()
    le.fit(all_categories)
    train_df[col] = le.transform(train_df[col].astype(str)).astype(int)
    test[col] = le.transform(test[col].astype(str)).astype(int)

# StandardScaler
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

# NaN / Inf 처리
train_df[numerical_cols] = train_df[numerical_cols].fillna(0).replace([np.inf, -np.inf], 0)
test[numerical_cols] = test[numerical_cols].fillna(0).replace([np.inf, -np.inf], 0)

In [None]:
class ClickDataset(Dataset):
    def __init__(self, df, categorical_cols, numerical_feat_cols, numerical_history_cols, numerical_l_feat_cols, seq_col, target_col=None, has_target=True):
        self.df = df.reset_index(drop=True)
        self.categorical_cols = categorical_cols
        self.numerical_feat_cols = numerical_feat_cols
        self.numerical_history_cols = numerical_history_cols
        self.numerical_l_feat_cols = numerical_l_feat_cols
        self.seq_col = seq_col
        self.target_col = target_col
        self.has_target = has_target
        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        cat_features = torch.tensor(self.df.loc[idx, self.categorical_cols].values.astype(int), dtype=torch.long)
        num_feat = torch.tensor(self.df.loc[idx, self.numerical_feat_cols].values.astype(float), dtype=torch.float)
        num_history = torch.tensor(self.df.loc[idx, self.numerical_history_cols].values.astype(float), dtype=torch.float)
        num_l_feat = torch.tensor(self.df.loc[idx, self.numerical_l_feat_cols].values.astype(float), dtype=torch.float)

        seq_string = self.df.loc[idx, self.seq_col]
        seq_array = np.fromstring(seq_string, sep=",", dtype=np.float32) if isinstance(seq_string, str) and seq_string else np.array([0.0], dtype=np.float32)
        seq_tensor = torch.from_numpy(seq_array)

        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float)
            return cat_features, num_feat, num_history, num_l_feat, seq_tensor, y
        else:
            return cat_features, num_feat, num_history, num_l_feat, seq_tensor

def collate_fn_train(batch):
    cat_features, num_feat, num_history, num_l_feat, seqs, ys = zip(*batch)
    cat_features = torch.stack(cat_features)
    num_feat = torch.stack(num_feat)
    num_history = torch.stack(num_history)
    num_l_feat = torch.stack(num_l_feat)
    ys = torch.stack(ys).unsqueeze(1)

    seqs_padded = rnn_utils.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return cat_features, num_feat, num_history, num_l_feat, seqs_padded, seq_lengths, ys

def collate_fn_infer(batch):
    cat_features, num_feat, num_history, num_l_feat, seqs = zip(*batch)
    cat_features = torch.stack(cat_features)
    num_feat = torch.stack(num_feat)
    num_history = torch.stack(num_history)
    num_l_feat = torch.stack(num_l_feat)
    seqs_padded = rnn_utils.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return cat_features, num_feat, num_history, num_l_feat, seqs_padded, seq_lengths

## 모델 구조 정의

### Subtask:
사용자 요구사항에 맞춰 새로운 `ComplexModel` 클래스를 정의합니다. 이 클래스 안에는 다음 서브 모듈이 포함됩니다.
- 범주형 피처를 위한 임베딩 레이어와 Transformer Encoder.
- 시퀀스 데이터 처리를 위한 LSTM.
- `feat_*_*` 수치형 피처를 위한 MLP.
- `history_*_*` 수치형 피처를 위한 MLP.
- 모든 서브 모듈의 출력을 결합하고 최종 예측을 수행하는 MLP.

In [None]:
class ComplexModel(nn.Module):
    def __init__(self, vocab_sizes, embedding_dims, numerical_feat_dim, numerical_history_dim, numerical_l_feat_dim,
                 lstm_hidden_dim=64, lstm_layers=1,
                 mlp_hidden_units_feat=[160, 80], mlp_hidden_units_history=[64, 32], mlp_hidden_units_l_feat=[32, 16],
                 final_mlp_hidden_units=[512, 256, 128], dropout=0.2):
        super().__init__()

        # Embeddings for categorical features
        self.embedding_layers = nn.ModuleDict({col: nn.Embedding(vocab_sizes[col], embedding_dims[col]) for col in vocab_sizes})
        total_cat_dim = sum(embedding_dims.values())

        # LSTM for sequence data
        self.lstm = nn.LSTM(input_size=1, hidden_size=lstm_hidden_dim, num_layers=lstm_layers, batch_first=True)


        def create_mlp(input_dim, hidden_units):
            layers = []
            for h in hidden_units:
                layers += [nn.Linear(input_dim, h), nn.ReLU(), nn.Dropout(dropout)]
                input_dim = h
            return nn.Sequential(*layers), input_dim

        self.feat_mlp, feat_out_dim = create_mlp(numerical_feat_dim, mlp_hidden_units_feat)
        self.history_mlp, history_out_dim = create_mlp(numerical_history_dim, mlp_hidden_units_history)
        self.l_feat_mlp, l_feat_out_dim = create_mlp(numerical_l_feat_dim, mlp_hidden_units_l_feat)


        # Final MLP
        final_input_dim = total_cat_dim + lstm_hidden_dim + feat_out_dim + history_out_dim + l_feat_out_dim # Adjusted input dim
        final_layers = []
        for h in final_mlp_hidden_units:
            final_layers += [nn.Linear(final_input_dim, h), nn.ReLU(), nn.Dropout(dropout)]
            final_input_dim = h
        final_layers += [nn.Linear(final_input_dim, 1)]
        self.final_mlp = nn.Sequential(*final_layers)

    def forward(self, cat_x, num_feat_x, num_history_x, num_l_feat_x, seq_x, seq_lengths):
        # Embeddings for categorical features
        cat_embs = [self.embedding_layers[col](cat_x[:, i]) for i, col in enumerate(self.embedding_layers)]
        cat_embedded = torch.cat(cat_embs, dim=1)

        # LSTM processing for sequence
        seq_x = seq_x.unsqueeze(-1) # Add feature dimension for LSTM
        packed_seq = rnn_utils.pack_padded_sequence(seq_x, seq_lengths.cpu(), batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(packed_seq)
        lstm_out, _ = rnn_utils.pad_packed_sequence(lstm_out, batch_first=True)

        # Get the output of the last time step
        idx = (seq_lengths - 1).view(-1, 1).unsqueeze(1).expand(-1, -1, lstm_out.size(2))
        lstm_last_output = torch.gather(lstm_out, 1, idx).squeeze(1)
        # lstm_processed = self.lstm_output_proj(lstm_last_output) # Removed
        lstm_processed = lstm_last_output # Use raw LSTM output

        # Numerical features
        feat_out = self.feat_mlp(num_feat_x)
        history_out = self.history_mlp(num_history_x)
        l_feat_out = self.l_feat_mlp(num_l_feat_x)

        # Combine all features
        combined = torch.cat([cat_embedded, lstm_processed, feat_out, history_out, l_feat_out], dim=1)
        logits = self.final_mlp(combined).squeeze(1)
        return logits

def calculate_weighted_logloss(y_true, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    mask_0 = (y_true == 0)
    mask_1 = (y_true == 1)
    ll_0 = -np.mean(np.log(1 - y_pred[mask_0])) if mask_0.sum() > 0 else 0
    ll_1 = -np.mean(np.log(y_pred[mask_1])) if mask_1.sum() > 0 else 0
    return 0.5 * ll_0 + 0.5 * ll_1

def calculate_competition_score(y_true, y_pred):
    ap = average_precision_score(y_true, y_pred)
    wll = calculate_weighted_logloss(y_true, y_pred)
    score = 0.5 * ap + 0.5 * (1 / (1 + wll))
    return score, ap, wll

In [None]:
def train_model(train_df, categorical_cols, numerical_feat_cols, numerical_history_cols, numerical_l_feat_cols, seq_col, target_col,
                vocab_sizes, embedding_dims, batch_size=1024, epochs=5, lr=1e-3, device="cuda"):

    tr_df, va_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)
    train_dataset = ClickDataset(tr_df, categorical_cols, numerical_feat_cols, numerical_history_cols, numerical_l_feat_cols, seq_col, target_col)
    val_dataset   = ClickDataset(va_df, categorical_cols, numerical_feat_cols, numerical_history_cols, numerical_l_feat_cols, seq_col, target_col)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_train, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_train, num_workers=2)

    model = ComplexModel(vocab_sizes, embedding_dims,
                                    len(numerical_feat_cols), len(numerical_history_cols), len(numerical_l_feat_cols)).to(device)

    pos_count = tr_df[target_col].sum()
    neg_count = len(tr_df) - pos_count
    pos_weight = torch.tensor(neg_count / pos_count, dtype=torch.float32).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    best_val_score = -float('inf')
    history = {'train_losses': [], 'val_losses': [], 'train_comp': [], 'val_comp': []}

    for epoch in range(1, epochs+1):
        model.train()
        train_loss = 0.0
        train_preds, train_true = [], []
        for cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len, y in tqdm(train_loader):
            cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len, y = cat_x.to(device), num_feat.to(device), num_history.to(device), num_l_feat.to(device), seq_x.to(device), seq_len.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len)
            loss = criterion(logits, y.squeeze(1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * y.size(0)
            train_preds.append(torch.sigmoid(logits).detach().cpu().numpy())
            train_true.append(y.squeeze(1).cpu().numpy())

        train_loss /= len(train_dataset)
        train_preds = np.concatenate(train_preds)
        train_true = np.concatenate(train_true)
        train_score, _, _ = calculate_competition_score(train_true, train_preds)

        # Validation
        model.eval()
        val_loss = 0.0
        val_preds, val_true = [], []
        with torch.no_grad():
            for cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len, y in val_loader:
                cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len, y = cat_x.to(device), num_feat.to(device), num_history.to(device), num_l_feat.to(device), seq_x.to(device), seq_len.to(device), y.to(device)
                logits = model(cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len)
                loss = criterion(logits, y.squeeze(1))
                val_loss += loss.item() * y.size(0)
                val_preds.append(torch.sigmoid(logits).cpu().numpy())
                val_true.append(y.squeeze(1).cpu().numpy())

        val_loss /= len(val_dataset)
        val_preds = np.concatenate(val_preds)
        val_true = np.concatenate(val_true)
        val_score, _, _ = calculate_competition_score(val_true, val_preds)

        history['train_losses'].append(train_loss)
        history['val_losses'].append(val_loss)
        history['train_comp'].append(train_score)
        history['val_comp'].append(val_score)

        if val_score > best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), './best_model.pth')

        print(f"[Epoch {epoch}] Train Loss:{train_loss:.4f}, Val Loss:{val_loss:.4f}, Train Score:{train_score:.4f}, Val Score:{val_score:.4f}")

    model.load_state_dict(torch.load('./best_model.pth'))
    return model, history

## 모델 학습, 추론 및 제출 파일 생성 (컬럼 목록 정의 포함)

### Subtask:
정의된 `ComplexModel`을 사용하여 모델을 학습하고, 학습된 모델로 테스트 데이터에 대한 추론을 수행하여 제출 파일을 생성합니다. 필요한 컬럼 목록 변수들을 이 셀 내에서 명확히 정의합니다.

In [None]:
embedding_dims = {'gender':4, 'age_group':4, 'inventory_id':4, 'day_of_week':4, 'hour':4, 'l_feat_14':16}
device = "cuda" if torch.cuda.is_available() else "cpu"
model, history = train_model(train_df, categorical_cols, numerical_feat_cols, numerical_history_cols, numerical_l_feat_cols, seq_col, target_col,
                             vocab_sizes, embedding_dims, device=device)

  1%|          | 74/8363 [01:24<2:37:32,  1.14s/it]


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history['train_losses'], label='Train Loss')
plt.plot(history['val_losses'], label='Val Loss')
plt.legend(); plt.title('Loss'); plt.xlabel('Epoch')
plt.subplot(1,2,2)
plt.plot(history['train_comp'], label='Train Score')
plt.plot(history['val_comp'], label='Val Score')
plt.legend(); plt.title('Competition Score'); plt.xlabel('Epoch')
plt.show()

In [None]:
test_dataset = ClickDataset(test, categorical_cols, numerical_feat_cols, numerical_history_cols, numerical_l_feat_cols, seq_col, has_target=False)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, collate_fn=collate_fn_infer, num_workers=2)
model.eval()
outs = []
with torch.no_grad():
    for cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len in test_loader:
        cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len = cat_x.to(device), num_feat.to(device), num_history.to(device), num_l_feat.to(device), seq_x.to(device), seq_len.to(device)
        logits = model(cat_x, num_feat, num_history, num_l_feat, seq_x, seq_len)
        outs.append(torch.sigmoid(logits).cpu().numpy())
preds = np.concatenate(outs)

submission = pd.DataFrame({'ID': test.index, 'clicked': preds})
submission.to_csv('./submission.csv', index=False)
print("Submission saved.")