In [1]:
# =======================
# BLOCK 1 - IMPORTS
# =======================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
# =======================
# BLOCK 2 - LOAD DATA
# =======================
PATH = "/kaggle/input/microlens/"

train = pd.read_parquet(PATH + "train.parquet")
valid = pd.read_parquet(PATH + "valid.parquet")
test = pd.read_parquet(PATH + "test.parquet")
item_info = pd.read_parquet(PATH + "item_info_fused_multimodal.parquet")

print("Train shape:", train.shape)
print("Valid shape:", valid.shape)
print("Test shape:", test.shape)
print("Item info shape:", item_info.shape)

Train shape: (3600000, 6)
Valid shape: (10000, 6)
Test shape: (379142, 6)
Item info shape: (91718, 3)


In [4]:
# =======================
# BLOCK 3 - PREPARE ITEM EMBEDDINGS
# =======================
print("\nPreparing item embeddings...")
item_info = item_info[~item_info["item_emb_d128"].isna()].reset_index(drop=True)
emb_matrix = np.vstack(item_info["item_emb_d128"].values)

# Simple normalization
emb_matrix = (emb_matrix - emb_matrix.mean(axis=0)) / (emb_matrix.std(axis=0) + 1e-8)

print(f"Item embedding shape: {emb_matrix.shape}")


Preparing item embeddings...
Item embedding shape: (91718, 128)


In [5]:
# =======================
# BLOCK 4 - CREATE ITEM EMBEDDING TENSOR
# =======================
# Create a mapping from item_id to embedding index
item_id_to_idx = {item_id: idx for idx, item_id in enumerate(item_info['item_id'].values)}

# Pad with zeros for unknown items
max_item_id = max(train['item_id'].max(), valid['item_id'].max(), test['item_id'].max())
print(f"Max item_id: {max_item_id}")

# Create full embedding matrix with padding
full_emb_matrix = np.zeros((max_item_id + 1, 128), dtype=np.float32)
for item_id, idx in item_id_to_idx.items():
    full_emb_matrix[item_id] = emb_matrix[idx]

print(f"Full embedding matrix shape: {full_emb_matrix.shape}")

Max item_id: 91717
Full embedding matrix shape: (91718, 128)


In [6]:
# =======================
# BLOCK 5 - DATASET
# =======================
class CTRDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.user_id = df["user_id"].values
        self.item_id = df["item_id"].values
        self.item_seq = df["item_seq"].values
        self.likes = df["likes_level"].values
        self.views = df["views_level"].values
        self.is_test = is_test
        
        if not is_test:
            self.label = df["label"].values
        else:
            self.ID = df["ID"].values

    def __len__(self):
        return len(self.user_id)

    def __getitem__(self, idx):
        data = {
            "user_id": torch.tensor(self.user_id[idx], dtype=torch.long),
            "item_id": torch.tensor(self.item_id[idx], dtype=torch.long),
            "item_seq": torch.tensor(self.item_seq[idx], dtype=torch.long),
            "likes": torch.tensor(self.likes[idx], dtype=torch.long),
            "views": torch.tensor(self.views[idx], dtype=torch.long),
        }
        
        if self.is_test:
            data["ID"] = self.ID[idx]
        else:
            data["label"] = torch.tensor(self.label[idx], dtype=torch.float32)
        
        return data


In [7]:
# =======================
# BLOCK 6 - DATALOADERS
# =======================
train_ds = CTRDataset(train, is_test=False)
valid_ds = CTRDataset(valid, is_test=False)
test_ds = CTRDataset(test, is_test=True)

train_loader = DataLoader(train_ds, batch_size=2048, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_ds, batch_size=2048, shuffle=False, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=2048, shuffle=False, num_workers=2)

print("Dataloaders ready!")

Dataloaders ready!


In [8]:
# =======================
# BLOCK 7 - ATTENTION POOLING MODULE
# =======================
class AttentionPooling(nn.Module):
    """Attention-based pooling for sequence embeddings"""
    def __init__(self, emb_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(emb_dim, emb_dim // 2),
            nn.Tanh(),
            nn.Linear(emb_dim // 2, 1)
        )
    
    def forward(self, seq_emb, mask):
        # seq_emb: [batch, seq_len, emb_dim]
        # mask: [batch, seq_len, 1]
        attn_scores = self.attention(seq_emb)  # [batch, seq_len, 1]
        attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_weights = F.softmax(attn_scores, dim=1)
        pooled = (seq_emb * attn_weights).sum(dim=1)
        return pooled

In [9]:
# =======================
# BLOCK 8 - IMPROVED MODEL
# =======================
class ImprovedMMCTRModel(nn.Module):
    def __init__(self, num_users, num_items, item_emb_matrix, emb_dim=64):
        super().__init__()
        
        # Increased embedding dimension for better representation
        self.emb_dim = emb_dim
        
        # Learnable embeddings with better initialization
        self.user_emb = nn.Embedding(num_users + 1, emb_dim, padding_idx=0)
        self.item_emb = nn.Embedding(num_items, emb_dim, padding_idx=0)
        
        # Categorical feature embeddings
        self.likes_emb = nn.Embedding(11, emb_dim // 2)
        self.views_emb = nn.Embedding(11, emb_dim // 2)
        
        # Pre-trained multimodal embeddings
        self.register_buffer('item_mm_emb', torch.tensor(item_emb_matrix, dtype=torch.float32))
        
        # Transform multimodal embeddings to match dimension
        self.mm_transform = nn.Sequential(
            nn.Linear(128, emb_dim),
            nn.LayerNorm(emb_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Attention pooling for sequence
        self.seq_attention = AttentionPooling(emb_dim)
        
        # Feature interaction layers
        self.user_item_interaction = nn.Bilinear(emb_dim, emb_dim, emb_dim)
        self.seq_item_interaction = nn.Bilinear(emb_dim, emb_dim, emb_dim)
        
        # Calculate total feature dimension
        # user(64) + item(64) + seq(64) + likes(32) + views(32) + mm_transformed(64)
        # + user_item_interact(64) + seq_item_interact(64) = 448
        total_dim = emb_dim * 6 + (emb_dim // 2) * 2
        
        # Deep MLP with batch normalization and residual connections
        self.bn1 = nn.BatchNorm1d(total_dim)
        self.fc1 = nn.Linear(total_dim, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn4 = nn.BatchNorm1d(128)
        self.fc_out = nn.Linear(128, 1)
        
        self.dropout = nn.Dropout(0.3)
        
        # Xavier initialization
        self._init_weights()
    
    def _init_weights(self):
        nn.init.xavier_normal_(self.user_emb.weight)
        nn.init.xavier_normal_(self.item_emb.weight)
        nn.init.xavier_normal_(self.likes_emb.weight)
        nn.init.xavier_normal_(self.views_emb.weight)
        
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, user_id, item_id, item_seq, likes, views):
        # Basic embeddings
        user_e = self.user_emb(user_id)
        item_e = self.item_emb(item_id)
        
        # Sequence with attention pooling
        seq_e = self.item_emb(item_seq)
        mask = (item_seq != 0).unsqueeze(-1).float()
        seq_e = self.seq_attention(seq_e, mask)
        
        # Categorical features
        likes_e = self.likes_emb(likes)
        views_e = self.views_emb(views)
        
        # Multimodal embeddings with transformation
        item_mm_e = self.item_mm_emb[item_id]
        item_mm_e = self.mm_transform(item_mm_e)
        
        # Feature interactions
        user_item_cross = self.user_item_interaction(user_e, item_e)
        seq_item_cross = self.seq_item_interaction(seq_e, item_e)
        
        # Concatenate all features
        x = torch.cat([
            user_e, item_e, seq_e, likes_e, views_e, item_mm_e,
            user_item_cross, seq_item_cross
        ], dim=1)
        
        # Deep network with residual connections
        x = self.bn1(x)
        x1 = F.relu(self.bn2(self.fc1(x)))
        x1 = self.dropout(x1)
        
        x2 = F.relu(self.bn3(self.fc2(x1)))
        x2 = self.dropout(x2)
        
        x3 = F.relu(self.bn4(self.fc3(x2)))
        x3 = self.dropout(x3)
        
        logits = self.fc_out(x3).squeeze(1)
        
        return logits

In [10]:
# =======================
# BLOCK 9 - LABEL SMOOTHING LOSS
# =======================
class LabelSmoothingBCELoss(nn.Module):
    def __init__(self, smoothing=0.05):
        super().__init__()
        self.smoothing = smoothing
    
    def forward(self, logits, targets):
        # Smooth labels: 0 -> smoothing/2, 1 -> 1-smoothing/2
        targets = targets * (1 - self.smoothing) + 0.5 * self.smoothing
        return F.binary_cross_entropy_with_logits(logits, targets)


In [11]:
# =======================
# BLOCK 10 - TRAINING SETUP
# =======================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")

num_users = train["user_id"].max()
num_items = full_emb_matrix.shape[0]

print(f"Num users: {num_users}, Num items: {num_items}")

model = ImprovedMMCTRModel(
    num_users=num_users,
    num_items=num_items,
    item_emb_matrix=full_emb_matrix,
    emb_dim=64  # Increased from 32
).to(device)

# Label smoothing loss for regularization
criterion = LabelSmoothingBCELoss(smoothing=0.05)

# Separate learning rates for different components
pretrained_params = []
other_params = []

for name, param in model.named_parameters():
    if 'mm_transform' in name:
        pretrained_params.append(param)
    else:
        other_params.append(param)

optimizer = torch.optim.AdamW([
    {'params': other_params, 'lr': 2e-3, 'weight_decay': 1e-4},
    {'params': pretrained_params, 'lr': 5e-4, 'weight_decay': 1e-5}
])

# Cosine annealing with warm restarts
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=5, T_mult=2, eta_min=1e-6
)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")



Using device: cuda
Num users: 1000000, Num items: 91718
Model parameters: 70,802,562


In [12]:
# =======================
# BLOCK 11 - TRAINING LOOP
# =======================
EPOCHS = 30
best_val_auc = 0
patience = 7  # Increased patience
patience_counter = 0

print("\nüöÄ Starting enhanced training...")
for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0
    train_preds = []
    train_labels = []
    
    for batch in train_loader:
        user_id = batch["user_id"].to(device)
        item_id = batch["item_id"].to(device)
        item_seq = batch["item_seq"].to(device)
        likes = batch["likes"].to(device)
        views = batch["views"].to(device)
        label = batch["label"].to(device)
        
        optimizer.zero_grad()
        
        logits = model(user_id, item_id, item_seq, likes, views)
        loss = criterion(logits, label)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        
        with torch.no_grad():
            probs = torch.sigmoid(logits)
            train_preds.extend(probs.cpu().numpy())
            train_labels.extend(label.cpu().numpy())
    
    # Step scheduler
    scheduler.step()
    
    train_auc = roc_auc_score(train_labels, train_preds)
    avg_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_preds = []
    val_labels = []
    
    with torch.no_grad():
        for batch in valid_loader:
            user_id = batch["user_id"].to(device)
            item_id = batch["item_id"].to(device)
            item_seq = batch["item_seq"].to(device)
            likes = batch["likes"].to(device)
            views = batch["views"].to(device)
            label = batch["label"].to(device)
            
            logits = model(user_id, item_id, item_seq, likes, views)
            probs = torch.sigmoid(logits)
            
            val_preds.extend(probs.cpu().numpy())
            val_labels.extend(label.cpu().numpy())
    
    val_auc = roc_auc_score(val_labels, val_preds)
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f"Epoch {epoch+1:02d}/{EPOCHS} | Loss: {avg_loss:.4f} | "
          f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f} | LR: {current_lr:.6f}")
    
    # Save best model
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"‚úÖ New best! Val AUC: {val_auc:.4f}")
    else:
        patience_counter += 1
        print(f"‚è≥ No improvement. Patience {patience_counter}/{patience}")
        
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

model.load_state_dict(torch.load('best_model.pt'))
print(f"\nüéØ Best Validation AUC: {best_val_auc:.4f}")



üöÄ Starting enhanced training...
Epoch 01/30 | Loss: 0.2319 | Train AUC: 0.9744 | Val AUC: 0.8868 | LR: 0.001809
‚úÖ New best! Val AUC: 0.8868
Epoch 02/30 | Loss: 0.1513 | Train AUC: 0.9958 | Val AUC: 0.9003 | LR: 0.001309
‚úÖ New best! Val AUC: 0.9003
Epoch 03/30 | Loss: 0.1287 | Train AUC: 0.9990 | Val AUC: 0.8975 | LR: 0.000692
‚è≥ No improvement. Patience 1/7
Epoch 04/30 | Loss: 0.1208 | Train AUC: 0.9997 | Val AUC: 0.9188 | LR: 0.000192
‚úÖ New best! Val AUC: 0.9188
Epoch 05/30 | Loss: 0.1188 | Train AUC: 0.9998 | Val AUC: 0.9251 | LR: 0.002000
‚úÖ New best! Val AUC: 0.9251
Epoch 06/30 | Loss: 0.1266 | Train AUC: 0.9993 | Val AUC: 0.8923 | LR: 0.001951
‚è≥ No improvement. Patience 1/7
Epoch 07/30 | Loss: 0.1266 | Train AUC: 0.9994 | Val AUC: 0.8904 | LR: 0.001809
‚è≥ No improvement. Patience 2/7
Epoch 08/30 | Loss: 0.1206 | Train AUC: 0.9998 | Val AUC: 0.8911 | LR: 0.001588
‚è≥ No improvement. Patience 3/7
Epoch 09/30 | Loss: 0.1194 | Train AUC: 0.9998 | Val AUC: 0.8652 | LR: 0

In [14]:
# =======================
# BLOCK 12 - GENERATE PREDICTIONS
# =======================
model.eval()
all_ids = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        user_id = batch["user_id"].to(device)
        item_id = batch["item_id"].to(device)
        item_seq = batch["item_seq"].to(device)
        likes = batch["likes"].to(device)
        views = batch["views"].to(device)
        
        logits = model(user_id, item_id, item_seq, likes, views)
        probs = torch.sigmoid(logits)
        
        all_preds.extend(probs.cpu().numpy())
        all_ids.extend(batch["ID"])

# Create submission
submission = pd.DataFrame({
    "ID": all_ids,
    "Task1&2": all_preds
})

submission = submission.sort_values("ID").reset_index(drop=True)
submission.to_csv("submission.csv", index=False)

print("\n‚úÖ Submission saved!")
print(f"Prediction stats - Min: {submission['Task1&2'].min():.6f}, Max: {submission['Task1&2'].max():.6f}, Mean: {submission['Task1&2'].mean():.6f}")


‚úÖ Submission saved!
Prediction stats - Min: 0.013942, Max: 0.980519, Mean: 0.573859
