# Pipeline Tests

## 1. Reader

In [None]:
import pandas as pd
from model.reader import reader

df = reader('train.csv')

In [None]:
TARGET = df.iloc[:, 0]
browsers = df.iloc[:, 1]
actions = df.iloc[:, 2:]

## 2. Tokenizer

In [None]:
from model.tokenizer import tokenize_action_sequence, tokenize_browser_data, tokenize_username_data

username_tokens, username_to_idx = tokenize_username_data(TARGET)
action_tokens, action_to_idx = tokenize_action_sequence(actions)
browser_tokens, browser_to_idx = tokenize_browser_data(browsers)

## 3. Model: Transformer

Read device

In [None]:
import torch

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Enable memory optimization
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

In [None]:
from model.transformer import create_model, train_model

# Create optimized model for T4 GPU
model = create_model(
    vocab_size=len(action_to_idx),
    n_usernames=len(username_to_idx),
    n_browsers=len(browser_to_idx),
    d_model=128,        # Increased for better performance
    n_heads=4,          # More attention heads
    n_layers=4,         # Deeper network
    d_ff=512,
    max_seq_len=100,
    dropout=0.1
)

# Move model to GPU
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Encode usernames

In [None]:
#import torch


#unique_usernames = TARGET.unique()
#username_to_idx = {username: idx for idx, username in enumerate(unique_usernames)}
#TARGET_indices = TARGET.map(username_to_idx).values
#TARGET_tensor = torch.tensor(TARGET_indices, dtype=torch.long)

Train Transformer model

In [None]:
# Use the raw session_tokens (list of lists) instead of converting to tensor
# This allows the training function to handle padding properly

# Split data (simple 80/20 split)
split_idx = int(0.8 * len(df))
train_data = (
    action_tokens[:split_idx],
    username_tokens[:split_idx],
    browser_tokens[:split_idx]
)
val_data = (
    action_tokens[split_idx:],
    username_tokens[split_idx:],
    browser_tokens[split_idx:]
)
 
# Train with memory-efficient parameters
train_model(
    model,
    train_data=train_data,
    val_data=val_data,
    epochs=100,
    batch_size=64,      # Larger batch size for GPU
    max_seq_len=100,
    device=device
)

## Inference

In [None]:
def get_username_from_prediction(predicted_tensor, username_to_idx):
    """Convert predicted tensor index to actual username."""
    idx_to_username = {idx: username for username, idx in username_to_idx.items()}
    predicted_idx = predicted_tensor.item()
    return idx_to_username[predicted_idx]

# Predict username from action sequence
action_sequence = torch.tensor([5, 32, 1, 52, 89]).to(device)
logits, probs = model.predict_username(action_sequence)
predicted_username = torch.argmax(logits, dim=-1)

# Get the actual username
predicted_username_name = get_username_from_prediction(predicted_username, username_to_idx)
print(f"Predicted username: {predicted_username_name}")