# Pipeline Tests

## 1. Reader

In [1]:
import pandas as pd
from model.reader import reader

df = reader('train.csv')

In [2]:
TARGET = df.iloc[:, 0]
browsers = df.iloc[:, 1]
sequence_lengths = df.iloc[:, 2]
actions = df.iloc[:, 3:]

## Time Features

In [3]:
from model.time_features import bucketize_time_features, compute_time_features

time_features = compute_time_features(actions, sequence_lengths)
time_features = bucketize_time_features(time_features)

## 2. Tokenizer

In [4]:
from model.tokenizer import tokenize_action_sequence, tokenize_browser_data, tokenize_username_data

username_tokens, username_to_idx = tokenize_username_data(TARGET)
action_tokens, action_to_idx = tokenize_action_sequence(actions)
browser_tokens, browser_to_idx = tokenize_browser_data(browsers)

## Models

Read device

In [5]:
import torch

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Enable memory optimization
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

Using device: cpu


### Model: Transformer with context token "Browser"

Create new model

In [None]:
from model.transformer import create_model, train_model

# Create optimized model for T4 GPU
model = create_model(
    vocab_size=len(action_to_idx),
    n_usernames=len(username_to_idx),
    n_browsers=len(browser_to_idx),
    d_model=256,        # Increased for better performance
    n_heads=8,          # More attention heads
    n_layers=6,         # Deeper network
    d_ff=512,
    max_seq_len=100,
    dropout=0.1
)

# Move model to GPU
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Train Transformer model

In [None]:
# Use the raw session_tokens (list of lists) instead of converting to tensor
# This allows the training function to handle padding properly

# Split data (simple 80/20 split)
split_idx = int(0.8 * len(df))
train_data = (
    action_tokens[:split_idx],
    username_tokens[:split_idx],
    browser_tokens[:split_idx]
)
val_data = (
    action_tokens[split_idx:],
    username_tokens[split_idx:],
    browser_tokens[split_idx:]
)
 
# Train with memory-efficient parameters
train_model(
    model,
    train_data=train_data,
    val_data=val_data,
    epochs=100,
    batch_size=32,      # Larger batch size for GPU
    max_seq_len=500,
    device=device
)

Alternatively: import saved model

In [None]:
# from utils.save_load_transformer import load_model


# model, metadata = load_model(filepath="saved_models/transformer1.pt", device=device)

### Model: Transformer with context token "Browser", "Duration" and "Speed"

In [6]:
from model.transformer_extended_context import create_model, train_model

discrete_contexts = {
    'browser': 4,
    'duration_bucket': 8,
    'speed_bucket': 8
}

# Create optimized model for T4 GPU
model = create_model(
    vocab_size=len(action_to_idx),
    n_usernames=len(username_to_idx),
    discrete_contexts=discrete_contexts,
    d_model=256,       
    n_heads=8,          
    n_layers=6,        
    d_ff=512,
    max_seq_len=100,
    dropout=0.1
)

# Move model to GPU
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Model parameters: 5,175,031


In [7]:
duration_tokens = list(time_features['duration_bucket'])
speed_tokens = list(time_features['speed_bucket'])

split_idx = int(0.8 * len(df))
train_data = (
    action_tokens[:split_idx],
    username_tokens[:split_idx],
    browser_tokens[:split_idx],
    duration_tokens[:split_idx],
    speed_tokens[:split_idx]
)
val_data = (
    action_tokens[split_idx:],
    username_tokens[split_idx:],
    browser_tokens[split_idx:],
    duration_tokens[split_idx:],
    speed_tokens[split_idx:]
)
 
# Train with memory-efficient parameters
train_model(
    model,
    train_data=train_data,
    val_data=val_data,
    epochs=100,
    batch_size=32,      # Larger batch size for GPU
    max_seq_len=500,
    device=device
)

Training on 2623 samples
Validation on 656 samples
Using max sequence length: 500
Using batch size: 32
Epoch   0: Train Loss: 5.6413, Val Loss: 5.5648, Val Acc: 0.0000
  Top predicted usernames (by frequency):
    1. Username 128: 47.41% (311/656)
    2. Username 183: 35.98% (236/656)
    3. Username 143: 8.38% (55/656)
    4. Username 126: 3.66% (24/656)
    5. Username 34: 1.37% (9/656)


KeyboardInterrupt: 

## Serving

Use test.csv

In [None]:
test_df = reader('test.csv', training=False)

browsers = test_df.iloc[:, 0]
sequence_lengths = test_df.iloc[:, 1]
actions = test_df.iloc[:, 2:]

action_tokens, _ = tokenize_action_sequence(actions=actions, existing_token_to_idx=action_to_idx, training=False)
browser_tokens, _ = tokenize_browser_data(browsers=browsers, existing_browser_to_idx=browser_to_idx, training=False)
idx_to_username = {idx: username for username, idx in username_to_idx.items()}

submission = []

for i in range(len(action_tokens)):
    action_sequence = torch.tensor(action_tokens[i]).to(device)
    browser = torch.tensor(browser_tokens[i]).to(device)
    logits, probs = model.predict_username(action_sequence, browser)
    predicted_username = torch.argmax(logits, dim=-1)
    predicted_idx = predicted_username.item()
    predicted_username_name = idx_to_username[predicted_idx]
    submission.append(predicted_username_name)



In [None]:
submission