# Pipeline Tests

## 1. Reader

In [1]:
import pandas as pd
from model.reader import reader

df = reader('train.csv')

In [2]:
TARGET = df.iloc[:, 0]
browsers = df.iloc[:, 1]
sequence_lengths = df.iloc[:, 2]
actions = df.iloc[:, 3:]

In [3]:
sequence_lengths.describe()

count     3279.000000
mean       850.238792
std       1212.323701
min          2.000000
25%        192.000000
50%        423.000000
75%       1019.000000
max      14468.000000
Name: sequence_length, dtype: float64

## Time Features

In [4]:
from model.time_features import compute_time_features

time_features = compute_time_features(actions, sequence_lengths)
time_features

Unnamed: 0,duration,speed
0,2905,0.876764
1,230,0.391304
2,750,0.821333
3,1445,0.629758
4,275,0.647273
...,...,...
3274,135,0.903704
3275,730,0.601370
3276,1730,0.853757
3277,1150,0.547826


## 2. Tokenizer

In [4]:
from model.tokenizer import tokenize_action_sequence, tokenize_browser_data, tokenize_username_data

username_tokens, username_to_idx = tokenize_username_data(TARGET)
action_tokens, action_to_idx = tokenize_action_sequence(actions)
browser_tokens, browser_to_idx = tokenize_browser_data(browsers)

In [5]:
len(action_to_idx)

7087

## Models

### Model: Transformer

Read device

In [6]:
import torch

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Enable memory optimization
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

Using device: cpu


Create new model

In [None]:
from model.transformer import create_model, train_model

# Create optimized model for T4 GPU
model = create_model(
    vocab_size=len(action_to_idx),
    n_usernames=len(username_to_idx),
    n_browsers=len(browser_to_idx),
    d_model=256,        # Increased for better performance
    n_heads=8,          # More attention heads
    n_layers=6,         # Deeper network
    d_ff=512,
    max_seq_len=100,
    dropout=0.1
)

# Move model to GPU
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Train Transformer model

In [None]:
# Use the raw session_tokens (list of lists) instead of converting to tensor
# This allows the training function to handle padding properly

# Split data (simple 80/20 split)
split_idx = int(0.8 * len(df))
train_data = (
    action_tokens[:split_idx],
    username_tokens[:split_idx],
    browser_tokens[:split_idx]
)
val_data = (
    action_tokens[split_idx:],
    username_tokens[split_idx:],
    browser_tokens[split_idx:]
)
 
# Train with memory-efficient parameters
train_model(
    model,
    train_data=train_data,
    val_data=val_data,
    epochs=100,
    batch_size=32,      # Larger batch size for GPU
    max_seq_len=500,
    device=device
)

Alternatively: import saved model

In [14]:
# from utils.save_load_transformer import load_model


# model, metadata = load_model(filepath="saved_models/transformer1.pt", device=device)

Model loaded from saved_models/transformer1.pt


## Serving

Use test.csv

In [None]:
test_df = reader('test.csv', training=False)

browsers = test_df.iloc[:, 0]
sequence_lengths = test_df.iloc[:, 1]
actions = test_df.iloc[:, 2:]

action_tokens, _ = tokenize_action_sequence(actions=actions, existing_token_to_idx=action_to_idx, training=False)
browser_tokens, _ = tokenize_browser_data(browsers=browsers, existing_browser_to_idx=browser_to_idx, training=False)
idx_to_username = {idx: username for username, idx in username_to_idx.items()}

submission = []

for i in range(len(action_tokens)):
    action_sequence = torch.tensor(action_tokens[i]).to(device)
    browser = torch.tensor(browser_tokens[i]).to(device)
    logits, probs = model.predict_username(action_sequence, browser)
    predicted_username = torch.argmax(logits, dim=-1)
    predicted_idx = predicted_username.item()
    predicted_username_name = idx_to_username[predicted_idx]
    submission.append(predicted_username_name)



In [20]:
submission

['zbn',
 'bou',
 'snn',
 'ubr',
 'eqo',
 'snn',
 'ycb',
 'azx',
 'yrb',
 'rff',
 'fou',
 'azx',
 'gtj',
 'wjn',
 'wht',
 'nuh',
 'gtj',
 'hrj',
 'mmm',
 'ulu',
 'cxt',
 'lra',
 'tfi',
 'vjw',
 'snn',
 'rmf',
 'yrb',
 'vek',
 'fjb',
 'myl',
 'rff',
 'vjw',
 'eqw',
 'lra',
 'snn',
 'gid',
 'prb',
 'fyg',
 'rvh',
 'kik',
 'dbi',
 'tfi',
 'deb',
 'snn',
 'azx',
 'xwz',
 'uof',
 'ulu',
 'fyg',
 'ycb',
 'snn',
 'qsw',
 'khv',
 'bou',
 'yrb',
 'tfi',
 'ftq',
 'vog',
 'cyd',
 'ycb',
 'ycb',
 'ulu',
 'tfi',
 'cum',
 'hvh',
 'zbn',
 'azx',
 'tzn',
 'qsw',
 'tfi',
 'lra',
 'prb',
 'fyg',
 'uwp',
 'prb',
 'prb',
 'myl',
 'zbn',
 'snn',
 'rmf',
 'kks',
 'snn',
 'dbi',
 'mqo',
 'tfi',
 'eus',
 'qsw',
 'vog',
 'ulu',
 'cxt',
 'opp',
 'tfi',
 'ien',
 'hrj',
 'bgn',
 'dbi',
 'fxg',
 'prb',
 'gid',
 'bgn',
 'azx',
 'tft',
 'wht',
 'prb',
 'azo',
 'muz',
 'ulu',
 'ycb',
 'azx',
 'azx',
 'tzn',
 'wwk',
 'mqo',
 'xxb',
 'azx',
 'ftq',
 'bou',
 'qsw',
 'bou',
 'tfi',
 'ien',
 'azo',
 'ulu',
 'cyd',
 'ulu',
