# Pipeline Tests

## 1. Reader

In [None]:
import pandas as pd
from model.reader import reader

df = reader('train.csv')

In [None]:
TARGET = df.iloc[:, 0]
browser = df.iloc[:, 1]
actions = df.iloc[:, 2:]

## 2. Parser

In [None]:
from model.parser import parse_action_string

parsed_actions = actions.map(parse_action_string)
parsed_actions

## 3. Tokenizer

In [None]:
from model.tokenizer import tokenize_action_sequence

session_tokens, token_to_idx = tokenize_action_sequence(actions)

## 4. Model: Transformer

Read device

In [None]:
import torch

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    # Enable memory optimization
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False

In [None]:
from model.transformer import create_model, train_model
from sklearn.model_selection import train_test_split

# Create model
vocab_size = len(token_to_idx)
n_usernames = len(TARGET.unique())

# Create optimized model for T4 GPU
model = create_model(
    vocab_size=vocab_size,
    n_usernames=n_usernames,
    d_model=128,        # Increased for better performance
    n_heads=4,          # More attention heads
    n_layers=4,         # Deeper network
    d_ff=512,
    max_seq_len=100,
    dropout=0.1
)


# Move model to GPU
model = model.to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Encode usernames

In [None]:
import torch


unique_usernames = TARGET.unique()
username_to_idx = {username: idx for idx, username in enumerate(unique_usernames)}
TARGET_indices = TARGET.map(username_to_idx).values
TARGET_tensor = torch.tensor(TARGET_indices, dtype=torch.long)

Train Transformer model

In [None]:
# Use the raw session_tokens (list of lists) instead of converting to tensor
# This allows the training function to handle padding properly

# Split the dataset
train_size = int(0.8 * len(session_tokens))
val_size = len(session_tokens) - train_size

# Split session_tokens and TARGET_tensor
train_tokens = session_tokens[:train_size]
val_tokens = session_tokens[train_size:]

train_targets = TARGET_tensor[:train_size]
val_targets = TARGET_tensor[train_size:]
 
# Train with memory-efficient parameters
train_model(
    model,
    (train_tokens, train_targets),
    (val_tokens, val_targets),
    epochs=100,
    batch_size=64,      # Larger batch size for GPU
    max_seq_len=100,
    device=device
)

In [None]:
# Predict username from action sequence
action_sequence = torch.tensor([5, 32, 1, 52, 89]).to(device)
logits, probs = model.predict_username(action_sequence)
predicted_username = torch.argmax(logits, dim=-1)