# End-to-End Demo: GameStats2Text

This notebook will:
1. Load and preprocess the dataset.
2. Split into training and validation sets.
3. Instantiate the GameStats2Text generator model.
4. Train the model for a few epochs.
5. Generate sample responses using test prompts.


In [None]:
import os
import torch
from torch.utils.data import DataLoader, random_split
from src.process.setupData import GameStatsTextDataset, collate_fn
from src.models.generator import GameStats2TextGenerator
from transformers import GPT2Tokenizer

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


Using device: cpu


In [None]:
# Load dataset
csv_path = 'data/dataset.csv'  # Path to your dataset CSV
dataset = GameStatsTextDataset(csv_file=csv_path)
print(f"Dataset size: {len(dataset)} samples")
print("Stat feature columns:", dataset.feature_cols)




Dataset size: 1193 samples
Stat feature columns: ['MP', 'PTS', 'FG%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'Result']


In [None]:
# Split into training and validation sets (80/20)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")


Train size: 954, Validation size: 239


In [None]:
# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [None]:
# Instantiate model, tokenizer, and optimizer
stats_input_dim = len(dataset.feature_cols)
model = GameStats2TextGenerator(stats_input_dim=stats_input_dim).to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [None]:
# Training loop
from tqdm import tqdm

num_epochs = 3

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        stats = batch['stats'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(stats, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_loss:.4f}")


Epoch 1:   0%|          | 0/120 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1:   1%|          | 1/120 [00:03<07:26,  3.76s/it]


KeyboardInterrupt: 

In [None]:
# Validation
model.eval()
val_loss = 0
with torch.no_grad():
    for batch in val_loader:
        stats = batch['stats'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(stats, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += outputs.loss.item()

val_loss /= len(val_loader)
print(f"Validation Loss: {val_loss:.4f}")
model.train()


In [None]:
# Generate sample responses
test_indices = [0, 1, 2] if len(dataset) >= 3 else list(range(len(dataset)))
for idx in test_indices:
    stats = dataset.stats[idx]
    prompt = dataset.questions[idx]
    print("="*20)
    print("Prompt:", prompt)
    response = model.generate(stats, prompt, tokenizer)
    print("Generated Response:", response)
