Transformer with documentation

In [1]:
# --- Imports ---

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import pandas as pd
import numpy as np

from tqdm import tqdm
import time


In [2]:
# --- Global Variables and Hyperparameters ---
base_path = './kaggle/input/nfl-big-data-bowl-2026-prediction/train'
lr = .001
num_epochs = 10
batch_size = 64
num_weeks = 9
load_prev_model = False

In [3]:
# --- ETL Helpers ---

# Input: Array of input dataframes
# Output: Dictionary of dataframes grouped by id
def format_input(input):
    ipt = pd.concat(input, ignore_index=True) # Concatenate all dfs into one
    ipt = ipt[ipt['player_to_predict'] == True].copy(deep=True) # Filter to only players that matter

    # Add velocity component column
    dir_rad = np.deg2rad(ipt['dir'])
    ipt['vx'] = ipt['s'] * np.sin(dir_rad)
    ipt['vy'] = ipt['s'] * np.cos(dir_rad)

    # Add orientation components
    o_rad = np.deg2rad(ipt['o'])
    ipt['ox'] = np.sin(o_rad)
    ipt['oy'] = np.cos(o_rad)

    # Offense going left? 1 or -1
    ipt['go_left'] = np.where(ipt['play_direction'] == 'left', 1, -1)

    # Offensive player? 1 or -1
    ipt['offensive_player'] = np.where(ipt['player_role'] == 'Targeted Receiver', 1, -1)

    # Get useful columns only
    # Constant variables: 'go_left', 'offensive_player', 'ball_land_x', 'ball_land_y', 'absolute_yardline_number', 'num_frames_output'
    # Time variables : 'x', 'y', 'vx', 'vy', 'ox', 'oy', 'a', 'frame_id'
    ipt = ipt[['game_id', 'play_id', 'nfl_id', 'x', 'y', 'vx', 'vy', 'ox', 'oy', 'go_left', 'offensive_player', 'a', 'ball_land_x', 'ball_land_y', 'absolute_yardline_number', 'frame_id', 'num_frames_output']]


    # Create dictionary of dfs
    ipt = {
        f"{gid}_{pid}_{nid}": g
        for (gid, pid, nid), g in ipt.groupby(['game_id', 'play_id', 'nfl_id'])
    }
    return ipt


# Input: 
    # instance: Dataframe specific to game, nfl, and player ids
    # frame: Current frame used for prediction
# Output: List of (x0, x1, x2)
    # x0, First input vector
    # x1: First input array
    # x2: Second input vector
def build_inputs(instance, frame):
    x0 = instance.iloc[-1][['go_left', 'offensive_player', 'ball_land_x', 'ball_land_y','absolute_yardline_number', 'num_frames_output']].values 
    x0 = np.concatenate([x0, [frame]]).astype(np.float32)#np(7,)
    x1 = instance.iloc[-5:][['x','y','vx','vy','ox','oy','a','frame_id']].values.astype(np.float32) #np(5,8)
    x2 = instance.iloc[-1][['x','y']].values.astype(np.float32) #np(2,)
    return x0, x1, x2



# Input: 
    # out: Target dataframe (use only to extract ids and get frame id)
    # ipt: Dictionary of input dfs
# Output: List of (x0, x1, x2, y)
    # x0: First input vector
    # x1: First input array
    # x2: Second input vector
    # y: Target vector
def preprocess(out, ipt):
    data = []
    for _, row in out.iterrows():
        fid = f"{int(row['game_id'])}_{int(row['play_id'])}_{int(row['nfl_id'])}"
        instance = ipt[fid]
        frame = row['frame_id']

        x0, x1, x2 = build_inputs(instance, frame) # Input vectors
        y = row[['x','y']].values.astype(np.float32) # Output vectors

        data.append((x0, x1, x2, y))
    return data



# Custom Dataset Class
class MyDataset(Dataset):
    def __init__(self, dat):
        self.data = dat

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x0, x1, x2, y = self.data[idx]
        return torch.tensor(x0), torch.tensor(x1), torch.tensor(x2), torch.tensor(y)


Train and Test

In [4]:
# --- Train & Test Data ETL ---

print("Started loading training data")
raw_input_train = [pd.read_csv(f'{base_path}/input_2023_w0{i}.csv') for i in range(1,num_weeks+1)]
raw_output_train = [pd.read_csv(f'{base_path}/output_2023_w0{i}.csv') for i in range(1,num_weeks+1)]
in_train = format_input(raw_input_train)
out_train = pd.concat(raw_output_train, ignore_index=True)
train_dataset = MyDataset(preprocess(out_train, in_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
print("Finished loading training data")

print("Started loading validation data")
raw_input_val = [pd.read_csv(f'{base_path}/input_2023_w{i}.csv') for i in range(16,18)]
raw_output_val = [pd.read_csv(f'{base_path}/output_2023_w{i}.csv') for i in range(16,18)]
in_val = format_input(raw_input_val)
out_val = pd.concat(raw_output_val, ignore_index=True)
val_dataset = MyDataset(preprocess(out_val, in_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
print("Finished loading validation data")

print("Started loading testing data")
raw_input_test = [pd.read_csv(f'{base_path}/input_2023_w{i}.csv') for i in range(18,19)]
raw_output_test = [pd.read_csv(f'{base_path}/output_2023_w{i}.csv') for i in range(18,19)]
in_test = format_input(raw_input_test)
out_test = pd.concat(raw_output_test, ignore_index=True)
test_dataset = MyDataset(preprocess(out_test, in_test))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
print("Finished loading testing data")

Started loading training data
Finished loading training data
Started loading validation data
Finished loading validation data
Started loading testing data
Finished loading testing data


In [5]:
# --- Model Initialization ---

class Transformer(nn.Module):
    def __init__(self, x0_size=7, transformer_input_size=8, hidden_size=64, num_layers=3, mlp_output_size=2, nhead=8):
        super().__init__()
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=nhead,
            dim_feedforward=hidden_size*4,
            batch_first=True  # allows input shape (batch, seq_len, hidden_size)
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # linear layer to project x1 input to hidden_size
        self.input_proj = nn.Linear(transformer_input_size, hidden_size)
        
        # MLP after concatenating last hidden with x0
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size + x0_size, 128),
            nn.ReLU(),
            nn.Linear(128, mlp_output_size)
        )

    def forward(self, x0, x1, x2):
        # project input to hidden_size
        x1_proj = self.input_proj(x1)  # (batch, seq_len, hidden_size)
        
        # pass through transformer
        out = self.transformer(x1_proj)  # (batch, seq_len, hidden_size)
        last_hidden = out[:, -1, :]      # (batch, hidden_size), take last token
        
        # concatenate with x0
        x0_expanded = x0.view(x0.size(0), -1)
        combined = torch.cat([last_hidden, x0_expanded], dim=1)
        
        output = self.mlp(combined)
        
        # add x2
        output = output + x2
        
        return output

    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load("V5.pth", weights_only=False, map_location=device) if load_prev_model else torch.compile(Transformer().to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss(reduction="mean")

In [6]:
# --- Training Loop ---

def val_loss():
    model.eval()
    total_se = 0.0 
    total_count = 0

    with torch.no_grad():
        for x0, x1, x2, y in val_loader:
            x0, x1, x2, y = x0.to(device), x1.to(device), x2.to(device), y.to(device)
            preds = model(x0, x1, x2)
            se = (preds - y) ** 2
            total_se += se.sum().item() 
            total_count += y.numel()

    rmse = ((total_se / total_count) ** 0.5)
    return rmse

for epoch in range(num_epochs):
    total_loss = 0.0
    
    model.train()
    for x0, x1, x2, y in train_loader:
        x0, x1, x2, y = x0.to(device, non_blocking=True), x1.to(device, non_blocking=True), x2.to(device, non_blocking=True), y.to(device, non_blocking=True)
        preds = model(x0, x1, x2)
        loss = torch.sqrt(criterion(preds, y))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    v_loss = val_loss()
    print(f"Epoch {epoch+1}: Approx Loss: {total_loss / len(train_loader):.4f} Val Loss: {v_loss:.4f}")

Epoch 1: Approx Loss: 1.2490 Val Loss: 1.0876
Epoch 2: Approx Loss: 0.9796 Val Loss: 1.0540
Epoch 3: Approx Loss: 0.9222 Val Loss: 1.0005
Epoch 4: Approx Loss: 0.8799 Val Loss: 0.9850
Epoch 5: Approx Loss: 0.8531 Val Loss: 0.9743
Epoch 6: Approx Loss: 0.8292 Val Loss: 1.0010
Epoch 7: Approx Loss: 0.8123 Val Loss: 1.0457
Epoch 8: Approx Loss: 0.7954 Val Loss: 0.9891
Epoch 9: Approx Loss: 0.7856 Val Loss: 0.9964
Epoch 10: Approx Loss: 0.7755 Val Loss: 1.0792


In [7]:
# --- Test model ---
total_se = 0.0
total_count = 0
model.eval()

with torch.no_grad():
    for x0, x1, x2, y in tqdm(test_loader, leave=False):
        x0, x1, x2, y = x0.to(device), x1.to(device), x2.to(device), y.to(device)
        preds = model(x0, x1, x2)
        se = (preds - y) ** 2
        total_se += se.sum().item()
        total_count += y.numel()

rmse = ((total_se / total_count) ** 0.5)
print(f"RMSE = {rmse}")


                                                  

RMSE = 1.0244069435616274




Submission

In [8]:
# --- Submission data preprocessing ---

# Submission inputs
print("Started importing submission inputs")
instances = pd.read_csv('./kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv')
eval_in = pd.read_csv('./kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv')
eval_in = format_input([eval_in])
print("Finished importing submission inputs")

# Process inputs for submission
def prepare_inputs(row, ipt):
    gid, pid, nid, fid = int(row['game_id']), int(row['play_id']), int(row['nfl_id']), int(row['frame_id'])
    instance = ipt[f"{gid}_{pid}_{nid}"]

    x0, x1, x2 = build_inputs(instance, fid)
    return torch.tensor(x0).unsqueeze(0), torch.tensor(x1).unsqueeze(0), torch.tensor(x2).unsqueeze(0)

# Predict positions for submission
def compute_pos(row):
    gid, pid, nid, fid = row['game_id'], row['play_id'], row['nfl_id'], row['frame_id']
    full_id = f"{gid}_{pid}_{nid}_{fid}"

    x0, x1, x2 = prepare_inputs(row, eval_in)
    x0, x1, x2 = x0.to(device, non_blocking=True), x1.to(device, non_blocking=True), x2.to(device, non_blocking=True)

    pred_x, pred_y = model(x0, x1, x2).squeeze(0).tolist()
    return pd.Series([full_id, pred_x, pred_y])

Started importing submission inputs
Finished importing submission inputs


In [9]:
# --- Final submission df ---
model.eval()
submission = instances.copy(deep=True)

with torch.no_grad():
    print("Started computing submission")
    submission[['id', 'x', 'y']] = submission.apply(compute_pos, axis=1)
    print("Finished computing submission")
submission = submission[['id', 'x', 'y']]

submission.head(10)

Started computing submission
Finished computing submission


Unnamed: 0,id,x,y
0,2024120805_74_54586_1,87.977875,34.147022
1,2024120805_74_54586_2,88.441956,34.300583
2,2024120805_74_54586_3,88.906029,34.454144
3,2024120805_74_54586_4,89.332329,34.495369
4,2024120805_74_54586_5,89.620613,34.601234
5,2024120805_74_54586_6,89.971954,34.755505
6,2024120805_74_54586_7,90.341591,34.92382
7,2024120805_74_54586_8,90.696617,35.105747
8,2024120805_74_54586_9,90.960915,35.441547
9,2024120805_74_54586_10,91.230774,35.780663


In [10]:
# --- Submission to csv ---
submission.to_csv('submission.csv', index=False)
print("Submitted!!!")

Submitted!!!


In [11]:
# --- Export for metric testing ---
out_csv = out_test.copy(deep=True)
out_csv['id'] = out_csv.apply(lambda r: f"{int(r.game_id)}_{int(r.play_id)}_{int(r.nfl_id)}_{int(r.frame_id)}", axis=1)
out_csv = out_csv[['id', 'x', 'y']]
out_csv.to_csv('testSolution.csv', index=False)

# Process inputs for testing export
def prepare_inputs_1(row, ipt):
    gid, pid, nid, fid = int(row['game_id']), int(row['play_id']), int(row['nfl_id']), int(row['frame_id'])
    out_test = ipt[f"{gid}_{pid}_{nid}"]

    x0, x1, x2 = build_inputs(out_test, fid)
    return torch.tensor(x0).unsqueeze(0), torch.tensor(x1).unsqueeze(0), torch.tensor(x2).unsqueeze(0)

# Predict positions for submission
def compute_pos_1(row):
    gid, pid, nid, fid = row['game_id'], row['play_id'], row['nfl_id'], row['frame_id']
    full_id = f"{gid}_{pid}_{nid}_{fid}"

    x0, x1, x2 = prepare_inputs_1(row, in_test)
    x0, x1, x2 = x0.to(device, non_blocking=True), x1.to(device, non_blocking=True), x2.to(device, non_blocking=True)

    pred_x, pred_y = model(x0, x1, x2).squeeze(0).tolist()
    return pd.Series([full_id, pred_x, pred_y])

model.eval()
pred_csv = out_test[['game_id', 'play_id', 'nfl_id', 'frame_id']].copy(deep=True)

with torch.no_grad():
    print("Started computing test")
    pred_csv[['id', 'x', 'y']] = pred_csv.apply(compute_pos_1, axis=1)
    print("Finished computing test")
pred_csv = pred_csv[['id', 'x', 'y']]

pred_csv.to_csv('testSubmission.csv', index=False)


Started computing test
Finished computing test


In [12]:
torch.save(model, "V5.pth")