In [1]:
# --- Imports ---

import torch
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

from tqdm import tqdm
import time
import math

In [2]:
# --- Configurations ---

class CONFIG:
    DATA_PATH = "./kaggle/input/nfl-big-data-bowl-2026-prediction/"

    N_FOLDS = 5
    EPOCHS = 20
    PATIENCE = 5
    FACTOR = .9
    LEARNING_RATE = 5e-4
    DROPOUT = 0.01
    FORCE_MAX = .8
    FORCE_MIN = .2


config = CONFIG()

In [3]:
# --- Data import ---

train_input_files = [f"{config.DATA_PATH}train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
train_output_files = [f"{config.DATA_PATH}train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
train_input = pd.concat([pd.read_csv(f) for f in train_input_files])
train_output = pd.concat([pd.read_csv(f) for f in train_output_files])   
train_input_dict = {(g, p): subdf for (g, p), subdf in train_input.groupby(['game_id', 'play_id'])}

test_input = pd.read_csv(f"{config.DATA_PATH}test_input.csv")
test_instances = pd.read_csv(f"{config.DATA_PATH}test.csv")
test_input_dict = {(g, p): subdf for (g, p), subdf in test_input.groupby(['game_id', 'play_id'])}

In [4]:
# -- Feature engineering helpers --
def height_to_feet(height_str):
    try:
        ft, inches = map(int, str(height_str).split('-'))
        return ft + inches/12
    except:
        return 6.0

def encode_play_dir(height_str):
    return 1 if height_str=="Left" else 0

def encode_player_role(height_str):
    return 1 if height_str=="Targeted Receiver" else 0


In [5]:
# -- Input constructors --

def build_x0a(player_df):
    height = height_to_feet(player_df.loc[0, 'player_height'])
    weight = player_df.loc[0, 'player_weight']
    land_x = player_df.loc[0, 'ball_land_x']
    land_y = player_df.loc[0, 'ball_land_y']
    start_x = player_df.loc[0, 'x']
    start_y = player_df.loc[0, 'y']
    end_x = player_df.iloc[-1]['x']
    end_y = player_df.iloc[-1]['y']
    end_s = player_df.iloc[-1]['s']
    end_a = player_df.iloc[-1]['a']
    end_dir = math.radians(player_df.iloc[-1]['dir'])
    end_o = math.radians(player_df.iloc[-1]['o'])
    end_vx = end_s * math.sin(end_dir)
    end_vy = end_s * math.cos(end_dir)
    end_ox = math.sin(end_o)
    end_oy = math.cos(end_o)
    play_dir = encode_play_dir(player_df.loc[0, 'play_direction'])
    role = encode_player_role(player_df.loc[0, 'player_role'])
    duration = player_df.loc[0, 'num_frames_output']
    return (np.array([height, weight, land_x, land_y, start_x, start_y, end_x, end_y, end_a, end_vx, end_vy, end_ox, end_oy, play_dir, role, duration]), duration)

def build_x0b(qb_df):
    if qb_df.empty:
        return np.array([6.25, 220, 60, 26.6, 60, 26.6, 0, 0, 0, 0, 0])
    height = height_to_feet(qb_df.loc[0, 'player_height'])
    weight = qb_df.loc[0, 'player_weight']
    start_x = qb_df.loc[0, 'x']
    start_y = qb_df.loc[0, 'y']
    end_x = qb_df.iloc[-1]['x']
    end_y = qb_df.iloc[-1]['y']
    end_s = qb_df.iloc[-1]['s']
    end_a = qb_df.iloc[-1]['a']
    end_dir = math.radians(qb_df.iloc[-1]['dir'])
    end_o = math.radians(qb_df.iloc[-1]['o'])
    end_vx = end_s * math.sin(end_dir)
    end_vy = end_s * math.cos(end_dir)
    end_ox = math.sin(end_o)
    end_oy = math.cos(end_o)
    return np.array([height, weight, start_x, start_y, end_x, end_y, end_a, end_vx, end_vy, end_ox, end_oy])


def build_x1(player_df):
    dir_radians = np.deg2rad(player_df['dir'])
    o_radians = np.deg2rad(player_df['o'])
    player_df['vx'] = player_df['s'] * np.sin(dir_radians)
    player_df['vy'] = player_df['s'] * np.cos(dir_radians)
    player_df['ax'] = player_df['vx'].diff().fillna(0)
    player_df['ay'] = player_df['vy'].diff().fillna(0)
    player_df['ox'] = np.sin(o_radians)
    player_df['oy'] = np.cos(o_radians)
    return player_df[['x', 'y', 'vx', 'vy', 'ax', 'ay', 's', 'a', 'ox', 'oy']].to_numpy()

def build_x2(player_df):
    end_x = player_df.iloc[-1]['x']
    end_y = player_df.iloc[-1]['y']
    return np.array([end_x, end_y])


def build_inputs(gid, pid, nid, train=True):
    in_df = train_input_dict[(gid, pid)] if train else test_input_dict[(gid, pid)]
    player_df = in_df[in_df['nfl_id'] == nid].copy().reset_index(drop=True)
    qb_df = in_df[in_df['player_role'] == 'Passer'].copy().reset_index(drop=True)

    x0a, dur = build_x0a(player_df)
    x0b = build_x0b(qb_df)
    x0 = np.concatenate((x0a, x0b), axis=0)
    x0 = torch.from_numpy(x0).float()

    x1a = build_x1(player_df)
    x1b = build_x1(qb_df)
    if x1b.size==0: x1b = None

    x2 = build_x2(player_df)
    x2 = torch.from_numpy(x2).float()

    return (x0, x1a, x1b, x2, dur)


In [6]:
# -- Train Dataset -- 

class TrainDataset(Dataset):
    def __init__(self, ids, x0, x1a, x1b, x2, duration, targets):
        # N: Number of training examples, L1: Length of input sequence, L2: Length of output sequence
        self.ids = ids                  # str list      # shape: N
        self.x0 = x0                    # np.arr(N)     # tensor(27)
        self.x1a = x1a                  # np.arr(N)     # np.arr(L1, 10)
        self.x1b = x1b                  # np.arr(N)     # np.arr(L1, 10)
        self.x2 = x2                    # np.arr(N)     # tensor(2)
        self.targets = targets          # np.arr(N)     # np.arr(L2, 2)
        self.duration = duration        # int list      # shape: N

    def __len__(self):
        return len(self.x0)

    def __getitem__(self, idx):
        id = self.ids[idx]
        x0 = self.x0[idx]
        x1a = torch.tensor(self.x1a[idx], dtype=torch.float32)
        if self.x1b[idx] is None: x1b = torch.empty(0, x1a.shape[1], dtype=torch.float32)
        else: x1b = torch.tensor(self.x1b[idx], dtype=torch.float32)
        x2 = self.x2[idx]
        dur = self.duration[idx]
        targets = torch.tensor(self.targets[idx], dtype=torch.float32)
        return (id, x0, x1a, x1b, x2, targets, dur)  

def build_train_ds(train_output):
    train = train_output.groupby(['game_id', 'play_id', 'nfl_id'])[['x', 'y']].apply(lambda a: a.to_numpy()).reset_index(name = 'targets')
    train[['x0', 'x1a', 'x1b', 'x2', 'dur']] = train.apply(
        lambda row: pd.Series(build_inputs(row['game_id'], row['play_id'], row['nfl_id'])),
        axis=1
    )
    train["id"] = train["game_id"].astype(str) + "_" + train["play_id"].astype(str) + "_" + train["nfl_id"].astype(str)
    train = TrainDataset(
        train['id'], 
        train['x0'].to_numpy(), 
        train['x1a'].to_numpy(), 
        train['x1b'].to_numpy(), 
        train['x2'].to_numpy(), 
        train['dur'].to_numpy(), 
        targets = train['targets'].to_numpy(),
    )
    return train

train_set = build_train_ds(train_output)
torch.save(train_set, "train_set.pt")

In [7]:
# -- Test Dataset -- 

class TestDataset(Dataset):
    def __init__(self, ids, x0, x1a, x1b, x2, duration):
        # N: Number of training examples, L1: Length of input sequence, L2: Length of output sequence
        self.ids = ids                  # str list      # shape: N
        self.x0 = x0                    # np.arr(N)     # tensor(27)
        self.x1a = x1a                  # np.arr(N)     # np.arr(L1, 10)
        self.x1b = x1b                  # np.arr(N)     # np.arr(L1, 10)
        self.x2 = x2                    # np.arr(N)     # tensor(2)
        self.duration = duration        # int list      # shape: N

    def __len__(self):
        return len(self.x0)

    def __getitem__(self, idx):
        id = self.ids.iloc[idx]
        x0 = self.x0[idx]
        x1a = torch.tensor(self.x1a[idx], dtype=torch.float32)
        if self.x1b[idx] is None: x1b = torch.empty(0, x1a.shape[1], dtype=torch.float32)
        else: x1b = torch.tensor(self.x1b[idx], dtype=torch.float32)
        x2 = self.x2[idx]
        dur = self.duration[idx]
        return (id, x0, x1a, x1b, x2, dur)  
  
def build_test_ds(test_instances):
    test = test_instances.drop_duplicates(subset=['game_id', 'play_id', 'nfl_id']).copy()
    test[['x0', 'x1a', 'x1b', 'x2', 'dur']] = test.apply(
        lambda row: pd.Series(build_inputs(row['game_id'], row['play_id'], row['nfl_id'], False)),
        axis=1
    )
    test["id"] = test["game_id"].astype(str) + "_" + test["play_id"].astype(str) + "_" + test["nfl_id"].astype(str)
    test = TestDataset(
        test['id'],
        test['x0'].to_numpy(), 
        test['x1a'].to_numpy(), 
        test['x1b'].to_numpy(), 
        test['x2'].to_numpy(), 
        test['dur'].to_numpy(), 
    )
    return test


test_set = build_test_ds(test_instances)
torch.save(test_set, "test_set.pt")