# Step: Downloading the dataset

To begin, you must connect to runtime and have an api token from kaggle saved in your immediate directory of Google Drive in order to access the dataset directly through CLI

In [2]:
#@title Run this cell to mount Google Drive and get `kaggle.json` from personal directory

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#@title Run this cell to download the competition dataset to notebook directory

! pip install kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download nfl-big-data-bowl-2025

Downloading nfl-big-data-bowl-2025.zip to /content
100% 1.13G/1.14G [00:55<00:00, 17.9MB/s]
100% 1.14G/1.14G [00:55<00:00, 21.8MB/s]


### Install SportsTransformers_utils provided by SumerSports

In [4]:
file_location = '/content/nfl-big-data-bowl-2025.zip'

In [5]:
!unzip {file_location}

Archive:  /content/nfl-big-data-bowl-2025.zip
  inflating: games.csv               
  inflating: player_play.csv         
  inflating: players.csv             
  inflating: plays.csv               
  inflating: tracking_week_1.csv     
  inflating: tracking_week_2.csv     
  inflating: tracking_week_3.csv     
  inflating: tracking_week_4.csv     
  inflating: tracking_week_5.csv     
  inflating: tracking_week_6.csv     
  inflating: tracking_week_7.csv     
  inflating: tracking_week_8.csv     
  inflating: tracking_week_9.csv     


# Stage: Prep Data

## Load in the data files from the data set

In [7]:
from argparse import ArgumentParser
from pathlib import Path

import polars as pl
import os

import random


# TEMPORARY CHANGE
INPUT_DATA_DIR = Path("./")
SPLIT_OUT_DIR = Path("./drive/My Drive/bdb-2025/prepped_data/split")
SPLIT_OUT_DIR.mkdir(exist_ok=True, parents=True)
TRACKING_OUT_DIR = Path("./drive/My Drive/bdb-2025/prepped_data/tracking")
TRACKING_OUT_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
def get_players_df() -> pl.DataFrame:
    """
    Load player-level data and preprocesses features.

    Returns:
        pl.DataFrame: Preprocessed player data with additional features.
    """
    return (
        pl.read_csv(INPUT_DATA_DIR / "players.csv", null_values=["NA", "nan", "N/A", "NaN", ""])
        .with_columns(
            height_inches=(
                pl.col("height").str.split("-").map_elements(lambda s: int(s[0]) * 12 + int(s[1]), return_dtype=int)
            )
        )
        .with_columns(
            weight_Z=(pl.col("weight") - pl.col("weight").mean()) / pl.col("weight").std(),
            height_Z=(pl.col("height_inches") - pl.col("height_inches").mean()) / pl.col("height_inches").std(),
        )
    )

def get_plays_df() -> pl.DataFrame:
    """
    Load play-level data and preprocesses features.

    Returns:
        pl.DataFrame: Preprocessed play data with additional features.
    """
    return pl.read_csv(INPUT_DATA_DIR / "plays.csv", null_values=["NA", "nan", "N/A", "NaN", ""]).with_columns(
        distanceToGoal=(
            pl.when(pl.col("possessionTeam") == pl.col("yardlineSide"))
            .then(100 - pl.col("yardlineNumber"))
            .otherwise(pl.col("yardlineNumber"))
        )
    )

def get_tracking_df() -> pl.DataFrame:
    """
    Load tracking data and preprocesses features. Notably, exclude rows representing the football's movement.

    Returns:
        pl.DataFrame: Preprocessed tracking data with additional features.
    """
    # don't include football rows for this project.
    # NOTE: Only processing week 1 for the sake of time.  Change "1" to "*" to process all weeks
    return pl.read_csv(INPUT_DATA_DIR / "tracking_week_1.csv", null_values=["NA", "nan", "N/A", "NaN", ""]).filter(
        pl.col("displayName") != "football"
    )

def add_features_to_tracking_df(
    tracking_df: pl.DataFrame,
    players_df: pl.DataFrame,
    plays_df: pl.DataFrame,
) -> pl.DataFrame:
    """
    Consolidates play and player level data into the tracking data.

    Args:
        tracking_df (pl.DataFrame): Tracking data
        players_df (pl.DataFrame): Player data
        plays_df (pl.DataFrame): Play data

    Returns:
        pl.DataFrame: Tracking data with additional features.
    """
    # add `is_ball_carrier`, `team_indicator`, and other features to tracking data
    og_len = len(tracking_df)
    tracking_df = (
        tracking_df.join(
            plays_df.select(
                "gameId",
                "playId",
                "defensiveTeam"
            ),
            on=["gameId", "playId"],
            how="inner",
        )
        .join(
            players_df.select(["nflId", "displayName", "position"]).unique(), # select position column
            on=["nflId", "displayName"],
            how="left",
        )
        #.join(
        #    players_df.select(["nflId", "weight_Z", "height_Z"]).unique(),
        #    on="nflId",
        #    how="inner",
        #)
        .with_columns(
            isDefense=pl.when(pl.col("club") == pl.col("defensiveTeam"))
            .then(pl.lit(1))
            .otherwise(pl.lit(-1))
            .alias("isDefense"),
        )
        .drop(["defensiveTeam"])
    )

    assert len(tracking_df) == og_len, "Lost rows when joining tracking data with play/player data"

    return tracking_df

def remove_null_positions(tracking_df: pl.DataFrame) -> pl.DataFrame:
    """
    Remove rows with null positions.

    Args:
        tracking_df (pl.DataFrame): Tracking data

    Returns:
        pl.DataFrame: Tracking data with null positions removed.
    """
    # Players with null positions
    null_position_players = tracking_df.filter(pl.col("position").is_null())

    # Identify unique gameId and playId combinations with null positions
    null_plays = null_position_players.select(["gameId", "playId"]).unique()

    # Filter out the plays with null positions from the tracking data
    return tracking_df.join(
        null_plays, on=["gameId", "playId"], how="anti"
    )


# Load in raw data
print("Load players")
players_df = get_players_df()
print("Load plays")
plays_df = get_plays_df()
print("Load tracking")
tracking_df = get_tracking_df()
print("tracking_df rows:", len(tracking_df))
print("Add features to tracking")
tracking_df = add_features_to_tracking_df(tracking_df, players_df, plays_df)
del players_df
print("Remove null positions")
tracking_df = remove_null_positions(tracking_df)
print("tracking_df rows:", len(tracking_df))

Load players
Load plays
Load tracking
tracking_df rows: 6795800
Add features to tracking
Remove null positions
tracking_df rows: 6233458


In [9]:
# Get unique player positions from players_df

unique_positions = tracking_df['position'].unique().to_list()
unique_positions = { pos: i for i, pos in enumerate(sorted(unique_positions)) }
unique_positions

{'C': 0,
 'CB': 1,
 'DE': 2,
 'DT': 3,
 'FB': 4,
 'FS': 5,
 'G': 6,
 'ILB': 7,
 'LB': 8,
 'MLB': 9,
 'NT': 10,
 'OLB': 11,
 'QB': 12,
 'RB': 13,
 'SS': 14,
 'T': 15,
 'TE': 16,
 'WR': 17}

NOTE: there two current `tracking` dataframes. One is the consolidated one, the other is augmented the direction of all players to assume that they are all moving in the same direction/

## Explore model target --> Position of Masked player

* Target dim.: (3, )
  - first dim.: `displayName` of masked player
  - second dim.: `x` coordinate of masked player on the field at the frame
  - third dim.: `y` coordinate of masked player on the field at the frame

## Goal: Randomly select a player and predict their x-y coordinates by removing them from the input sequence and having the model estimate their motion based on the locations of the 21 other players.



In [10]:
def augment_data(tracking_df):
    """
    Augments tracking data by iterating through each game and play,
    removing one defensive player at a time and creating a new DataFrame.
    Saves each play to a parquet file.

    Args:
        tracking_df (pl.DataFrame): Tracking data

    Returns:
        pl.DataFrame: Augmented tracking data
    """
    # Assuming 'tracking_df' is your Polars DataFrame
    # and it has columns 'gameId', 'playId', 'x', 'y'
    for game_id, play_id in tracking_df.select(["gameId", "playId"]).unique().rows():
        defensive_players = tracking_df.filter((pl.col("gameId") == game_id) & (pl.col("playId") == play_id) & (pl.col("isDefense") == 1))
        unique_names = defensive_players['displayName'].unique().to_list()

        rel_tracking_df = []

        for player_name in unique_names:
            # Filter out the selected player from tracking_df
            # filtered_df = tracking_df.filter(
            #         (pl.col("gameId") == game_id)
            #         & (pl.col("playId") == play_id)
            #         & (pl.col("displayName") != player_name)
            # )
            filtered_df = tracking_df.filter(
                (pl.col("gameId") == game_id)
                & (pl.col("playId") == play_id)
            ).with_columns(
                pl.when(pl.col("displayName") == player_name)
                .then(pl.lit("MASKED"))
                .otherwise(pl.col("position"))
                .alias("position"),
                pl.when(pl.col("displayName") == player_name)
                .then(pl.lit(-1))
                .otherwise(pl.col("x"))
                .alias("x"),
                pl.when(pl.col("displayName") == player_name)
                .then(pl.lit(-1))
                .otherwise(pl.col("y"))
                .alias("y")
            )

            # Assert that the masked player's position is now "MASKED"
            masked_player_df = filtered_df.filter(pl.col("displayName") == player_name)
            assert masked_player_df["position"].unique().to_list() == ["MASKED"], f"Masked player {player_name} position is not 'MASKED'"
            rel_tracking_df.append(filtered_df)

        # Concatenate all DataFrames in the list
        rel_tracking_df = pl.concat(rel_tracking_df)

        # Select only the specified columns
        rel_tracking_df = rel_tracking_df.select(
            ["gameId", "playId", "frameId", "displayName", "position", "x", "y"]
        )

        # Save the DataFrame to the specified directory
        rel_tracking_df.write_parquet(TRACKING_OUT_DIR / f"game_{game_id}_play_{play_id}.parquet")
print(f"Augment tracking data")
rel_tracking_df = augment_data(tracking_df)

Augment tracking data


In [11]:
def get_target_variable(tracking_df):
    """
    Get the target variable for the model.

    Args:
        tracking_df (pl.DataFrame): Tracking data

    Returns:
        pl.DataFrame: Target variable
    """
    return tracking_df.filter(pl.col("isDefense") == 1).select(
        ["gameId", "playId", "frameId", "displayName", "position", "x", "y"]
    )

print("Generate target - maskedPlayers")
maskedPlayers_df = get_target_variable(tracking_df)
maskedPlayers_df

Generate target - maskedPlayers


gameId,playId,frameId,displayName,position,x,y
i64,i64,i64,str,str,f64,f64
2022091200,64,1,"""Kareem Jackson""","""SS""",51.06,28.55
2022091200,64,2,"""Kareem Jackson""","""SS""",51.13,28.57
2022091200,64,3,"""Kareem Jackson""","""SS""",51.2,28.59
2022091200,64,4,"""Kareem Jackson""","""SS""",51.26,28.62
2022091200,64,5,"""Kareem Jackson""","""SS""",51.32,28.65
…,…,…,…,…,…,…
2022090800,3696,171,"""Christian Benford""","""CB""",3.31,2.22
2022090800,3696,172,"""Christian Benford""","""CB""",3.08,2.18
2022090800,3696,173,"""Christian Benford""","""CB""",2.86,2.15
2022090800,3696,174,"""Christian Benford""","""CB""",2.65,2.14


## Splitting data into train, validation, and test sets.

In [22]:
def split_train_test_val(target_df: pl.DataFrame) -> dict[str, pl.DataFrame]:
    """
    Split data into train, validation, and test sets.
    Split is 70-15-15 for train-test-val respectively. Notably, we split at the play levle and not frame level.
    This ensures no target contamination between splits.

    Args:
        tracking_df (pl.DataFrame): Tracking data
        target_df (pl.DataFrame): Target data

    Returns:
        dict: Dictionary containing train, validation, and test dataframes.
    """
    target_df = target_df.sort(["gameId", "playId"])

    test_val_ids = target_df.select(["gameId", "playId"]).unique(maintain_order=True).sample(fraction=0.3, seed=42)
    train_tgt_df = target_df.join(test_val_ids, on=["gameId", "playId"], how="anti")
    train_ids = train_tgt_df.select(["gameId", "playId"]).unique(maintain_order=True)
    train_tracking_df = [ pl.read_parquet(TRACKING_OUT_DIR / f"game_{game_id}_play_{play_id}.parquet") for game_id, play_id in train_ids.rows() ]
    train_tracking_df = pl.concat(train_tracking_df)
    print(
        f"Train set: {train_tracking_df.n_unique(['gameId', 'playId'])} plays,",
        f"{train_tracking_df.n_unique(['gameId', 'playId', 'frameId'])} frames",
    )

    test_ids = test_val_ids.sample(fraction=0.5, seed=42)  # 70-15-15 split
    test_tgt_df = target_df.join(test_ids, on=["gameId", "playId"], how="inner")
    test_tracking_df = [ pl.read_parquet(TRACKING_OUT_DIR / f"game_{game_id}_play_{play_id}.parquet") for game_id, play_id in test_ids.rows() ]
    test_tracking_df = pl.concat(test_tracking_df)
    print(
        f"Test set: {test_tracking_df.n_unique(['gameId', 'playId'])} plays,",
        f"{test_tracking_df.n_unique(['gameId', 'playId', 'frameId'])} frames",
    )

    val_ids = test_val_ids.join(test_ids, on=["gameId", "playId"], how="anti")
    val_tgt_df = target_df.join(val_ids, on=["gameId", "playId"], how="inner")
    val_tracking_df = [ pl.read_parquet(TRACKING_OUT_DIR / f"game_{game_id}_play_{play_id}.parquet") for game_id, play_id in val_ids.rows() ]
    val_tracking_df = pl.concat(val_tracking_df)
    print(
        f"Validation set: {val_tracking_df.n_unique(['gameId', 'playId'])} plays,",
        f"{val_tracking_df.n_unique(['gameId', 'playId','frameId'])} frames",
    )

    len_plays_tracking_df = train_tracking_df.n_unique(['gameId', 'playId']) + test_tracking_df.n_unique(['gameId', 'playId']) + val_tracking_df.n_unique(['gameId', 'playId'])
    len_frames_tracking_df = train_tracking_df.n_unique(['gameId', 'playId','frameId']) + test_tracking_df.n_unique(['gameId', 'playId','frameId']) + val_tracking_df.n_unique(['gameId', 'playId','frameId'])
    print(
        f"Total set: {len_plays_tracking_df} plays,",
        f"{len_frames_tracking_df} frames",
    )

    return {
        "train_features": train_tracking_df,
        "train_targets": train_tgt_df,
        "test_features": test_tracking_df,
        "test_targets": test_tgt_df,
        "val_features": val_tracking_df,
        "val_targets": val_tgt_df,
    }


print("Split train/test/val")
split_dfs = split_train_test_val(maskedPlayers_df)

Split train/test/val
Train set: 1258 plays, 198599 frames
Test set: 269 plays, 42793 frames
Validation set: 269 plays, 41947 frames
Total set: 1796 plays, 283339 frames


In [23]:
# Export splits to OUT_DIR

for key, df in split_dfs.items():
    sort_keys = ["gameId", "playId", "frameId"]
    df.sort(sort_keys).write_parquet(SPLIT_OUT_DIR / f"{key}.parquet")