In [1]:
# import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

import tqdm
import xgboost
from xgboost import XGBClassifier

import plotly.graph_objects as go

In [2]:
filepath = os.getcwd()

In [3]:
player_play_file = os.path.join(filepath, "BigDataBowl2025/nfl-big-data-bowl-2025/player_play.csv")
games_file = os.path.join(filepath, "BigDataBowl2025/nfl-big-data-bowl-2025/games.csv")
players_file = os.path.join(filepath, "BigDataBowl2025/nfl-big-data-bowl-2025/players.csv")
plays_file = os.path.join(filepath, "BigDataBowl2025/nfl-big-data-bowl-2025/plays.csv")
tracking_files_train1 = [os.path.join(filepath, f"BigDataBowl2025/nfl-big-data-bowl-2025/tracking_week_{i}.csv") for i in range(1,4)]
tracking_files_train2 = [os.path.join(filepath, f"BigDataBowl2025/nfl-big-data-bowl-2025/tracking_week_{i}.csv") for i in range(4,7)]
tracking_files_train3 = [os.path.join(filepath, f"BigDataBowl2025/nfl-big-data-bowl-2025/tracking_week_{i}.csv") for i in range(7,10)]

In [4]:
players_play_df = pd.read_csv(player_play_file)
play_df = pd.read_csv(plays_file)
player_df = pd.read_csv(players_file)
games_df = pd.read_csv(games_file)

In [6]:
def combine_train_data(player_play_file, players_file, plays_file, tracking_files):
    
    plays_df = pd.read_csv(plays_file)
    player_play_df = pd.read_csv(player_play_file)
    games_df = pd.read_csv(games_file)
    players_df = pd.read_csv(players_file)
    players_df = players_df.drop(columns = 'displayName')
    
    tracking_df = pd.concat(
        [pd.read_csv(tracking_file) for tracking_file in tracking_files]
    )
    
    tmp_full_df1 = pd.merge(tracking_df, player_play_df, on=["gameId", "playId", "nflId"], how="left")
    tmp_full_df2 = pd.merge(tmp_full_df1, plays_df, on=["gameId", "playId"], how="left")
    full_df = pd.merge(tmp_full_df2, players_df, on=["nflId"], how="left")
    
    return full_df

In [7]:
def keep_motion_cols(df):
    
    used_cols = [
        "gameId", "playId", "nflId", "frameId", "frameType",
        "club", "possessionTeam", "defensiveTeam", "position", "jerseyNumber",
        "o_std", "a_std", "s_std", "x_std", "y_std", "dis_std", "dir_std", "los_std",
        "offensiveFormation", "receiverAlignment", "pff_manZone",
        "down", "yardsToGo", "yardlineNumber", "absoluteYardlineNumber",
        "inMotionAtBallSnap", "shiftSinceLineset", "motionSinceLineset",
        "wasRunningRoute", "routeRan", "event"
    ]
    
    cols_to_drop = [col for col in df.columns if col not in used_cols]
    df = df.drop(columns = cols_to_drop)
    return df

In [8]:
def get_lineset_frame(df):
    
    lineset_df = df[df.event == 'line_set']
    lineset_df = lineset_df[['gameId', 'playId', 'nflId', 'frameId']]
    lineset_df = lineset_df.rename(columns = {'frameId':'linesetFrameId'})
    return lineset_df

In [9]:
def get_motion_frame(df):

    motionFrame_df = df[df.event == 'man_in_motion']
    motionFrame_df = motionFrame_df[['gameId', 'playId', 'nflId', 'frameId']]
    motionFrame_df = motionFrame_df.rename(columns = {'frameId':'motionFrameId'})
    return motionFrame_df

In [10]:
def get_snap_frame(df):
    
    snap_df = df[df.frameType == 'SNAP']
    snap_df = snap_df[['gameId', 'playId', 'nflId', 'frameId']]
    snap_df = snap_df.rename(columns = {'frameId':'snapFrameId'})
    return snap_df

In [11]:
def get_clean_frames(df):
    new_df = df[(df["frameId"] >= df['motionFrameId']) & (df["frameId"] <= df['snapFrameId'])]
    return new_df

In [12]:
def clean_data(df):
    
    ## STANDARDIZE THE DATA
    df["o_std"] = (-(df["o"] - 90)) % 360
    df["o_std"] = np.where(df["playDirection"] == "left", 180 - df["o"], df["o"])
    df["o_std"] = (df["o_std"] + 360) % 360
    
    
    df["dir_std"] = (-(df["dir"] - 90)) % 360
    df["dir_std"] = np.where(df["playDirection"] == "left", 180 - df["dir"], df["dir"])
    df["dir_std"] = (df["dir_std"] + 360) % 360
    
    df["x_std"] = np.where(df["playDirection"] == "left", 120-df["x"], df["x"])
    df["y_std"] = np.where(df["playDirection"] == "left", 160/3-df["y"], df["y"])
    df["s_std"] = df["s"]
    df["a_std"] = df["a"]
    df["dis_std"] = df["dis"]
    
    df['los_std'] = np.where(df['playDirection'] == 'left', 
                             120-df['absoluteYardlineNumber'], 
                             df['absoluteYardlineNumber'])
    
    ## FILTER RUN/PASS
    df["isRun"] = pd.isna(df["passResult"]) | (df["passResult"] == "R")
    
    ## REMOVE RUN PLAYS, DATA POST-SNAP, AND NON-PENALTY PLAYS
    df = df[df["isRun"] == False]
    df = df[(df["frameId"] >= df['motionFrameId']) & (df["frameId"] <= df['snapFrameId'])]
    df = df[df.playNullifiedByPenalty != 'Y']

    df = keep_motion_cols(df)
    
    return df

In [13]:
def get_qb_coords(df):
    
    qb_df = df[df.position == 'QB'].loc[:, ['gameId', 'playId', 'frameId', 'x_std', 'y_std']]
    qb_df = qb_df.rename(columns={"x_std":"qb_x", "y_std":"qb_y"})
    
    return qb_df

In [14]:
def get_ball_coords(df):
    ball_df = df[df.club == 'football'].loc[:, ['gameId', 'playId', 'frameId', 'x_std', 'y_std']]
    ball_df = ball_df.rename(columns={"x_std":"ball_x", "y_std":"ball_y"})
    
    return ball_df

In [15]:
def get_tackle_coords(group):
    
    group = group.sort_values(by='y_std', ascending=False).reset_index(drop=True)
    
    if len(group) < 2:
        return pd.DataFrame()
    else:
        left_tackle = group.iloc[0, :]
        right_tackle = group.iloc[-1, :]
        
        t_df = pd.DataFrame({
            'gameId': [left_tackle['gameId']],
            'playId': [left_tackle['playId']],
            'frameId': [left_tackle['frameId']],
            'lt_x': [left_tackle['x_std']],
            'lt_y': [left_tackle['y_std']],
            'rt_x': [right_tackle['x_std']],
            'rt_y': [right_tackle['y_std']]
        })
    
    return t_df

In [16]:
def assign_tackle_coords(df):
    tackle_coords = df.groupby(['gameId', 'playId', 'frameId']).apply(get_tackle_coords).reset_index(drop=True)
    return tackle_coords

In [27]:
def get_motion_dataset(full_df):
    motionframe_df = get_motion_frame(full_df)
    snap_df = get_snap_frame(full_df)
    df = pd.merge(full_df, motionframe_df, on=['gameId', 'playId', 'nflId'], how='left')
    df = pd.merge(df, snap_df, on=['gameId', 'playId', 'nflId'], how='left')
    clean_df = clean_data(df)
    
    off_positions = ['QB', 'WR', 'C', 'G', 'T']
    off_df = clean_df[(np.isin(clean_df['position'], off_positions)) | (clean_df['club'] == 'football')]
    
    qb_df = get_qb_coords(off_df)
    ball_df = get_ball_coords(off_df)
    
    wr_df = off_df[off_df.wasRunningRoute==True]
    wr_df = wr_df[(wr_df["motionSinceLineset"]==True) | (wr_df["shiftSinceLineset"]==True)]
    wr_df = wr_df[wr_df.position == 'WR']
    wr_df = wr_df[wr_df.pff_manZone=='Man']
    
    t_df = off_df[off_df.position == 'T']
    tackle_df = assign_tackle_coords(t_df)
    
    motion_df = pd.merge(wr_df, qb_df, on=['gameId', 'playId', 'frameId'], how='left')
    motion_df = pd.merge(motion_df, ball_df, on=['gameId', 'playId', 'frameId'], how='left')
    motion_df = pd.merge(motion_df, tackle_df, on=['gameId', 'playId', 'frameId'], how='left')
        
    return motion_df
    

In [32]:
def classify_motion_type(group):
    """
    Classify a player's motion type within a play, including jet, fly, and orbit motions.
    
    Args:
        group (pd.DataFrame): Data for a single player during a single play, sorted by frame.
        center_x (float): The x-coordinate of the center player at the snap.
    
    Returns:
        pd.DataFrame: A single row with the motion classification.
    """
    # Starting position for WR, Ball, QB
    start_x, start_y = group.iloc[0]['x_std'], group.iloc[0]['y_std']
    ball_x, ball_y = group.iloc[0]['ball_x'], group.iloc[0]['ball_y']
    qb_start_x, qb_start_y = group.iloc[0]['qb_x'], group.iloc[0]['qb_y']
    
    # Position at the snap
    snap_row = group[group['event'] == 'ball_snap']
    snap_x, snap_y = snap_row.iloc[0]['x_std'], snap_row.iloc[0]['y_std']

    # Final position for WR and O-Line
    end_x, end_y = group.iloc[-1]['x_std'], group.iloc[-1]['y_std']
    end_lt_y, end_rt_y = group.iloc[-1]['lt_y'], group.iloc[-1]['rt_y']
    
    # Total displacement
    final_displacement = np.sqrt((end_x - start_x)**2 + (end_y - start_y)**2)
    max_displacement = np.sqrt((group['x_std'] - start_x)**2 + (group['y_std'] - start_y)**2).max()
    displacement_diff = max_displacement - final_displacement
    
    # Directional movement
    x_displacement = end_x - start_x
    y_displacement = end_y - start_y
    
    # Determine if the player crosses the center/ball
    crosses_center = (group['y_std'] < ball_y).any() and (group['y_std'] > ball_y).any()
    crosses_center_by_snap = ((group['y_std'] < ball_y).any() and 
                             (group['y_std'] > ball_y).iloc[:snap_row.index[0]].any())
    
    # Determine if the player goes behind the offensive line ever/at snap
    behind_line = np.where((group['y_std'] < group['lt_y']) & (group['y_std'] > group['rt_y']), 1, 0).any()
    behind_line_at_snap = (end_y < end_lt_y) & (end_y > end_rt_y)
    
   # Angular displacement
    group['angle'] = np.arctan2(group['y_std'] - start_y, group['x_std'] - start_x)
    angular_displacement = group['angle'].diff().abs().sum()
    
    # Thresholds
    significant_displacement = max_displacement > 3.0
    crosses_field = abs(y_displacement) > 5.0
    moves_downfield = abs(x_displacement) > 5.0
    significant_curve = angular_displacement > np.pi / 2 
    
    # Delta in X and Y at each frame
    group['x_diff'] = group['x_std'].diff().fillna(0)
    group['y_diff'] = group['y_std'].diff().fillna(0)
    
    # Identify frames where the direction reverses
    direction_change = ((group['y_diff'][:-1] * group['y_diff'][1:]) < 0).any()

    # Position relative to QB
    start_ahead_qb = start_x > qb_start_x
    go_behind_qb = (group['x_std'] < group['qb_x']).any()
#     around_qb = ((group['x_std'] < group['qb_x']).any()
#                 and (group['y_std'] < group['qb_y']).any() and (group['y_std'] > group['qb_y']).any())
    
    inMotionAtSnap = group['inMotionAtBallSnap'].iloc[-1]


    # Classify motions
    if start_ahead_qb and go_behind_qb and inMotionAtSnap and behind_line_at_snap:
        motion_type = 'orbit'
    elif crosses_center_by_snap and not behind_line_at_snap and displacement_diff < 0.25:
        motion_type = 'fly'
    elif behind_line_at_snap and displacement_diff < 0.25:
        motion_type = 'jet'
    elif not crosses_center and displacement_diff < 0.25:
        motion_type = 'short'
    elif significant_displacement and final_displacement < max_displacement:
        motion_type = 'return'
    else:
        motion_type = 'other'
        
    return motion_type


def classify_motions(data):
    """
    Assign motion labels for each player during each play.
    
    Args:
        data (pd.DataFrame): Tracking data
    
    Returns:
        pd.DataFrame: DataFrame with one row per (game, play, player) and a motion label.
    """
    # Group by game, play, and player
    results = data.groupby(['gameId', 'playId', 'nflId']).apply(classify_motion_type)
    
    # Convert results to a DataFrame
    motion_labels_df = results.reset_index()
    motion_labels_df.columns = ['gameId', 'playId', 'nflId', 'motionType']
    return motion_labels_df

In [17]:
full_df1 = combine_train_data(player_play_file, players_file, plays_file, tracking_files_train1)
full_df2 = combine_train_data(player_play_file, players_file, plays_file, tracking_files_train2)
full_df3 = combine_train_data(player_play_file, players_file, plays_file, tracking_files_train3)

In [22]:
full_df = pd.concat([full_df1, full_df2, full_df3])
# full_path = os.path.join(filepath, "BigDataBowl2025/nfl-big-data-bowl-2025/full_df.csv")
# full_df.to_csv(full_path)

In [28]:
motion_df1 = get_motion_dataset(full_df1)
motion_df2 = get_motion_dataset(full_df2)
motion_df3 = get_motion_dataset(full_df3)

In [None]:
motion_classification_df1 = classify_motions(motion_df1)
motion_classification_df2 = classify_motions(motion_df2)
motion_classification_df3 = classify_motions(motion_df3)
motion_classification_df = pd.concat([motion_classification_df1, 
                                      motion_classification_df2, 
                                      motion_classification_df3])

# final_path = os.path.join(filepath, "BigDataBowl2025/nfl-big-data-bowl-2025/motion_classes.csv")
# motion_classification_df.to_csv(final_path)