In [1]:
"""
01_data_exploration.ipynb
Purpose: explore with initial NFL tracking data
Key Notes:
setup
    - input: training features (metadata, tracking data, context, frames)
    - output: target variables (player stats, play outcomes)
    - no nulls in input data
initial data exploration
    - play_id is not globally unique
    - ~12-13 players are tracked per frame
    - ~3-4 players are marked for prediction
    - all players tracked are solely players involved with pass (dbs, wrs, etc.)
    - 70% of predicted players are defensive
October 2025
"""

'\n01_data_exploration.ipynb\nPurpose: explore with initial NFL tracking data\nKey Notes:\nsetup\n    - input: training features (metadata, tracking data, context, frames)\n    - output: target variables (player stats, play outcomes)\n    - no nulls in input data\ninitial data exploration\n    - play_id is not globally unique\n    - ~12-13 players are tracked per frame\n    - ~3-4 players are marked for prediction\n    - all players tracked are solely players involved with pass (dbs, wrs, etc.)\n    - 70% of predicted players are defensive\nOctober 2025\n'

In [2]:
# Set up
import pandas as pd 
import numpy as np

input_df = pd.read_csv('../data/train/input_2023_w01.csv')
print(input_df.head())

output_df = pd.read_csv('../data/train/output_2023_w01.csv')
print(output_df.head())

supplemental_df = pd.read_csv('../data/supplementary_data.csv')

      game_id  play_id  player_to_predict  nfl_id  frame_id play_direction  \
0  2023090700      101              False   54527         1          right   
1  2023090700      101              False   54527         2          right   
2  2023090700      101              False   54527         3          right   
3  2023090700      101              False   54527         4          right   
4  2023090700      101              False   54527         5          right   

   absolute_yardline_number player_name player_height  player_weight  ...  \
0                        42  Bryan Cook           6-1            210  ...   
1                        42  Bryan Cook           6-1            210  ...   
2                        42  Bryan Cook           6-1            210  ...   
3                        42  Bryan Cook           6-1            210  ...   
4                        42  Bryan Cook           6-1            210  ...   

          player_role      x      y     s     a     dir       o  \
0

  supplemental_df = pd.read_csv('../data/supplementary_data.csv')


In [3]:
"""
Notes: 
- Column names: 'game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id',
       'play_direction', 'absolute_yardline_number', 'player_name',
       'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a',
       'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y'
- No nulls
737 unique players
"""
print (input_df.shape)
print (input_df.columns)
print (input_df['nfl_id'].nunique())
print (input_df['play_id'].describe())
print (input_df.isnull().sum())

(285714, 23)
Index(['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id',
       'play_direction', 'absolute_yardline_number', 'player_name',
       'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a',
       'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y'],
      dtype='object')
737
count    285714.000000
mean       2284.758118
std        1239.145629
min          55.000000
25%        1232.000000
50%        2263.000000
75%        3302.000000
max        4699.000000
Name: play_id, dtype: float64
game_id                     0
play_id                     0
player_to_predict           0
nfl_id                      0
frame_id                    0
play_direction              0
absolute_yardline_number    0
player_name                 0
player_height               0
player_weight               0
player_birth_date           0
player_position             0
player_side                 0
player_r

In [4]:
""" 
GOAL: Counting highest number of players tracked in a single play

Notes:
- found out play_id values are not globally unique
- each game starts play_id from zero
- changed ['play_id','frame_id'] to ['play_id','frame_id', 'game_id']
- originally outputted 39; examined the specific play_id further; saw player_names from 3+ teams
- 14 is still low
"""
highest_num_players = input_df.groupby(['play_id','frame_id', 'game_id'])['nfl_id'].nunique().max()
highest_num_players

14

In [5]:
"""
GOAL: Get player counts for all frames for prev analysis

Notes: 
- Only 12-13 players are tracked per play instead of the full 22
- Focuses more on the relevant players
"""
players_per_frame = input_df.groupby(['game_id', 'play_id', 'frame_id'])['nfl_id'].nunique()
print(players_per_frame.describe())

print(players_per_frame.value_counts().head(20))

count    23151.000000
mean        12.341324
std          1.050945
min          6.000000
25%         12.000000
50%         13.000000
75%         13.000000
max         14.000000
Name: nfl_id, dtype: float64
13    12969
12     5105
11     2827
10     1135
14      710
9       314
8        33
7        32
6        26
Name: nfl_id, dtype: int64


In [6]:
""""
GOAL: How many players marked for prediction per frame?

Notes: 
- typically 3-4 players marked for prediction for each frame
- leaves 8-10 context players
- most likely dbs/lbs and receivers involved with a pass
"""
predict_per_frame = input_df.groupby(['game_id', 'play_id', 'frame_id'])['player_to_predict'].sum()

print(predict_per_frame.value_counts())


3    7104
4    4950
2    4712
5    2743
1    2007
6    1330
7     279
8      26
Name: player_to_predict, dtype: int64


In [7]:
"""
GOAL: Find specific position

Notes:
- as expected, all key players involved with a pass play
- ~70% of predicted players are defensive (CBs, Safeties, LBs)
- other ~30% are WRs, TEs, RBs
"""
predicted_players = input_df[input_df['player_to_predict'] == True]
print(predicted_players['player_position'].value_counts())

CB     20456
WR     14074
FS      9592
SS      7190
ILB     6174
TE      5035
MLB     4672
OLB     4442
RB      3831
DE       537
FB       211
S        158
DT        27
Name: player_position, dtype: int64
