In [1]:
import numpy as np
import pandas as pd
import nflreadpy as nfl

In [37]:
input_files = [f'Data/input_2023_w{i:02}.csv' for i in range(1, 19)]
inputs = pd.concat([pd.read_csv(file) for file in input_files], ignore_index=True) # competition data, has frames of player positions and movement
input_players = inputs[['player_name', 'player_position', 'nfl_id']].drop_duplicates() # all player names and their positions

# make plays that are going left go right instead
going_left = inputs['play_direction'] == 'left'
inputs.loc[going_left, 'x'] = 120 - inputs.loc[going_left, 'x']
inputs.loc[going_left, 'y'] = 53.3 - inputs.loc[going_left, 'y']
inputs.loc[going_left, 'ball_land_x'] = 120 - inputs.loc[going_left, 'ball_land_x']
inputs.loc[going_left, 'ball_land_y'] = 53.3 - inputs.loc[going_left, 'ball_land_y']
inputs.loc[going_left, 'o'] = (180 + inputs.loc[going_left, 'o']) % 360
inputs.loc[going_left, 'dir'] = (180 + inputs.loc[going_left, 'dir']) % 360

In [38]:
players = pd.DataFrame(nfl.load_players()[['gsis_id', 'display_name', 'nfl_id']]) # NFL player names, GSIS ID, nfl id
players.columns = ['player_id', 'player_name', 'nfl_id']
players = players[pd.notna(players['nfl_id'])]
players['nfl_id'] = [int(x) for x  in players['nfl_id']]

#contains player name, position, GSIS ID, and competition ID
player_crosswalk = pd.merge(players, inputs[['player_position', 'nfl_id']].drop_duplicates(), on = 'nfl_id', how = 'inner') 

In [39]:
pbp_data = pd.read_parquet('Data/play_by_play_2023.parquet') # 2023 play by play data, to get Next Gen Stats data
pbp_data = pbp_data[['play_id', 'old_game_id', 
                 'epa', 'qb_hit', 'pass_defense_1_player_id', 'pass_defense_1_player_name', 
                 'pass_defense_2_player_id', 'pass_defense_2_player_name',
                 'interception_player_id', 'interception_player_name',
                 'cp', 'xpass']]
pbp_data = pbp_data.rename(columns={"old_game_id": "game_id"})
pbp_data['play_id'] = pbp_data['play_id'].astype(int)
pbp_data['game_id'] = pbp_data['game_id'].astype(int)

In [40]:
# game, play, and player nfl id of interception
interceptions = pbp_data[pd.notna(pbp_data['interception_player_id'])][['play_id', 'game_id', 'interception_player_id']]
interceptions = pd.merge(interceptions, player_crosswalk[['player_id', 'nfl_id']], left_on = 'interception_player_id', right_on = 'player_id', how = 'left')
interceptions = interceptions[['game_id', 'play_id', 'nfl_id']]
interceptions['disruption'] = 1

# game, play, and player nfl id of pass deflections
pass_deflections = pbp_data[pd.notna(pbp_data['pass_defense_1_player_id']) | pd.notna(pbp_data['pass_defense_2_player_id'])][['game_id', 'play_id', 'pass_defense_1_player_id', 'pass_defense_2_player_id']]
pass_deflections = (
    pass_deflections.melt(
        id_vars=['game_id', 'play_id'],
        value_vars=['pass_defense_1_player_id', 'pass_defense_2_player_id'],
        value_name='passdefense_player_id'
    )
    .drop(columns='variable')
    .dropna(subset=['passdefense_player_id'])
    .reset_index(drop=True)
)
pass_deflections = pd.merge(pass_deflections, player_crosswalk[['player_id', 'nfl_id']], left_on = 'passdefense_player_id', right_on = 'player_id', how = 'left')
pass_deflections = pass_deflections[['game_id', 'play_id', 'nfl_id']]
pass_deflections['disruption'] = 1

pbp_data = pbp_data[['game_id', 'play_id', 'cp', 'xpass']]

# additional play data given for big data bowl
supplementary_data = pd.read_csv('Data/supplementary_data.csv')
supplementary_data = supplementary_data[['game_id', 'week', 'home_team_abbr', 'visitor_team_abbr', 'play_id', 'play_description',
                                        'quarter', 'game_clock', 'down', 'yards_to_go', 'possession_team',
                                        'defensive_team', 'pre_snap_home_score', 'pre_snap_visitor_score', 'pass_result', 'pass_length',
                                        'offense_formation', 'route_of_targeted_receiver',
                                        'play_action', 'dropback_type', 'dropback_distance', 'defenders_in_the_box', 'team_coverage_man_zone',
                                        'team_coverage_type', 'expected_points_added']]
supplementary_data = supplementary_data.merge(pbp_data, on = ['game_id', 'play_id'], how = 'left')
valid_plays = supplementary_data[supplementary_data.team_coverage_man_zone == 'ZONE_COVERAGE']
valid_plays = valid_plays[valid_plays.route_of_targeted_receiver != 'GO']
valid_plays = valid_plays[valid_plays.pass_length >= 5]
valid_plays = valid_plays[valid_plays.pass_length <= 30]
# 6601 valid plays

In [70]:
# get all lurkers
last_frame = inputs.loc[inputs.groupby(['game_id', 'play_id'])['frame_id'].transform('max') == inputs['frame_id']].reset_index(drop=True)

targets = last_frame[last_frame['player_role'] == 'Targeted Receiver'][['game_id', 'play_id', 'x', 'y']]
targets = targets.rename(columns={'x': 'target_x', 'y': 'target_y'})

# Step 2: merge back to all rows
last_frame = last_frame.merge(targets, on=['game_id', 'play_id'], how='inner')

In [111]:
# keep defenders whose x is less than target
lurkers = last_frame[last_frame['player_role'] == 'Defensive Coverage']
lurkers = lurkers[lurkers['x'] < lurkers['target_x']]

# distance to target
lurkers['distance_to_target'] = np.sqrt(
    (lurkers['target_x'] - lurkers['x'])**2 + (lurkers['target_y'] - lurkers['y'])**2
)
# distance to ball
lurkers['distance_to_ball'] = np.sqrt(
    (lurkers['ball_land_x'] - lurkers['x'])**2 + (lurkers['ball_land_y'] - lurkers['y'])**2
)
# depth difference
lurkers['depth_difference'] = lurkers['target_x'] - lurkers['x']

#last_frame.loc[(last_frame['x'] < last_frame['target_x']) & (last_frame['player_role'] == 'Defensive Coverage'), 'player_role'] = 'Potential Lurker'
lurkers.loc[(lurkers['depth_difference'] > 0) & (lurkers['distance_to_target'] < 8) & (lurkers['player_role'] == 'Defensive Coverage'), 'player_role'] = 'Potential Lurker'
lurkers.loc[(lurkers['depth_difference'] > 0) & (lurkers['distance_to_ball'] < 8) & (lurkers['player_role'] == 'Defensive Coverage'), 'player_role'] = 'Potential Lurker'

lurkers = lurkers[lurkers.player_role == 'Potential Lurker']
lurkers = lurkers.merge(interceptions, on = ['game_id', 'play_id', 'nfl_id'], how = 'left')
lurkers = lurkers.merge(pass_deflections, on = ['game_id', 'play_id', 'nfl_id'], how = 'left')
lurkers['disruption'] = (
    ((lurkers['disruption_x'].fillna(0) == 1) | 
     (lurkers['disruption_y'].fillna(0) == 1))
    .astype(int)
)
lurkers = lurkers.drop(['disruption_x', 'disruption_y'], axis = 1)

In [112]:
lurkers

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,o,num_frames_output,ball_land_x,ball_land_y,target_x,target_y,distance_to_target,distance_to_ball,depth_difference,disruption
0,2023090700,101,True,52546,26,right,42,L'Jarius Sneed,6-1,193,...,309.47,21,63.259998,-0.220000,52.43,14.14,4.735652,19.820143,4.42,0
1,2023090700,194,True,54653,32,left,89,Malcolm Rodriguez,5-11,225,...,57.08,9,35.059998,31.550000,31.02,31.07,1.883215,5.934852,1.81,0
2,2023090700,436,True,53487,20,right,31,Nick Bolton,6-0,232,...,277.06,7,34.889999,34.820000,33.67,37.80,8.933695,6.578822,2.03,0
3,2023090700,461,True,52546,23,right,44,L'Jarius Sneed,6-1,193,...,355.54,10,55.910000,18.830000,51.82,13.98,3.877422,10.179047,1.90,0
4,2023090700,736,True,44906,31,left,83,Cameron Sutton,5-11,188,...,255.73,15,51.529999,12.849999,46.65,19.88,6.727258,8.071238,3.12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7025,2024010713,3351,True,52478,28,right,79,Brandon Jones,6-0,205,...,172.74,20,98.589996,2.950000,83.45,8.39,2.659060,18.543098,1.91,0
7026,2024010713,3995,True,54650,28,left,50,Christian Benford,6-1,205,...,4.34,17,90.660000,5.640000,82.20,12.97,5.704069,14.069033,4.90,0
7027,2024010713,3995,True,40166,28,left,50,Jordan Poyer,6-0,191,...,282.19,17,90.660000,5.640000,82.20,12.97,4.725124,14.850754,0.22,0
7028,2024010713,4018,True,52647,27,left,50,Dane Jackson,5-11,185,...,322.21,18,87.860001,46.590000,81.55,36.06,7.254309,13.820710,5.95,0


In [3]:
variable list:

game related:
absolute_yardline_number, quarter, down, pre_snap_home_score, pre_snap_visitor_score, game_clock, yards_to_go

pass related:
pass_length, qb hit, play_action, dropback_type, dropback_distance, team_coverage_type, team_coverage_man_zone, qb x, qb y, defenders_in_the_box,
ball_x, ball_y (?)

target receiver related:
x, y, s, a, o, dir, route_of_targetedreceiver
past .5 seconds
past 1 seconds

lurker related:
x, y, s, a, o, dir, height, position (db vs lb)
past .5 seconds
past 1 seconds

outcomes:
complete, incomplete, lurk (deflection or interception)

train valid test

SyntaxError: invalid syntax (2550563028.py, line 1)

In [4]:
signals that model works:

completion percentage is similar to aws completion percentage
probability of pass defended and interceptions are higher when they actually happened
expected pd/interceptions is close for players with enough volume (over 20)

SyntaxError: invalid syntax (425448153.py, line 1)