Creates training data used for building model

In [1]:
import numpy as np
import pandas as pd
import nflreadpy as nfl
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
input_files = [f'Data/input_2023_w{i:02}.csv' for i in range(1, 19)]
inputs = pd.concat([pd.read_csv(file) for file in input_files], ignore_index=True) # competition data, has frames of player positions and movement
input_players = inputs[['player_name', 'player_position', 'nfl_id']].drop_duplicates() # all player names and their positions

# make plays that are going left go right instead
going_left = inputs['play_direction'] == 'left'
inputs.loc[going_left, 'x'] = 120 - inputs.loc[going_left, 'x']
inputs.loc[going_left, 'y'] = 53.3 - inputs.loc[going_left, 'y']
inputs.loc[going_left, 'ball_land_x'] = 120 - inputs.loc[going_left, 'ball_land_x']
inputs.loc[going_left, 'ball_land_y'] = 53.3 - inputs.loc[going_left, 'ball_land_y']
inputs.loc[going_left, 'o'] = (180 + inputs.loc[going_left, 'o']) % 360
inputs.loc[going_left, 'dir'] = (180 + inputs.loc[going_left, 'dir']) % 360

In [3]:
players = pd.DataFrame(nfl.load_players()[['gsis_id', 'display_name', 'nfl_id']]) # NFL player names, GSIS ID, nfl id
players.columns = ['player_id', 'player_name', 'nfl_id']
players = players[pd.notna(players['nfl_id'])]
players['nfl_id'] = [int(x) for x  in players['nfl_id']]

#contains player name, position, GSIS ID, and competition ID
player_crosswalk = pd.merge(players, inputs[['player_position', 'nfl_id']].drop_duplicates(), on = 'nfl_id', how = 'inner') 

In [4]:
pbp_data = pd.read_parquet('Data/play_by_play_2023.parquet') # 2023 play by play data, to get Next Gen Stats data
pbp_data = pbp_data[['play_id', 'old_game_id', 
                 'epa', 'qb_hit', 'pass_defense_1_player_id', 'pass_defense_1_player_name', 
                 'pass_defense_2_player_id', 'pass_defense_2_player_name',
                 'interception_player_id', 'interception_player_name',
                 'cp', 'xpass']]
pbp_data = pbp_data.rename(columns={"old_game_id": "game_id"})
pbp_data['play_id'] = pbp_data['play_id'].astype(int)
pbp_data['game_id'] = pbp_data['game_id'].astype(int)

In [5]:
# game, play, and player nfl id of interception
interceptions = pbp_data[pd.notna(pbp_data['interception_player_id'])][['play_id', 'game_id', 'interception_player_id']]
interceptions = pd.merge(interceptions, player_crosswalk[['player_id', 'nfl_id']], left_on = 'interception_player_id', right_on = 'player_id', how = 'left')
interceptions = interceptions[['game_id', 'play_id', 'nfl_id']]
interceptions['disruption'] = 1

# game, play, and player nfl id of pass deflections
pass_deflections = pbp_data[pd.notna(pbp_data['pass_defense_1_player_id']) | pd.notna(pbp_data['pass_defense_2_player_id'])][['game_id', 'play_id', 'pass_defense_1_player_id', 'pass_defense_2_player_id']]
pass_deflections = (
    pass_deflections.melt(
        id_vars=['game_id', 'play_id'],
        value_vars=['pass_defense_1_player_id', 'pass_defense_2_player_id'],
        value_name='passdefense_player_id'
    )
    .drop(columns='variable')
    .dropna(subset=['passdefense_player_id'])
    .reset_index(drop=True)
)
pass_deflections = pd.merge(pass_deflections, player_crosswalk[['player_id', 'nfl_id']], left_on = 'passdefense_player_id', right_on = 'player_id', how = 'left')
pass_deflections = pass_deflections[['game_id', 'play_id', 'nfl_id']]
pass_deflections['disruption'] = 1

pbp_data = pbp_data[['game_id', 'play_id', 'cp', 'xpass', 'qb_hit']]

# additional play data given for big data bowl
supplementary_data = pd.read_csv('Data/supplementary_data.csv')
supplementary_data['possession_score'] = np.where(
    supplementary_data['possession_team'] == supplementary_data['home_team_abbr'],
    supplementary_data['pre_snap_home_score'],
    supplementary_data['pre_snap_visitor_score']
)

supplementary_data['defensive_score'] = np.where(
    supplementary_data['defensive_team'] == supplementary_data['home_team_abbr'],
    supplementary_data['pre_snap_home_score'],
    supplementary_data['pre_snap_visitor_score']
)
supplementary_data['defensive_point_diff'] = supplementary_data['defensive_score'] - supplementary_data['possession_score']
supplementary_data = supplementary_data[['game_id', 'week', 'home_team_abbr', 'visitor_team_abbr', 'play_id', 'play_description',
                                        'quarter', 'game_clock', 'down', 'yards_to_go', 'defensive_point_diff', 'pass_result', 'pass_length',
                                        'offense_formation', 'route_of_targeted_receiver', 'play_action', 'dropback_type', 'dropback_distance',
                                        'defenders_in_the_box', 'team_coverage_man_zone', 'team_coverage_type', 'expected_points_added']]
supplementary_data = supplementary_data.merge(pbp_data, on = ['game_id', 'play_id'], how = 'left')
valid_plays = supplementary_data[supplementary_data.team_coverage_man_zone == 'ZONE_COVERAGE']
valid_plays = valid_plays[~valid_plays.route_of_targeted_receiver.isin(['GO', 'SCREEN'])]
valid_plays = valid_plays[valid_plays.pass_length >= 5]
valid_plays = valid_plays[valid_plays.pass_length <= 30]
game_context_data = valid_plays[['game_id', 'play_id', 'quarter', 'game_clock', 'down', 'yards_to_go', 'week',
                     'defensive_point_diff', 'pass_result', 'pass_length', 'play_action',
                     'dropback_type', 'dropback_distance', 'team_coverage_type', 'defenders_in_the_box',
                     'route_of_targeted_receiver', 'qb_hit']]

  supplementary_data = pd.read_csv('Data/supplementary_data.csv')


In [9]:
valid_inputs = inputs.merge(valid_plays[['game_id', 'play_id']], on=['game_id', 'play_id'], how='inner')

# get all lurkers
last_frame = valid_inputs.loc[valid_inputs.groupby(['game_id', 'play_id'])['frame_id'].transform('max') == valid_inputs['frame_id']].reset_index(drop=True)

targets = last_frame[last_frame['player_role'] == 'Targeted Receiver'][['game_id', 'play_id', 'x', 'y', 's', 'a', 'dir', 'o', 'ball_land_x', 'ball_land_y']]
targets['target_distance_to_ball'] = np.sqrt(
    (targets['ball_land_x'] - targets['x'])**2 + (targets['ball_land_y'] - targets['y'])**2
)
targets = targets.drop(['ball_land_x', 'ball_land_y'], axis = 1)
passers = last_frame[last_frame['player_role'] == 'Passer'][['game_id', 'play_id', 'x', 'y', 's', 'a', 'dir', 'o']]
last_frame = last_frame.merge(targets.rename(columns={'x': 'target_x', 'y': 'target_y'})[['game_id', 'play_id', 'target_x', 'target_y']], 
                              on=['game_id', 'play_id'], how='inner')
targets = targets.rename(columns={c: f"receiver_{c}" for c in ['x', 'y', 's', 'a', 'dir', 'o']})
passers = passers.rename(columns={c: f"qb_{c}" for c in ['x', 'y', 's', 'a', 'dir', 'o']})

In [10]:
# keep defenders whose x is less than target
lurkers = last_frame[last_frame['player_role'] == 'Defensive Coverage']
lurkers = lurkers[lurkers['x'] < lurkers['target_x']]

lurkers['distance_to_target'] = np.sqrt(
    (lurkers['target_x'] - lurkers['x'])**2 + (lurkers['target_y'] - lurkers['y'])**2
)
lurkers['distance_to_ball'] = np.sqrt(
    (lurkers['ball_land_x'] - lurkers['x'])**2 + (lurkers['ball_land_y'] - lurkers['y'])**2
)
lurkers['depth_difference'] = lurkers['target_x'] - lurkers['x']
lurkers.loc[(lurkers['depth_difference'] > 0) & (lurkers['distance_to_target'] < 8) & (lurkers['player_role'] == 'Defensive Coverage'), 'player_role'] = 'Potential Lurker'
lurkers.loc[(lurkers['depth_difference'] > 0) & (lurkers['distance_to_ball'] < 8) & (lurkers['player_role'] == 'Defensive Coverage'), 'player_role'] = 'Potential Lurker'

lurkers = lurkers[lurkers.player_role == 'Potential Lurker']
lurkers = lurkers.merge(interceptions, on = ['game_id', 'play_id', 'nfl_id'], how = 'left')
lurkers = lurkers.merge(pass_deflections, on = ['game_id', 'play_id', 'nfl_id'], how = 'left')
lurkers['disruption'] = (
    ((lurkers['disruption_x'].fillna(0) == 1) | 
     (lurkers['disruption_y'].fillna(0) == 1))
    .astype(int)
)
lurkers['time_to_throw'] = lurkers['frame_id']/10
lurkers = lurkers.rename(columns = {'disruption_x' : 'int', 'disruption_y' : 'pd'})
lurkers = lurkers[['game_id', 'play_id', 'absolute_yardline_number', 'player_name', 'player_height', 'player_position', 'x', 'y', 's', 'a', 'dir', 'o',
        'ball_land_x', 'ball_land_y', 'distance_to_target', 'distance_to_ball', 'depth_difference', 'disruption', 'pd', 'int', 'time_to_throw']]
lurkers = lurkers.rename(columns={c: f"lurker_{c}" for c in ['x', 'y', 's', 'a', 'dir', 'o']})
lurkers['player_position'] = lurkers['player_position'].replace({
    'CB': 'DB',
    'SS': 'DB',
    'FS': 'DB',
    'S': 'DB',
    'ILB': 'LB',
    'OLB': 'LB',
    'MLB': 'LB',
    'DE': 'DL',
    'DT': 'DL',
    'NT': 'DL'
})

In [18]:
input_data = lurkers.merge(targets, on = ['game_id', 'play_id'], how = 'inner')
input_data = input_data.merge(passers, on = ['game_id', 'play_id'], how = 'inner')
input_data = input_data.merge(game_context_data, on = ['game_id', 'play_id'], how = 'inner')
input_data['outcome'] = np.where(input_data['pass_result'] == 'C', 'Complete', 'Incomplete')
input_data['outcome'] = np.where(input_data['disruption'] == 1, 'Disruption', input_data['outcome'])

In [9]:
input_data.to_csv('Data/processed/data_for_model.csv', index = False) # save training data

# Get frame data from past half second and past full second

In [39]:
selected_plays = input_data[['game_id', 'play_id']].apply(tuple, axis=1)
selected_plays_and_name = input_data[['game_id', 'play_id', 'player_name']].apply(tuple, axis=1)
selected_inputs = inputs[inputs[['game_id', 'play_id']].apply(tuple, axis=1).isin(selected_plays)]
selected_inputs.loc[selected_inputs[['game_id', 'play_id', 'player_name']].apply(tuple, axis=1).isin(selected_plays_and_name), 'player_role'] = 'Lurker'
selected_inputs = selected_inputs[selected_inputs.player_role.isin(['Lurker', 'Passer', 'Targeted Receiver'])]

In [55]:
past_ten_frames = (selected_inputs
                   .sort_values('frame_id', ascending=False)
                   .groupby(['game_id', 'play_id', 'player_name'], group_keys=False)
                   .apply(lambda g: g.iloc[1:11]))

past_ten_frames = past_ten_frames[['game_id', 'play_id', 'player_name', 'frame_id', 'player_role', 'x', 'y', 's', 'a', 'dir', 'o']]

  .apply(lambda g: g.iloc[1:11])


In [64]:
order_labels = ['one','two','three','four','five','six','seven','eight','nine','ten']
past_ten_frames_sorted = past_ten_frames.sort_values(['game_id', 'play_id', 'player_name', 'frame_id'])
past_ten_frames_sorted['frame_rank'] = (past_ten_frames_sorted.groupby(['game_id', 'play_id', 'player_name']).cumcount())
static_cols = ['game_id', 'play_id', 'player_name', 'player_role']
frame_cols  = ['x', 'y', 's', 'a', 'dir', 'o']

collapsed_past_ten = (past_ten_frames_sorted
                      .set_index(['game_id', 'play_id', 'player_name', 'player_role', 'frame_rank'])[frame_cols]
                      .unstack('frame_rank'))
collapsed_past_ten.columns = [f"{col[0]}_{order_labels[col[1]]}" if col[1] < len(order_labels) else f"{col[0]}_{col[1]+1}"
                              for col in collapsed_past_ten.columns]
collapsed_past_ten = collapsed_past_ten.reset_index()

In [70]:
collapsed_past_ten.to_csv('Data/processed/last_ten_frames_of_input.csv', index = False) # save last 10 frames