In [2]:
import pandas as pd 
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

input_df = pd.read_csv('../data/train/input_2023_w01.csv')
print(input_df.head())

output_df = pd.read_csv('../data/train/output_2023_w01.csv')
print(output_df.head())

supplemental_df = pd.read_csv('../data/supplementary_data.csv')

      game_id  play_id  player_to_predict  nfl_id  frame_id play_direction  \
0  2023090700      101              False   54527         1          right   
1  2023090700      101              False   54527         2          right   
2  2023090700      101              False   54527         3          right   
3  2023090700      101              False   54527         4          right   
4  2023090700      101              False   54527         5          right   

   absolute_yardline_number player_name player_height  player_weight  ...  \
0                        42  Bryan Cook           6-1            210  ...   
1                        42  Bryan Cook           6-1            210  ...   
2                        42  Bryan Cook           6-1            210  ...   
3                        42  Bryan Cook           6-1            210  ...   
4                        42  Bryan Cook           6-1            210  ...   

          player_role      x      y     s     a     dir       o  \
0

  supplemental_df = pd.read_csv('../data/supplementary_data.csv')


In [4]:
# Merge input with supplemental data on game_id and play_id
final_df = pd.merge(supplemental_df, input_df, on=['game_id', 'play_id'], how='left')

print(f"Final shape: {final_df.shape}")
print(final_df.head())

Final shape: (302904, 62)
      game_id  season  week   game_date game_time_eastern home_team_abbr  \
0  2023090700    2023     1  09/07/2023          20:20:00             KC   
1  2023090700    2023     1  09/07/2023          20:20:00             KC   
2  2023090700    2023     1  09/07/2023          20:20:00             KC   
3  2023090700    2023     1  09/07/2023          20:20:00             KC   
4  2023090700    2023     1  09/07/2023          20:20:00             KC   

  visitor_team_abbr  play_id  \
0               DET     3461   
1               DET     3461   
2               DET     3461   
3               DET     3461   
4               DET     3461   

                                    play_description  quarter  ...  \
0  (10:46) (Shotgun) J.Goff pass deep left to J.R...        4  ...   
1  (10:46) (Shotgun) J.Goff pass deep left to J.R...        4  ...   
2  (10:46) (Shotgun) J.Goff pass deep left to J.R...        4  ...   
3  (10:46) (Shotgun) J.Goff pass deep left t

In [5]:
# Sort by game_id and play_id to ensure proper order
supplemental_df = supplemental_df.sort_values(['game_id', 'play_id'])

# Get the previous play's expected_points_added within each game
supplemental_df['prev_expected_points_added'] = supplemental_df.groupby('game_id')['expected_points_added'].shift(1)

# Calculate the change in EPA from the last play
supplemental_df['epa_change_from_last_play'] = supplemental_df['expected_points_added'] - supplemental_df['prev_expected_points_added']

# Check the result
print(supplemental_df[['game_id', 'play_id', 'expected_points_added', 'prev_expected_points_added', 'epa_change_from_last_play']].head(20))

       game_id  play_id  expected_points_added  prev_expected_points_added  \
5   2023090700      101              -2.145443                         NaN   
41  2023090700      194               1.702563                   -2.145443   
52  2023090700      219               0.089352                    1.702563   
16  2023090700      361              -0.862062                    0.089352   
36  2023090700      436               1.613927                   -0.862062   
1   2023090700      461               1.345633                    1.613927   
56  2023090700      530               0.031384                    1.345633   
55  2023090700      621               2.850031                    0.031384   
12  2023090700      713              -0.873874                    2.850031   
18  2023090700      736               2.717539                   -0.873874   
35  2023090700      877               1.140134                    2.717539   
7   2023090700      902              -0.407670                  