In [122]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import os
from typing import List

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.cluster import KMeans
from multiprocessing import Pool as MultiprocessingPool, cpu_count


from src.kinematics import calculate_speed_and_direction
# from kinematics import add_kinematics

pd.set_option("display.max_columns", None)

In [123]:
# ============================================================================
# CONFIG
# ============================================================================

class Config:
    DATA_DIR = Path("./data")
    OUTPUT_DIR = Path("./outputs")
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    SEED = 42
    N_FOLDS = 5
    BATCH_SIZE = 256
    EPOCHS = 200
    PATIENCE = 30
    LEARNING_RATE = 1e-4
    
    WINDOW_SIZE = 10
    HIDDEN_DIM = 128
    MAX_FUTURE_HORIZON = 94
    
    FIELD_X_MIN, FIELD_X_MAX = 0.0, 120.0
    FIELD_Y_MIN, FIELD_Y_MAX = 0.0, 53.3
    
    K_NEIGH = 6
    RADIUS = 30.0
    TAU = 8.0
    N_ROUTE_CLUSTERS = 7
    
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(Config.SEED)



In [22]:
config = Config()
config

# Load
print("\n[1/4] Loading data...")
train_input_files = [config.DATA_DIR / f"train/input_2023_w{w:02d}.csv" for w in range(1, 19)]
train_output_files = [config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]
train_input = pd.concat([pd.read_csv(f) for f in train_input_files if f.exists()])
train_output = pd.concat([pd.read_csv(f) for f in train_output_files if f.exists()])
# test_input = pd.read_csv(config.DATA_DIR / "test_input.csv")
test_template = pd.read_csv(config.DATA_DIR / "test.csv")
supplementary_data = pd.read_csv(config.DATA_DIR / "supplementary_data.csv")

print(f"✓ Train input: {train_input.shape}, Train output: {train_output.shape}")
print(f"✓ Train output: {train_output.shape}, unique plays: {train_output[['game_id','play_id']].drop_duplicates().shape[0]}")
print(f"✓ Test template: {test_template.shape}, unique plays: {test_template[['game_id','play_id']].drop_duplicates().shape[0]}")
# print(f"✓ Test input: {test_input.shape}, Test template: {test_template.shape}")
print(f"✓ Supplementary data: {supplementary_data.shape}")


[1/4] Loading data...
✓ Train input: (4880579, 23), Train output: (562936, 6)
✓ Train output: (562936, 6), unique plays: 14108
✓ Test template: (5837, 5), unique plays: 143
✓ Supplementary data: (18009, 41)


  supplementary_data = pd.read_csv(config.DATA_DIR / "supplementary_data.csv")


In [4]:
play_results = supplementary_data[['game_id','play_id','pass_result']].drop_duplicates()
play_results.loc[play_results['pass_result'] == 'IN', 'pass_result'] = 'I'

In [99]:
train_input[(train_input['game_id'] == 2023092411)
             & (train_input['play_id'] == 3742)
             ]

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y
237574,2023092411,3742,True,54527,1,right,66,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,81.27,30.18,2.12,0.96,137.19,233.06,21,89.25,51.349998
237575,2023092411,3742,True,54527,2,right,66,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,81.43,30.02,2.29,0.97,135.13,238.74,21,89.25,51.349998
237576,2023092411,3742,True,54527,3,right,66,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,81.61,29.86,2.43,1.03,132.26,243.29,21,89.25,51.349998
237577,2023092411,3742,True,54527,4,right,66,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,81.80,29.70,2.54,1.00,130.07,246.37,21,89.25,51.349998
237578,2023092411,3742,True,54527,5,right,66,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,82.01,29.53,2.68,0.99,128.20,250.10,21,89.25,51.349998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237829,2023092411,3742,True,46093,16,right,66,DJ Moore,5-11,215,1997-04-14,WR,Offense,Targeted Receiver,69.28,47.20,6.24,2.23,61.56,72.66,21,89.25,51.349998
237830,2023092411,3742,True,46093,17,right,66,DJ Moore,5-11,215,1997-04-14,WR,Offense,Targeted Receiver,69.85,47.50,6.57,2.14,62.41,71.46,21,89.25,51.349998
237831,2023092411,3742,True,46093,18,right,66,DJ Moore,5-11,215,1997-04-14,WR,Offense,Targeted Receiver,70.46,47.80,6.85,1.93,63.92,72.10,21,89.25,51.349998
237832,2023092411,3742,True,46093,19,right,66,DJ Moore,5-11,215,1997-04-14,WR,Offense,Targeted Receiver,71.10,48.08,7.08,1.95,66.20,72.10,21,89.25,51.349998


In [None]:
train_output[(train_output['game_id'] == 2023100807)
             & (train_output['play_id'] == 3900)
             & (train_output['nfl_id'] == 48026)]

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
19210,2023100807,3900,48026,1,66.74,40.79
19211,2023100807,3900,48026,2,66.51,41.31
19212,2023100807,3900,48026,3,66.34,41.84
19213,2023100807,3900,48026,4,66.25,42.37
19214,2023100807,3900,48026,5,66.18,42.9
19215,2023100807,3900,48026,6,66.16,43.43
19216,2023100807,3900,48026,7,66.18,43.93
19217,2023100807,3900,48026,8,66.24,44.42
19218,2023100807,3900,48026,9,66.32,44.89
19219,2023100807,3900,48026,10,66.43,45.33


In [130]:
c = pd.read_csv('./outputs/defender_impact_log.csv')
c = c.merge(supplementary_data[['game_id','play_id','pass_result', 'yards_gained', 'season','week','home_team_abbr','visitor_team_abbr','play_description', 'quarter','game_clock']],
        on=['game_id','play_id'], how='left')
c.sort_values('delta', ascending = False).head(20)

Unnamed: 0,game_id,play_id,nfl_id,baseline_prob,real_prob,delta,fold,pass_result,yards_gained,season,week,home_team_abbr,visitor_team_abbr,play_description,quarter,game_clock
13589,2023100113,1796,52566,0.416263,0.824197,0.407933,3,C,15,2023,4,NYJ,KC,(3:47) (Shotgun) P.Mahomes pass short right to...,2,03:47
13865,2023100807,3900,48026,0.226236,0.588904,0.362668,3,IN,0,2023,5,ARI,CIN,"(5:01) (No Huddle, Shotgun) J.Dobbs pass short...",4,05:01
31219,2023122407,1171,54500,0.156081,0.51634,0.36026,5,C,26,2023,16,TEN,SEA,"(13:03) (No Huddle, Shotgun) G.Smith pass shor...",2,13:03
25580,2023091011,2388,43351,0.113261,0.464019,0.350759,5,I,0,2023,1,NE,PHI,(14:04) (Shotgun) M.Jones pass incomplete deep...,3,14:04
2198,2023102201,2765,42361,0.178235,0.495883,0.317649,1,I,0,2023,7,CHI,LV,(4:14) T.Bagent pass incomplete short right to...,3,04:14
14420,2023102210,2653,46097,0.468586,0.783989,0.315403,3,I,0,2023,7,PHI,MIA,(4:20) (Shotgun) T.Tagovailoa pass incomplete ...,3,04:20
22043,2023102901,3281,54677,0.090516,0.400905,0.310389,4,C,24,2023,8,DAL,LA,(13:19) (Shotgun) D.Prescott pass deep left to...,4,13:19
10541,2023112603,174,56086,0.110954,0.421207,0.310253,2,I,0,2023,12,IND,TB,(11:29) B.Mayfield pass incomplete deep left t...,1,11:29
19769,2023092403,3198,54514,0.2153,0.52261,0.30731,4,C,22,2023,3,GB,NO,"(12:30) (No Huddle, Shotgun) J.Love pass short...",4,12:30
17653,2023122411,2070,54969,0.119438,0.422529,0.303091,3,C,41,2023,16,DEN,NE,(12:44) B.Zappe pass deep left to D.Douglas to...,3,12:44


In [131]:
c[c['play_id'] == 1082]

Unnamed: 0,game_id,play_id,nfl_id,baseline_prob,real_prob,delta,fold,pass_result,yards_gained,season,week,home_team_abbr,visitor_team_abbr,play_description,quarter,game_clock
10325,2023112301,1082,44827,0.493925,0.497229,0.003304,2,C,30,2023,12,DAL,WAS,(:01) (Shotgun) S.Howell pass deep left to C.S...,1,00:01
10326,2023112301,1082,54632,0.493925,0.455996,-0.037928,2,C,30,2023,12,DAL,WAS,(:01) (Shotgun) S.Howell pass deep left to C.S...,1,00:01
10759,2023112609,1082,43351,0.691522,0.692315,0.000793,2,C,8,2023,12,PHI,BUF,(15:00) (Shotgun) J.Allen pass short right to ...,2,15:00
10760,2023112609,1082,44869,0.691522,0.690369,-0.001153,2,C,8,2023,12,PHI,BUF,(15:00) (Shotgun) J.Allen pass short right to ...,2,15:00
15365,2023111210,1082,52435,0.900951,0.899612,-0.001339,3,C,3,2023,10,SEA,WAS,(13:32) S.Howell pass short right to A.Armah t...,2,13:32
20452,2023100110,1082,53509,0.913754,0.914665,0.000911,4,C,5,2023,4,LAC,LV,(14:33) (Shotgun) J.Herbert pass short right t...,2,14:33
26568,2023091711,1082,55880,0.123415,0.025208,-0.098208,5,C,53,2023,2,DEN,WAS,(13:15) (Shotgun) R.Wilson pass deep left to M...,2,13:15
28590,2023102911,1082,46137,0.083381,0.111845,0.028464,5,C,39,2023,8,DEN,KC,(13:03) (Shotgun) R.Wilson pass deep middle to...,2,13:03
28591,2023102911,1082,52546,0.083381,0.083438,5.7e-05,5,C,39,2023,8,DEN,KC,(13:03) (Shotgun) R.Wilson pass deep middle to...,2,13:03
28592,2023102911,1082,54527,0.083381,0.083784,0.000403,5,C,39,2023,8,DEN,KC,(13:03) (Shotgun) R.Wilson pass deep middle to...,2,13:03


In [124]:
a = pd.read_csv('./outputs/catch_probabilities_log.csv')
a['real_less_proj'] = a['pred_catch_prob_by_real_traj'] - a['pred_catch_prob_by_proj_traj']
a.sort_values('real_less_proj', ascending=False, inplace=True)
a.merge(supplementary_data[['game_id','play_id','pass_result', 'yards_gained', 'season','week','home_team_abbr','visitor_team_abbr','play_description', 'quarter','game_clock']],
        on=['game_id','play_id'], how='left').head(30)
# a.sort_values('real_less_proj', ascending=False).head(30)
# b = a.sort_values('delta')
# b.shape
# b.merge(supplementary_data[['game_id','play_id','pass_result', 'yards_gained', 'season','week','home_team_abbr','visitor_team_abbr','play_description', 'quarter','game_clock']],
#         on=['game_id','play_id'], how='left').head(30)

Unnamed: 0,game_id,play_id,pred_catch_prob_by_proj_traj,pred_catch_prob_by_real_traj,real_less_proj,pass_result,yards_gained,season,week,home_team_abbr,visitor_team_abbr,play_description,quarter,game_clock
0,2023100805,2735,0.093919,0.741877,0.647958,C,3,2023,5,NE,NO,(9:12) (Shotgun) D.Carr pass short right to R....,3,09:12
1,2023091711,1082,0.123415,0.744086,0.620671,C,53,2023,2,DEN,WAS,(13:15) (Shotgun) R.Wilson pass deep left to M...,2,13:15
2,2023110200,2677,0.215321,0.819346,0.604025,I,0,2023,9,PIT,TEN,(5:49) (Shotgun) K.Pickett pass incomplete sho...,3,05:49
3,2023101508,2360,0.224503,0.823885,0.599383,I,0,2023,6,TB,DET,(7:52) (Shotgun) B.Mayfield pass incomplete sh...,3,07:52
4,2023101600,4288,0.179383,0.77526,0.595877,I,0,2023,6,LAC,DAL,(2:28) (Shotgun) D.Prescott pass incomplete de...,4,02:28
5,2023091701,3025,0.061843,0.649082,0.587239,I,0,2023,2,BUF,LV,(12:57) (Shotgun) J.Garoppolo pass incomplete ...,4,12:57
6,2023100111,205,0.249542,0.823836,0.574294,I,0,2023,4,DAL,NE,"(12:16) (No Huddle, Shotgun) D.Prescott pass i...",1,12:16
7,2023091707,1259,0.326438,0.887386,0.560948,I,0,2023,2,TEN,LAC,(8:29) (Shotgun) J.Herbert pass incomplete sho...,2,08:29
8,2023101505,3419,0.048402,0.606116,0.557713,I,0,2023,6,HOU,NO,(9:40) (Shotgun) C.Stroud pass incomplete deep...,4,09:40
9,2023102210,1994,0.08882,0.639325,0.550504,C,27,2023,7,PHI,MIA,(:45) (Shotgun) T.Tagovailoa pass deep left to...,2,00:45


In [None]:
# 2023100112	1977
# 2023091707	1259
# 2023101506	3181
# 2023121007	3552
# 2024010708	2810
# 2023102201	2765
# 2023112604	656
# 2023091005	3483
# 2023121007	2564
# 2023112605	1823
supplementary_data[(supplementary_data['gamae_id']== 2023112605)
                   & (supplementary_data['play_id']== 1823)]

Unnamed: 0,game_id,season,week,game_date,game_time_eastern,home_team_abbr,visitor_team_abbr,play_id,play_description,quarter,game_clock,down,yards_to_go,possession_team,defensive_team,yardline_side,yardline_number,pre_snap_home_score,pre_snap_visitor_score,play_nullified_by_penalty,pass_result,pass_length,offense_formation,receiver_alignment,route_of_targeted_receiver,play_action,dropback_type,dropback_distance,pass_location_type,defenders_in_the_box,team_coverage_man_zone,team_coverage_type,penalty_yards,pre_penalty_yards_gained,yards_gained,expected_points,expected_points_added,pre_snap_home_team_win_probability,pre_snap_visitor_team_win_probability,home_team_win_probability_added,visitor_team_win_probility_added
9063,2023112605,2023,12,11/26/2023,13:00:00,TEN,CAR,1823,(1:05) (Shotgun) W.Levis pass incomplete short...,2,01:05,3,4,TEN,CAR,TEN,39,14,3,N,I,15,SHOTGUN,3x1,WHEEL,False,TRADITIONAL,3.23,INSIDE_BOX,6,MAN_COVERAGE,COVER_1_MAN,,0,0,1.135092,-0.823373,0.884547,0.115453,-0.031048,0.031048


In [71]:
train_input[(train_input['game_id'] == 2023102209)
             & (train_input['play_id'] == 46456)]

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y


In [61]:
train_output[(train_output['game_id'] == 2023102209)
             & (train_output['play_id'] == 4016)]

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
22508,2023102209,4016,54527,1,30.74,16.82
22509,2023102209,4016,54527,2,30.49,16.89
22510,2023102209,4016,54527,3,30.27,16.96
22511,2023102209,4016,54527,4,30.14,17.06
22512,2023102209,4016,54527,5,30.05,17.15
...,...,...,...,...,...,...
22629,2023102209,4016,40024,14,35.37,20.10
22630,2023102209,4016,40024,15,34.99,20.49
22631,2023102209,4016,40024,16,34.65,20.84
22632,2023102209,4016,40024,17,34.33,21.16


In [106]:
train_input[train_input['nfl_id'] == 46700]

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y
66743,2023091003,81,False,46700,1,left,83,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,79.37,20.15,0.80,0.88,206.12,74.24,12,70.470001,44.689999
66744,2023091003,81,False,46700,2,left,83,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,79.34,20.08,0.80,0.89,198.31,74.24,12,70.470001,44.689999
66745,2023091003,81,False,46700,3,left,83,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,79.31,20.00,0.84,0.55,198.95,72.29,12,70.470001,44.689999
66746,2023091003,81,False,46700,4,left,83,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,79.30,19.92,0.86,0.61,191.04,70.74,12,70.470001,44.689999
66747,2023091003,81,False,46700,5,left,83,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,79.28,19.84,0.83,0.29,192.11,72.86,12,70.470001,44.689999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69126,2024010702,2873,False,46700,25,right,42,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,45.79,18.95,4.51,1.31,65.59,331.18,8,42.410000,35.560001
69127,2024010702,2873,False,46700,26,right,42,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,46.20,19.16,4.65,1.20,63.42,335.56,8,42.410000,35.560001
69128,2024010702,2873,False,46700,27,right,42,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,46.63,19.38,4.84,1.39,61.28,341.45,8,42.410000,35.560001
69129,2024010702,2873,False,46700,28,right,42,Tre Herndon,5-11,185,1996-03-05,CB,Defense,Defensive Coverage,47.06,19.63,4.97,1.77,58.31,342.54,8,42.410000,35.560001


In [107]:
supplementary_data.head()

Unnamed: 0,game_id,season,week,game_date,game_time_eastern,home_team_abbr,visitor_team_abbr,play_id,play_description,quarter,game_clock,down,yards_to_go,possession_team,defensive_team,yardline_side,yardline_number,pre_snap_home_score,pre_snap_visitor_score,play_nullified_by_penalty,pass_result,pass_length,offense_formation,receiver_alignment,route_of_targeted_receiver,play_action,dropback_type,dropback_distance,pass_location_type,defenders_in_the_box,team_coverage_man_zone,team_coverage_type,penalty_yards,pre_penalty_yards_gained,yards_gained,expected_points,expected_points_added,pre_snap_home_team_win_probability,pre_snap_visitor_team_win_probability,home_team_win_probability_added,visitor_team_win_probility_added
0,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,3461,(10:46) (Shotgun) J.Goff pass deep left to J.R...,4,10:46,3,12,DET,KC,DET,23,20,14,N,C,18,EMPTY,3x2,IN,False,TRADITIONAL,5.3,INSIDE_BOX,6,ZONE_COVERAGE,COVER_2_ZONE,,18,18,-0.664416,2.945847,0.834296,0.165704,-0.081149,0.081149
1,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,461,(7:30) J.Goff pass short right to J.Reynolds t...,1,07:30,1,10,DET,KC,DET,34,0,0,N,C,13,SINGLEBACK,3x1,POST,True,TRADITIONAL,4.72,INSIDE_BOX,7,ZONE_COVERAGE,COVER_6_ZONE,,21,21,1.926131,1.345633,0.544618,0.455382,-0.029415,0.029415
2,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1940,(:09) (Shotgun) J.Goff pass incomplete deep ri...,2,00:09,2,10,DET,KC,DET,42,14,7,N,I,18,SHOTGUN,3x1,OUT,False,TRADITIONAL,4.44,INSIDE_BOX,6,ZONE_COVERAGE,COVER_2_ZONE,,0,0,0.281891,-0.081964,0.771994,0.228006,0.000791,-0.000791
3,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1711,"(:45) (No Huddle, Shotgun) P.Mahomes pass deep...",2,00:45,1,10,KC,DET,DET,30,7,7,N,C,23,SHOTGUN,3x1,CORNER,False,TRADITIONAL,5.36,INSIDE_BOX,5,ZONE_COVERAGE,COVER_2_ZONE,,26,26,3.452352,2.342947,0.663187,0.336813,0.041843,-0.041843
4,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1588,(1:54) (Shotgun) P.Mahomes pass incomplete dee...,2,01:54,1,10,KC,DET,KC,43,7,7,N,I,38,SHOTGUN,2x2,POST,False,TRADITIONAL,4.59,INSIDE_BOX,6,ZONE_COVERAGE,COVER_4_ZONE,,0,0,1.921525,-0.324035,0.615035,0.384965,6.1e-05,-6.1e-05


In [5]:
# ===================
# Conduct kinematics test - NOTE: move this later
# ===================
ids = ["game_id", "play_id", "nfl_id", "frame_id"]
unique_plays = train_input[['game_id', 'play_id']].drop_duplicates()
sampled_plays = unique_plays.sample(n=2000, random_state=42)
test = train_input.merge(sampled_plays, on=['game_id', 'play_id']).reset_index(drop=True)

kinematics_test = calculate_speed_and_direction(test[['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y']])
test_with_kinematics = test.merge(kinematics_test[ids + ['s','dir']], on=['game_id', 'play_id', 'nfl_id', "frame_id"], suffixes = ('', '_kinematics'))
correlations = test_with_kinematics[['s', 's_kinematics', 'dir', 'dir_kinematics']].corr()
correlations

Unnamed: 0,s,s_kinematics,dir,dir_kinematics
s,1.0,0.998408,-0.002528,0.034965
s_kinematics,0.998408,1.0,-0.002918,0.034257
dir,-0.002528,-0.002918,1.0,0.931304
dir_kinematics,0.034965,0.034257,0.931304,1.0


In [6]:
# ==========================================
# Prepare Sequences Geometric - Base
# ==========================================

# TODO: implement kinematics based on what's below
need_to_calculate = ["s", "dir"]
train_output = calculate_speed_and_direction(train_output)

# TODO: Re-attach play features from input_df onto output_df
keys = ["game_id", "play_id", "nfl_id"]
play_features = ["player_height", "player_weight","player_side","player_role",
                 "player_position",
                 "play_direction", "absolute_yardline_number",
                 "ball_land_x","ball_land_y"]

# NOTE: Stuff to add later on:
# How far was the throw
# QB orientation
# Defender angles collapsed

# TODO: Actually create the features
input = train_input[keys + play_features].drop_duplicates()

train_output = train_output.merge(input, on=keys, how='inner')
train_output = train_output.merge(play_results, on=['game_id','play_id'], how='left', indicator= True)
assert all(train_output['_merge'] == 'both')
train_output = train_output.drop(columns=['_merge'])


In [7]:
train_output.shape

(562936, 18)

In [8]:
def prepare_completions(suppl_df: pd.DataFrame) -> pd.DataFrame:
  play_results = suppl_df[['game_id','play_id','pass_result']].drop_duplicates()
  play_results.loc[play_results['pass_result'] == 'IN', 'pass_result'] = 'I'
  return play_results

def attach_and_prepare_play_level_features(input_df: pd.DataFrame,
                                           output_df:pd.DataFrame,
                                           supplementary_df: pd.DataFrame) -> pd.DataFrame:
    """
    Attaches play-level features from input_df and supplementary_df onto output_df

    Args:
        input_df (pd.DataFrame): Input DataFrame containing pre-throw tracking data
        output_df (pd.DataFrame): Output DataFrame containing post-throw tracking data
        supplementary_df (pd.DataFrame): Supplementary DataFrame containing supplementary play-level information
    """
    play_results = prepare_completions(supplementary_df)
    
    df = calculate_speed_and_direction(df)

    player_level_keys = ["game_id", "play_id", "nfl_id"]
    play_features = ["player_height", "player_weight","player_side","player_role",
                 "player_position",
                 "play_direction", "absolute_yardline_number",
                 "ball_land_x","ball_land_y"]

    input = input_df[player_level_keys + play_features].drop_duplicates()

    output = output_df.merge(input, on=player_level_keys, how='inner')
    output = output.merge(play_results, on=['game_id','play_id'], how='left', indicator= True)
    assert all(output_df['_merge'] == 'both')
    output_df = output_df.drop(columns=['_merge'])

    return df

In [9]:
# ============================================================================
# GEOMETRIC BASELINE - THE BREAKTHROUGH
# ============================================================================

def compute_geometric_endpoint(df):
    """
    Compute where each player SHOULD end up based on geometry.
    This is the deterministic part - no learning needed.
    """
    df = df.copy()
    
    # Time to play end
    if 'num_frames_output' in df.columns:
        t_total = df['num_frames_output'] / 10.0
    else:
        t_total = 3.0
    
    df['time_to_endpoint'] = t_total
    
    # Initialize with momentum (default rule)
    df['geo_endpoint_x'] = df['x'] + df['velocity_x'] * t_total
    df['geo_endpoint_y'] = df['y'] + df['velocity_y'] * t_total
    
    # Rule 1: Targeted Receivers converge to ball
    if 'ball_land_x' in df.columns:
        receiver_mask = df['player_role'] == 'Targeted Receiver'
        df.loc[receiver_mask, 'geo_endpoint_x'] = df.loc[receiver_mask, 'ball_land_x']
        df.loc[receiver_mask, 'geo_endpoint_y'] = df.loc[receiver_mask, 'ball_land_y']
        
        # Rule 2: Defenders mirror receivers (maintain offset)
        defender_mask = df['player_role'] == 'Defensive Coverage'
        has_mirror = df.get('mirror_offset_x', 0).notna() & (df.get('mirror_wr_dist', 50) < 15)
        coverage_mask = defender_mask & has_mirror
        
        df.loc[coverage_mask, 'geo_endpoint_x'] = (
            df.loc[coverage_mask, 'ball_land_x'] + 
            df.loc[coverage_mask, 'mirror_offset_x'].fillna(0)
        )
        df.loc[coverage_mask, 'geo_endpoint_y'] = (
            df.loc[coverage_mask, 'ball_land_y'] + 
            df.loc[coverage_mask, 'mirror_offset_y'].fillna(0)
        )
    
    # Clip to field
    df['geo_endpoint_x'] = df['geo_endpoint_x'].clip(Config.FIELD_X_MIN, Config.FIELD_X_MAX)
    df['geo_endpoint_y'] = df['geo_endpoint_y'].clip(Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
    
    return df

def add_geometric_features(df):
    """Add features that describe the geometric solution"""
    df = compute_geometric_endpoint(df)
    
    # Vector to geometric endpoint
    df['geo_vector_x'] = df['geo_endpoint_x'] - df['x']
    df['geo_vector_y'] = df['geo_endpoint_y'] - df['y']
    df['geo_distance'] = np.sqrt(df['geo_vector_x']**2 + df['geo_vector_y']**2)
    
    # Required velocity to reach geometric endpoint
    t = df['time_to_endpoint'] + 0.1
    df['geo_required_vx'] = df['geo_vector_x'] / t
    df['geo_required_vy'] = df['geo_vector_y'] / t
    
    # Current velocity vs required
    df['geo_velocity_error_x'] = df['geo_required_vx'] - df['velocity_x']
    df['geo_velocity_error_y'] = df['geo_required_vy'] - df['velocity_y']
    df['geo_velocity_error'] = np.sqrt(
        df['geo_velocity_error_x']**2 + df['geo_velocity_error_y']**2
    )
    
    # Required constant acceleration (a = 2*Δx/t²)
    t_sq = t * t
    df['geo_required_ax'] = 2 * df['geo_vector_x'] / t_sq
    df['geo_required_ay'] = 2 * df['geo_vector_y'] / t_sq
    df['geo_required_ax'] = df['geo_required_ax'].clip(-10, 10)
    df['geo_required_ay'] = df['geo_required_ay'].clip(-10, 10)
    
    # Alignment with geometric path
    velocity_mag = np.sqrt(df['velocity_x']**2 + df['velocity_y']**2)
    geo_unit_x = df['geo_vector_x'] / (df['geo_distance'] + 0.1)
    geo_unit_y = df['geo_vector_y'] / (df['geo_distance'] + 0.1)
    df['geo_alignment'] = (
        df['velocity_x'] * geo_unit_x + df['velocity_y'] * geo_unit_y
    ) / (velocity_mag + 0.1)
    
    # Role-specific geometric quality
    df['geo_receiver_urgency'] = df['is_receiver'] * df['geo_distance'] / (t + 0.1)
    df['geo_defender_coupling'] = df['is_coverage'] * (1.0 / (df.get('mirror_wr_dist', 50) + 1.0))
    
    return df

In [10]:
def get_velocity(speed, direction_deg):
    theta = np.deg2rad(direction_deg)
    return speed * np.sin(theta), speed * np.cos(theta)


def height_to_feet(height_str):
    try:
        ft, inches = map(int, str(height_str).split('-'))
        return ft + inches/12
    except:
        return 6.0

def get_opponent_features(input_df: pd.DataFrame) -> pd.DataFrame:
    """Enhanced opponent interaction with MIRROR WR tracking"""
    features = []
    
    for (gid, pid), group in tqdm(input_df.groupby(['game_id', 'play_id']), 
                                   desc="🏈 Opponents", leave=False):
        last = group.sort_values('frame_id').groupby('nfl_id').last()
        
        if len(last) < 2:
            continue
            
        positions = last[['x', 'y']].values
        sides = last['player_side'].values
        speeds = last['s'].values
        directions = last['dir'].values
        roles = last['player_role'].values
        
        receiver_mask = np.isin(roles, ['Targeted Receiver', 'Other Route Runner'])
        
        for i, (nid, side, role) in enumerate(zip(last.index, sides, roles)):
            opp_mask = sides != side
            
            feat = {
                'game_id': gid, 'play_id': pid, 'nfl_id': nid,
                'nearest_opp_dist': 50.0, 'closing_speed': 0.0,
                'num_nearby_opp_3': 0, 'num_nearby_opp_5': 0,
                'mirror_wr_vx': 0.0, 'mirror_wr_vy': 0.0,
                'mirror_offset_x': 0.0, 'mirror_offset_y': 0.0,
                'mirror_wr_dist': 50.0,
            }
            
            if not opp_mask.any():
                features.append(feat)
                continue
            
            opp_positions = positions[opp_mask]
            distances = np.sqrt(((positions[i] - opp_positions)**2).sum(axis=1))
            
            if len(distances) == 0:
                features.append(feat)
                continue
                
            nearest_idx = distances.argmin()
            feat['nearest_opp_dist'] = distances[nearest_idx]
            feat['num_nearby_opp_3'] = (distances < 3.0).sum()
            feat['num_nearby_opp_5'] = (distances < 5.0).sum()
            
            my_vx, my_vy = get_velocity(speeds[i], directions[i])
            opp_speeds = speeds[opp_mask]
            opp_dirs = directions[opp_mask]
            opp_vx, opp_vy = get_velocity(opp_speeds[nearest_idx], opp_dirs[nearest_idx])
            
            rel_vx = my_vx - opp_vx
            rel_vy = my_vy - opp_vy
            to_me = positions[i] - opp_positions[nearest_idx]
            to_me_norm = to_me / (np.linalg.norm(to_me) + 0.1)
            feat['closing_speed'] = -(rel_vx * to_me_norm[0] + rel_vy * to_me_norm[1])
            
            if role == 'Defensive Coverage' and receiver_mask.any():
                rec_positions = positions[receiver_mask]
                rec_distances = np.sqrt(((positions[i] - rec_positions)**2).sum(axis=1))
                
                if len(rec_distances) > 0:
                    closest_rec_idx = rec_distances.argmin()
                    rec_indices = np.where(receiver_mask)[0]
                    actual_rec_idx = rec_indices[closest_rec_idx]
                    
                    rec_vx, rec_vy = get_velocity(speeds[actual_rec_idx], directions[actual_rec_idx])
                    
                    feat['mirror_wr_vx'] = rec_vx
                    feat['mirror_wr_vy'] = rec_vy
                    feat['mirror_wr_dist'] = rec_distances[closest_rec_idx]
                    feat['mirror_offset_x'] = positions[i][0] - rec_positions[closest_rec_idx][0]
                    feat['mirror_offset_y'] = positions[i][1] - rec_positions[closest_rec_idx][1]
            
            features.append(feat)
    
    return pd.DataFrame(features)

def extract_route_patterns(input_df, kmeans=None, scaler=None, fit=True):
    """Route clustering"""
    route_features = []
    
    for (gid, pid, nid), group in tqdm(input_df.groupby(['game_id', 'play_id', 'nfl_id']), 
                                        desc="🛣️  Routes", leave=False):
        traj = group.sort_values('frame_id').tail(5)
        
        if len(traj) < 3:
            continue
        
        positions = traj[['x', 'y']].values
        speeds = traj['s'].values
        
        total_dist = np.sum(np.sqrt(np.diff(positions[:, 0])**2 + np.diff(positions[:, 1])**2))
        displacement = np.sqrt((positions[-1, 0] - positions[0, 0])**2 + 
                              (positions[-1, 1] - positions[0, 1])**2)
        straightness = displacement / (total_dist + 0.1)
        
        angles = np.arctan2(np.diff(positions[:, 1]), np.diff(positions[:, 0]))
        if len(angles) > 1:
            angle_changes = np.abs(np.diff(angles))
            max_turn = np.max(angle_changes)
            mean_turn = np.mean(angle_changes)
        else:
            max_turn = mean_turn = 0
        
        speed_mean = speeds.mean()
        speed_change = speeds[-1] - speeds[0] if len(speeds) > 1 else 0
        dx = positions[-1, 0] - positions[0, 0]
        dy = positions[-1, 1] - positions[0, 1]
        
        route_features.append({
            'game_id': gid, 'play_id': pid, 'nfl_id': nid,
            'traj_straightness': straightness,
            'traj_max_turn': max_turn,
            'traj_mean_turn': mean_turn,
            'traj_depth': abs(dx),
            'traj_width': abs(dy),
            'speed_mean': speed_mean,
            'speed_change': speed_change,
        })
    
    route_df = pd.DataFrame(route_features)
    feat_cols = ['traj_straightness', 'traj_max_turn', 'traj_mean_turn',
                 'traj_depth', 'traj_width', 'speed_mean', 'speed_change']
    X = route_df[feat_cols].fillna(0)
    
    if fit:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        kmeans = KMeans(n_clusters=Config.N_ROUTE_CLUSTERS, random_state=Config.SEED, n_init=10)
        route_df['route_pattern'] = kmeans.fit_predict(X_scaled)
        return route_df, kmeans, scaler
    else:
        X_scaled = scaler.transform(X)
        route_df['route_pattern'] = kmeans.predict(X_scaled)
        return route_df
    
def compute_neighbor_embeddings(input_df, k_neigh=Config.K_NEIGH, 
                                radius=Config.RADIUS, tau=Config.TAU):
    """GNN-lite embeddings"""
    print("🕸️  GNN embeddings...")
    
    cols_needed = ["game_id", "play_id", "nfl_id", "frame_id", "x", "y", 
                   "velocity_x", "velocity_y", "player_side"]
    src = input_df[cols_needed].copy()
    
    last = (src.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
               .groupby(["game_id", "play_id", "nfl_id"], as_index=False)
               .tail(1)
               .rename(columns={"frame_id": "last_frame_id"})
               .reset_index(drop=True))
    
    tmp = last.merge(
        src.rename(columns={
            "frame_id": "nb_frame_id", "nfl_id": "nfl_id_nb",
            "x": "x_nb", "y": "y_nb", 
            "velocity_x": "vx_nb", "velocity_y": "vy_nb", 
            "player_side": "player_side_nb"
        }),
        left_on=["game_id", "play_id", "last_frame_id"],
        right_on=["game_id", "play_id", "nb_frame_id"],
        how="left"
    )
    
    tmp = tmp[tmp["nfl_id_nb"] != tmp["nfl_id"]]
    tmp["dx"] = tmp["x_nb"] - tmp["x"]
    tmp["dy"] = tmp["y_nb"] - tmp["y"]
    tmp["dvx"] = tmp["vx_nb"] - tmp["velocity_x"]
    tmp["dvy"] = tmp["vy_nb"] - tmp["velocity_y"]
    tmp["dist"] = np.sqrt(tmp["dx"]**2 + tmp["dy"]**2)
    
    tmp = tmp[np.isfinite(tmp["dist"]) & (tmp["dist"] > 1e-6)]
    if radius is not None:
        tmp = tmp[tmp["dist"] <= radius]
    
    tmp["is_ally"] = (tmp["player_side_nb"] == tmp["player_side"]).astype(np.float32)
    
    keys = ["game_id", "play_id", "nfl_id"]
    tmp["rnk"] = tmp.groupby(keys)["dist"].rank(method="first")
    if k_neigh is not None:
        tmp = tmp[tmp["rnk"] <= float(k_neigh)]
    
    tmp["w"] = np.exp(-tmp["dist"] / float(tau))
    sum_w = tmp.groupby(keys)["w"].transform("sum")
    tmp["wn"] = np.where(sum_w > 0, tmp["w"] / sum_w, 0.0)
    
    tmp["wn_ally"] = tmp["wn"] * tmp["is_ally"]
    tmp["wn_opp"] = tmp["wn"] * (1.0 - tmp["is_ally"])
    
    for col in ["dx", "dy", "dvx", "dvy"]:
        tmp[f"{col}_ally_w"] = tmp[col] * tmp["wn_ally"]
        tmp[f"{col}_opp_w"] = tmp[col] * tmp["wn_opp"]
    
    tmp["dist_ally"] = np.where(tmp["is_ally"] > 0.5, tmp["dist"], np.nan)
    tmp["dist_opp"] = np.where(tmp["is_ally"] < 0.5, tmp["dist"], np.nan)
    
    ag = tmp.groupby(keys).agg(
        gnn_ally_dx_mean=("dx_ally_w", "sum"),
        gnn_ally_dy_mean=("dy_ally_w", "sum"),
        gnn_ally_dvx_mean=("dvx_ally_w", "sum"),
        gnn_ally_dvy_mean=("dvy_ally_w", "sum"),
        gnn_opp_dx_mean=("dx_opp_w", "sum"),
        gnn_opp_dy_mean=("dy_opp_w", "sum"),
        gnn_opp_dvx_mean=("dvx_opp_w", "sum"),
        gnn_opp_dvy_mean=("dvy_opp_w", "sum"),
        gnn_ally_cnt=("is_ally", "sum"),
        gnn_opp_cnt=("is_ally", lambda s: float(len(s) - s.sum())),
        gnn_ally_dmin=("dist_ally", "min"),
        gnn_ally_dmean=("dist_ally", "mean"),
        gnn_opp_dmin=("dist_opp", "min"),
        gnn_opp_dmean=("dist_opp", "mean"),
    ).reset_index()
    
    near = tmp.loc[tmp["rnk"] <= 3, keys + ["rnk", "dist"]].copy()
    if len(near) > 0:
        near["rnk"] = near["rnk"].astype(int)
        dwide = near.pivot_table(index=keys, columns="rnk", values="dist", aggfunc="first")
        dwide = dwide.rename(columns={1: "gnn_d1", 2: "gnn_d2", 3: "gnn_d3"}).reset_index()
        ag = ag.merge(dwide, on=keys, how="left")
    
    for c in ["gnn_ally_dx_mean", "gnn_ally_dy_mean", "gnn_ally_dvx_mean", "gnn_ally_dvy_mean",
              "gnn_opp_dx_mean", "gnn_opp_dy_mean", "gnn_opp_dvx_mean", "gnn_opp_dvy_mean"]:
        ag[c] = ag[c].fillna(0.0)
    for c in ["gnn_ally_cnt", "gnn_opp_cnt"]:
        ag[c] = ag[c].fillna(0.0)
    for c in ["gnn_ally_dmin", "gnn_opp_dmin", "gnn_ally_dmean", "gnn_opp_dmean", 
              "gnn_d1", "gnn_d2", "gnn_d3"]:
        ag[c] = ag[c].fillna(radius if radius is not None else 30.0)
    
    return ag



In [None]:
def prepare_sequences_geometric(input_df, 
                                output_df=None, 
                                test_template=None, 
                                is_training=True, 
                                window_size=10,
                                route_kmeans=None, 
                                route_scaler=None):
    """
    YOUR 154 features + 13 geometric features = 167 total
    
    
    Returns:
        If Training:

        If Test:

    
    """
    
    print(f"\n{'='*80}")
    print(f"PREPARING GEOMETRIC SEQUENCES")
    print(f"{'='*80}")
    
    input_df = input_df.copy()
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    
    print("Step 1: Base features...")
    
    input_df['player_height_feet'] = input_df['player_height'].apply(height_to_feet)
    height_parts = input_df['player_height'].str.split('-', expand=True)
    input_df['height_inches'] = height_parts[0].astype(float) * 12 + height_parts[1].astype(float)
    input_df['bmi'] = (input_df['player_weight'] / (input_df['height_inches']**2)) * 703
    
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    input_df['velocity_x'] = input_df['s'] * np.sin(dir_rad)
    input_df['velocity_y'] = input_df['s'] * np.cos(dir_rad)
    # input_df['acceleration_x'] = input_df['a'] * np.cos(dir_rad)
    # input_df['acceleration_y'] = input_df['a'] * np.sin(dir_rad)
    
    input_df['speed_squared'] = input_df['s'] ** 2
    # input_df['accel_magnitude'] = np.sqrt(input_df['acceleration_x']**2 + input_df['acceleration_y']**2)
    input_df['momentum_x'] = input_df['velocity_x'] * input_df['player_weight']
    input_df['momentum_y'] = input_df['velocity_y'] * input_df['player_weight']
    input_df['kinetic_energy'] = 0.5 * input_df['player_weight'] * input_df['speed_squared']
    
    # input_df['orientation_diff'] = np.abs(input_df['o'] - input_df['dir'])
    # input_df['orientation_diff'] = np.minimum(input_df['orientation_diff'], 360 - input_df['orientation_diff'])
    
    input_df['is_offense'] = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense'] = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Targeted Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer'] = (input_df['player_role'] == 'Passer').astype(int)
    input_df['role_targeted_receiver'] = input_df['is_receiver']
    input_df['role_defensive_coverage'] = input_df['is_coverage']
    input_df['role_passer'] = input_df['is_passer']
    input_df['side_offense'] = input_df['is_offense']
    
    if 'ball_land_x' in input_df.columns:
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball'] = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['dist_to_ball'] = input_df['distance_to_ball']
        input_df['dist_squared'] = input_df['distance_to_ball'] ** 2
        input_df['angle_to_ball'] = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x'] = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y'] = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed_ball'] = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )
        input_df['velocity_toward_ball'] = (
            input_df['velocity_x'] * np.cos(input_df['angle_to_ball']) + 
            input_df['velocity_y'] * np.sin(input_df['angle_to_ball'])
        )
        input_df['velocity_alignment'] = np.cos(input_df['angle_to_ball'] - dir_rad)
        # input_df['angle_diff'] = np.abs(input_df['o'] - np.degrees(input_df['angle_to_ball']))
        # input_df['angle_diff'] = np.minimum(input_df['angle_diff'], 360 - input_df['angle_diff'])
    
    print("Step 2: Advanced features...")
    
    opp_features = get_opponent_features(input_df)
    input_df = input_df.merge(opp_features, on=['game_id', 'play_id', 'nfl_id'], how='left')
    
    if is_training:
        route_features, route_kmeans, route_scaler = extract_route_patterns(input_df)
    else:
        route_features = extract_route_patterns(input_df, route_kmeans, route_scaler, fit=False)
    input_df = input_df.merge(route_features, on=['game_id', 'play_id', 'nfl_id'], how='left')
    
    gnn_features = compute_neighbor_embeddings(input_df)
    input_df = input_df.merge(gnn_features, on=['game_id', 'play_id', 'nfl_id'], how='left')
    
    if 'nearest_opp_dist' in input_df.columns:
        input_df['pressure'] = 1 / np.maximum(input_df['nearest_opp_dist'], 0.5)
        input_df['under_pressure'] = (input_df['nearest_opp_dist'] < 3).astype(int)
        input_df['pressure_x_speed'] = input_df['pressure'] * input_df['s']
    
    if 'mirror_wr_vx' in input_df.columns:
        s_safe = np.maximum(input_df['s'], 0.1)
        input_df['mirror_similarity'] = (
            input_df['velocity_x'] * input_df['mirror_wr_vx'] + 
            input_df['velocity_y'] * input_df['mirror_wr_vy']
        ) / s_safe
        input_df['mirror_offset_dist'] = np.sqrt(
            input_df['mirror_offset_x']**2 + input_df['mirror_offset_y']**2
        )
        input_df['mirror_alignment'] = input_df['mirror_similarity'] * input_df['role_defensive_coverage']
    
    print("Step 3: Temporal features...")
    
    gcols = ['game_id', 'play_id', 'nfl_id']
    
    for lag in [1, 2, 3, 4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a']:
            if col in input_df.columns:
                input_df[f'{col}_lag{lag}'] = input_df.groupby(gcols)[col].shift(lag)
    
    for window in [3, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
            if col in input_df.columns:
                input_df[f'{col}_rolling_mean_{window}'] = (
                    input_df.groupby(gcols)[col]
                      .rolling(window, min_periods=1).mean()
                      .reset_index(level=[0,1,2], drop=True)
                )
                input_df[f'{col}_rolling_std_{window}'] = (
                    input_df.groupby(gcols)[col]
                      .rolling(window, min_periods=1).std()
                      .reset_index(level=[0,1,2], drop=True)
                )
    
    for col in ['velocity_x', 'velocity_y']:
        if col in input_df.columns:
            input_df[f'{col}_delta'] = input_df.groupby(gcols)[col].diff()
    
    input_df['velocity_x_ema'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['velocity_y_ema'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['speed_ema'] = input_df.groupby(gcols)['s'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    
    print("Step 4: Time features...")
    
    if 'num_frames_output' in input_df.columns:
        max_frames = input_df['num_frames_output']
        
        input_df['max_play_duration'] = max_frames / 10.0
        input_df['frame_time'] = input_df['frame_id'] / 10.0
        input_df['progress_ratio'] = input_df['frame_id'] / np.maximum(max_frames, 1)
        input_df['time_remaining'] = (max_frames - input_df['frame_id']) / 10.0
        input_df['frames_remaining'] = max_frames - input_df['frame_id']
        
        input_df['expected_x_at_ball'] = input_df['x'] + input_df['velocity_x'] * input_df['frame_time']
        input_df['expected_y_at_ball'] = input_df['y'] + input_df['velocity_y'] * input_df['frame_time']
        
        if 'ball_land_x' in input_df.columns:
            input_df['error_from_ball_x'] = input_df['expected_x_at_ball'] - input_df['ball_land_x']
            input_df['error_from_ball_y'] = input_df['expected_y_at_ball'] - input_df['ball_land_y']
            input_df['error_from_ball'] = np.sqrt(
                input_df['error_from_ball_x']**2 + input_df['error_from_ball_y']**2
            )
            
            input_df['weighted_dist_by_time'] = input_df['dist_to_ball'] / (input_df['frame_time'] + 0.1)
            input_df['dist_scaled_by_progress'] = input_df['dist_to_ball'] * (1 - input_df['progress_ratio'])
        
        input_df['time_squared'] = input_df['frame_time'] ** 2
        input_df['velocity_x_progress'] = input_df['velocity_x'] * input_df['progress_ratio']
        input_df['velocity_y_progress'] = input_df['velocity_y'] * input_df['progress_ratio']
        input_df['speed_scaled_by_time_left'] = input_df['s'] * input_df['time_remaining']
        
        input_df['actual_play_length'] = max_frames
        input_df['length_ratio'] = max_frames / 30.0
    
    # 🎯 THE BREAKTHROUGH: Add geometric features
    print("Step 5: 🎯 Geometric endpoint features...")
    input_df = add_geometric_features(input_df)
    
    print("Step 6: Building feature list...")
    
    # Your 154 proven features
    feature_cols = [
        'x', 'y', 's', 
        # 'a', 'o', 
        'dir', 'frame_id', 'ball_land_x', 'ball_land_y',
        'player_height_feet', 'player_weight', 'height_inches', 'bmi',
        'velocity_x', 'velocity_y', 
        # 'acceleration_x', 'acceleration_y',
        'momentum_x', 'momentum_y', 'kinetic_energy',
        'speed_squared', 'accel_magnitude', 
        # 'orientation_diff',
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',
        'role_targeted_receiver', 'role_defensive_coverage', 'role_passer', 'side_offense',
        'distance_to_ball', 'dist_to_ball', 'dist_squared', 'angle_to_ball', 
        'ball_direction_x', 'ball_direction_y', 'closing_speed_ball',
        'velocity_toward_ball', 'velocity_alignment', 
        # 'angle_diff',
        'nearest_opp_dist', 'closing_speed', 'num_nearby_opp_3', 'num_nearby_opp_5',
        'mirror_wr_vx', 'mirror_wr_vy', 'mirror_offset_x', 'mirror_offset_y',
        'pressure', 'under_pressure', 'pressure_x_speed', 
        'mirror_similarity', 'mirror_offset_dist', 'mirror_alignment',
        'route_pattern', 'traj_straightness', 'traj_max_turn', 'traj_mean_turn',
        'traj_depth', 'traj_width', 'speed_mean', 'speed_change',
        'gnn_ally_dx_mean', 'gnn_ally_dy_mean', 'gnn_ally_dvx_mean', 'gnn_ally_dvy_mean',
        'gnn_opp_dx_mean', 'gnn_opp_dy_mean', 'gnn_opp_dvx_mean', 'gnn_opp_dvy_mean',
        'gnn_ally_cnt', 'gnn_opp_cnt',
        'gnn_ally_dmin', 'gnn_ally_dmean', 'gnn_opp_dmin', 'gnn_opp_dmean',
        'gnn_d1', 'gnn_d2', 'gnn_d3',
    ]
    
    for lag in [1, 2, 3, 4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's', 'a']:
            feature_cols.append(f'{col}_lag{lag}')
    
    for window in [3, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y', 's']:
            feature_cols.append(f'{col}_rolling_mean_{window}')
            feature_cols.append(f'{col}_rolling_std_{window}')
    
    feature_cols.extend(['velocity_x_delta', 'velocity_y_delta'])
    feature_cols.extend(['velocity_x_ema', 'velocity_y_ema', 'speed_ema'])
    
    feature_cols.extend([
        'max_play_duration', 'frame_time', 'progress_ratio', 'time_remaining', 'frames_remaining',
        'expected_x_at_ball', 'expected_y_at_ball', 
        'error_from_ball_x', 'error_from_ball_y', 'error_from_ball',
        'time_squared', 'weighted_dist_by_time', 
        'velocity_x_progress', 'velocity_y_progress', 'dist_scaled_by_progress',
        'speed_scaled_by_time_left', 'actual_play_length', 'length_ratio',
    ])
    
    # 🎯 Add 13 geometric features
    feature_cols.extend([
        'geo_endpoint_x', 'geo_endpoint_y',
        'geo_vector_x', 'geo_vector_y', 'geo_distance',
        'geo_required_vx', 'geo_required_vy',
        'geo_velocity_error_x', 'geo_velocity_error_y', 'geo_velocity_error',
        'geo_required_ax', 'geo_required_ay',
        'geo_alignment',
    ])
    
    feature_cols = [c for c in feature_cols if c in input_df.columns]
    print(f"✓ Using {len(feature_cols)} features (154 proven + 13 geometric)")
    
    print("Step 7: Creating sequences...")
    
    target_rows = input_df.copy() # Instantiate before we mess with input_df
    target_groups = target_rows[['game_id', 'play_id']].drop_duplicates()
    print(f"The number of target groups is {len(target_groups)}")

    sequences, targets_catch, sequence_ids = [], [], []

    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups), desc="Creating sequences"):
        # key = (row['game_id'], row['play_id'], row['nfl_id']) 
        key = (row['game_id'], row['play_id'])
        
        try:
            group_df = input_df[(input_df['game_id']==row['game_id']) &
                                 (input_df['play_id']==row['play_id'])]
        except KeyError:
            continue
        
        group_df = group_df[group_df['player_role']=='Targeted Receiver']
        if len(group_df) == 0:
            print("AAAAAAAAAAAH - why this zero?")
            continue
        input_window = group_df.tail(window_size)
        
        if len(input_window) < window_size:
            if is_training:
                continue
            pad_len = window_size - len(input_window)
            pad_df = pd.DataFrame(np.nan, index=range(pad_len), columns=input_window.columns)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)
        
        input_window = input_window.fillna(group_df.mean(numeric_only=True))
        seq = input_window[feature_cols].values
        
        if np.isnan(seq).any():
            if is_training:
                print("BBBBBBBBBBBBB - what this?")
                continue
            seq = np.nan_to_num(seq, nan=0.0)
        
        sequences.append(seq)
        
        # Store geometric endpoint for this player
        geo_x = input_window.iloc[-1]['geo_endpoint_x']
        geo_y = input_window.iloc[-1]['geo_endpoint_y']
        
        if is_training:
            out_grp = output_df[
                (output_df['game_id']==group_df.iloc[0]['game_id']) &
                (output_df['play_id']==group_df.iloc[0]['play_id']) &
                (output_df['nfl_id']==group_df.iloc[0]['nfl_id'])
            ].sort_values('frame_id')

            was_catch = out_grp['pass_result'].values[0] == 'C'
            targets_catch.append(1 if was_catch else 0)
            
        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'frame_id': input_window.iloc[-1]['frame_id']
        })

    print(f"✓ Created {len(sequences)} sequences")
    
    if is_training:
        return (sequences, 
                targets_catch,
                # targets_dx,
                # targets_dy, 
                # targets_frame_ids, 
                sequence_ids, 
                # geo_endpoints_x, 
                # geo_endpoints_y, 
                route_kmeans, 
                route_scaler,
                feature_cols)
    return sequences, sequence_ids#, geo_endpoints_x, geo_endpoints_y
    # return input_df

In [18]:
# ============================================================================
# MODEL ARCHITECTURE (YOUR PROVEN GRU + ATTENTION)
# ============================================================================
class JointSeqModel(nn.Module):
    """Your proven architecture - unchanged"""
    
    def __init__(self, input_dim: int):
        super().__init__()
        self.gru = nn.GRU(input_dim, 128, num_layers=2, batch_first=True, dropout=0.1)
        self.pool_ln = nn.LayerNorm(128)
        self.pool_attn = nn.MultiheadAttention(128, num_heads=4, batch_first=True)
        self.pool_query = nn.Parameter(torch.randn(1, 1, 128))
        
        self.head = nn.Sequential(
            nn.Linear(128, 256), 
            nn.GELU(), 
            nn.Dropout(0.2), 
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        h, _ = self.gru(x)
        B = h.size(0)
        q = self.pool_query.expand(B, -1, -1)
        ctx, _ = self.pool_attn(q, self.pool_ln(h), self.pool_ln(h))
        out = self.head(ctx.squeeze(1))
        return out.squeeze(-1)

In [19]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# ============================================================================
# TRAINING
# ============================================================================

def prepare_targets(batch_dx, batch_dy, max_h):
    tensors_x, tensors_y, masks = [], [], []
    
    for dx, dy in zip(batch_dx, batch_dy):
        L = len(dx)
        padded_x = np.pad(dx, (0, max_h - L), constant_values=0).astype(np.float32)
        padded_y = np.pad(dy, (0, max_h - L), constant_values=0).astype(np.float32)
        mask = np.zeros(max_h, dtype=np.float32)
        mask[:L] = 1.0
        
        tensors_x.append(torch.tensor(padded_x))
        tensors_y.append(torch.tensor(padded_y))
        masks.append(torch.tensor(mask))
    
    targets = torch.stack([torch.stack(tensors_x), torch.stack(tensors_y)], dim=-1)
    return targets, torch.stack(masks)

def train_model(X_train: List[np.ndarray], 
                y_train: List[int], 
                X_val: List[np.ndarray], 
                y_val: List[int], 
                input_dim: int, 
                config: Config):
    device = config.DEVICE
    model = JointSeqModel(input_dim).to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    train_batches = []
    for i in range(0, len(X_train), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_train))
        bx = torch.tensor(np.stack(X_train[i:end]).astype(np.float32))
        by = torch.tensor(np.stack(y_train[i:end]).astype(np.float32))
        train_batches.append((bx, by))
    
    val_batches = []
    for i in range(0, len(X_val), config.BATCH_SIZE):
        end = min(i + config.BATCH_SIZE, len(X_val))
        bx = torch.tensor(np.stack(X_val[i:end]).astype(np.float32))
        by = torch.tensor(np.stack(y_val[i:end]).astype(np.float32))
        val_batches.append((bx, by))
    
    best_loss, best_state, bad = float('inf'), None, 0
    
    for epoch in range(1, config.EPOCHS + 1):
        model.train()
        train_losses = []
        for bx, by in train_batches:
            bx, by = bx.to(device), by.to(device)
            pred = model(bx)
            loss = criterion(pred, by)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_losses.append(loss.item())
        
        model.eval()
        val_losses = []
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for bx, by in val_batches:
                bx, by = bx.to(device), by.to(device)
                pred = model(bx)
                val_losses.append(criterion(pred, by).item())
                # ADD THESE 2 LINES:
                all_preds.append(torch.sigmoid(pred).cpu().numpy())
                all_targets.append(by.cpu().numpy())
              
        train_loss, val_loss = np.mean(train_losses), np.mean(val_losses)
        # ADD THESE LINES:
        y_pred_proba = np.concatenate(all_preds)
        y_true = np.concatenate(all_targets)
        y_pred = (y_pred_proba > 0.5).astype(int)
        
        auc = roc_auc_score(y_true, y_pred_proba)
        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        scheduler.step(val_loss)
        
        if epoch % 10 == 0:
            print(f"  Epoch {epoch}: train={train_loss:.4f}, val={val_loss:.4f} | "
                f"AUC={auc:.3f}, Acc={acc:.3f}, Prec={precision:.3f}, Rec={recall:.3f}, F1={f1:.3f}")
      
        
        if val_loss < best_loss:
            best_loss = val_loss
            train_loss_at_best = train_loss
            auc_at_best = auc
            accuracy_at_best = acc
            precision_at_best = precision
            recall_at_best = recall
            f1_at_best = f1
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= config.PATIENCE:
                print(f"  Early stop at epoch {epoch}")
                break
    
    if best_state:
        model.load_state_dict(best_state)
    
    return model, best_loss, train_loss_at_best, auc_at_best, accuracy_at_best, precision_at_best, recall_at_best, f1_at_best



In [20]:
# test = train_output.merge(sampled_plays, on=['game_id', 'play_id'], how='left')
test = train_output.copy()
result = prepare_sequences_geometric(test, output_df = test)
sequences,targets_catch, sequence_ids, route_kmeans, route_scale, feature_cols = result

sequences = list(sequences)
targets_catch = list(targets_catch)


PREPARING GEOMETRIC SEQUENCES
Step 1: Base features...
Step 2: Advanced features...


                                                                   ]

🕸️  GNN embeddings...
Step 3: Temporal features...
Step 4: Time features...
Step 5: 🎯 Geometric endpoint features...
Step 6: Building feature list...
✓ Using 137 features (154 proven + 13 geometric)
Step 7: Creating sequences...
The number of target groups is 14108


Creating sequences:   0%|          | 19/14108 [00:00<01:17, 182.43it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   0%|          | 61/14108 [00:00<01:22, 170.37it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   1%|          | 151/14108 [00:00<01:18, 178.87it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   3%|▎         | 476/14108 [00:02<01:11, 190.15it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   4%|▍         | 625/14108 [00:03<01:17, 173.82it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   6%|▌         | 837/14108 [00:04<01:10, 187.24it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   6%|▋         | 903/14108 [00:04<01:06, 197.26it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   7%|▋         | 982/14108 [00:05<01:24, 155.36it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   8%|▊         | 1083/14108 [00:05<01:11, 182.83it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   8%|▊         | 1192/14108 [00:06<01:08, 189.27it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   9%|▉         | 1280/14108 [00:06<01:02, 204.04it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:   9%|▉         | 1339/14108 [00:07<01:16, 166.58it/s]

BBBBBBBBBBBBB - what this?
BBBBBBBBBBBBB - what this?
BBBBBBBBBBBBB - what this?


Creating sequences:  10%|█         | 1470/14108 [00:08<01:16, 166.09it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:  12%|█▏        | 1635/14108 [00:08<01:03, 195.12it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:  12%|█▏        | 1728/14108 [00:09<01:00, 203.32it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:  13%|█▎        | 1795/14108 [00:09<00:59, 206.31it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:  15%|█▍        | 2083/14108 [00:11<01:11, 167.82it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:  16%|█▌        | 2266/14108 [00:12<01:17, 153.20it/s]

BBBBBBBBBBBBB - what this?


Creating sequences:  17%|█▋        | 2388/14108 [00:13<01:13, 158.49it/s]

BBBBBBBBBBBBB - what this?
BBBBBBBBBBBBB - what this?


Creating sequences:  18%|█▊        | 2548/14108 [00:14<01:05, 177.72it/s]


KeyboardInterrupt: 

In [126]:
def compute_feature_importance(model, X_val, y_val, feature_names, device, n_repeats=3):
    """Permutation-based feature importance"""
    from sklearn.metrics import roc_auc_score
    
    model.eval()
    X_val_tensor = torch.tensor(np.stack(X_val).astype(np.float32)).to(device)
    y_val_array = np.array(y_val)
    
    # Baseline score
    with torch.no_grad():
        baseline_pred = torch.sigmoid(model(X_val_tensor)).cpu().numpy()
        baseline_score = roc_auc_score(y_val_array, baseline_pred)
    
    importances = []
    
    for feat_idx in tqdm(range(len(feature_names)), desc="Computing importances"):
        scores = []
        for _ in range(n_repeats):
            X_permuted = [x.copy() for x in X_val]
            # Permute this feature across all sequences
            perm_values = np.random.permutation([x[:, feat_idx] for x in X_val])
            for i, x in enumerate(X_permuted):
                x[:, feat_idx] = perm_values[i]
            
            X_perm_tensor = torch.tensor(np.stack(X_permuted).astype(np.float32)).to(device)
            with torch.no_grad():
                perm_pred = torch.sigmoid(model(X_perm_tensor)).cpu().numpy()
                perm_score = roc_auc_score(y_val_array, perm_pred)
            scores.append(baseline_score - perm_score)  # Drop in performance
        
        importances.append(np.mean(scores))
    
    # Create DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    return importance_df


In [21]:
X_traj.shape

NameError: name 'X_traj' is not defined

In [172]:
print("\n[3/4] Training geometric models...")
groups = np.array([d['game_id'] for d in sequence_ids])
gkf = GroupKFold(n_splits=config.N_FOLDS)

models, scalers = [], []

calibration_r2s = []  # ADD THIS BEFORE LOOP
importance_dfs = []  # STORE IMPORTANCE DFS

for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
    print(f"\n{'='*60}")
    print(f"Fold {fold}/{config.N_FOLDS}")
    print(f"{'='*60}")

    X_tr = [sequences[i] for i in tr]
    X_va = [sequences[i] for i in va]
    y_tr = [targets_catch[i] for i in tr]
    y_va = [targets_catch[i] for i in va]

    scaler = StandardScaler()
    scaler.fit(np.vstack([s for s in X_tr]))

    X_tr_sc = [scaler.transform(s) for s in X_tr]
    X_va_sc = [scaler.transform(s) for s in X_va]
        
    model, loss, train_loss, auc, acc, prec, rec, f1 = train_model(
        X_tr_sc, y_tr,
        X_va_sc, y_va,
        X_tr[0].shape[-1], config = config
    )
    
    print(f"\n✓ Fold {fold} validation outcomes - BCE Loss: {loss:.5f}, Train Loss: {train_loss:.5f}, AUC: {auc:.3f}, Acc: {acc:.3f}, Prec: {prec:.3f}, Rec: {rec:.3f}, F1: {f1:.3f}")
    
    # 🔥 CALIBRATION CHECK
    model.eval()
    X_va_tensor = torch.tensor(np.stack(X_va_sc).astype(np.float32)).to(config.DEVICE)
    with torch.no_grad():
        y_va_pred_proba = torch.sigmoid(model(X_va_tensor)).cpu().numpy()
    
    bins = np.linspace(0, 1, 11)
    bin_indices = np.digitize(y_va_pred_proba, bins) - 1
    
    predicted_probs, actual_rates = [], []
    for i in range(10):
        mask = (bin_indices == i)
        if mask.sum() > 0:
            predicted_probs.append(y_va_pred_proba[mask].mean())
            actual_rates.append(np.array(y_va)[mask].mean())
    
    from sklearn.metrics import r2_score
    if len(predicted_probs) > 1:
        r2 = r2_score(actual_rates, predicted_probs)
        calibration_r2s.append(r2)  # 🔥 STORE IT
        print(f"📈 Calibration R² = {r2:.3f}")
    
    models.append(model)
    scalers.append(scaler)
     
    # Feature importance code...
    print(f"\n📊 Computing feature importance for fold {fold}...")
    importance_df = compute_feature_importance(
        model, X_va_sc, y_va, feature_cols, config.DEVICE, n_repeats=3
    )
    importance_dfs.append(importance_df)
    
# 🔥 FINAL SUMMARY
print("\n" + "="*80)
print("FINAL MODEL SUMMARY")
print("="*80)
print(f"Mean Calibration R²: {np.mean(calibration_r2s):.3f} ± {np.std(calibration_r2s):.3f}")
print(f"🏆 NFL Benchmark: 0.98 | Your Gap: {0.98 - np.mean(calibration_r2s):.3f}")

all_importances = []
for fold in range(1, config.N_FOLDS + 1):
    df = importance_dfs[fold-1]
    all_importances.append(df)

avg_importance = pd.concat(all_importances).groupby('feature')['importance'].mean()
avg_importance = avg_importance.sort_values(ascending=False).reset_index()

print(avg_importance.head(30))
avg_importance.to_csv(config.OUTPUT_DIR / 'feature_importance_avg.csv', index=False)



[3/4] Training geometric models...

Fold 1/5
  Epoch 10: train=0.4342, val=0.4253 | AUC=0.873, Acc=0.805, Prec=0.828, Rec=0.861, F1=0.844
  Epoch 20: train=0.4029, val=0.4189 | AUC=0.879, Acc=0.822, Prec=0.842, Rec=0.871, F1=0.857
  Epoch 30: train=0.3745, val=0.4241 | AUC=0.877, Acc=0.817, Prec=0.833, Rec=0.877, F1=0.854
  Epoch 40: train=0.3639, val=0.4262 | AUC=0.877, Acc=0.815, Prec=0.829, Rec=0.879, F1=0.853
  Epoch 50: train=0.3609, val=0.4277 | AUC=0.876, Acc=0.811, Prec=0.826, Rec=0.876, F1=0.850
  Early stop at epoch 50

✓ Fold 1 validation outcomes - BCE Loss: 0.41893, Train Loss: 0.40287, AUC: 0.879, Acc: 0.822, Prec: 0.842, Rec: 0.871, F1: 0.857
📈 Calibration R² = 0.972

📊 Computing feature importance for fold 1...


Computing importances: 100%|██████████| 137/137 [00:21<00:00,  6.48it/s]



Fold 2/5
  Epoch 10: train=0.4281, val=0.4536 | AUC=0.857, Acc=0.790, Prec=0.818, Rec=0.843, F1=0.830
  Epoch 20: train=0.3934, val=0.4452 | AUC=0.865, Acc=0.798, Prec=0.822, Rec=0.852, F1=0.837
  Epoch 30: train=0.3730, val=0.4456 | AUC=0.867, Acc=0.802, Prec=0.820, Rec=0.863, F1=0.841
  Epoch 40: train=0.3644, val=0.4504 | AUC=0.867, Acc=0.798, Prec=0.817, Rec=0.861, F1=0.839
  Epoch 50: train=0.3602, val=0.4508 | AUC=0.866, Acc=0.799, Prec=0.818, Rec=0.862, F1=0.839
  Early stop at epoch 54

✓ Fold 2 validation outcomes - BCE Loss: 0.44324, Train Loss: 0.38214, AUC: 0.867, Acc: 0.800, Prec: 0.819, Rec: 0.862, F1: 0.840
📈 Calibration R² = 0.931

📊 Computing feature importance for fold 2...


Computing importances: 100%|██████████| 137/137 [00:21<00:00,  6.27it/s]



Fold 3/5
  Epoch 10: train=0.4211, val=0.4410 | AUC=0.866, Acc=0.787, Prec=0.798, Rec=0.854, F1=0.825
  Epoch 20: train=0.3859, val=0.4341 | AUC=0.871, Acc=0.797, Prec=0.810, Rec=0.856, F1=0.832
  Epoch 30: train=0.3601, val=0.4397 | AUC=0.871, Acc=0.799, Prec=0.815, Rec=0.852, F1=0.833
  Epoch 40: train=0.3487, val=0.4435 | AUC=0.871, Acc=0.808, Prec=0.822, Rec=0.860, F1=0.840
  Epoch 50: train=0.3496, val=0.4441 | AUC=0.870, Acc=0.808, Prec=0.820, Rec=0.863, F1=0.841
  Early stop at epoch 52

✓ Fold 3 validation outcomes - BCE Loss: 0.43227, Train Loss: 0.38242, AUC: 0.872, Acc: 0.796, Prec: 0.808, Rec: 0.859, F1: 0.832
📈 Calibration R² = 0.958

📊 Computing feature importance for fold 3...


Computing importances: 100%|██████████| 137/137 [00:20<00:00,  6.65it/s]



Fold 4/5
  Epoch 10: train=0.4249, val=0.4304 | AUC=0.868, Acc=0.788, Prec=0.789, Rec=0.879, F1=0.832
  Epoch 20: train=0.3910, val=0.4270 | AUC=0.873, Acc=0.795, Prec=0.802, Rec=0.870, F1=0.835
  Epoch 30: train=0.3679, val=0.4246 | AUC=0.875, Acc=0.806, Prec=0.807, Rec=0.886, F1=0.845
  Epoch 40: train=0.3523, val=0.4279 | AUC=0.875, Acc=0.807, Prec=0.804, Rec=0.894, F1=0.847
  Epoch 50: train=0.3455, val=0.4296 | AUC=0.875, Acc=0.808, Prec=0.805, Rec=0.893, F1=0.847
  Epoch 60: train=0.3464, val=0.4298 | AUC=0.875, Acc=0.809, Prec=0.806, Rec=0.895, F1=0.848
  Early stop at epoch 60

✓ Fold 4 validation outcomes - BCE Loss: 0.42463, Train Loss: 0.36794, AUC: 0.875, Acc: 0.806, Prec: 0.807, Rec: 0.886, F1: 0.845
📈 Calibration R² = 0.980

📊 Computing feature importance for fold 4...


Computing importances: 100%|██████████| 137/137 [00:23<00:00,  5.74it/s]



Fold 5/5
  Epoch 10: train=0.4283, val=0.4632 | AUC=0.853, Acc=0.780, Prec=0.775, Rec=0.868, F1=0.819
  Epoch 20: train=0.3952, val=0.4517 | AUC=0.861, Acc=0.786, Prec=0.785, Rec=0.864, F1=0.823
  Epoch 30: train=0.3702, val=0.4587 | AUC=0.860, Acc=0.786, Prec=0.781, Rec=0.871, F1=0.824
  Epoch 40: train=0.3641, val=0.4619 | AUC=0.860, Acc=0.788, Prec=0.779, Rec=0.879, F1=0.826
  Early stop at epoch 48

✓ Fold 5 validation outcomes - BCE Loss: 0.45148, Train Loss: 0.39952, AUC: 0.861, Acc: 0.786, Prec: 0.782, Rec: 0.869, F1: 0.823
📈 Calibration R² = 0.974

📊 Computing feature importance for fold 5...


Computing importances: 100%|██████████| 137/137 [00:20<00:00,  6.68it/s]


FINAL MODEL SUMMARY
Mean Calibration R²: 0.963 ± 0.018
🏆 NFL Benchmark: 0.98 | Your Gap: 0.017
                      feature  importance
0                dist_squared    0.023812
1          geo_velocity_error    0.022503
2                    pressure    0.019162
3                dist_to_ball    0.017296
4            distance_to_ball    0.016912
5               closing_speed    0.014184
6             x_rolling_std_5    0.010068
7                geo_distance    0.008182
8             velocity_y_lag4    0.007649
9                      s_lag5    0.007609
10           pressure_x_speed    0.007221
11         closing_speed_ball    0.007185
12           nearest_opp_dist    0.007082
13       velocity_toward_ball    0.006404
14           x_rolling_mean_3    0.005603
15               gnn_opp_dmin    0.005551
16                          s    0.005547
17            velocity_y_lag2    0.005166
18  velocity_x_rolling_mean_5    0.004205
19  velocity_y_rolling_mean_3    0.004099
20             kinetic




In [154]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss


for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
    print(f"\n{'='*60}")
    print(f"Fold {fold}/{config.N_FOLDS}")
    print(f"{'='*60}")

    X_tr = [sequences[i] for i in tr]
    X_va = [sequences[i] for i in va]
    y_tr = [targets_catch[i] for i in tr]
    y_va = [targets_catch[i] for i in va]

    # Flatten sequences: (num_samples, 10 timesteps, 153 features) -> (num_samples, 1530)
    X_tr_flat = np.vstack([s.flatten() for s in X_tr])
    X_va_flat = np.vstack([s.flatten() for s in X_va])
    
    scaler = StandardScaler()
    X_tr_sc = scaler.fit_transform(X_tr_flat)
    X_va_sc = scaler.transform(X_va_flat)
    
    # Grid search for best C (inverse regularization strength)
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],  # L2 penalty strength
        'max_iter': [1000]
    }
    
    base_model = LogisticRegression(penalty='l2', solver='lbfgs', random_state=config.SEED)
    
    grid_search = GridSearchCV(
        base_model, 
        param_grid, 
        cv=3,  # 3-fold CV within training data
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    print(f"  Running GridSearchCV...")
    grid_search.fit(X_tr_sc, y_tr)
    
    print(f"  Best params: {grid_search.best_params_}")
    print(f"  Best CV score: {grid_search.best_score_:.4f}")
    
    # Use best model
    model = grid_search.best_estimator_
    
    # Evaluate on validation set
    y_va_pred_proba = model.predict_proba(X_va_sc)[:, 1]
    y_va_pred = (y_va_pred_proba > 0.5).astype(int)
    
    # 🔥 ADD BCE COMPUTATION
    bce_loss = log_loss(y_va, y_va_pred_proba)
    
    val_auc = roc_auc_score(y_va, y_va_pred_proba)
    acc = accuracy_score(y_va, y_va_pred)
    precision = precision_score(y_va, y_va_pred, zero_division=0)
    recall = recall_score(y_va, y_va_pred, zero_division=0)
    f1 = f1_score(y_va, y_va_pred, zero_division=0)
    
    # 🔥 UPDATE PRINT TO INCLUDE BCE
    print(f"  Validation: BCE={bce_loss:.4f}, AUC={val_auc:.4f}, Acc={acc:.3f}, "
          f"Prec={precision:.3f}, Rec={recall:.3f}, F1={f1:.3f}")
    
    models.append(model)
    scalers.append(scaler)
    
    # Feature importance from coefficients
    coef = model.coef_[0]  # Shape: (1530,) for flattened features
    
    # Aggregate across timesteps: (10 timesteps, 153 features) -> (153,)
    coef_reshaped = coef.reshape(10, len(feature_cols))
    feature_importance = np.abs(coef_reshaped).mean(axis=0)  # Average across time
    
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print("\n🏆 Top 20 Features:")
    print(importance_df.head(20).to_string(index=False))
    
    importance_df.to_csv(config.OUTPUT_DIR / f'importance_fold{fold}_logreg.csv', index=False)

# After all folds, aggregate
print("\n" + "="*80)
print("AGGREGATED FEATURE IMPORTANCE ACROSS FOLDS")
print("="*80)

all_importances = []
for fold in range(1, config.N_FOLDS + 1):
    df = pd.read_csv(config.OUTPUT_DIR / f'importance_fold{fold}_logreg.csv')
    all_importances.append(df)

avg_importance = pd.concat(all_importances).groupby('feature')['importance'].mean()
avg_importance = avg_importance.sort_values(ascending=False).reset_index()
print(avg_importance.head(30))



Fold 1/5
  Running GridSearchCV...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

  Best params: {'C': 0.01, 'max_iter': 1000}
  Best CV score: 0.8549
  Validation: BCE=0.4289, AUC=0.8752, Acc=0.807, Prec=0.815, Rec=0.885, F1=0.849

🏆 Top 20 Features:
                 feature  importance
      geo_velocity_error    0.161527
            dist_squared    0.091769
        distance_to_ball    0.078398
            dist_to_ball    0.078398
                pressure    0.054051
           geo_alignment    0.052004
        velocity_y_delta    0.050838
      closing_speed_ball    0.050226
    velocity_toward_ball    0.050226
velocity_y_rolling_std_3    0.047483
           angle_to_ball    0.044558
      velocity_alignment    0.038788
velocity_x_rolling_std_5    0.038099
velocity_x_rolling_std_3    0.036613
                     dir    0.031111
         s_rolling_std_3    0.030815
        velocity_x_delta    0.030257
           closing_speed    0.028640
        ball_direction_x    0.026713
               speed_ema    0.025558

Fold 2/5
  Running GridSearchCV...
Fitting 3 folds f

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

  Best params: {'C': 0.01, 'max_iter': 1000}
  Best CV score: 0.8572
  Validation: BCE=0.4539, AUC=0.8566, Acc=0.794, Prec=0.811, Rec=0.862, F1=0.836

🏆 Top 20 Features:
                 feature  importance
      geo_velocity_error    0.152999
            dist_squared    0.091729
            dist_to_ball    0.080284
        distance_to_ball    0.080284
           geo_alignment    0.066035
        velocity_y_delta    0.058949
                pressure    0.053304
    velocity_toward_ball    0.051462
      closing_speed_ball    0.051445
velocity_y_rolling_std_3    0.047797
                     dir    0.047758
velocity_x_rolling_std_3    0.046267
velocity_x_rolling_std_5    0.042611
        ball_direction_x    0.033158
        velocity_x_delta    0.032192
         s_rolling_std_5    0.030883
           closing_speed    0.028223
         s_rolling_std_3    0.026015
               speed_ema    0.023431
        ball_direction_y    0.023008

Fold 3/5
  Running GridSearchCV...
Fitting 3 folds f

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

  Best params: {'C': 0.01, 'max_iter': 1000}
  Best CV score: 0.8567
  Validation: BCE=0.4555, AUC=0.8601, Acc=0.791, Prec=0.798, Rec=0.864, F1=0.830

🏆 Top 20 Features:
                 feature  importance
      geo_velocity_error    0.129769
            dist_squared    0.072823
            dist_to_ball    0.064109
        distance_to_ball    0.064109
           geo_alignment    0.062733
                pressure    0.054229
    velocity_toward_ball    0.054039
      closing_speed_ball    0.054023
        velocity_y_delta    0.049656
velocity_x_rolling_std_5    0.045430
velocity_y_rolling_std_3    0.043827
velocity_x_rolling_std_3    0.042451
                     dir    0.042138
        ball_direction_x    0.036255
           angle_to_ball    0.031069
        velocity_x_delta    0.030009
         s_rolling_std_5    0.027597
           closing_speed    0.026854
          kinetic_energy    0.023024
         y_rolling_std_5    0.022890

Fold 4/5
  Running GridSearchCV...
Fitting 3 folds f

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

  Best params: {'C': 0.01, 'max_iter': 1000}
  Best CV score: 0.8571
  Validation: BCE=0.4328, AUC=0.8682, Acc=0.799, Prec=0.800, Rec=0.884, F1=0.840

🏆 Top 20 Features:
                 feature  importance
      geo_velocity_error    0.154190
            dist_squared    0.089915
            dist_to_ball    0.081308
        distance_to_ball    0.081308
           geo_alignment    0.064723
    velocity_toward_ball    0.054884
      closing_speed_ball    0.054867
velocity_y_rolling_std_3    0.054152
                pressure    0.053361
                     dir    0.047603
        velocity_y_delta    0.047120
velocity_x_rolling_std_3    0.042131
velocity_x_rolling_std_5    0.037840
         s_rolling_std_5    0.035507
        ball_direction_x    0.033255
        velocity_x_delta    0.029533
           closing_speed    0.027499
           angle_to_ball    0.026777
               speed_ema    0.026022
velocity_y_rolling_std_5    0.025001

Fold 5/5
  Running GridSearchCV...
Fitting 3 folds f

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

  Best params: {'C': 0.01, 'max_iter': 1000}
  Best CV score: 0.8641
  Validation: BCE=0.4714, AUC=0.8473, Acc=0.786, Prec=0.777, Rec=0.880, F1=0.825

🏆 Top 20 Features:
                 feature  importance
      geo_velocity_error    0.139883
            dist_squared    0.088948
            dist_to_ball    0.085317
        distance_to_ball    0.085317
           geo_alignment    0.067200
                     dir    0.053841
    velocity_toward_ball    0.053504
      closing_speed_ball    0.053486
                pressure    0.051957
velocity_y_rolling_std_3    0.048394
        velocity_y_delta    0.047085
velocity_x_rolling_std_5    0.044914
        ball_direction_x    0.039706
           angle_to_ball    0.039562
velocity_x_rolling_std_3    0.032820
        velocity_x_delta    0.031347
         s_rolling_std_3    0.030433
         s_rolling_std_5    0.029248
           closing_speed    0.028073
        pressure_x_speed    0.022307

AGGREGATED FEATURE IMPORTANCE ACROSS FOLDS
         

In [None]:
HORIZON, config
    )
    
    models.append(model)
    scalers.append(scaler)
    
    print(f"\n✓ Fold {fold} - Loss: {loss:.5f}")

In [None]:

for fold, (tr, va) in enumerate(gkf.split(sequences, groups=groups), 1):
    print(f"\n{'='*60}")
    print(f"Fold {fold}/{config.N_FOLDS}")
    print(f"{'='*60}")a
    
    X_tr = [sequences[i] for i in tr]
    X_va = [sequences[i] for i in va]
    y_tr_dx = [targets_dx[i] for i in tr]
    y_va_dx = [targets_dx[i] for i in va]
    y_tr_dy = [targets_dy[i] for i in tr]
    y_va_dy = [targets_dy[i] for i in va]
    
    scaler = StandardScaler()
    scaler.fit(np.vstack([s for s in X_tr]))
    
    X_tr_sc = [scaler.transform(s) for s in X_tr]
    X_va_sc = [scaler.transform(s) for s in X_va]
    
    model, loss = train_model(
        X_tr_sc, y_tr_dx, y_tr_dy,
        X_va_sc, y_va_dx, y_va_dy,
        X_tr[0].shape[-1], config.MAX_FUTURE_HORIZON, config
    )
    
    models.append(model)
    scalers.append(scaler)
    
    print(f"\n✓ Fold {fold} - Loss: {loss:.5f}")

In [66]:
a = prepare_sequences_geometric(train_input, train_output, is_training=True, window_size=10)


PREPARING GEOMETRIC SEQUENCES
Step 1: Base features...
Step 2: Advanced features...


                                                                     ]

In [60]:
a.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,player_height_feet,height_inches,bmi,velocity_x,velocity_y,acceleration_x,acceleration_y,speed_squared,accel_magnitude,momentum_x,momentum_y,kinetic_energy,orientation_diff,is_offense,is_defense,is_receiver,is_coverage,is_passer,role_targeted_receiver,role_defensive_coverage,role_passer,side_offense,distance_to_ball,dist_to_ball,dist_squared,angle_to_ball,ball_direction_x,ball_direction_y,closing_speed_ball,velocity_toward_ball,velocity_alignment,angle_diff
182,2023090700,101,False,43290,1,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,37.36,30.07,0.0,0.0,65.42,95.98,21,63.259998,-0.22,6.333333,76.0,27.141447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.56,1,0,0,0,1,0,0,1,1,39.853407,39.853407,1588.294013,-0.863368,0.649882,-0.760035,0.0,0.0,-0.420835,145.447319
183,2023090700,101,False,43290,2,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,37.36,30.07,0.0,0.0,63.91,95.98,21,63.259998,-0.22,6.333333,76.0,27.141447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.07,1,0,0,0,1,0,0,1,1,39.853407,39.853407,1588.294013,-0.863368,0.649882,-0.760035,0.0,0.0,-0.396785,145.447319
184,2023090700,101,False,43290,3,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,37.35,30.07,0.0,0.0,53.83,95.98,21,63.259998,-0.22,6.333333,76.0,27.141447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.15,1,0,0,0,1,0,0,1,1,39.859906,39.859906,1588.812113,-0.863177,0.650027,-0.759911,0.0,0.0,-0.229819,145.436394
185,2023090700,101,False,43290,4,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,37.34,30.07,0.0,0.0,310.79,95.98,21,63.259998,-0.22,6.333333,76.0,27.141447,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,145.19,1,0,0,0,1,0,0,1,1,39.866407,39.866407,1589.330413,-0.862986,0.650171,-0.759788,-0.0,-0.0,0.999992,145.425473
186,2023090700,101,False,43290,5,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,37.33,30.07,0.06,1.37,271.81,96.8,21,63.259998,-0.22,6.333333,76.0,27.141447,-0.05997,0.001895,0.043272,-1.369316,0.0036,1.37,-13.373324,0.42261,0.4014,175.01,1,0,0,0,1,0,0,1,1,39.87291,39.87291,1589.848913,-0.862796,0.650316,-0.759664,-0.040439,-0.040439,0.779825,146.234555


In [None]:
# ==========================================
# Get Opponent Features
# ==========================================
# This includes mirror / wr tracking
# Seems like wecan include because it only has speed and positions?

# TODO: Implement opponent features
# Hopefully we just need to update some variable names after getting kinematics in


In [None]:
# ==========================================
# Extract route patterns
# ==========================================



In [None]:
# ==========================================
# Compute neighbor embeddings
# ==========================================
# This is the GNN

In [None]:
# ==========================================
# Create temporal features (rolling mean)
# ==========================================

In [None]:
# ==========================================
# Time features
# ==========================================

In [None]:
# ==========================================
# Geometric features
# ==========================================