In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from src.prepData import load_train_data, normalize_input_fields, normalize_output_fields

## Data Prep

#### Load Data

In [2]:
print("Loading data...")
input_df, output_df = load_train_data()
# input_df.to_pickle("data/personal/input_df.pkl")
# output_df.to_pickle("data/personal/output_df.pkl")

print(f"Loaded {len(input_df)} input rows, {len(output_df)} output rows")
print(
  f"Unique plays: {input_df[['game_id', 'play_id']].drop_duplicates().shape[0]}"
)

Loading data...
Loaded 4880579 input rows, 562936 output rows
Unique plays: 14108


#### Normalize fields

In [3]:
input_df = normalize_input_fields(input_df)
norm_helper = input_df[['game_id','play_id','play_direction','absolute_yardline_number']].drop_duplicates()
output_df = normalize_output_fields(output_df, norm_helper)

#### Create play-level features

In [4]:
distinct_plays = input_df[['game_id', 'play_id']].drop_duplicates()
distinct_plays.sort_values(['game_id','play_id']).head(3)

Unnamed: 0,game_id,play_id
0,2023090700,101
234,2023090700,194
650,2023090700,219


In [5]:
# Get max frame_id from input_df for each play (throw_frame_id baseline)
input_max_frames = (
    input_df
    .groupby(['game_id', 'play_id'])[['frame_id', "ball_land_x_std","ball_land_y_std"]]
    .max()
    .reset_index()
    .rename(columns={'frame_id': 'throw_frame_id'})
)

# Get max frame_id from output_df for each play (throw_land_frame_id baseline)
output_max_frames = (
    output_df
    .groupby(['game_id', 'play_id'])[['frame_id']]
    .max()
    .reset_index()
    .rename(columns={'frame_id': 'throw_land_frame_id'})
)

# Combine both into baseline frame info
baseline_frame_info = input_max_frames.merge(
    output_max_frames,
    on=['game_id', 'play_id'],
    how='outer'
)

print(f"Baseline frame info shape: {baseline_frame_info.shape}")
print(f"Unique plays: {baseline_frame_info.shape[0]}")
baseline_frame_info.head(2)

Baseline frame info shape: (14108, 6)
Unique plays: 14108


Unnamed: 0,game_id,play_id,throw_frame_id,ball_land_x_std,ball_land_y_std,throw_land_frame_id
0,2023090700,101,26,21.259998,-0.22,21
1,2023090700,194,32,4.059998,31.55,9


In [6]:
# Create all play-level features
qb_frame = input_df[input_df['player_role'] == 'Passer']
if qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0] < len(distinct_plays):
    print(f"Warning: fewer plays with QB ({qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0]}) than original plays ({len(distinct_plays)})")

# Get QB max frame for plays with a passer
qb_max_frame = (
    qb_frame
    .groupby(['game_id', 'play_id', 'nfl_id', 'player_role'])['frame_id']
    .max()
    .reset_index()
)

# Find plays without a passer
plays_with_qb = qb_max_frame[['game_id', 'play_id']].drop_duplicates()
plays_without_qb = (
    distinct_plays
    .merge(plays_with_qb, on=['game_id', 'play_id'], how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns=['_merge'])
)

# For plays without a passer, use the overall max frame_id
if len(plays_without_qb) > 0:
    print(f"Found {len(plays_without_qb)} plays without a Passer. Using overall max frame_id.")
    
    missing_max_frames = (
        input_df
        .merge(plays_without_qb, on=['game_id', 'play_id'])
        .groupby(['game_id', 'play_id'])['frame_id']
        .max()
        .reset_index()
    )
    
    # Add placeholder columns for nfl_id and player_role
    missing_max_frames['nfl_id'] = None
    missing_max_frames['player_role'] = None
    
    # Combine with QB frames
    qb_max_frame = pd.concat([qb_max_frame, missing_max_frames], ignore_index=True)

# Join back to input_df to get the full row data
qb_rows = pd.merge(
    input_df, 
    qb_max_frame, 
    on=['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_role'], 
    how='inner'
)

# Start with qb_rows
qb_sub = qb_rows.copy()

# Calculate derived features
qb_sub['qb_throw_distance'] = np.sqrt((qb_sub['ball_land_x_std'] - qb_sub['x_std'])**2 + (qb_sub['ball_land_y_std'] - qb_sub['y_std'])**2)
qb_sub['qb_ball_dir'] = (90 - np.degrees(np.arctan2(
    qb_sub['ball_land_y_std'] - qb_sub['y_std'],
    qb_sub['ball_land_x_std'] - qb_sub['x_std']
))) % 360
qb_sub['qb_direction_diff'] = (qb_sub['o_std'] - qb_sub['qb_ball_dir'] + 180) % 360 - 180  # difference between -180 and 180

# Rename frame_id to be QB-specific
qb_sub.rename(columns={'frame_id':'throw_frame_id'}, inplace=True)

# Drop player_to_predict column (not needed for QB)
qb_sub = qb_sub.drop(columns=['player_to_predict'])

# Rename QB kinematic fields to have qb_ prefix
qb_kinematic_fields_rename = {
    "x_std": "qb_x_std",
    "y_std": "qb_y_std",
    "o_std": "qb_o_std",
    "dir_std": "qb_dir_std",
    "s": "qb_s",
    "a": "qb_a"
}
qb_sub = qb_sub.rename(columns=qb_kinematic_fields_rename)

qb_sub = qb_sub.drop(columns=["ball_land_x_std","ball_land_y_std"])

qb_sub.head(3)

Found 3 plays without a Passer. Using overall max frame_id.


Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,qb_s,qb_a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,absolute_yardline_number_std,qb_x_std,qb_y_std,qb_o_std,qb_dir_std,dx_ball,dy_ball,dist_ball,angle_to_ball,angle_to_ball_minus_dir,angle_to_ball_minus_o,s_x_std,s_y_std,s_parallel,s_perp,dir_std_sin,dir_std_cos,o_std_sin,o_std_cos,angle_to_ball_sin,angle_to_ball_cos,angle_to_ball_minus_dir_sin,angle_to_ball_minus_dir_cos,angle_to_ball_minus_o_sin,angle_to_ball_minus_o_cos,height_in,birth_year,qb_throw_distance,qb_ball_dir,qb_direction_diff
0,2023090700,101,43290,26,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,35.41,29.99,0.64,0.47,108.83,212.25,21,63.259998,-0.22,1,42,-6.59,29.99,212.25,108.83,27.849998,-30.21,41.088521,137.327657,28.497657,-74.922343,0.605747,-0.206567,0.562455,0.305359,0.94648,-0.322761,-0.533615,-0.845728,0.677805,-0.735242,0.477123,0.878837,-0.965574,0.260128,76.0,1994,41.08852,137.327657,74.922343
1,2023090700,194,44822,32,left,89,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,97.62,29.67,0.96,1.64,185.14,285.7,9,84.940002,21.75,1,31,-8.62,23.63,105.7,5.14,12.679998,7.92,14.95021,58.010861,52.870861,-47.689139,0.086006,0.95614,0.579469,0.765386,0.08959,0.995979,0.962692,-0.2706,0.848149,0.529758,0.797277,0.603614,-0.739504,0.673153,75.0,1995,14.950209,58.010861,47.689139
2,2023090700,219,44822,17,left,79,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,85.87,22.97,1.49,2.76,133.64,245.38,8,75.849998,11.49,1,41,-6.87,30.33,65.38,313.64,10.020002,11.48,15.23781,41.115185,87.475185,-24.264815,-1.078298,1.028286,0.065638,1.488553,-0.72369,0.690125,0.909091,0.416598,0.657575,0.753389,0.999029,0.044052,-0.410955,0.911656,75.0,1995,15.237809,41.115185,24.264815


In [7]:
# Just prove only one player per output
input_unique_players = input_df[['game_id', 'play_id', 'nfl_id', 'player_role', 'player_side']].drop_duplicates()
output_unique_players = output_df[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

a = output_unique_players.merge(input_unique_players[['game_id','play_id','nfl_id','player_role','player_side']], on=['game_id', 'play_id', 'nfl_id'], how='inner', indicator=True)
b = a.loc[a['player_side'] == 'Offense', ['game_id','play_id','nfl_id']].groupby(['game_id','play_id']).nunique().reset_index()
b['nfl_id'].value_counts()

nfl_id
1    14108
Name: count, dtype: int64

In [8]:
# Create all play-level features
qb_frame = input_df[input_df['player_role'] == 'Passer']
if qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0] < len(distinct_plays):
    print(f"Warning: fewer plays with QB ({qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0]}) than original plays ({len(distinct_plays)})")

# Get QB max frame for plays with a passer
qb_max_frame = (
    qb_frame
    .groupby(['game_id', 'play_id', 'nfl_id', 'player_role'])['frame_id']
    .max()
    .reset_index()
)

# Find plays without a passer
plays_with_qb = qb_max_frame[['game_id', 'play_id']].drop_duplicates()
plays_without_qb = (
    distinct_plays
    .merge(plays_with_qb, on=['game_id', 'play_id'], how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns=['_merge'])
)

# For plays without a passer, use the overall max frame_id
if len(plays_without_qb) > 0:
    print(f"Found {len(plays_without_qb)} plays without a Passer. Using overall max frame_id.")
    
    missing_max_frames = (
        input_df
        .merge(plays_without_qb, on=['game_id', 'play_id'])
        .groupby(['game_id', 'play_id'])['frame_id']
        .max()
        .reset_index()
    )
    
    # Add placeholder columns for nfl_id and player_role
    missing_max_frames['nfl_id'] = None
    missing_max_frames['player_role'] = None
    
    # Combine with QB frames
    qb_max_frame = pd.concat([qb_max_frame, missing_max_frames], ignore_index=True)

# Join back to input_df to get the full row data
qb_rows = pd.merge(
    input_df, 
    qb_max_frame, 
    on=['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_role'], 
    how='inner'
)

# Start with qb_rows
qb_sub = qb_rows.copy()

# Calculate derived features
qb_sub['qb_throw_distance'] = np.sqrt((qb_sub['ball_land_x_std'] - qb_sub['x_std'])**2 + (qb_sub['ball_land_y_std'] - qb_sub['y_std'])**2)
qb_sub['qb_ball_dir'] = (90 - np.degrees(np.arctan2(
    qb_sub['ball_land_y_std'] - qb_sub['y_std'],
    qb_sub['ball_land_x_std'] - qb_sub['x_std']
))) % 360
qb_sub['qb_direction_diff'] = (qb_sub['o_std'] - qb_sub['qb_ball_dir'] + 180) % 360 - 180  # difference between -180 and 180

# Rename frame_id to be QB-specific
qb_sub.rename(columns={'frame_id':'throw_frame_id'}, inplace=True)

# Drop player_to_predict column (not needed for QB)
qb_sub = qb_sub.drop(columns=['player_to_predict'])

# Rename QB kinematic fields to have qb_ prefix
qb_kinematic_fields_rename = {
    "x_std": "qb_x_std",
    "y_std": "qb_y_std",
    "o_std": "qb_o_std",
    "dir_std": "qb_dir_std",
    "s": "qb_s",
    "a": "qb_a"
}
qb_sub = qb_sub.rename(columns=qb_kinematic_fields_rename)

qb_sub = qb_sub.drop(columns=["ball_land_x_std","ball_land_y_std"])

qb_sub.head(3)

Found 3 plays without a Passer. Using overall max frame_id.


Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,qb_s,qb_a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,absolute_yardline_number_std,qb_x_std,qb_y_std,qb_o_std,qb_dir_std,dx_ball,dy_ball,dist_ball,angle_to_ball,angle_to_ball_minus_dir,angle_to_ball_minus_o,s_x_std,s_y_std,s_parallel,s_perp,dir_std_sin,dir_std_cos,o_std_sin,o_std_cos,angle_to_ball_sin,angle_to_ball_cos,angle_to_ball_minus_dir_sin,angle_to_ball_minus_dir_cos,angle_to_ball_minus_o_sin,angle_to_ball_minus_o_cos,height_in,birth_year,qb_throw_distance,qb_ball_dir,qb_direction_diff
0,2023090700,101,43290,26,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,35.41,29.99,0.64,0.47,108.83,212.25,21,63.259998,-0.22,1,42,-6.59,29.99,212.25,108.83,27.849998,-30.21,41.088521,137.327657,28.497657,-74.922343,0.605747,-0.206567,0.562455,0.305359,0.94648,-0.322761,-0.533615,-0.845728,0.677805,-0.735242,0.477123,0.878837,-0.965574,0.260128,76.0,1994,41.08852,137.327657,74.922343
1,2023090700,194,44822,32,left,89,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,97.62,29.67,0.96,1.64,185.14,285.7,9,84.940002,21.75,1,31,-8.62,23.63,105.7,5.14,12.679998,7.92,14.95021,58.010861,52.870861,-47.689139,0.086006,0.95614,0.579469,0.765386,0.08959,0.995979,0.962692,-0.2706,0.848149,0.529758,0.797277,0.603614,-0.739504,0.673153,75.0,1995,14.950209,58.010861,47.689139
2,2023090700,219,44822,17,left,79,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,85.87,22.97,1.49,2.76,133.64,245.38,8,75.849998,11.49,1,41,-6.87,30.33,65.38,313.64,10.020002,11.48,15.23781,41.115185,87.475185,-24.264815,-1.078298,1.028286,0.065638,1.488553,-0.72369,0.690125,0.909091,0.416598,0.657575,0.753389,0.999029,0.044052,-0.410955,0.911656,75.0,1995,15.237809,41.115185,24.264815


In [9]:
qb_features = ["qb_x_std", 
               "qb_y_std", 
               "qb_s", 
               "qb_a", 
               "qb_dir_std", 
               "qb_o_std", 
               "qb_throw_distance", 
               "qb_ball_dir"]

play_level_features = baseline_frame_info.merge(
  qb_sub[['game_id','play_id'] + qb_features], 
  how = 'left', 
  on = ['game_id','play_id'])

def impute_qb_features_safe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing QB features using ball trajectory (always available)
    This is 'safe' because ball_land_x/y are inputs, not targets
    """
    mask = df['qb_x_std'].isnull()
    
    if mask.sum() > 0:
        # Proxy: assume QB was ~10 yards behind ball landing
        df.loc[mask, 'qb_x_std'] = df.loc[mask, 'ball_land_x_std'] - 10
        df.loc[mask, 'qb_y_std'] = 26.7  # assume center of field
        
        # Proxy: assume QB was stationary (conservative)
        df.loc[mask, 'qb_s'] = 0.0
        df.loc[mask, 'qb_a'] = 0.0
      
        # Throw distance from imputed position
        df.loc[mask, 'qb_throw_distance'] = np.sqrt(
            (df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std'])**2 +
            (df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'])**2
        )

        # Proxy: QB facing ball direction
        df.loc[mask, 'qb_o_std'] = (90 - np.degrees(np.arctan2(
            df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'],
            df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std']
        ))) % 360
        df.loc[mask, 'qb_dir_std'] = df.loc[mask, 'qb_o_std']

        df.loc[mask, 'qb_ball_dir'] = (90 - np.degrees(np.arctan2(
            df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'],
            df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std']
        ))) % 360
    
    return df

# Apply BEFORE split
play_level_features = impute_qb_features_safe(play_level_features)


# Encode angles as sin/cos
for col in ["qb_o_std", "qb_dir_std",
             "qb_ball_dir",
             ]:
    rad = np.deg2rad(play_level_features[col])
    play_level_features[col + "_sin"] = np.sin(rad)
    play_level_features[col + "_cos"] = np.cos(rad)


In [10]:
play_level_features.head(3)

Unnamed: 0,game_id,play_id,throw_frame_id,ball_land_x_std,ball_land_y_std,throw_land_frame_id,qb_x_std,qb_y_std,qb_s,qb_a,qb_dir_std,qb_o_std,qb_throw_distance,qb_ball_dir,qb_o_std_sin,qb_o_std_cos,qb_dir_std_sin,qb_dir_std_cos,qb_ball_dir_sin,qb_ball_dir_cos
0,2023090700,101,26,21.259998,-0.22,21,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242
1,2023090700,194,32,4.059998,31.55,9,-8.62,23.63,0.96,1.64,5.14,105.7,14.950209,58.010861,0.962692,-0.2706,0.08959,0.995979,0.848149,0.529758
2,2023090700,219,17,3.150002,41.81,8,-6.87,30.33,1.49,2.76,313.64,65.38,15.237809,41.115185,0.909091,0.416598,-0.72369,0.690125,0.657575,0.753389


In [11]:
x_data = baseline_frame_info.merge(
    input_df[input_df['player_to_predict'] == True],
    left_on = ['game_id','play_id'],
    right_on = ['game_id','play_id'],
    how = 'inner'
)

baseline_play_features =[
    'throw_frame_id',
    'throw_land_frame_id'
]

player_level_features = [
    'frame_id',
    'height_in',
    'player_weight',
    'birth_year',
    'player_position',
    'player_side',
    'player_role',
    'x_std',
    'y_std',
    'o_std',
    'o_std_sin',
    'o_std_cos',
    'dir_std',
    'dir_std_sin',
    'dir_std_cos',
    's',
    'a',
    "dx_ball",
    "dy_ball",
    "dist_ball",
    "angle_to_ball",
    "angle_to_ball_sin",
    "angle_to_ball_cos",
    "angle_to_ball_minus_dir",
    "angle_to_ball_minus_dir_sin",
    "angle_to_ball_minus_dir_cos",
    "angle_to_ball_minus_o",
    "angle_to_ball_minus_o_sin",
    "angle_to_ball_minus_o_cos",
    "s_x_std",
    "s_y_std",
    "s_parallel",
    "s_perp",]

x_data = x_data[['game_id','play_id','nfl_id'] + baseline_play_features + player_level_features].copy()

In [12]:
x_data_last = x_data[x_data['frame_id'] == x_data['throw_frame_id']].copy()
play_level_features_cols = [i for i in list(play_level_features) if i not in ['throw_frame_id','throw_land_frame_id']]
x_data_last = x_data_last.merge(play_level_features[play_level_features_cols], on = ['game_id','play_id'])
x_data_last.sort_values(['game_id','play_id','nfl_id'], inplace=True)
x_data_last.head(3)

Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,throw_land_frame_id,frame_id,height_in,player_weight,birth_year,player_position,player_side,player_role,x_std,y_std,o_std,o_std_sin,o_std_cos,dir_std,dir_std_sin,dir_std_cos,s,a,dx_ball,dy_ball,dist_ball,angle_to_ball,angle_to_ball_sin,angle_to_ball_cos,angle_to_ball_minus_dir,angle_to_ball_minus_dir_sin,angle_to_ball_minus_dir_cos,angle_to_ball_minus_o,angle_to_ball_minus_o_sin,angle_to_ball_minus_o_cos,s_x_std,s_y_std,s_parallel,s_perp,ball_land_x_std,ball_land_y_std,qb_x_std,qb_y_std,qb_s,qb_a,qb_dir_std,qb_o_std,qb_throw_distance,qb_ball_dir,qb_o_std_sin,qb_o_std_cos,qb_dir_std_sin,qb_dir_std_cos,qb_ball_dir_sin,qb_ball_dir_cos
2,2023090700,101,44930,26,21,26,75.0,196,1995,WR,Offense,Targeted Receiver,10.43,14.14,106.8,0.957319,-0.289032,99.25,0.986996,-0.160743,7.9,2.68,10.829998,-14.36,17.986064,142.977199,0.602133,-0.798396,43.727199,0.691226,0.722639,36.177199,0.590284,0.807195,7.797271,-1.269866,5.708848,5.460681,21.259998,-0.22,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242
0,2023090700,101,46137,26,21,26,73.0,204,1997,SS,Defense,Defensive Coverage,13.82,17.67,184.99,-0.086982,-0.99621,134.17,0.717276,-0.69679,5.34,1.8,7.439998,-17.89,19.375389,157.41881,0.383992,-0.923336,23.24881,0.394725,0.918799,-27.57119,-0.46285,0.886436,3.830251,-3.720857,4.906389,2.10783,21.259998,-0.22,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242
1,2023090700,101,52546,26,21,26,73.0,193,1997,CB,Defense,Defensive Coverage,6.01,12.44,309.47,-0.771958,0.635674,192.18,-0.210984,-0.97749,2.93,4.75,15.249998,-12.66,19.820144,129.698237,0.769419,-0.638744,-62.481763,-0.886864,0.462031,-179.771763,-0.003983,-0.999992,-0.618182,-2.864045,1.353751,-2.598511,21.259998,-0.22,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242


In [13]:
y_data = output_df.merge(
    baseline_frame_info[['game_id','play_id']], 
    on=['game_id','play_id']
)

y_data.sort_values(['game_id','play_id','nfl_id', 'frame_id'], inplace=True)

In [14]:
def hybrid_trajectory_interpolation(x_data, y_data, frame_rate=10, blend_factor=0.5):
    """
    Hybrid: blend velocity projection (early) with ball-directed (late)
    blend_factor: 0 = pure velocity, 1 = pure ball-directed
    """
    results = []
    
    for idx, row in x_data.iterrows():
        if idx % 10000 == 0:
            print(f"Processing row {idx}/{len(x_data)}")
        gid = row['game_id']
        pid = row['play_id']
        nid = row['nfl_id']
        
        x_throw = row['x_std']
        y_throw = row['y_std']
        vx = row['s_x_std']
        vy = row['s_y_std']
        x_land = row['ball_land_x_std']
        y_land = row['ball_land_y_std']
        throw_frame = row['throw_frame_id']
        
        traj_frames = y_data[
            (y_data['game_id'] == gid) &
            (y_data['play_id'] == pid) &
            (y_data['nfl_id'] == nid)
        ].sort_values('frame_id')
        
        if traj_frames.empty:
            continue
        
        frame_ids = traj_frames['frame_id'].values
        n_frames = len(frame_ids)
        
        for i, fid in enumerate(frame_ids):
            dt = (fid) / frame_rate
            t_norm = i / max(n_frames - 1, 1)  # 0 to 1
            
            # Velocity projection
            x_vel = x_throw + vx * dt
            y_vel = y_throw + vy * dt
            
            # Ball-directed interpolation
            x_ball = x_throw + t_norm * (x_land - x_throw)
            y_ball = y_throw + t_norm * (y_land - y_throw)
            
            # Blend: early frames favor velocity, late frames favor ball
            alpha = t_norm * blend_factor
            x_hybrid = (1 - alpha) * x_vel + alpha * x_ball
            y_hybrid = (1 - alpha) * y_vel + alpha * y_ball
            
            results.append({
                'game_id': gid,
                'play_id': pid,
                'nfl_id': nid,
                'frame_id': fid,
                'x_std_hybrid': x_hybrid,
                'y_std_hybrid': y_hybrid,
            })
    
    return pd.DataFrame(results)

# Generate hybrid trajectories
hybrid_traj = hybrid_trajectory_interpolation(x_data_last, y_data, blend_factor=0.7)
y_with_hybrid = y_data.merge(hybrid_traj, on=['game_id', 'play_id', 'nfl_id', 'frame_id'])

y_with_hybrid.shape

Processing row 0/46045
Processing row 10000/46045
Processing row 20000/46045
Processing row 30000/46045
Processing row 40000/46045


(562936, 14)

In [15]:
import numpy as np

def calculate_kaggle_rmse(df):
    """
    Calculate RMSE per Kaggle's formula
    df should have: x_std, y_std (actual), x_std_hybrid, y_std_hybrid (predicted)
    """
    # Calculate squared errors per frame
    squared_errors = (
        (df['x_std'] - df['x_std_hybrid'])**2 + 
        (df['y_std'] - df['y_std_hybrid'])**2
    )
    
    # RMSE = sqrt(mean of squared distances)
    rmse = np.sqrt(squared_errors.mean())
    
    return rmse

# Calculate overall RMSE
overall_rmse = calculate_kaggle_rmse(y_with_hybrid)
print(f"\n{'='*50}")
print(f"üèà Hybrid Baseline RMSE: {overall_rmse:.4f} yards")
print(f"{'='*50}\n")

# # Calculate per-frame RMSE (to see if error grows over time)
# frame_rmse = y_with_hybrid.groupby('frame_id').apply(
#     lambda g: np.sqrt(((g['x_std'] - g['x_std_hybrid'])**2 + 
#                        (g['y_std'] - g['y_std_hybrid'])**2).mean())
# ).reset_index(name='rmse')

# print("RMSE by frame:")
# print(frame_rmse.head(15))

# # Calculate per-play RMSE (to identify hardest plays)
# play_rmse = y_with_hybrid.groupby(['game_id', 'play_id']).apply(
#     lambda g: np.sqrt(((g['x_std'] - g['x_std_hybrid'])**2 + 
#                        (g['y_std'] - g['y_std_hybrid'])**2).mean())
# ).reset_index(name='rmse')


# print(f"\nPlay-level RMSE statistics:")
# print(play_rmse['rmse'].describe())
# print(f"\nWorst 5 plays:")
# print(play_rmse.nlargest(5, 'rmse'))


# print(f"\nPlay-level RMSE statistics:")
# print(play_rmse['rmse'].describe())
# print(f"\nBest 5 plays:")
# print(play_rmse.nsmallest(5, 'rmse'))


üèà Hybrid Baseline RMSE: 2.8302 yards



In [16]:
y_with_hybrid['target_dx'] = y_with_hybrid['x_std'] - y_with_hybrid['x_std_hybrid']
y_with_hybrid['target_dy'] = y_with_hybrid['y_std'] - y_with_hybrid['y_std_hybrid']

y_with_hybrid.head(10)

y_data = y_with_hybrid[['game_id','play_id','nfl_id','frame_id','target_dx','target_dy', 'x_std','y_std', 'x_std_hybrid', 'y_std_hybrid']].copy()

In [17]:
interaction_features = ['x_std',
                        'y_std',
                        's_x_std',
                        's_y_std',
                        'height_in',
                        'dist_ball',
                        's_parallel',
                        's_perp']

inv_numeric_features = [
    # Predicted player features
    "height_in", "player_weight", "birth_year",
    # Predicted player kinematics
    "x_std", "y_std",
    "s_x_std", "s_y_std",
    "a",  # if present
    "dir_std_sin", "dir_std_cos",
    "o_std_sin", "o_std_cos",
    
    # QB kinematics
    "qb_x_std", "qb_y_std", "qb_s", "qb_a",
    "qb_o_std_sin", "qb_o_std_cos",
    "qb_dir_std_sin", "qb_dir_std_cos",
    
    # Throw features - global
    "throw_frame_id", "throw_land_frame_id",
    "ball_land_x_std", "ball_land_y_std",
    # Time of throw - needs QB kinematics
    "qb_throw_distance", 
    "qb_ball_dir_sin", "qb_ball_dir_cos",

    # Ball-related features
    "dx_ball", "dy_ball", "dist_ball",
    "angle_to_ball_sin", "angle_to_ball_cos",
    "angle_to_ball_minus_dir_sin", "angle_to_ball_minus_dir_cos",
    "angle_to_ball_minus_o_sin", "angle_to_ball_minus_o_cos",
    "s_parallel", "s_perp",
]

motion_features= ["x_std","y_std","s_x_std","s_y_std","dir_std_sin","dir_std_cos","o_std_sin","o_std_cos",
                  "dx_ball","dy_ball","dist_ball","angle_to_ball_sin","angle_to_ball_cos",
                  "angle_to_ball_minus_dir_sin","angle_to_ball_minus_dir_cos",
                  "angle_to_ball_minus_o_sin","angle_to_ball_minus_o_cos",
                  "s_parallel","s_perp"]

inv_categorical_features = [
    "player_position",
    "player_side",
    "player_role",
]

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


preproc_invariant = ColumnTransformer(
    transformers=[
        ("num", "passthrough", inv_numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), inv_categorical_features),
    ]
)

preproc_invariant.fit(x_data_last[inv_numeric_features + inv_categorical_features])

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [32]:
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import numpy as np

class PlayDataset(Dataset):
    def __init__(
        self,
        x_data_last,
        x_data_all,
        y_data,
        interaction_features,
        inv_numeric_features,
        inv_categorical_features,
        motion_features,
        preproc_invariant,
        T_pre=10,
        device="cpu",
    ):
        self.device = device
        self.interaction_features = interaction_features
        self.inv_numeric_features = inv_numeric_features
        self.inv_categorical_features = inv_categorical_features
        self.motion_features = motion_features
        self.T_pre = T_pre
        self.preproc_invariant = preproc_invariant

        self.samples = []

        # ‚úÖ OPTIMIZATION 1: Pre-group data once outside the loop
        y_grouped = {
            (gid, pid, nid): group.sort_values("frame_id")
            for (gid, pid, nid), group in y_data.groupby(["game_id", "play_id", "nfl_id"])
        }
        
        x_all_grouped = {
            (gid, pid, nid): group.sort_values("frame_id")
            for (gid, pid, nid), group in x_data_all.groupby(["game_id", "play_id", "nfl_id"])
        }

        for (gid, pid), play_df_all in tqdm(x_data_last.groupby(["game_id", "play_id"])):
            play_df = play_df_all.sort_values("nfl_id").reset_index(drop=True)
            nfl_ids = play_df["nfl_id"].tolist()

            # ‚úÖ OPTIMIZATION 2: Collect all player data in one pass
            frames_per_player = []
            targets_per_player = []
            x_true_per_player = []
            y_true_per_player = []
            x_hyb_per_player = []
            y_hyb_per_player = []
            motion_list = []
            T_max = 0

            for nid in nfl_ids:
                # Use pre-grouped dict instead of .query()
                out_rows = y_grouped.get((gid, pid, nid))
                if out_rows is None or out_rows.empty:
                    continue

                frames = out_rows["frame_id"].to_numpy()
                targets = out_rows[['target_dx','target_dy']].to_numpy(dtype='float32')
                x_true = out_rows["x_std"].to_numpy(dtype="float32")
                y_true = out_rows["y_std"].to_numpy(dtype="float32")
                x_hyb  = out_rows["x_std_hybrid"].to_numpy(dtype="float32")
                y_hyb  = out_rows["y_std_hybrid"].to_numpy(dtype="float32")

                frames_per_player.append(frames)
                targets_per_player.append(targets)
                x_true_per_player.append(x_true)
                y_true_per_player.append(y_true)
                x_hyb_per_player.append(x_hyb)
                y_hyb_per_player.append(y_hyb)
                T_max = max(T_max, len(frames))

                # ‚úÖ OPTIMIZATION 3: Motion history lookup
                hist_rows = x_all_grouped.get((gid, pid, nid))
                
                if hist_rows is None or hist_rows.empty:
                    motion_seq = np.zeros((self.T_pre, len(self.motion_features)),
                                          dtype=np.float32)
                else:
                    # Filter by throw_frame_id (vectorized)
                    throw_frame = play_df[play_df["nfl_id"] == nid]["throw_frame_id"].iloc[0]
                    hist_rows = hist_rows[hist_rows["frame_id"] <= throw_frame]
                    
                    vals = hist_rows[self.motion_features].to_numpy(np.float32)
                    if len(vals) >= self.T_pre:
                        motion_seq = vals[-self.T_pre:]
                    else:
                        pad = np.repeat(vals[:1], self.T_pre - len(vals), axis=0)
                        motion_seq = np.concatenate([pad, vals], axis=0)

                motion_list.append(motion_seq)

            if len(frames_per_player) == 0:
                continue

            # ‚úÖ OPTIMIZATION 4: Vectorized tensor creation
            t_norm = torch.linspace(0.0, 1.0, steps=T_max, dtype=torch.float32)
            N = len(targets_per_player)

            # Pre-allocate all tensors at once
            targets_tensor    = torch.zeros(N, T_max, 2, dtype=torch.float32)
            mask              = torch.zeros(N, T_max,    dtype=torch.bool)
            x_true_tensor     = torch.zeros(N, T_max,    dtype=torch.float32)
            y_true_tensor     = torch.zeros(N, T_max,    dtype=torch.float32)
            x_hybrid_tensor   = torch.zeros(N, T_max,    dtype=torch.float32)
            y_hybrid_tensor   = torch.zeros(N, T_max,    dtype=torch.float32)

            # Fill in one loop
            for i, (targ, x_t, y_t, x_h, y_h) in enumerate(
                zip(targets_per_player, x_true_per_player, y_true_per_player,
                    x_hyb_per_player, y_hyb_per_player)
            ):
                Ti = targ.shape[0]
                targets_tensor[i, :Ti, :] = torch.from_numpy(targ)
                mask[i, :Ti] = True
                x_true_tensor[i, :Ti] = torch.from_numpy(x_t)
                y_true_tensor[i, :Ti] = torch.from_numpy(y_t)
                x_hybrid_tensor[i, :Ti] = torch.from_numpy(x_h)
                y_hybrid_tensor[i, :Ti] = torch.from_numpy(y_h)

            X_pair, X_inv = self._build_pairwise_and_invariant(play_df)
            motion_tensor = torch.from_numpy(np.stack(motion_list))

            self.samples.append((
                X_pair, X_inv, motion_tensor, t_norm,
                targets_tensor, mask,
                x_true_tensor, y_true_tensor,
                x_hybrid_tensor, y_hybrid_tensor,
            ))

    def __len__(self):
        return len(self.samples)

    def _build_pairwise_and_invariant(self, play_df):
        X_int = play_df[self.interaction_features].to_numpy(dtype=np.float32)
        N, F_int = X_int.shape

        feat_i = X_int[:, None, :]
        feat_j = X_int[None, :, :]
        pair_diff = feat_j - feat_i
        X_pair = np.transpose(pair_diff, (2, 0, 1))

        X_inv = self.preproc_invariant.transform(
            play_df[self.inv_numeric_features + self.inv_categorical_features]
        ).astype("float32")

        return torch.from_numpy(X_pair), torch.from_numpy(X_inv)

    def __getitem__(self, idx):
        return self.samples[idx]
    

full_dataset = PlayDataset(
    x_data_last=x_data_last,
    x_data_all = x_data,
    y_data=y_data,  # with proper residual targets!
    interaction_features=interaction_features,
    inv_numeric_features=inv_numeric_features,
    inv_categorical_features=inv_categorical_features,
    motion_features=motion_features,
    preproc_invariant=preproc_invariant,
)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14108/14108 [00:41<00:00, 337.64it/s]


In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MotionEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=32, out_dim=32):
        super().__init__()
        self.conv = nn.Conv1d(in_channels=in_dim,
                              out_channels=hidden_dim,
                              kernel_size=3,
                              padding=1)
        self.relu = nn.ReLU()
        self.fc   = nn.Linear(hidden_dim, out_dim)

    def forward(self, motion):  # motion: (B, N, T_pre, F_motion)
        B, N, T, F = motion.shape
        x = motion.view(B * N, T, F).transpose(1, 2)  # (B*N, F, T)
        h = self.relu(self.conv(x))                   # (B*N, H, T)
        h = h.mean(dim=-1)                            # (B*N, H) - avg over time
        h = self.fc(h)                                # (B*N, out_dim)
        h = h.view(B, N, -1)                          # (B, N, out_dim)
        return h
    
class PairwiseInteractionEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=64):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(hidden_channels, hidden_channels, kernel_size=1)
        self.conv3 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)

    def forward(self, x):
        # x: (B, F_int, N, N)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))   # (B, C, N, N)
        x = x.mean(dim=3)           # pool over "other player" j ‚Üí (B, C, N)
        x = x.permute(0, 2, 1)      # ‚Üí (B, N, C)
        return x

class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        # x: (..., in_dim)
        return self.net(x)

class FullModel(nn.Module):
    def __init__(self, in_channels, in_motion, inv_dim, hidden_dim=128, enc_hidden=64, enc_out=64):
        super().__init__()
        self.encoder = PairwiseInteractionEncoder(
            in_channels=in_channels,
            hidden_channels=enc_hidden,
            out_channels=enc_out,
        )
        self.motion_encoder = MotionEncoder(
            in_dim=in_motion,
            hidden_dim=enc_hidden,
            out_dim=enc_out,
        )
        self.mlp = TimeConditionedMLP(
            in_dim=enc_out + enc_out + inv_dim + 1,  # enc_out (interaction) + enc_out (motion) + inv_dim + time
            hidden_dim=hidden_dim,
            out_dim=2,
        )

    def forward(self, X_pair, X_inv, X_motion, t_norm, mask):
        """
        X_pair: (B, F_int, N, N)
        X_inv:  (B, N, F_inv)
        X_motion: (B, N, T_pre, F_motion)
        t_norm: (B, T_max)
        mask:   (B, N, T_max)  (bool) ‚Äì True where target is valid
        """
        B, F_int, N, _ = X_pair.shape
        _, N_inv, F_inv = X_inv.shape
        _, T_max = t_norm.shape

        assert N == N_inv, "Mismatch in N between pairwise and inv features"

        # --- Encode interactions ---
        z_int = self.encoder(X_pair)    # (B, N, C)
        z_motion = self.motion_encoder(X_motion)  # (B, N, C)

        # --- Prepare features over time ---
        # z_int:     (B, N, C)     ‚Üí (B, N, T, C)
        # z_motion:  (B, N, C)     ‚Üí (B, N, T, C)
        # X_inv:     (B, N, F_inv) ‚Üí (B, N, T, F_inv)
        # t_norm:    (B, T)        ‚Üí (B, 1, T, 1) broadcast to (B, N, T, 1)
        C = z_int.shape[-1]
        z_int_exp = z_int.unsqueeze(2).expand(B, N, T_max, C)          # (B, N, T, C)
        z_motion_exp = z_motion.unsqueeze(2).expand(B, N, T_max, C)    # (B, N, T, C)
        X_inv_exp = X_inv.unsqueeze(2).expand(B, N, T_max, F_inv)      # (B, N, T, F_inv)
        t_exp     = t_norm.unsqueeze(1).unsqueeze(-1).expand(
            B, N, T_max, 1
        )  # (B, N, T, 1)

        feat = torch.cat([z_int_exp, z_motion_exp, X_inv_exp, t_exp], dim=-1)  # (B, N, T, 2*C+F_inv+1)

        # Flatten players and time to feed MLP
        feat_flat = feat.view(B * N * T_max, -1)       # (B*N*T, in_dim)
        out_flat  = self.mlp(feat_flat)                # (B*N*T, 2)
        out       = out_flat.view(B, N, T_max, 2)      # (B, N, T, 2)

        # Apply mask in loss outside (we return full out)
        return out


In [34]:
from torch.utils.data import DataLoader
import numpy as np

# For now, simple random split by index (you can do group splits by game_id if you like)
# Dataset is already at the play level, so this way of splitting is fine
n = len(full_dataset)
idxs = np.arange(n)
np.random.seed(42)
np.random.shuffle(idxs)

n_train = int(0.7 * n)
n_val   = int(0.15 * n)
train_idx = idxs[:n_train]
val_idx   = idxs[n_train:n_train+n_val]
test_idx  = idxs[n_train+n_val:]

from torch.utils.data import Subset

train_ds = Subset(full_dataset, train_idx)
val_ds   = Subset(full_dataset, val_idx)
test_ds  = Subset(full_dataset, test_idx)

train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=1, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=1, shuffle=False)

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Infer dims
F_int = len(interaction_features)
# Get one batch to determine inv_dim and in_motion
X_pair0, X_inv0, motion0, t_norm0, targets0, mask0, x_true0, y_true0, x_hybrid0, y_hybrid0 = next(iter(train_loader))
inv_dim = X_inv0.shape[-1]

model = FullModel(
    in_channels=F_int,
    in_motion=len(motion_features),
    inv_dim=inv_dim,
    hidden_dim=128,
    enc_hidden=64,
    enc_out=64,
).to(device)

print(f"Model initialized with:")
print(f"  F_int (interaction features): {F_int}")
print(f"  in_motion (motion features): {len(motion_features)}")
print(f"  inv_dim (invariant features): {inv_dim}")
print(f"  MLP input dim: {64 + 64 + inv_dim + 1} (enc_out + enc_out + inv_dim + 1)")


Model initialized with:
  F_int (interaction features): 8
  in_motion (motion features): 19
  inv_dim (invariant features): 59
  MLP input dim: 188 (enc_out + enc_out + inv_dim + 1)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Infer dims
F_int = len(interaction_features)
X_pair0, X_inv0, motion0, t_norm0, targets0, mask0, x_true0, y_true0, x_hybrid0, y_hybrid0 = next(iter(train_loader))
inv_dim = X_inv0.shape[-1]

model = FullModel(
    in_channels=F_int,
    in_motion=len(motion_features),
    inv_dim=inv_dim,
    hidden_dim=256,
    enc_hidden=128,
    enc_out=64,
).to(device)

criterion = nn.MSELoss(reduction="sum")  # we'll divide by #valid later
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0              # for residual MSE (training objective)
    total_squared_distance = 0.0  # for *position* error (Kaggle metric)
    total_samples = 0

    for X_pair, X_inv, motion, t_norm, targets, mask, x_true, y_true, x_hybrid, y_hybrid in tqdm(loader):
        X_pair  = X_pair.to(device).float()      # (B, F_int, N, N)
        X_inv   = X_inv.to(device).float()       # (B, N, F_inv)
        motion  = motion.to(device).float()      # (B, N, T_pre, F_motion)
        t_norm  = t_norm.to(device).float()      # (B, T)
        targets = targets.to(device).float()     # (B, N, T, 2)  -> residuals
        mask    = mask.to(device)                # (B, N, T)

        x_true    = x_true.to(device).float()    # (B, N, T)
        y_true    = y_true.to(device).float()    # (B, N, T)
        x_hybrid  = x_hybrid.to(device).float()  # (B, N, T)
        y_hybrid  = y_hybrid.to(device).float()  # (B, N, T)

        if train:
            optimizer.zero_grad()

        preds = model(X_pair, X_inv, motion, t_norm, mask)   # (B, N, T, 2)

        # # ---- residual MSE loss (training objective) ----
        mask_expanded = mask.unsqueeze(-1).expand_as(preds)  # (B, N, T, 2)
        diff_res = (preds - targets) * mask_expanded         # (B, N, T, 2)
        # sum of squared residual errors
        loss = criterion(diff_res, torch.zeros_like(diff_res))

        # ---- absolute position MSE loss ----
        # mask_expanded = mask.unsqueeze(-1).expand_as(preds)
        # diff = (preds - targets) * mask_expanded  # targets are absolute positions
        # loss = criterion(diff, torch.zeros_like(diff))
        # Loss is the same computation, just different interpretation!

        valid_count = mask.sum().item() * 2  # *2 for x and y
        if valid_count == 0:
            continue

        loss = loss / valid_count  # mean residual MSE over valid coords

        if train:
            loss.backward()
            optimizer.step()

        total_loss += loss.item()

        # ---- true Kaggle-style RMSE on *positions* ----
        with torch.no_grad():
            dx_pred = preds[..., 0]
            dy_pred = preds[..., 1]

            x_pred = x_hybrid + dx_pred
            y_pred = y_hybrid + dy_pred
            # x_pred = preds[..., 0]
            # y_pred = preds[..., 1]

            diff_x = (x_pred - x_true) * mask
            diff_y = (y_pred - y_true) * mask

            squared_distances = diff_x**2 + diff_y**2   # (B, N, T)
            total_squared_distance += squared_distances.sum().item()
            total_samples += mask.sum().item()

    avg_loss = total_loss / max(len(loader), 1)  # avg residual MSE (for logging)
    kaggle_rmse = np.sqrt(total_squared_distance / max(total_samples, 1))

    return avg_loss, kaggle_rmse

num_epochs = 90
best_val = float("inf")
best_state = None
current_ts_abbreviated = __import__('datetime').datetime.now().strftime("%Y%m%d_%H%M%S")

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss, train_kaggle_rmse = run_epoch(train_loader, train=True)
    val_loss, val_kaggle_rmse = run_epoch(val_loader, train=False)
    print(f"Epoch {epoch+1}: train={train_loss:.4f},, val={val_loss:.4f}, Kaggle RMSE val={val_kaggle_rmse:.4f}")
    if val_loss < best_val:
        best_val = val_loss
        best_state = model.state_dict().copy()
        torch.save(best_state, f"best_model_{current_ts_abbreviated}.pth")
        print(f"  New best model saved with val loss {best_val:.4f}")

# 20251130-9:15pm run shows nothing to be gained from predictin all way from scratch
# RMSE stil like 1.2-1.3 just takes longer to get there

Epoch 1/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 558.32it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2527.33it/s]


Epoch 1: train=2.1530,, val=1.4507, Kaggle RMSE val=2.1500
  New best model saved with val loss 1.4507
Epoch 2/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 565.18it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2479.73it/s]


Epoch 2: train=1.4957,, val=1.3245, Kaggle RMSE val=2.0706
  New best model saved with val loss 1.3245
Epoch 3/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 585.40it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2515.40it/s]


Epoch 3: train=1.1802,, val=0.7633, Kaggle RMSE val=1.6137
  New best model saved with val loss 0.7633
Epoch 4/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 576.65it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2546.37it/s]


Epoch 4: train=0.8019,, val=0.8702, Kaggle RMSE val=1.7669
Epoch 5/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 590.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2462.00it/s]


Epoch 5: train=0.7337,, val=0.4766, Kaggle RMSE val=1.3150
  New best model saved with val loss 0.4766
Epoch 6/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 575.81it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2534.98it/s]


Epoch 6: train=0.7646,, val=0.4840, Kaggle RMSE val=1.3436
Epoch 7/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 599.04it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1884.53it/s]


Epoch 7: train=0.6460,, val=0.5648, Kaggle RMSE val=1.4161
Epoch 8/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 606.66it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2565.33it/s]


Epoch 8: train=0.6347,, val=0.4333, Kaggle RMSE val=1.2682
  New best model saved with val loss 0.4333
Epoch 9/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 581.96it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2550.94it/s]


Epoch 9: train=0.5973,, val=0.4298, Kaggle RMSE val=1.2508
  New best model saved with val loss 0.4298
Epoch 10/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 607.78it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2558.66it/s]


Epoch 10: train=0.5828,, val=0.5958, Kaggle RMSE val=1.4058
Epoch 11/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 580.84it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2546.41it/s]


Epoch 11: train=0.5715,, val=0.4808, Kaggle RMSE val=1.3024
Epoch 12/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 609.44it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2571.58it/s]


Epoch 12: train=0.5754,, val=0.4161, Kaggle RMSE val=1.2347
  New best model saved with val loss 0.4161
Epoch 13/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 588.42it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2545.25it/s]


Epoch 13: train=0.5345,, val=0.4999, Kaggle RMSE val=1.3163
Epoch 14/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 594.32it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2442.23it/s]


Epoch 14: train=0.5198,, val=0.4617, Kaggle RMSE val=1.2748
Epoch 15/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 591.05it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2553.82it/s]


Epoch 15: train=0.4984,, val=0.3829, Kaggle RMSE val=1.1944
  New best model saved with val loss 0.3829
Epoch 16/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 583.68it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2526.24it/s]


Epoch 16: train=0.5028,, val=0.4947, Kaggle RMSE val=1.3452
Epoch 17/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 601.94it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2564.05it/s]


Epoch 17: train=0.4852,, val=0.4211, Kaggle RMSE val=1.2319
Epoch 18/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 579.38it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2523.17it/s]


Epoch 18: train=0.4824,, val=0.4258, Kaggle RMSE val=1.2396
Epoch 19/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 588.42it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2178.21it/s]


Epoch 19: train=0.5226,, val=0.4067, Kaggle RMSE val=1.2138
Epoch 20/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 595.72it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2525.42it/s]


Epoch 20: train=0.4778,, val=0.5160, Kaggle RMSE val=1.3645
Epoch 21/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 592.79it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2498.72it/s]


Epoch 21: train=0.4565,, val=0.3815, Kaggle RMSE val=1.1826
  New best model saved with val loss 0.3815
Epoch 22/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 602.86it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2527.85it/s]


Epoch 22: train=0.5465,, val=0.5281, Kaggle RMSE val=1.3404
Epoch 23/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 586.09it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2568.16it/s]


Epoch 23: train=0.4637,, val=0.3613, Kaggle RMSE val=1.1650
  New best model saved with val loss 0.3613
Epoch 24/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 609.36it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2375.16it/s]


Epoch 24: train=0.4501,, val=0.4431, Kaggle RMSE val=1.2960
Epoch 25/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 589.41it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2512.77it/s]


Epoch 25: train=0.4582,, val=0.4744, Kaggle RMSE val=1.2964
Epoch 26/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 588.89it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2141.08it/s]


Epoch 26: train=0.4513,, val=0.3663, Kaggle RMSE val=1.1677
Epoch 27/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 608.61it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2567.50it/s]


Epoch 27: train=0.4403,, val=0.4546, Kaggle RMSE val=1.3153
Epoch 28/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 590.10it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2483.15it/s]


Epoch 28: train=0.4380,, val=0.4954, Kaggle RMSE val=1.2956
Epoch 29/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 587.70it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2368.05it/s]


Epoch 29: train=0.4213,, val=0.3612, Kaggle RMSE val=1.1636
  New best model saved with val loss 0.3612
Epoch 30/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [03:39<00:00, 45.00it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2564.37it/s]


Epoch 30: train=0.4101,, val=0.3423, Kaggle RMSE val=1.1367
  New best model saved with val loss 0.3423
Epoch 31/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 601.53it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2314.84it/s]


Epoch 31: train=0.4156,, val=0.4077, Kaggle RMSE val=1.2117
Epoch 32/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [02:31<00:00, 64.98it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2347.74it/s]


Epoch 32: train=0.4065,, val=0.3673, Kaggle RMSE val=1.1663
Epoch 33/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 559.72it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2556.40it/s]


Epoch 33: train=0.4109,, val=0.4490, Kaggle RMSE val=1.2676
Epoch 34/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 540.18it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2284.65it/s]


Epoch 34: train=0.3960,, val=0.4499, Kaggle RMSE val=1.2722
Epoch 35/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:20<00:00, 483.39it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2505.61it/s]


Epoch 35: train=0.3927,, val=0.3753, Kaggle RMSE val=1.1693
Epoch 36/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 539.64it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2110.46it/s]


Epoch 36: train=0.3834,, val=0.3320, Kaggle RMSE val=1.1065
  New best model saved with val loss 0.3320
Epoch 37/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 590.82it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2645.52it/s]


Epoch 37: train=0.3866,, val=0.3276, Kaggle RMSE val=1.1152
  New best model saved with val loss 0.3276
Epoch 38/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 551.48it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2243.08it/s]


Epoch 38: train=0.3812,, val=0.4012, Kaggle RMSE val=1.1794
Epoch 39/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 539.48it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2384.88it/s]


Epoch 39: train=0.3843,, val=0.3426, Kaggle RMSE val=1.1171
Epoch 40/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 577.81it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2667.55it/s]


Epoch 40: train=0.3727,, val=0.3320, Kaggle RMSE val=1.1302
Epoch 41/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 543.48it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2653.59it/s]


Epoch 41: train=0.3637,, val=0.5780, Kaggle RMSE val=1.4276
Epoch 42/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 551.05it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2444.51it/s]


Epoch 42: train=0.3803,, val=0.3476, Kaggle RMSE val=1.1705
Epoch 43/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 571.56it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2509.42it/s]


Epoch 43: train=0.3607,, val=0.3189, Kaggle RMSE val=1.0971
  New best model saved with val loss 0.3189
Epoch 44/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:15<00:00, 623.87it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2566.80it/s]


Epoch 44: train=0.3517,, val=0.3492, Kaggle RMSE val=1.1381
Epoch 45/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:15<00:00, 632.35it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2686.59it/s]


Epoch 45: train=0.3571,, val=0.3864, Kaggle RMSE val=1.1763
Epoch 46/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 589.57it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2267.60it/s]


Epoch 46: train=0.3481,, val=0.3574, Kaggle RMSE val=1.1472
Epoch 47/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 599.73it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2697.99it/s]


Epoch 47: train=0.3438,, val=0.4045, Kaggle RMSE val=1.2109
Epoch 48/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 611.64it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2710.47it/s]


Epoch 48: train=0.3595,, val=0.4421, Kaggle RMSE val=1.2548
Epoch 49/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 578.64it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1652.27it/s]


Epoch 49: train=0.3332,, val=0.3322, Kaggle RMSE val=1.1197
Epoch 50/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 545.38it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2343.56it/s]


Epoch 50: train=0.3347,, val=0.3255, Kaggle RMSE val=1.1035
Epoch 51/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 542.77it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2191.78it/s]


Epoch 51: train=0.3377,, val=0.3327, Kaggle RMSE val=1.1256
Epoch 52/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 534.87it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2337.67it/s]


Epoch 52: train=0.3896,, val=0.3377, Kaggle RMSE val=1.1454
Epoch 53/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 565.82it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2517.99it/s]


Epoch 53: train=0.3495,, val=0.3192, Kaggle RMSE val=1.1189
Epoch 54/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 585.55it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2713.94it/s]


Epoch 54: train=0.3873,, val=0.3269, Kaggle RMSE val=1.1043
Epoch 55/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:15<00:00, 638.10it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2751.74it/s]


Epoch 55: train=0.3245,, val=0.3186, Kaggle RMSE val=1.1097
  New best model saved with val loss 0.3186
Epoch 56/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 566.19it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2087.92it/s]


Epoch 56: train=0.3280,, val=0.3458, Kaggle RMSE val=1.1457
Epoch 57/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:20<00:00, 491.12it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1884.46it/s]


Epoch 57: train=0.3233,, val=0.3012, Kaggle RMSE val=1.0690
  New best model saved with val loss 0.3012
Epoch 58/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 560.23it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2290.81it/s]


Epoch 58: train=0.3139,, val=0.3154, Kaggle RMSE val=1.0915
Epoch 59/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 590.43it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2210.00it/s]


Epoch 59: train=0.3217,, val=0.3128, Kaggle RMSE val=1.1056
Epoch 60/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:19<00:00, 508.78it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2043.58it/s]


Epoch 60: train=0.3536,, val=0.4490, Kaggle RMSE val=1.3521
Epoch 61/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:20<00:00, 482.32it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2440.89it/s]


Epoch 61: train=0.3072,, val=0.2977, Kaggle RMSE val=1.0710
  New best model saved with val loss 0.2977
Epoch 62/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 573.38it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2552.34it/s]


Epoch 62: train=0.3063,, val=0.3814, Kaggle RMSE val=1.1733
Epoch 63/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 587.73it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2267.66it/s]


Epoch 63: train=0.2969,, val=0.3706, Kaggle RMSE val=1.1416
Epoch 64/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 580.27it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2556.81it/s]


Epoch 64: train=0.2958,, val=0.5632, Kaggle RMSE val=1.4137
Epoch 65/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 559.27it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2116.43it/s]


Epoch 65: train=0.2993,, val=0.4295, Kaggle RMSE val=1.2756
Epoch 66/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 584.51it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2529.78it/s]


Epoch 66: train=0.2870,, val=0.3630, Kaggle RMSE val=1.1459
Epoch 67/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 581.83it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2532.02it/s]


Epoch 67: train=0.2867,, val=0.6460, Kaggle RMSE val=1.6165
Epoch 68/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 582.91it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2522.32it/s]


Epoch 68: train=0.3298,, val=0.3447, Kaggle RMSE val=1.1306
Epoch 69/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 572.76it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2310.89it/s]


Epoch 69: train=0.2834,, val=0.3844, Kaggle RMSE val=1.1803
Epoch 70/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 597.19it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2330.85it/s]


Epoch 70: train=0.2868,, val=0.3343, Kaggle RMSE val=1.1424
Epoch 71/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 545.52it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2535.35it/s]


Epoch 71: train=0.2864,, val=0.3025, Kaggle RMSE val=1.0745
Epoch 72/90


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 7389/9875 [00:14<00:04, 568.13it/s]

In [None]:
x_data.head()

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import numpy as np

# X_flat: all your per-(player,frame) features (invariant + motion + ball/QB),
# y_flat: [target_dx, target_dy],
# groups: something like f"{game_id}_{play_id}" so we group by play

gkf = GroupKFold(n_splits=5)
rmse_list = []

for train_idx, val_idx in gkf.split(X_flat, y_flat, groups=groups):
    X_tr, X_val = X_flat[train_idx], X_flat[val_idx]
    y_tr, y_val = y_flat[train_idx], y_flat[val_idx]

    # Separate models for dx and dy or MultiOutputRegressor ‚Äì either is fine
    reg_x = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.05, max_iter=300)
    reg_y = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.05, max_iter=300)

    reg_x.fit(X_tr, y_tr[:, 0])
    reg_y.fit(X_tr, y_tr[:, 1])

    dx_pred = reg_x.predict(X_val)
    dy_pred = reg_y.predict(X_val)

    # Kaggle-style error on residuals
    sq = (dx_pred - y_val[:, 0])**2 + (dy_pred - y_val[:, 1])**2
    rmse = np.sqrt(sq.mean())
    rmse_list.append(rmse)
    print("Fold RMSE:", rmse)

print("Mean RMSE:", np.mean(rmse_list), "¬±", np.std(rmse_list))

In [None]:
@torch.no_grad()
def compute_baseline_rmse(loader):
    total_sq = 0.0
    total_points = 0

    for (
        X_pair, X_inv, t_norm, targets, mask,
        x_true, y_true, x_hyb, y_hyb
    ) in loader:
        mask = mask.to(device)
        x_true   = x_true.to(device)
        y_true   = y_true.to(device)
        x_hyb    = x_hyb.to(device)
        y_hyb    = y_hyb.to(device)

        diff_x = (x_hyb - x_true) * mask
        diff_y = (y_hyb - y_true) * mask
        sq = diff_x**2 + diff_y**2
        total_sq += sq.sum().item()
        total_points += mask.sum().item()

    return np.sqrt(total_sq / max(total_points, 1))

baseline_train_rmse = compute_baseline_rmse(train_loader)
baseline_val_rmse   = compute_baseline_rmse(val_loader)
print("Hybrid baseline RMSE  - train:", baseline_train_rmse)
print("Hybrid baseline RMSE  - val  :", baseline_val_rmse)

Hybrid baseline RMSE  - train: 2.844986673890057
Hybrid baseline RMSE  - val  : 2.6904142974101704


In [27]:
import math
import copy
import numpy as np
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Infer dims
F_int = len(interaction_features)
# Get one batch to determine inv_dim
X_pair0, X_inv0, t_norm0, targets0, mask0 = next(iter(train_loader))
inv_dim = X_inv0.shape[-1]

model = FullModel(
    in_channels=F_int,
    inv_dim=inv_dim,
    hidden_dim=128,
    enc_hidden=64,
    enc_out=64,
).to(device)


# criterion = nn.MSELoss(reduction="sum")  # we'll divide by #valid later
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    # For training loss (per-coordinate MSE)
    total_se = 0.0              # sum of squared errors over all coords
    total_valid_coords = 0      # number of valid coords (x + y) across all batches

    # For Kaggle-style RMSE (2D Euclidean per frame)
    total_squared_distance = 0.0  # sum of (dx^2 + dy^2) over valid frames
    total_valid_frames = 0        # number of valid frames

    with torch.set_grad_enabled(train):
        for X_pair, X_inv, t_norm, targets, mask in tqdm(loader):
            X_pair  = X_pair.to(device).float()    # (B, F_int, N, N)
            X_inv   = X_inv.to(device).float()     # (B, N, F_inv)
            t_norm  = t_norm.to(device).float()    # (B, T)
            targets = targets.to(device).float()   # (B, N, T, 2)
            mask    = mask.to(device)              # (B, N, T), bool or 0/1

            # Forward
            preds = model(X_pair, X_inv, t_norm, mask)  # (B, N, T, 2)

            # ---- Training loss (per-coordinate MSE over valid coords) ----
            # Expand mask to match preds/targets shape
            
            mask_expanded = mask.unsqueeze(-1).expand_as(preds)  # (B, N, T, 2)
            mask_expanded_f = mask_expanded.float()

            diff = (preds - targets) * mask_expanded_f          # zero where invalid

            # Sum of squared errors over all coords
            # You can either use criterion, or just square+sum:
            se = (diff ** 2).sum()    # scalar tensor

            # #valid coords = (#valid frames) * 2 (x and y)
            valid_coords = mask.sum().item() * 2

            if valid_coords == 0:
                continue

            batch_loss = se / valid_coords  # mean MSE over coords for this batch

            if train:
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

            # Accumulate for global MSE
            total_se += se.item()
            total_valid_coords += valid_coords

            # ---- Kaggle-style RMSE metric (Euclidean error per frame) ----
            # Note: here we only use mask (B, N, T) once per frame
            mask_f = mask.float()
            diff_x = (preds[..., 0] - targets[..., 0]) * mask_f
            diff_y = (preds[..., 1] - targets[..., 1]) * mask_f
            squared_distances = diff_x**2 + diff_y**2  # (B, N, T)

            total_squared_distance += squared_distances.sum().item()
            total_valid_frames += mask.sum().item()

    # Global mean per-coordinate MSE across dataset
    avg_loss = total_se / max(total_valid_coords, 1)

    # Global Euclidean RMSE across all valid frames
    kaggle_rmse = math.sqrt(
        total_squared_distance / max(total_valid_frames, 1)
    )

    return avg_loss, kaggle_rmse


# ---------- Training loop with model checkpointing on val RMSE ----------

num_epochs = 90
best_val_rmse = float("inf")
best_state = None
current_ts_abbreviated = __import__('datetime').datetime.now().strftime("%Y%m%d_%H%M%S")

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    train_loss, train_kaggle_rmse = run_epoch(train_loader, train=True)
    val_loss,   val_kaggle_rmse   = run_epoch(val_loader,   train=False)

    print(
        f"Epoch {epoch+1}: "
        f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, "
        f"train_RMSE={train_kaggle_rmse:.4f}, val_RMSE={val_kaggle_rmse:.4f}"
    )

    # Use validation RMSE as model-selection metric
    if val_kaggle_rmse < best_val_rmse:
        best_val_rmse = val_kaggle_rmse
        best_state = copy.deepcopy(model.state_dict())
        save_path = f"best_model_{current_ts_abbreviated}.pth"
        torch.save(best_state, save_path)
        print(f"  New best model saved: {save_path} (val_RMSE={best_val_rmse:.4f})")

NameError: name 'train_loader' is not defined

In [None]:
import time

def train_and_eval(model, train_loader, val_loader, num_epochs=10, patience=20, lr=5e-4):
    device = next(model.parameters()).device
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss(reduction="sum")

    best_val = float("inf")
    best_state = None
    bad_epochs = 0

    for epoch in range(num_epochs):
        # ----- train -----
        model.train()
        for X_pair, X_inv, t_norm, targets, mask in tqdm(train_loader):
            X_pair  = X_pair.to(device).float()
            X_inv   = X_inv.to(device).float()
            t_norm  = t_norm.to(device).float()
            targets = targets.to(device).float()
            mask    = mask.to(device)

            optimizer.zero_grad()
            preds = model(X_pair, X_inv, t_norm, mask)

            mask_expanded = mask.unsqueeze(-1).expand_as(preds)
            diff = (preds - targets) * mask_expanded
            loss = criterion(diff, torch.zeros_like(diff))
            valid_count = mask.sum().item() * 2
            if valid_count == 0:
                continue
            loss = loss / valid_count

            loss.backward()
            optimizer.step()

        # ----- validate -----
        model.eval()
        val_loss = 0.0
        n_batches = 0
        with torch.no_grad():
            for X_pair, X_inv, t_norm, targets, mask in tqdm(val_loader):
                X_pair  = X_pair.to(device).float()   
                X_inv   = X_inv.to(device).float()
                t_norm  = t_norm.to(device).float()
                targets = targets.to(device).float()
                mask    = mask.to(device)

                preds = model(X_pair, X_inv, t_norm, mask)
                mask_expanded = mask.unsqueeze(-1).expand_as(preds)
                diff = (preds - targets) * mask_expanded
                loss = criterion(diff, torch.zeros_like(diff))
                valid_count = mask.sum().item() * 2
                if valid_count == 0:
                    continue
                loss = loss / valid_count

                val_loss += loss.item()
                n_batches += 1

        val_loss /= max(n_batches, 1)
        print(f"Epoch {epoch+1}: val={val_loss:.4f}")

        # early stopping
        if val_loss < best_val:
            best_val = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    current_ts_abbreviated = time.time().__str__().replace('.', '')[-6:]
    torch.save(model.state_dict(), f"best_model_{current_ts_abbreviated}.pth")
    return best_val

In [None]:
all_idxs = np.arange(len(full_dataset))
np.random.shuffle(all_idxs)

subset_size = int(0.3 * len(all_idxs))
subset_idxs = all_idxs[:subset_size]
subset_train_idxs = subset_idxs[:int(0.7 * subset_size)]
subset_val_idxs   = subset_idxs[int(0.7 * subset_size):]

subset_train_ds = Subset(full_dataset, subset_train_idxs)
subset_val_ds   = Subset(full_dataset, subset_val_idxs)

subset_train_loader = DataLoader(subset_train_ds, batch_size=1, shuffle=True)
subset_val_loader   = DataLoader(subset_val_ds, batch_size=1, shuffle=False)


lrs = [1e-3, 5e-4, 2e-4]
hidden_dims = [64, 128]
enc_hidden = [32, 64]

results = []
for lr in lrs:
    for hd in hidden_dims:
        for eh in enc_hidden:
            model = FullModel(
                in_channels=len(interaction_features),
                inv_dim=inv_dim,
                hidden_dim=hd,
                enc_hidden=eh,
                enc_out=64,
        ).to(device)

        print(f"Testing lr={lr}, hidden_dim={hd}")
        val_loss = train_and_eval(
            model,
            subset_train_loader,
            subset_val_loader,
            num_epochs=8,
            patience=3,
            lr=lr,
        )
        results.append((lr, hd, val_loss))

print(sorted(results, key=lambda x: x[2]))

NameError: name 'full_dataset' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PairwiseInteractionEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=64):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(hidden_channels, hidden_channels, kernel_size=1)
        self.conv3 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)

    def forward(self, x):
        # x: (B, F_int, N, N)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))   # (B, C, N, N)
        x = x.mean(dim=3)           # pool over "other player" j ‚Üí (B, C, N)
        x = x.permute(0, 2, 1)      # ‚Üí (B, N, C)
        return x

class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        # x: (..., in_dim)
        return self.net(x)

class FullModel(nn.Module):
    def __init__(self, in_channels, inv_dim, hidden_dim=128, enc_hidden=64, enc_out=64):
        super().__init__()
        self.encoder = PairwiseInteractionEncoder(
            in_channels=in_channels,
            hidden_channels=enc_hidden,
            out_channels=enc_out,
        )
        self.mlp = TimeConditionedMLP(
            in_dim=enc_out + inv_dim + 1,  # +1 for time feature
            hidden_dim=hidden_dim,
            out_dim=2,
        )

    def forward(self, X_pair, X_inv, t_norm, mask):
        """
        X_pair: (B, F_int, N, N)
        X_inv:  (B, N, F_inv)
        t_norm: (B, T_max)
        mask:   (B, N, T_max)  (bool) ‚Äì True where target is valid
        """
        B, F_int, N, _ = X_pair.shape
        _, N_inv, F_inv = X_inv.shape
        _, T_max = t_norm.shape

        assert N == N_inv, "Mismatch in N between pairwise and inv features"

        # --- Encode interactions ---
        z_int = self.encoder(X_pair)    # (B, N, C)

        # --- Prepare features over time ---
        # z_int:     (B, N, C)     ‚Üí (B, N, T, C)
        # X_inv:     (B, N, F_inv) ‚Üí (B, N, T, F_inv)
        # t_norm:    (B, T)        ‚Üí (B, 1, T, 1) broadcast to (B, N, T, 1)
        C = z_int.shape[-1]
        z_int_exp = z_int.unsqueeze(2).expand(B, N, T_max, C)          # (B, N, T, C)
        X_inv_exp = X_inv.unsqueeze(2).expand(B, N, T_max, F_inv)      # (B, N, T, F_inv)
        t_exp     = t_norm.unsqueeze(1).unsqueeze(-1).expand(
            B, N, T_max, 1
        )  # (B, N, T, 1)

        feat = torch.cat([z_int_exp, X_inv_exp, t_exp], dim=-1)        # (B, N, T, C+F_inv+1)

        # Flatten players and time to feed MLP
        feat_flat = feat.view(B * N * T_max, -1)       # (B*N*T, in_dim)
        out_flat  = self.mlp(feat_flat)                # (B*N*T, 2)
        out       = out_flat.view(B, N, T_max, 2)      # (B, N, T, 2)

        # Apply mask in loss outside (we return full out)
        return out

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preproc_invariant = ColumnTransformer(
    transformers=[
        ("num", "passthrough", inv_numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), inv_categorical_features),
    ]
)

preproc_invariant.fit(x_data_last[inv_numeric_features + inv_categorical_features])

# merge in play_targets to get y per play
x_with_y = x_data_last.merge(
    y_data[['game_id','play_id','nfl_id','target_dx','target_dy']],
    on=['game_id','play_id','nfl_id'],
    how='inner',
    indicator=True
).query('_merge == "both"').drop(columns=['_merge'])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PairwiseInteractionEncoder(nn.Module):
    """
    Input:  (B, F_int, N, N)  pairwise features
    Output: (B, N, C)         per-player interaction embedding
    """
    def __init__(self, in_channels, hidden_channels=64, out_channels=64):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(hidden_channels, hidden_channels, kernel_size=1)
        self.conv3 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)

    def forward(self, x):
        # x: (B, F_int, N, N)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))  # (B, C, N, N)

        # pool over "other player" axis (j), keep i:
        # assume dim 2 = i (row player), dim 3 = j (other)
        x = x.mean(dim=3)          # (B, C, N)

        # reshape to per-player embeddings (B, N, C)
        x = x.permute(0, 2, 1)     # (B, N, C)
        return x

In [None]:
def build_play_embeddings(play_df, encoder: PairwiseInteractionEncoder, device="cpu"):
    play_df = play_df.sort_values("nfl_id").reset_index(drop=True)
    N = len(play_df)

    # Interaction features -> pairwise grid
    X_int = play_df[interaction_features].to_numpy(dtype=np.float32)  # (N, F_int)
    N, F_int = X_int.shape

    feat_i = X_int[:, None, :]                # (N, 1, F_int)
    feat_j = X_int[None, :, :]                # (1, N, F_int)
    pair_diff = feat_j - feat_i               # (N, N, F_int)
    X_pair = np.transpose(pair_diff, (2, 0, 1)).astype(np.float32)  # (F_int, N, N)
    X_pair_t = torch.from_numpy(X_pair).unsqueeze(0).to(device)     # (1, F_int, N, N)

    # Invariant features
    X_inv = preproc_invariant.transform(
        play_df[inv_numeric_features + inv_categorical_features]
    )
    X_inv = np.asarray(X_inv, dtype=np.float32)                      # (N, F_inv)
    X_inv_t = torch.from_numpy(X_inv).to(device).unsqueeze(0)       # (1, N, F_inv)

    # Encode interactions
    # with torch.no_grad():  # (for now: treat encoder as fixed)
    z_int = encoder(X_pair_t)            # (1, N, C)

    # Concatenate interaction + invariant per player
    Z_play = torch.cat([z_int, X_inv_t], dim=-1)  # (1, N, D)

    return Z_play, play_df  # return df so we know which row is which

In [None]:
from tqdm import tqdm

device = "cpu"  # or "cuda" if available
encoder = PairwiseInteractionEncoder(
    in_channels=len(interaction_features),
    hidden_channels=128,
    out_channels=128,
).to(device)

X_list = []  # will hold [z_player || time_features]
y_list = []
play_ids = []  # ‚úÖ NEW: Track which play each sample belongs to


for (gid, pid), play_df_all in tqdm(x_data_last.groupby(["game_id", "play_id"])):
    play_df = play_df_all.copy()

    # Build embeddings for all players in this play
    Z_play, play_df_sorted = build_play_embeddings(play_df, encoder, device=device)
    Z_play = Z_play.squeeze(0)   # (N, D)

    play_df_sorted = play_df_sorted.reset_index(drop=True)
    N, D = Z_play.shape

    for i in range(N):
        row = play_df_sorted.iloc[i]
        nid = row["nfl_id"]

        # Get this player's future frames
        out_rows = (
            y_data
            .query(
                "game_id == @gid and play_id == @pid and nfl_id == @nid"
            )
            .sort_values("frame_id")
        )

        if out_rows.empty:
            continue
        
        T_i = len(out_rows)

        # Example time feature: normalized time 0..1
        t_norm = (np.arange(T_i, dtype=np.float32) / max(T_i - 1, 1)).reshape(-1, 1)  # (T_i, 1)
        # print(Z_play)
        # Player embedding (D,) -> repeat over T_i frames
        z_i = Z_play[i].detach().cpu().numpy()         # (D,)
        z_rep = np.repeat(z_i[None, :], T_i, axis=0)   # (T_i, D)

        # Concatenate [z_i || t_features]
        X_i_t = np.concatenate([z_rep, t_norm], axis=1)  # (T_i, D+1)

        # Targets: x_t, y_t for each frame
        y_i_t = out_rows[["target_dx", "target_dy"]].to_numpy(dtype=np.float32)  # (T_i, 2)

        X_list.append(X_i_t)
        y_list.append(y_i_t)

        play_ids.extend([f"{gid}_{pid}"] * T_i)

# Stack all (T_i, ‚Ä¶) chunks into one big (num_samples, ‚Ä¶)
X_all = np.concatenate(X_list, axis=0)  # (num_samples, D+1)
Y_all = np.concatenate(y_list, axis=0)  # (num_samples, 2)
play_ids = np.array(play_ids)  # ‚úÖ (num_samples,)

print(X_all.shape, Y_all.shape, play_ids.shape)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14108/14108 [02:20<00:00, 100.66it/s]


(562936, 177) (562936, 2) (562936,)


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit

# Create train/test split grouped by play
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(splitter.split(X_all, Y_all, groups=play_ids))

X_train_full = X_all[train_idx]
y_train_full = Y_all[train_idx]
play_ids_train = play_ids[train_idx]

X_test = X_all[test_idx]
y_test = Y_all[test_idx]
play_ids_test = play_ids[test_idx]

print(f"Train: {len(X_train_full)} samples from {len(np.unique(play_ids_train))} plays")
print(f"Test: {len(X_test)} samples from {len(np.unique(play_ids_test))} plays")

# ‚úÖ Verify no overlap
assert len(set(play_ids_train) & set(play_ids_test)) == 0, "Data leakage detected!"

# Second split: train -> train + val (also grouped)
splitter_val = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx2, val_idx = next(splitter_val.split(
    X_train_full, y_train_full, groups=play_ids_train
))

X_train = X_train_full[train_idx2]
y_train = y_train_full[train_idx2]

X_val = X_train_full[val_idx]
y_val = y_train_full[val_idx]

print(f"Final split:")
print(f"  Train: {len(X_train)} samples")
print(f"  Val:   {len(X_val)} samples")
print(f"  Test:  {len(X_test)} samples")

Train: 451172 samples from 11286 plays
Test: 111764 samples from 2822 plays
Final split:
  Train: 360785 samples
  Val:   90387 samples
  Test:  111764 samples


In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(
    torch.from_numpy(X_train),
    torch.from_numpy(y_train),
)
val_ds = TensorDataset(
    torch.from_numpy(X_val),
    torch.from_numpy(y_val),
)
test_ds = TensorDataset(
    torch.from_numpy(X_test),
    torch.from_numpy(y_test),
)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy

class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )
    def forward(self, x):
        return self.net(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_dim = X_all.shape[1]

def train_one_config(hidden_dim, lr, num_epochs=30, patience=5):
    model = TimeConditionedMLP(in_dim=in_dim, hidden_dim=hidden_dim, out_dim=2).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    best_state = None
    bad_epochs = 0

    for epoch in range(num_epochs):
        # ---- Train ----
        model.train()
        train_loss = 0.0
        n_train = 0

        for xb, yb in train_loader:
            xb = xb.to(device).float()
            yb = yb.to(device).float()

            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * xb.size(0)
            n_train += xb.size(0)

        train_loss /= n_train

        # ---- Validate ----
        model.eval()
        val_loss = 0.0
        n_val = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device).float()
                yb = yb.to(device).float()
                preds = model(xb)
                loss = criterion(preds, yb)
                val_loss += loss.item() * xb.size(0)
                n_val += xb.size(0)
        val_loss /= n_val

        print(f"[hd={hidden_dim}, lr={lr}] Epoch {epoch+1}: train={train_loss:.4f}, val={val_loss:.4f}")

        # ---- Early stopping tracking ----
        if val_loss < best_val:
            best_val = val_loss
            best_state = copy.deepcopy(model.state_dict())
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print(f"Early stopping (no val improvement for {patience} epochs).")
                break

    # Load best weights before returning
    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_val

In [None]:
hidden_dims = [128, 256]
lrs = [1e-3, 5e-4]

best_cfg = None
best_val = float("inf")
best_model = None

# hidden_dims = [256]
# lrs = [5e-4]

for hd in hidden_dims:
    for lr in lrs:
        print(f"\n=== Training config: hidden_dim={hd}, lr={lr} ===")
        model, val_loss = train_one_config(hidden_dim=hd, lr=lr, num_epochs=75, patience=10)

        print(f"Config (hd={hd}, lr={lr}) finished with best val MSE={val_loss:.4f}")
        if val_loss < best_val:
            best_val = val_loss
            best_cfg = (hd, lr)
            best_model = model

print("\nBest config:", best_cfg, "with val MSE=", best_val)


=== Training config: hidden_dim=128, lr=0.001 ===
[hd=128, lr=0.001] Epoch 1: train=5.9389, val=3.7616
[hd=128, lr=0.001] Epoch 2: train=4.0389, val=3.5694
[hd=128, lr=0.001] Epoch 3: train=3.7739, val=3.8000
[hd=128, lr=0.001] Epoch 4: train=3.5165, val=3.1176
[hd=128, lr=0.001] Epoch 5: train=3.2068, val=2.8053
[hd=128, lr=0.001] Epoch 6: train=2.7851, val=2.3730
[hd=128, lr=0.001] Epoch 7: train=2.1014, val=1.8786
[hd=128, lr=0.001] Epoch 8: train=1.7370, val=1.5704
[hd=128, lr=0.001] Epoch 9: train=1.5280, val=1.4568
[hd=128, lr=0.001] Epoch 10: train=1.4522, val=1.5396
[hd=128, lr=0.001] Epoch 11: train=1.3617, val=1.3525
[hd=128, lr=0.001] Epoch 12: train=1.3262, val=1.4631
[hd=128, lr=0.001] Epoch 13: train=1.2690, val=1.4267
[hd=128, lr=0.001] Epoch 14: train=1.2444, val=1.2320
[hd=128, lr=0.001] Epoch 15: train=1.2066, val=1.2118
[hd=128, lr=0.001] Epoch 16: train=1.1723, val=1.2130
[hd=128, lr=0.001] Epoch 17: train=1.1372, val=1.1835
[hd=128, lr=0.001] Epoch 18: train=1.130

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

X_train, X_test, y_train, y_test = train_test_split(
    X_all, Y_all, test_size=0.2, random_state=42
)

train_ds = TensorDataset(
    torch.from_numpy(X_train),  # (N_samples, D+1)
    torch.from_numpy(y_train),  # (N_samples, 2)
)
test_ds = TensorDataset(
    torch.from_numpy(X_test),
    torch.from_numpy(y_test),
)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)


class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )
    def forward(self, x):
        return self.net(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_dim = X_all.shape[1]

model = TimeConditionedMLP(in_dim=in_dim, hidden_dim=128, out_dim=2).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.0025)

for epoch in range(75):
    model.train()
    total_loss = 0.0
    n = 0
    for xb, yb in train_loader:
        xb = xb.to(device).float()
        yb = yb.to(device).float()

        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        n += xb.size(0)
    print(f"Epoch {epoch+1}: train MSE={total_loss/n:.4f}")

Epoch 1: train MSE=13.8809
Epoch 2: train MSE=5.3395
Epoch 3: train MSE=3.3607
Epoch 4: train MSE=2.4103
Epoch 5: train MSE=2.0325
Epoch 6: train MSE=1.8106
Epoch 7: train MSE=1.6705
Epoch 8: train MSE=1.5075
Epoch 9: train MSE=1.4319
Epoch 10: train MSE=1.3599
Epoch 11: train MSE=1.3028
Epoch 12: train MSE=1.2703
Epoch 13: train MSE=1.2543
Epoch 14: train MSE=1.2273
Epoch 15: train MSE=1.2061
Epoch 16: train MSE=1.1722
Epoch 17: train MSE=1.1794
Epoch 18: train MSE=1.1544
Epoch 19: train MSE=1.1563
Epoch 20: train MSE=1.1255
Epoch 21: train MSE=1.1455
Epoch 22: train MSE=1.1133
Epoch 23: train MSE=1.1113
Epoch 24: train MSE=1.0993
Epoch 25: train MSE=1.1084
Epoch 26: train MSE=1.0819
Epoch 27: train MSE=1.0758
Epoch 28: train MSE=1.0801
Epoch 29: train MSE=1.0694
Epoch 30: train MSE=1.0679
Epoch 31: train MSE=1.0719
Epoch 32: train MSE=1.0625
Epoch 33: train MSE=1.0572
Epoch 34: train MSE=1.0481
Epoch 35: train MSE=1.0404
Epoch 36: train MSE=1.0338
Epoch 37: train MSE=1.0303
Epoch 38: