In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from src.prepData import load_train_data, normalize_input_fields, normalize_output_fields

## Data Prep

#### Load Data

In [2]:
print("Loading data...")
input_df, output_df = load_train_data()
# input_df.to_pickle("data/personal/input_df.pkl")
# output_df.to_pickle("data/personal/output_df.pkl")

print(f"Loaded {len(input_df)} input rows, {len(output_df)} output rows")
print(
  f"Unique plays: {input_df[['game_id', 'play_id']].drop_duplicates().shape[0]}"
)

Loading data...
Loaded 4880579 input rows, 562936 output rows
Unique plays: 14108


#### Normalize fields

In [3]:
input_df = normalize_input_fields(input_df)
norm_helper = input_df[['game_id','play_id','play_direction','absolute_yardline_number']].drop_duplicates()
output_df = normalize_output_fields(output_df, norm_helper)

#### Create play-level features

In [4]:
distinct_plays = input_df[['game_id', 'play_id']].drop_duplicates()
distinct_plays.sort_values(['game_id','play_id']).head(3)

Unnamed: 0,game_id,play_id
0,2023090700,101
234,2023090700,194
650,2023090700,219


In [5]:
# Get max frame_id from input_df for each play (throw_frame_id baseline)
input_max_frames = (
    input_df
    .groupby(['game_id', 'play_id'])[['frame_id', "ball_land_x_std","ball_land_y_std"]]
    .max()
    .reset_index()
    .rename(columns={'frame_id': 'throw_frame_id'})
)

# Get max frame_id from output_df for each play (throw_land_frame_id baseline)
output_max_frames = (
    output_df
    .groupby(['game_id', 'play_id'])[['frame_id']]
    .max()
    .reset_index()
    .rename(columns={'frame_id': 'throw_land_frame_id'})
)

# Combine both into baseline frame info
baseline_frame_info = input_max_frames.merge(
    output_max_frames,
    on=['game_id', 'play_id'],
    how='outer'
)

print(f"Baseline frame info shape: {baseline_frame_info.shape}")
print(f"Unique plays: {baseline_frame_info.shape[0]}")
baseline_frame_info.head(2)

Baseline frame info shape: (14108, 6)
Unique plays: 14108


Unnamed: 0,game_id,play_id,throw_frame_id,ball_land_x_std,ball_land_y_std,throw_land_frame_id
0,2023090700,101,26,21.259998,-0.22,21
1,2023090700,194,32,4.059998,31.55,9


In [6]:
K_NEIGH = 6
RADIUS = 30.0
TAU = 8

def compute_neighbor_embeddings(input_df: pd.DataFrame,
                                k_neigh: int = K_NEIGH,
                                radius: float = RADIUS,
                                tau: float = TAU,
                                frames_back: int = 0) -> pd.DataFrame:
    cols_needed = ["game_id","play_id","nfl_id","frame_id","x_std","y_std",
                   "s_x_std","s_y_std","player_side", "player_role", "o_std"]
    src = input_df[cols_needed].copy()

    # NEW: Take Nth frame from the end
    last = (src.sort_values(["game_id","play_id","nfl_id","frame_id"])
           .groupby(["game_id","play_id","nfl_id"], as_index=False)
           .nth(-1 - frames_back)  # -1 = last, -2 = second-to-last, etc.
           .rename(columns={"frame_id":"obs_frame_id"})
           .reset_index(drop=True)).copy()

    tmp = last.merge(
        src.rename(columns={
            "frame_id":"nb_frame_id",
            "nfl_id":"nfl_id_nb",
            "x_std":"x_nb",
            "y_std":"y_nb",
            "s_x_std":"vx_nb","s_y_std":"vy_nb","player_side":"player_side_nb",
            "player_role":"player_role_nb",
            "o_std":"o_nb",
        }),
        left_on=["game_id","play_id","obs_frame_id"],
        right_on=["game_id","play_id","nb_frame_id"],
        how="left",
    ).copy()
    tmp = tmp[tmp["nfl_id_nb"] != tmp["nfl_id"]]

    tmp["dx"]  = tmp["x_nb"] - tmp["x_std"]
    tmp["dy"]  = tmp["y_nb"] - tmp["y_std"]
    tmp["dvx"] = tmp["vx_nb"] - tmp["s_x_std"]
    tmp["dvy"] = tmp["vy_nb"] - tmp["s_y_std"]
    tmp["dist"] = np.sqrt(tmp["dx"]**2 + tmp["dy"]**2)
    tmp = tmp[np.isfinite(tmp["dist"]) & (tmp["dist"] > 1e-6)]
    if radius is not None: tmp = tmp[tmp["dist"] <= radius]

    tmp["is_ally"] = (tmp["player_side_nb"].fillna("") == tmp["player_side"].fillna("")).astype(np.float32)
    keys = ["game_id","play_id","nfl_id"]
    tmp["rnk"] = tmp.groupby(keys)["dist"].rank(method="first")
    if k_neigh is not None: tmp = tmp[tmp["rnk"] <= float(k_neigh)]

    tmp["w"] = np.exp(-tmp["dist"] / float(tau))
    sum_w = tmp.groupby(keys)["w"].transform("sum"); tmp["wn"] = np.where(sum_w>0, tmp["w"]/sum_w, 0.0)
    tmp["wn_ally"] = tmp["wn"] * tmp["is_ally"]; tmp["wn_opp"]  = tmp["wn"] * (1.0 - tmp["is_ally"])
    for col in ["dx","dy","dvx","dvy"]:
        tmp[f"{col}_ally_w"] = tmp[col] * tmp["wn_ally"]
        tmp[f"{col}_opp_w"]  = tmp[col] * tmp["wn_opp"]
    tmp["dist_ally"] = np.where(tmp["is_ally"] > 0.5, tmp["dist"], np.nan)
    tmp["dist_opp"]  = np.where(tmp["is_ally"] < 0.5, tmp["dist"], np.nan)

    ag = tmp.groupby(keys).agg(
        gnn_ally_dx_mean=("dx_ally_w","sum"),
        gnn_ally_dy_mean=("dy_ally_w","sum"),
        gnn_ally_dvx_mean=("dvx_ally_w","sum"),
        gnn_ally_dvy_mean=("dvy_ally_w","sum"),
        gnn_opp_dx_mean=("dx_opp_w","sum"),
        gnn_opp_dy_mean=("dy_opp_w","sum"),
        gnn_opp_dvx_mean=("dvx_opp_w","sum"),
        gnn_opp_dvy_mean=("dvy_opp_w","sum"),
        gnn_ally_cnt=("is_ally","sum"),
        gnn_opp_cnt=("is_ally", lambda s: float(len(s) - s.sum())),
        gnn_ally_dmin=("dist_ally","min"),
        gnn_ally_dmean=("dist_ally","mean"),
        gnn_opp_dmin=("dist_opp","min"),
        gnn_opp_dmean=("dist_opp","mean"),
    ).reset_index()


    near = tmp.loc[tmp["rnk"]<=3, keys+["rnk","dist"]].copy()
    near["rnk"] = near["rnk"].astype(int)
    dwide = near.pivot_table(index=keys, columns="rnk", values="dist", aggfunc="first")
    dwide = dwide.rename(columns={1:"gnn_d1",2:"gnn_d2",3:"gnn_d3"}).reset_index()
    ag = ag.merge(dwide, on=keys, how="left")

    print(ag.shape)


    # ========== ‚úÖ QB Features (computed separately from tmp) ========== #
    # Get QB positions at observation frame
    qb_pos = (src[src["player_role"] == "Passer"]
              .rename(columns={"nfl_id": "nfl_id_qb", "x_std": "x_qb", "y_std": "y_qb", "o_std": "o_qb"})
              [["game_id", "play_id", "frame_id", "nfl_id_qb", "x_qb", "y_qb", "o_qb"]])
    
    # Merge with last (all players at throw frame)
    qb_data = last.merge(
        qb_pos,
        left_on=["game_id", "play_id", "obs_frame_id"],
        right_on=["game_id", "play_id", "frame_id"],
        how="left"
    ).copy()
    
    # Compute QB features
    qb_data["dx_qb"] = qb_data["x_qb"] - qb_data["x_std"]
    qb_data["dy_qb"] = qb_data["y_qb"] - qb_data["y_std"]
    qb_data["gnn_qb_dist"] = np.sqrt(qb_data["dx_qb"]**2 + qb_data["dy_qb"]**2)
    
    qb_data["gnn_qb_angle"] = (90 - np.degrees(np.arctan2(qb_data["dy_qb"], qb_data["dx_qb"]))) % 360
    qb_data["gnn_qb_angle_sin"] = np.sin(np.deg2rad(qb_data["gnn_qb_angle"]))
    qb_data["gnn_qb_angle_cos"] = np.cos(np.deg2rad(qb_data["gnn_qb_angle"]))
    
    # Facing QB
    qb_data["gnn_facing_qb"] = (qb_data["o_std"] - qb_data["gnn_qb_angle"] + 180) % 360 - 180
    qb_data["gnn_facing_qb_abs"] = np.abs(qb_data["gnn_facing_qb"])
    qb_data["gnn_facing_qb_sin"] = np.sin(np.deg2rad(qb_data["gnn_facing_qb"]))
    qb_data["gnn_facing_qb_cos"] = np.cos(np.deg2rad(qb_data["gnn_facing_qb"]))
    
    # Merge into ag
    ag = ag.merge(
        qb_data[keys + ["gnn_qb_dist", "gnn_qb_angle_sin", "gnn_qb_angle_cos",
                        "gnn_facing_qb", "gnn_facing_qb_abs",
                        "gnn_facing_qb_sin", "gnn_facing_qb_cos"]],
        on=keys,
        how="left"
    )

    print(ag.shape)


     # ========== ‚úÖ NEW FEATURES: Nearest Opponent Orientation ========== #
    opp_data = tmp[tmp["is_ally"] < 0.5][keys + ["rnk","dx","dy","dist", "o_std"]].copy()
    opp_data = opp_data.sort_values(keys + ["dist"])
    
    # First nearest opponent
    nearest_opp = opp_data.groupby(keys, as_index=False).first()
    nearest_opp["gnn_nearest_opp_dist"] = nearest_opp["dist"]
    nearest_opp["gnn_nearest_opp_angle"] = (90 - np.degrees(np.arctan2(
        nearest_opp["dy"], nearest_opp["dx"]
    ))) % 360
    nearest_opp["gnn_nearest_opp_angle_sin"] = np.sin(np.deg2rad(nearest_opp["gnn_nearest_opp_angle"]))
    nearest_opp["gnn_nearest_opp_angle_cos"] = np.cos(np.deg2rad(nearest_opp["gnn_nearest_opp_angle"]))
    
     # ‚úÖ NEW: How directly player faces nearest opponent
    nearest_opp["gnn_facing_nearest_opp"] = (
        nearest_opp["o_std"] - nearest_opp["gnn_nearest_opp_angle"] + 180
    ) % 360 - 180
    nearest_opp["gnn_facing_nearest_opp_abs"] = np.abs(nearest_opp["gnn_facing_nearest_opp"])
    nearest_opp["gnn_facing_nearest_opp_sin"] = np.sin(np.deg2rad(nearest_opp["gnn_facing_nearest_opp"]))
    nearest_opp["gnn_facing_nearest_opp_cos"] = np.cos(np.deg2rad(nearest_opp["gnn_facing_nearest_opp"]))
    
    ag = ag.merge(
        nearest_opp[keys + ["gnn_nearest_opp_dist","gnn_nearest_opp_angle_sin","gnn_nearest_opp_angle_cos",
                           "gnn_facing_nearest_opp","gnn_facing_nearest_opp_abs",
                           "gnn_facing_nearest_opp_sin","gnn_facing_nearest_opp_cos"]], 
        on=keys, 
        how="left"
    )

    print(ag.shape)

    # ========== ‚úÖ NEW FEATURES: Second Nearest Opponent ========== #
    second_opp = opp_data.groupby(keys, as_index=False).nth(1).copy()  # 0-indexed, so nth(1) = 2nd row
    second_opp["gnn_second_opp_dist"] = second_opp["dist"]
    second_opp["gnn_second_opp_angle"] = (90 - np.degrees(np.arctan2(
        second_opp["dy"], second_opp["dx"]
    ))) % 360
    second_opp["gnn_second_opp_angle_sin"] = np.sin(np.deg2rad(second_opp["gnn_second_opp_angle"]))
    second_opp["gnn_second_opp_angle_cos"] = np.cos(np.deg2rad(second_opp["gnn_second_opp_angle"]))
    
    # ‚úÖ NEW: How directly player faces 2nd opponent
    second_opp["gnn_facing_second_opp"] = (
        second_opp["o_std"] - second_opp["gnn_second_opp_angle"] + 180
    ) % 360 - 180
    second_opp["gnn_facing_second_opp_abs"] = np.abs(second_opp["gnn_facing_second_opp"])
    second_opp["gnn_facing_second_opp_sin"] = np.sin(np.deg2rad(second_opp["gnn_facing_second_opp"]))
    second_opp["gnn_facing_second_opp_cos"] = np.cos(np.deg2rad(second_opp["gnn_facing_second_opp"]))
    
    ag = ag.merge(
        second_opp[keys + ["gnn_second_opp_dist","gnn_second_opp_angle_sin","gnn_second_opp_angle_cos",
                          "gnn_facing_second_opp","gnn_facing_second_opp_abs",
                          "gnn_facing_second_opp_sin","gnn_facing_second_opp_cos"]], 
        on=keys, 
        how="left"
    )
    
    print(ag.shape)

    # ========== Fill NaNs ========== #
    for c in ["gnn_ally_dx_mean","gnn_ally_dy_mean","gnn_ally_dvx_mean","gnn_ally_dvy_mean",
              "gnn_opp_dx_mean","gnn_opp_dy_mean","gnn_opp_dvx_mean","gnn_opp_dvy_mean"]:
        ag[c] = ag[c].fillna(0.0)
    
    for c in ["gnn_ally_cnt","gnn_opp_cnt"]:
        ag[c] = ag[c].fillna(0.0)
    
    # Distance features
    dist_cols = ["gnn_ally_dmin","gnn_opp_dmin","gnn_ally_dmean","gnn_opp_dmean",
                 "gnn_d1","gnn_d2","gnn_d3",
                 "gnn_qb_dist","gnn_nearest_opp_dist","gnn_second_opp_dist"]
    for c in dist_cols:
        if c in ag.columns:
            ag[c] = ag[c].fillna(radius if radius is not None else 30.0)
    
    # Angle features (sin/cos)
    angle_cols = ["gnn_qb_angle_sin","gnn_qb_angle_cos",
                  "gnn_nearest_opp_angle_sin","gnn_nearest_opp_angle_cos",
                  "gnn_second_opp_angle_sin","gnn_second_opp_angle_cos",
                  "gnn_facing_qb_sin","gnn_facing_qb_cos",
                  "gnn_facing_nearest_opp_sin","gnn_facing_nearest_opp_cos",
                  "gnn_facing_second_opp_sin","gnn_facing_second_opp_cos"]
    for c in angle_cols:
        if c in ag.columns:
            ag[c] = ag[c].fillna(0.0)
    
    # Raw angle differences (for interpretability, not modeling)
    raw_angle_cols = ["gnn_facing_qb","gnn_facing_qb_abs",
                      "gnn_facing_nearest_opp","gnn_facing_nearest_opp_abs",
                      "gnn_facing_second_opp","gnn_facing_second_opp_abs"]
    for c in raw_angle_cols:
        if c in ag.columns:
            ag[c] = ag[c].fillna(0.0)

    # ‚úÖ NEW: Add lag suffix if frames_back > 0
    if frames_back > 0:
        lag_suffix = f"_lag{frames_back}"
        ag = ag.rename(columns={
            col: col + lag_suffix 
            for col in ag.columns 
            if col not in ["game_id", "play_id", "nfl_id"]
        })

    print(ag.shape)
    return ag


In [7]:
# Create all play-level features
qb_frame = input_df[input_df['player_role'] == 'Passer']
if qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0] < len(distinct_plays):
    print(f"Warning: fewer plays with QB ({qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0]}) than original plays ({len(distinct_plays)})")

# Get QB max frame for plays with a passer
qb_max_frame = (
    qb_frame
    .groupby(['game_id', 'play_id', 'nfl_id', 'player_role'])['frame_id']
    .max()
    .reset_index()
)

# Find plays without a passer
plays_with_qb = qb_max_frame[['game_id', 'play_id']].drop_duplicates()
plays_without_qb = (
    distinct_plays
    .merge(plays_with_qb, on=['game_id', 'play_id'], how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns=['_merge'])
)

# For plays without a passer, use the overall max frame_id
if len(plays_without_qb) > 0:
    print(f"Found {len(plays_without_qb)} plays without a Passer. Using overall max frame_id.")
    
    missing_max_frames = (
        input_df
        .merge(plays_without_qb, on=['game_id', 'play_id'])
        .groupby(['game_id', 'play_id'])['frame_id']
        .max()
        .reset_index()
    )
    
    # Add placeholder columns for nfl_id and player_role
    missing_max_frames['nfl_id'] = None
    missing_max_frames['player_role'] = None
    
    # Combine with QB frames
    qb_max_frame = pd.concat([qb_max_frame, missing_max_frames], ignore_index=True)

# Join back to input_df to get the full row data
qb_rows = pd.merge(
    input_df, 
    qb_max_frame, 
    on=['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_role'], 
    how='inner'
)

# Start with qb_rows
qb_sub = qb_rows.copy()

# Calculate derived features
qb_sub['qb_throw_distance'] = np.sqrt((qb_sub['ball_land_x_std'] - qb_sub['x_std'])**2 + (qb_sub['ball_land_y_std'] - qb_sub['y_std'])**2)
qb_sub['qb_ball_dir'] = (90 - np.degrees(np.arctan2(
    qb_sub['ball_land_y_std'] - qb_sub['y_std'],
    qb_sub['ball_land_x_std'] - qb_sub['x_std']
))) % 360
qb_sub['qb_direction_diff'] = (qb_sub['o_std'] - qb_sub['qb_ball_dir'] + 180) % 360 - 180  # difference between -180 and 180

# Rename frame_id to be QB-specific
qb_sub.rename(columns={'frame_id':'throw_frame_id'}, inplace=True)

# Drop player_to_predict column (not needed for QB)
qb_sub = qb_sub.drop(columns=['player_to_predict'])

# Rename QB kinematic fields to have qb_ prefix
qb_kinematic_fields_rename = {
    "x_std": "qb_x_std",
    "y_std": "qb_y_std",
    "o_std": "qb_o_std",
    "dir_std": "qb_dir_std",
    "s": "qb_s",
    "a": "qb_a"
}
qb_sub = qb_sub.rename(columns=qb_kinematic_fields_rename)

qb_sub = qb_sub.drop(columns=["ball_land_x_std","ball_land_y_std"])

qb_sub.head(3)

Found 3 plays without a Passer. Using overall max frame_id.


Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,qb_s,qb_a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,absolute_yardline_number_std,qb_x_std,qb_y_std,qb_o_std,qb_dir_std,dx_ball,dy_ball,dist_ball,angle_to_ball,angle_to_ball_minus_dir,angle_to_ball_minus_o,s_x_std,s_y_std,s_parallel,s_perp,dir_std_sin,dir_std_cos,o_std_sin,o_std_cos,angle_to_ball_sin,angle_to_ball_cos,angle_to_ball_minus_dir_sin,angle_to_ball_minus_dir_cos,angle_to_ball_minus_o_sin,angle_to_ball_minus_o_cos,height_in,birth_year,qb_throw_distance,qb_ball_dir,qb_direction_diff
0,2023090700,101,43290,26,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,35.41,29.99,0.64,0.47,108.83,212.25,21,63.259998,-0.22,1,42,-6.59,29.99,212.25,108.83,27.849998,-30.21,41.088521,137.327657,28.497657,-74.922343,0.605747,-0.206567,0.562455,0.305359,0.94648,-0.322761,-0.533615,-0.845728,0.677805,-0.735242,0.477123,0.878837,-0.965574,0.260128,76.0,1994,41.08852,137.327657,74.922343
1,2023090700,194,44822,32,left,89,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,97.62,29.67,0.96,1.64,185.14,285.7,9,84.940002,21.75,1,31,-8.62,23.63,105.7,5.14,12.679998,7.92,14.95021,58.010861,52.870861,-47.689139,0.086006,0.95614,0.579469,0.765386,0.08959,0.995979,0.962692,-0.2706,0.848149,0.529758,0.797277,0.603614,-0.739504,0.673153,75.0,1995,14.950209,58.010861,47.689139
2,2023090700,219,44822,17,left,79,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,85.87,22.97,1.49,2.76,133.64,245.38,8,75.849998,11.49,1,41,-6.87,30.33,65.38,313.64,10.020002,11.48,15.23781,41.115185,87.475185,-24.264815,-1.078298,1.028286,0.065638,1.488553,-0.72369,0.690125,0.909091,0.416598,0.657575,0.753389,0.999029,0.044052,-0.410955,0.911656,75.0,1995,15.237809,41.115185,24.264815


In [8]:
# Just prove only one player per output
input_unique_players = input_df[['game_id', 'play_id', 'nfl_id', 'player_role', 'player_side']].drop_duplicates()
output_unique_players = output_df[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

a = output_unique_players.merge(input_unique_players[['game_id','play_id','nfl_id','player_role','player_side']], on=['game_id', 'play_id', 'nfl_id'], how='inner', indicator=True)
b = a.loc[a['player_side'] == 'Offense', ['game_id','play_id','nfl_id']].groupby(['game_id','play_id']).nunique().reset_index()
b['nfl_id'].value_counts()

nfl_id
1    14108
Name: count, dtype: int64

In [9]:
# Create all play-level features
qb_frame = input_df[input_df['player_role'] == 'Passer']
if qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0] < len(distinct_plays):
    print(f"Warning: fewer plays with QB ({qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0]}) than original plays ({len(distinct_plays)})")

# Get QB max frame for plays with a passer
qb_max_frame = (
    qb_frame
    .groupby(['game_id', 'play_id', 'nfl_id', 'player_role'])['frame_id']
    .max()
    .reset_index()
)

# Find plays without a passer
plays_with_qb = qb_max_frame[['game_id', 'play_id']].drop_duplicates()
plays_without_qb = (
    distinct_plays
    .merge(plays_with_qb, on=['game_id', 'play_id'], how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns=['_merge'])
)

# For plays without a passer, use the overall max frame_id
if len(plays_without_qb) > 0:
    print(f"Found {len(plays_without_qb)} plays without a Passer. Using overall max frame_id.")
    
    missing_max_frames = (
        input_df
        .merge(plays_without_qb, on=['game_id', 'play_id'])
        .groupby(['game_id', 'play_id'])['frame_id']
        .max()
        .reset_index()
    )
    
    # Add placeholder columns for nfl_id and player_role
    missing_max_frames['nfl_id'] = None
    missing_max_frames['player_role'] = None
    
    # Combine with QB frames
    qb_max_frame = pd.concat([qb_max_frame, missing_max_frames], ignore_index=True)

# Join back to input_df to get the full row data
qb_rows = pd.merge(
    input_df, 
    qb_max_frame, 
    on=['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_role'], 
    how='inner'
)

# Start with qb_rows
qb_sub = qb_rows.copy()

# Calculate derived features
qb_sub['qb_throw_distance'] = np.sqrt((qb_sub['ball_land_x_std'] - qb_sub['x_std'])**2 + (qb_sub['ball_land_y_std'] - qb_sub['y_std'])**2)
qb_sub['qb_ball_dir'] = (90 - np.degrees(np.arctan2(
    qb_sub['ball_land_y_std'] - qb_sub['y_std'],
    qb_sub['ball_land_x_std'] - qb_sub['x_std']
))) % 360
qb_sub['qb_direction_diff'] = (qb_sub['o_std'] - qb_sub['qb_ball_dir'] + 180) % 360 - 180  # difference between -180 and 180

# Rename frame_id to be QB-specific
qb_sub.rename(columns={'frame_id':'throw_frame_id'}, inplace=True)

# Drop player_to_predict column (not needed for QB)
qb_sub = qb_sub.drop(columns=['player_to_predict'])

# Rename QB kinematic fields to have qb_ prefix
qb_kinematic_fields_rename = {
    "x_std": "qb_x_std",
    "y_std": "qb_y_std",
    "o_std": "qb_o_std",
    "dir_std": "qb_dir_std",
    "s": "qb_s",
    "a": "qb_a"
}
qb_sub = qb_sub.rename(columns=qb_kinematic_fields_rename)

qb_sub = qb_sub.drop(columns=["ball_land_x_std","ball_land_y_std"])

qb_sub.head(3)

Found 3 plays without a Passer. Using overall max frame_id.


Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,qb_s,qb_a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,absolute_yardline_number_std,qb_x_std,qb_y_std,qb_o_std,qb_dir_std,dx_ball,dy_ball,dist_ball,angle_to_ball,angle_to_ball_minus_dir,angle_to_ball_minus_o,s_x_std,s_y_std,s_parallel,s_perp,dir_std_sin,dir_std_cos,o_std_sin,o_std_cos,angle_to_ball_sin,angle_to_ball_cos,angle_to_ball_minus_dir_sin,angle_to_ball_minus_dir_cos,angle_to_ball_minus_o_sin,angle_to_ball_minus_o_cos,height_in,birth_year,qb_throw_distance,qb_ball_dir,qb_direction_diff
0,2023090700,101,43290,26,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,35.41,29.99,0.64,0.47,108.83,212.25,21,63.259998,-0.22,1,42,-6.59,29.99,212.25,108.83,27.849998,-30.21,41.088521,137.327657,28.497657,-74.922343,0.605747,-0.206567,0.562455,0.305359,0.94648,-0.322761,-0.533615,-0.845728,0.677805,-0.735242,0.477123,0.878837,-0.965574,0.260128,76.0,1994,41.08852,137.327657,74.922343
1,2023090700,194,44822,32,left,89,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,97.62,29.67,0.96,1.64,185.14,285.7,9,84.940002,21.75,1,31,-8.62,23.63,105.7,5.14,12.679998,7.92,14.95021,58.010861,52.870861,-47.689139,0.086006,0.95614,0.579469,0.765386,0.08959,0.995979,0.962692,-0.2706,0.848149,0.529758,0.797277,0.603614,-0.739504,0.673153,75.0,1995,14.950209,58.010861,47.689139
2,2023090700,219,44822,17,left,79,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,85.87,22.97,1.49,2.76,133.64,245.38,8,75.849998,11.49,1,41,-6.87,30.33,65.38,313.64,10.020002,11.48,15.23781,41.115185,87.475185,-24.264815,-1.078298,1.028286,0.065638,1.488553,-0.72369,0.690125,0.909091,0.416598,0.657575,0.753389,0.999029,0.044052,-0.410955,0.911656,75.0,1995,15.237809,41.115185,24.264815


In [10]:
qb_features = ["qb_x_std", 
               "qb_y_std", 
               "qb_s", 
               "qb_a", 
               "qb_dir_std", 
               "qb_o_std", 
               "qb_throw_distance", 
               "qb_ball_dir"]

play_level_features = baseline_frame_info.merge(
  qb_sub[['game_id','play_id'] + qb_features], 
  how = 'left', 
  on = ['game_id','play_id'])

def impute_qb_features_safe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing QB features using ball trajectory (always available)
    This is 'safe' because ball_land_x/y are inputs, not targets
    """
    mask = df['qb_x_std'].isnull()
    
    if mask.sum() > 0:
        # Proxy: assume QB was ~10 yards behind ball landing
        df.loc[mask, 'qb_x_std'] = df.loc[mask, 'ball_land_x_std'] - 10
        df.loc[mask, 'qb_y_std'] = 26.7  # assume center of field
        
        # Proxy: assume QB was stationary (conservative)
        df.loc[mask, 'qb_s'] = 0.0
        df.loc[mask, 'qb_a'] = 0.0
      
        # Throw distance from imputed position
        df.loc[mask, 'qb_throw_distance'] = np.sqrt(
            (df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std'])**2 +
            (df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'])**2
        )

        # Proxy: QB facing ball direction
        df.loc[mask, 'qb_o_std'] = (90 - np.degrees(np.arctan2(
            df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'],
            df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std']
        ))) % 360
        df.loc[mask, 'qb_dir_std'] = df.loc[mask, 'qb_o_std']

        df.loc[mask, 'qb_ball_dir'] = (90 - np.degrees(np.arctan2(
            df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'],
            df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std']
        ))) % 360
    
    return df

# Apply BEFORE split
play_level_features = impute_qb_features_safe(play_level_features)


# Encode angles as sin/cos
for col in ["qb_o_std", "qb_dir_std",
             "qb_ball_dir",
             ]:
    rad = np.deg2rad(play_level_features[col])
    play_level_features[col + "_sin"] = np.sin(rad)
    play_level_features[col + "_cos"] = np.cos(rad)


In [11]:
x_data = baseline_frame_info.merge(
    input_df[input_df['player_to_predict'] == True],
    left_on = ['game_id','play_id'],
    right_on = ['game_id','play_id'],
    how = 'inner'
)

baseline_play_features =[
    'throw_frame_id',
    'throw_land_frame_id'
]

player_level_features = [
    'frame_id',
    'height_in',
    'player_weight',
    'birth_year',
    'player_position',
    'player_side',
    'player_role',
    'x_std',
    'y_std',
    'o_std',
    'o_std_sin',
    'o_std_cos',
    'dir_std',
    'dir_std_sin',
    'dir_std_cos',
    's',
    'a',
    "dx_ball",
    "dy_ball",
    "dist_ball",
    "angle_to_ball",
    "angle_to_ball_sin",
    "angle_to_ball_cos",
    "angle_to_ball_minus_dir",
    "angle_to_ball_minus_dir_sin",
    "angle_to_ball_minus_dir_cos",
    "angle_to_ball_minus_o",
    "angle_to_ball_minus_o_sin",
    "angle_to_ball_minus_o_cos",
    "s_x_std",
    "s_y_std",
    "s_parallel",
    "s_perp",]

x_data = x_data[['game_id','play_id','nfl_id'] + baseline_play_features + player_level_features].copy()

In [12]:
x_data_last = x_data[x_data['frame_id'] == x_data['throw_frame_id']].copy()
play_level_features_cols = [i for i in list(play_level_features) if i not in ['throw_frame_id','throw_land_frame_id']]
x_data_last = x_data_last.merge(play_level_features[play_level_features_cols], on = ['game_id','play_id'])


gnn_features = compute_neighbor_embeddings(input_df)  # Returns per (game, play, nfl_id)
x_data_last = x_data_last.merge(
    gnn_features, 
    on=['game_id', 'play_id', 'nfl_id'], 
    how='left'
)

# gnn_features_lag5 = compute_neighbor_embeddings(input_df, frames_back=5)
# x_data_last = x_data_last.merge(
#     gnn_features_lag5,
#     on=['game_id', 'play_id', 'nfl_id'], 
#     how='left'
# )

# gnn_features_lag10 = compute_neighbor_embeddings(input_df, frames_back=10)
# x_data_last = x_data_last.merge(
#     gnn_features_lag10,
#     on=['game_id', 'play_id', 'nfl_id'], 
#     how='left'
# )

# ‚úÖ UPDATED: Fill NaNs for ALL GNN columns (including lag variants)
gnn_base_cols = [
    'gnn_ally_dx_mean', 'gnn_ally_dy_mean', 'gnn_ally_dvx_mean', 'gnn_ally_dvy_mean',
    'gnn_opp_dx_mean', 'gnn_opp_dy_mean', 'gnn_opp_dvx_mean', 'gnn_opp_dvy_mean',
    'gnn_ally_cnt', 'gnn_opp_cnt',
    'gnn_ally_dmin', 'gnn_ally_dmean', 'gnn_opp_dmin', 'gnn_opp_dmean',
    'gnn_d1', 'gnn_d2', 'gnn_d3',
    'gnn_qb_dist', 'gnn_qb_angle_sin', 'gnn_qb_angle_cos',
    'gnn_facing_qb', 'gnn_facing_qb_abs', 'gnn_facing_qb_sin', 'gnn_facing_qb_cos',
    'gnn_nearest_opp_dist', 'gnn_nearest_opp_angle_sin', 'gnn_nearest_opp_angle_cos',
    'gnn_facing_nearest_opp', 'gnn_facing_nearest_opp_abs',
    'gnn_facing_nearest_opp_sin', 'gnn_facing_nearest_opp_cos',
    'gnn_second_opp_dist', 'gnn_second_opp_angle_sin', 'gnn_second_opp_angle_cos',
    'gnn_facing_second_opp', 'gnn_facing_second_opp_abs',
    'gnn_facing_second_opp_sin', 'gnn_facing_second_opp_cos',
]

# Generate all possible column names (base + lag5 + lag10)
all_gnn_cols = gnn_base_cols.copy()
for lag in [5, 10]:
    all_gnn_cols.extend([f"{col}_lag{lag}" for col in gnn_base_cols])

# Fill NaNs based on column type
for c in all_gnn_cols:
    if c not in x_data_last.columns:
        continue  # Skip if column doesn't exist
    
    # Count columns
    if 'cnt' in c:
        x_data_last[c] = x_data_last[c].fillna(0.0)
    
    # Distance columns
    elif any(dist in c for dist in ['_dmin', '_dmean', '_d1', '_d2', '_d3', '_dist']):
        x_data_last[c] = x_data_last[c].fillna(RADIUS)
    
    # Angle sin/cos columns
    elif c.endswith('_sin') or c.endswith('_cos'):
        x_data_last[c] = x_data_last[c].fillna(0.0)
    
    # Raw angle columns (facing_*, *_abs)
    elif 'facing_' in c or c.endswith('_abs'):
        x_data_last[c] = x_data_last[c].fillna(0.0)
    
    # Weighted mean columns (dx_mean, dy_mean, dvx_mean, dvy_mean)
    elif any(suffix in c for suffix in ['dx_mean', 'dy_mean', 'dvx_mean', 'dvy_mean']):
        x_data_last[c] = x_data_last[c].fillna(0.0)
    
    else:
        # Default: fill with 0 for any unmatched GNN column
        x_data_last[c] = x_data_last[c].fillna(0.0)

x_data_last.sort_values(['game_id','play_id','nfl_id'], inplace=True)
x_data_last.head(3)

(173147, 20)
(173147, 27)
(173147, 34)
(173147, 41)
(173147, 41)


Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,throw_land_frame_id,frame_id,height_in,player_weight,birth_year,player_position,player_side,player_role,x_std,y_std,o_std,o_std_sin,o_std_cos,dir_std,dir_std_sin,dir_std_cos,s,a,dx_ball,dy_ball,dist_ball,angle_to_ball,angle_to_ball_sin,angle_to_ball_cos,angle_to_ball_minus_dir,angle_to_ball_minus_dir_sin,angle_to_ball_minus_dir_cos,angle_to_ball_minus_o,angle_to_ball_minus_o_sin,angle_to_ball_minus_o_cos,s_x_std,s_y_std,s_parallel,s_perp,ball_land_x_std,ball_land_y_std,qb_x_std,qb_y_std,qb_s,qb_a,qb_dir_std,qb_o_std,qb_throw_distance,qb_ball_dir,qb_o_std_sin,qb_o_std_cos,qb_dir_std_sin,qb_dir_std_cos,qb_ball_dir_sin,qb_ball_dir_cos,gnn_ally_dx_mean,gnn_ally_dy_mean,gnn_ally_dvx_mean,gnn_ally_dvy_mean,gnn_opp_dx_mean,gnn_opp_dy_mean,gnn_opp_dvx_mean,gnn_opp_dvy_mean,gnn_ally_cnt,gnn_opp_cnt,gnn_ally_dmin,gnn_ally_dmean,gnn_opp_dmin,gnn_opp_dmean,gnn_d1,gnn_d2,gnn_d3,gnn_qb_dist,gnn_qb_angle_sin,gnn_qb_angle_cos,gnn_facing_qb,gnn_facing_qb_abs,gnn_facing_qb_sin,gnn_facing_qb_cos,gnn_nearest_opp_dist,gnn_nearest_opp_angle_sin,gnn_nearest_opp_angle_cos,gnn_facing_nearest_opp,gnn_facing_nearest_opp_abs,gnn_facing_nearest_opp_sin,gnn_facing_nearest_opp_cos,gnn_second_opp_dist,gnn_second_opp_angle_sin,gnn_second_opp_angle_cos,gnn_facing_second_opp,gnn_facing_second_opp_abs,gnn_facing_second_opp_sin,gnn_facing_second_opp_cos
2,2023090700,101,44930,26,21,26,75.0,196,1995,WR,Offense,Targeted Receiver,10.43,14.14,106.8,0.957319,-0.289032,99.25,0.986996,-0.160743,7.9,2.68,10.829998,-14.36,17.986064,142.977199,0.602133,-0.798396,43.727199,0.691226,0.722639,36.177199,0.590284,0.807195,7.797271,-1.269866,5.708848,5.460681,21.259998,-0.22,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242,-1.392319,0.461787,-1.808016,-1.259673,-0.689965,2.457668,-4.179921,-1.46985,2.0,4.0,5.641392,14.003193,4.735652,10.028938,4.735652,4.89418,5.641392,23.257319,-0.731813,0.681506,153.838569,153.838569,0.440902,-0.897555,4.735652,-0.933346,-0.358979,-142.162489,142.162489,-0.613424,-0.789754,4.89418,0.692659,0.721265,62.959005,62.959005,0.890681,0.454628
0,2023090700,101,46137,26,21,26,73.0,204,1997,SS,Defense,Defensive Coverage,13.82,17.67,184.99,-0.086982,-0.99621,134.17,0.717276,-0.69679,5.34,1.8,7.439998,-17.89,19.375389,157.41881,0.383992,-0.923336,23.24881,0.394725,0.918799,-27.57119,-0.46285,0.886436,3.830251,-3.720857,4.906389,2.10783,21.259998,-0.22,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242,-2.775771,1.107731,-0.955995,0.30966,-2.861945,-0.894189,0.759544,0.431643,3.0,3.0,7.927074,11.767292,4.89418,11.44679,4.89418,7.927074,9.399415,23.840103,-0.85612,0.516776,-116.126259,116.126259,-0.897826,-0.440351,4.89418,-0.692659,-0.721265,-38.850995,38.850995,-0.627297,0.77878,9.97349,-0.899384,-0.437159,-59.087251,59.087251,-0.857951,0.513732
1,2023090700,101,52546,26,21,26,73.0,193,1997,CB,Defense,Defensive Coverage,6.01,12.44,309.47,-0.771958,0.635674,192.18,-0.210984,-0.97749,2.93,4.75,15.249998,-12.66,19.820144,129.698237,0.769419,-0.638744,-62.481763,-0.886864,0.462031,-179.771763,-0.003983,-0.999992,-0.618182,-2.864045,1.353751,-2.598511,21.259998,-0.22,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242,1.307974,2.122338,1.252193,-0.214571,0.358924,1.903108,2.871676,-0.904191,2.0,4.0,9.399415,9.515372,1.45,12.979936,1.45,4.735652,9.399415,21.604687,-0.583207,0.812324,-14.853592,14.853592,-0.25635,0.966584,1.45,-0.8,0.6,2.600102,2.600102,0.045365,0.99897,4.735652,0.933346,0.358979,-119.492489,119.492489,-0.87042,-0.492309


In [13]:
y_data = output_df.merge(
    baseline_frame_info[['game_id','play_id']], 
    on=['game_id','play_id']
)

y_data.sort_values(['game_id','play_id','nfl_id', 'frame_id'], inplace=True)

In [14]:
def hybrid_trajectory_interpolation(x_data, y_data, frame_rate=10, blend_factor=0.5):
    """
    Hybrid: blend velocity projection (early) with ball-directed (late)
    blend_factor: 0 = pure velocity, 1 = pure ball-directed
    """
    results = []
    
    for idx, row in x_data.iterrows():
        if idx % 10000 == 0:
            print(f"Processing row {idx}/{len(x_data)}")
        gid = row['game_id']
        pid = row['play_id']
        nid = row['nfl_id']
        
        x_throw = row['x_std']
        y_throw = row['y_std']
        vx = row['s_x_std']
        vy = row['s_y_std']
        x_land = row['ball_land_x_std']
        y_land = row['ball_land_y_std']
        throw_frame = row['throw_frame_id']
        
        traj_frames = y_data[
            (y_data['game_id'] == gid) &
            (y_data['play_id'] == pid) &
            (y_data['nfl_id'] == nid)
        ].sort_values('frame_id')
        
        if traj_frames.empty:
            continue
        
        frame_ids = traj_frames['frame_id'].values
        n_frames = len(frame_ids)
        
        for i, fid in enumerate(frame_ids):
            dt = (fid) / frame_rate
            t_norm = i / max(n_frames - 1, 1)  # 0 to 1
            
            # Velocity projection
            x_vel = x_throw + vx * dt
            y_vel = y_throw + vy * dt
            
            # Ball-directed interpolation
            x_ball = x_throw + t_norm * (x_land - x_throw)
            y_ball = y_throw + t_norm * (y_land - y_throw)
            
            # Blend: early frames favor velocity, late frames favor ball
            alpha = t_norm * blend_factor
            x_hybrid = (1 - alpha) * x_vel + alpha * x_ball
            y_hybrid = (1 - alpha) * y_vel + alpha * y_ball
            
            results.append({
                'game_id': gid,
                'play_id': pid,
                'nfl_id': nid,
                'frame_id': fid,
                'x_std_hybrid': x_hybrid,
                'y_std_hybrid': y_hybrid,
            })
    
    return pd.DataFrame(results)

# Generate hybrid trajectories
hybrid_traj = hybrid_trajectory_interpolation(x_data_last, y_data, blend_factor=0.7)
y_with_hybrid = y_data.merge(hybrid_traj, on=['game_id', 'play_id', 'nfl_id', 'frame_id'])

y_with_hybrid.shape

Processing row 0/46045
Processing row 10000/46045
Processing row 20000/46045
Processing row 30000/46045
Processing row 40000/46045


(562936, 14)

In [15]:
import numpy as np

def calculate_kaggle_rmse(df):
    """
    Calculate RMSE per Kaggle's formula
    df should have: x_std, y_std (actual), x_std_hybrid, y_std_hybrid (predicted)
    """
    # Calculate squared errors per frame
    squared_errors = (
        (df['x_std'] - df['x_std_hybrid'])**2 + 
        (df['y_std'] - df['y_std_hybrid'])**2
    )
    
    # RMSE = sqrt(mean of squared distances)
    rmse = np.sqrt(squared_errors.mean())
    
    return rmse

# Calculate overall RMSE
overall_rmse = calculate_kaggle_rmse(y_with_hybrid)
print(f"\n{'='*50}")
print(f"üèà Hybrid Baseline RMSE: {overall_rmse:.4f} yards")
print(f"{'='*50}\n")

# # Calculate per-frame RMSE (to see if error grows over time)
# frame_rmse = y_with_hybrid.groupby('frame_id').apply(
#     lambda g: np.sqrt(((g['x_std'] - g['x_std_hybrid'])**2 + 
#                        (g['y_std'] - g['y_std_hybrid'])**2).mean())
# ).reset_index(name='rmse')

# print("RMSE by frame:")
# print(frame_rmse.head(15))

# # Calculate per-play RMSE (to identify hardest plays)
# play_rmse = y_with_hybrid.groupby(['game_id', 'play_id']).apply(
#     lambda g: np.sqrt(((g['x_std'] - g['x_std_hybrid'])**2 + 
#                        (g['y_std'] - g['y_std_hybrid'])**2).mean())
# ).reset_index(name='rmse')


# print(f"\nPlay-level RMSE statistics:")
# print(play_rmse['rmse'].describe())
# print(f"\nWorst 5 plays:")
# print(play_rmse.nlargest(5, 'rmse'))


# print(f"\nPlay-level RMSE statistics:")
# print(play_rmse['rmse'].describe())
# print(f"\nBest 5 plays:")
# print(play_rmse.nsmallest(5, 'rmse'))


üèà Hybrid Baseline RMSE: 2.8302 yards



In [16]:
y_with_hybrid['target_dx'] = y_with_hybrid['x_std'] - y_with_hybrid['x_std_hybrid']
y_with_hybrid['target_dy'] = y_with_hybrid['y_std'] - y_with_hybrid['y_std_hybrid']

y_with_hybrid.head(10)

y_data = y_with_hybrid[['game_id','play_id','nfl_id','frame_id','target_dx','target_dy', 'x_std','y_std', 'x_std_hybrid', 'y_std_hybrid']].copy()

In [19]:
gnn_base_cols

['gnn_ally_dx_mean',
 'gnn_ally_dy_mean',
 'gnn_ally_dvx_mean',
 'gnn_ally_dvy_mean',
 'gnn_opp_dx_mean',
 'gnn_opp_dy_mean',
 'gnn_opp_dvx_mean',
 'gnn_opp_dvy_mean',
 'gnn_ally_cnt',
 'gnn_opp_cnt',
 'gnn_ally_dmin',
 'gnn_ally_dmean',
 'gnn_opp_dmin',
 'gnn_opp_dmean',
 'gnn_d1',
 'gnn_d2',
 'gnn_d3',
 'gnn_qb_dist',
 'gnn_qb_angle_sin',
 'gnn_qb_angle_cos',
 'gnn_facing_qb',
 'gnn_facing_qb_abs',
 'gnn_facing_qb_sin',
 'gnn_facing_qb_cos',
 'gnn_nearest_opp_dist',
 'gnn_nearest_opp_angle_sin',
 'gnn_nearest_opp_angle_cos',
 'gnn_facing_nearest_opp',
 'gnn_facing_nearest_opp_abs',
 'gnn_facing_nearest_opp_sin',
 'gnn_facing_nearest_opp_cos',
 'gnn_second_opp_dist',
 'gnn_second_opp_angle_sin',
 'gnn_second_opp_angle_cos',
 'gnn_facing_second_opp',
 'gnn_facing_second_opp_abs',
 'gnn_facing_second_opp_sin',
 'gnn_facing_second_opp_cos']

In [20]:
interaction_features = ['x_std',
                        'y_std',
                        's_x_std',
                        's_y_std',
                        "o_std",
                        "dir_std",
                        'height_in',
                        'dist_ball',
                        's_parallel',
                        's_perp',
                        "dx_ball",
                        "dy_ball",
                        "angle_to_ball_sin",
                        "angle_to_ball_cos",
                        "angle_to_ball_minus_dir_sin",
                        "angle_to_ball_minus_dir_cos",
                        "angle_to_ball_minus_o_sin",
                        "angle_to_ball_minus_o_cos",
                       ]

inv_numeric_features = [
    # Predicted player features
    "height_in", "player_weight", "birth_year",
    # Predicted player kinematics
    "x_std", "y_std",
    "s_x_std", "s_y_std",
    "a",  # if present
    "dir_std_sin", "dir_std_cos",
    "o_std_sin", "o_std_cos",
    
    # QB kinematics
    "qb_x_std", "qb_y_std", "qb_s", "qb_a",
    "qb_o_std_sin", "qb_o_std_cos",
    "qb_dir_std_sin", "qb_dir_std_cos",
    
    # Throw features - global
    "throw_frame_id", "throw_land_frame_id",
    "ball_land_x_std", "ball_land_y_std",
    # Time of throw - needs QB kinematics
    "qb_throw_distance", 
    "qb_ball_dir_sin", "qb_ball_dir_cos",

    # Ball-related features
    "dx_ball", "dy_ball", "dist_ball",
    "angle_to_ball_sin", "angle_to_ball_cos",
    "angle_to_ball_minus_dir_sin", "angle_to_ball_minus_dir_cos",
    "angle_to_ball_minus_o_sin", "angle_to_ball_minus_o_cos",
    "s_parallel", "s_perp",
]

inv_numeric_features = inv_numeric_features + gnn_base_cols

motion_features= ["x_std","y_std","s_x_std","s_y_std","dir_std_sin","dir_std_cos","o_std_sin","o_std_cos",
                  "dx_ball","dy_ball","dist_ball","angle_to_ball_sin","angle_to_ball_cos",
                  "angle_to_ball_minus_dir_sin","angle_to_ball_minus_dir_cos",
                  "angle_to_ball_minus_o_sin","angle_to_ball_minus_o_cos",
                  "s_parallel","s_perp"]

inv_categorical_features = [
    "player_position",
    "player_side",
    "player_role",
]

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


preproc_invariant = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), inv_numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), inv_categorical_features),
    ]
)

preproc_invariant.fit(x_data_last[inv_numeric_features + inv_categorical_features])

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [21]:
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import numpy as np

class PlayDataset(Dataset):
    def __init__(
        self,
        x_data_last,
        x_data_all,
        y_data,
        interaction_features,
        inv_numeric_features,
        inv_categorical_features,
        motion_features,
        preproc_invariant,
        T_pre=10,
        device="cpu",
    ):
        self.device = device
        self.interaction_features = interaction_features
        self.inv_numeric_features = inv_numeric_features
        self.inv_categorical_features = inv_categorical_features
        self.motion_features = motion_features
        self.T_pre = T_pre
        self.preproc_invariant = preproc_invariant

        self.samples = []

        # ‚úÖ OPTIMIZATION 1: Pre-group data once outside the loop
        y_grouped = {
            (gid, pid, nid): group.sort_values("frame_id")
            for (gid, pid, nid), group in y_data.groupby(["game_id", "play_id", "nfl_id"])
        }
        
        x_all_grouped = {
            (gid, pid, nid): group.sort_values("frame_id")
            for (gid, pid, nid), group in x_data_all.groupby(["game_id", "play_id", "nfl_id"])
        }

        for (gid, pid), play_df_all in tqdm(x_data_last.groupby(["game_id", "play_id"])):
            play_df = play_df_all.sort_values("nfl_id").reset_index(drop=True)
            nfl_ids = play_df["nfl_id"].tolist()

            # ‚úÖ OPTIMIZATION 2: Collect all player data in one pass
            frames_per_player = []
            targets_per_player = []
            x_true_per_player = []
            y_true_per_player = []
            x_hyb_per_player = []
            y_hyb_per_player = []
            motion_list = []
            T_max = 0

            for nid in nfl_ids:
                # Use pre-grouped dict instead of .query()
                out_rows = y_grouped.get((gid, pid, nid))
                if out_rows is None or out_rows.empty:
                    continue

                frames = out_rows["frame_id"].to_numpy()
                targets = out_rows[['target_dx','target_dy']].to_numpy(dtype='float32')
                x_true = out_rows["x_std"].to_numpy(dtype="float32")
                y_true = out_rows["y_std"].to_numpy(dtype="float32")
                x_hyb  = out_rows["x_std_hybrid"].to_numpy(dtype="float32")
                y_hyb  = out_rows["y_std_hybrid"].to_numpy(dtype="float32")

                frames_per_player.append(frames)
                targets_per_player.append(targets)
                x_true_per_player.append(x_true)
                y_true_per_player.append(y_true)
                x_hyb_per_player.append(x_hyb)
                y_hyb_per_player.append(y_hyb)
                T_max = max(T_max, len(frames))

                # ‚úÖ OPTIMIZATION 3: Motion history lookup
                hist_rows = x_all_grouped.get((gid, pid, nid))
                
                if hist_rows is None or hist_rows.empty:
                    motion_seq = np.zeros((self.T_pre, len(self.motion_features)),
                                          dtype=np.float32)
                else:
                    # Filter by throw_frame_id (vectorized)
                    throw_frame = play_df[play_df["nfl_id"] == nid]["throw_frame_id"].iloc[0]
                    hist_rows = hist_rows[hist_rows["frame_id"] <= throw_frame]
                    
                    vals = hist_rows[self.motion_features].to_numpy(np.float32)
                    if len(vals) >= self.T_pre:
                        motion_seq = vals[-self.T_pre:]
                    else:
                        pad = np.repeat(vals[:1], self.T_pre - len(vals), axis=0)
                        motion_seq = np.concatenate([pad, vals], axis=0)

                motion_list.append(motion_seq)

            if len(frames_per_player) == 0:
                continue

            # ‚úÖ OPTIMIZATION 4: Vectorized tensor creation
            t_norm = torch.linspace(0.0, 1.0, steps=T_max, dtype=torch.float32)
            N = len(targets_per_player)

            # Pre-allocate all tensors at once
            targets_tensor    = torch.zeros(N, T_max, 2, dtype=torch.float32)
            mask              = torch.zeros(N, T_max,    dtype=torch.bool)
            x_true_tensor     = torch.zeros(N, T_max,    dtype=torch.float32)
            y_true_tensor     = torch.zeros(N, T_max,    dtype=torch.float32)
            x_hybrid_tensor   = torch.zeros(N, T_max,    dtype=torch.float32)
            y_hybrid_tensor   = torch.zeros(N, T_max,    dtype=torch.float32)

            # Fill in one loop
            for i, (targ, x_t, y_t, x_h, y_h) in enumerate(
                zip(targets_per_player, x_true_per_player, y_true_per_player,
                    x_hyb_per_player, y_hyb_per_player)
            ):
                Ti = targ.shape[0]
                targets_tensor[i, :Ti, :] = torch.from_numpy(targ)
                mask[i, :Ti] = True
                x_true_tensor[i, :Ti] = torch.from_numpy(x_t)
                y_true_tensor[i, :Ti] = torch.from_numpy(y_t)
                x_hybrid_tensor[i, :Ti] = torch.from_numpy(x_h)
                y_hybrid_tensor[i, :Ti] = torch.from_numpy(y_h)

            X_pair, X_inv = self._build_pairwise_and_invariant(play_df)
            motion_tensor = torch.from_numpy(np.stack(motion_list))

            # Map roles to IDs
            role_series = play_df["player_role"].fillna("Other").astype(str)
            def map_role(r):
                if r == "Targeted Receiver":
                    return 0
                else:  
                    # Defense
                    return 1
                
            role_ids = np.array([map_role(r) for r in role_series], dtype=np.int64)
            role_tensor = torch.from_numpy(role_ids)  # (N,)

            self.samples.append((
                X_pair, X_inv, motion_tensor, t_norm,
                targets_tensor, mask,
                x_true_tensor, y_true_tensor,
                x_hybrid_tensor, y_hybrid_tensor,
                role_tensor,
            ))

    def __len__(self):
        return len(self.samples)

    def _build_pairwise_and_invariant(self, play_df):
        X_int = play_df[self.interaction_features].to_numpy(dtype=np.float32)
        N, F_int = X_int.shape

        feat_i = X_int[:, None, :]
        feat_j = X_int[None, :, :]
        pair_diff = feat_j - feat_i
        X_pair = np.transpose(pair_diff, (2, 0, 1))

        X_inv = self.preproc_invariant.transform(
            play_df[self.inv_numeric_features + self.inv_categorical_features]
        ).astype("float32")

        return torch.from_numpy(X_pair), torch.from_numpy(X_inv)

    def __getitem__(self, idx):
        return self.samples[idx]
    

full_dataset = PlayDataset(
    x_data_last=x_data_last,
    x_data_all = x_data,
    y_data=y_data,  # with proper residual targets!
    interaction_features=interaction_features,
    inv_numeric_features=inv_numeric_features,
    inv_categorical_features=inv_categorical_features,
    motion_features=motion_features,
    preproc_invariant=preproc_invariant,
)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14108/14108 [00:50<00:00, 279.61it/s]


In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MotionEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=32, out_dim=32):
        super().__init__()
        self.conv = nn.Conv1d(in_channels=in_dim,
                              out_channels=hidden_dim,
                              kernel_size=3,
                              padding=1)
        # self.conv2 = nn.Conv1d(in_channels=hidden_dim,
        #                        out_channels=hidden_dim,
        #                        kernel_size=1,
        #                        padding=0)
        # self.conv3 = nn.Conv1d(in_channels=hidden_dim,
        #                        out_channels=hidden_dim,
        #                        kernel_size=1,
        #                        padding=0)
        self.relu = nn.ReLU()
        self.fc   = nn.Linear(hidden_dim, out_dim)

    def forward(self, motion):  # motion: (B, N, T_pre, F_motion)
        B, N, T, F = motion.shape
        x = motion.view(B * N, T, F).transpose(1, 2)  # (B*N, F, T)
        h = self.relu(self.conv(x))                   # (B*N, H, T)
        # h = self.relu(self.conv2(h))                  # (B*N, H, T)
        # h = self.relu(self.conv3(h))                  # (B*N, H, T
        h = h.mean(dim=-1)                            # (B*N, H) - avg over time
        h = self.fc(h)                                # (B*N, out_dim)
        h = h.view(B, N, -1)                          # (B, N, out_dim)
        return h
    
class PairwiseInteractionEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=64):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(hidden_channels, hidden_channels, kernel_size=1)
        self.conv3 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)

    def forward(self, x):
        # x: (B, F_int, N, N)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))   # (B, C, N, N)
        x = x.mean(dim=3)           # pool over "other player" j ‚Üí (B, C, N)
        x = x.permute(0, 2, 1)      # ‚Üí (B, N, C)
        return x

class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        # x: (..., in_dim)
        return self.net(x)

class FullModel(nn.Module):
    def __init__(self, in_channels, in_motion, inv_dim, hidden_dim=128, enc_hidden=64, enc_out=64):
        super().__init__()
        self.encoder = PairwiseInteractionEncoder(
            in_channels=in_channels,
            hidden_channels=enc_hidden,
            out_channels=enc_out,
        )
        self.motion_encoder = MotionEncoder(
            in_dim=in_motion,
            hidden_dim=enc_hidden,
            out_dim=enc_out,
        )
        
        in_dim = enc_out + enc_out + inv_dim + 1 # enc_out (interaction) + enc_out (motion) + inv_dim + time
        self.head_tr = TimeConditionedMLP(
            in_dim=in_dim,
            hidden_dim=hidden_dim,
            out_dim=2,
        )
        self.head_def = TimeConditionedMLP(
            in_dim=in_dim,
            hidden_dim=hidden_dim,
            out_dim=2,
        )

    def forward(self, X_pair, X_inv, X_motion, t_norm, mask, role_ids):
        """
        X_pair: (B, F_int, N, N)
        X_inv:  (B, N, F_inv)
        X_motion: (B, N, T_pre, F_motion)
        t_norm: (B, T_max)
        mask:   (B, N, T_max)  (bool) ‚Äì True where target is valid
        """
        B, F_int, N, _ = X_pair.shape
        _, N_inv, F_inv = X_inv.shape
        _, T_max = t_norm.shape

        assert N == N_inv, "Mismatch in N between pairwise and inv features"

        # --- Encode interactions ---
        z_int = self.encoder(X_pair)    # (B, N, C)
        z_motion = self.motion_encoder(X_motion)  # (B, N, C)

        # --- Prepare features over time ---
        # z_int:     (B, N, C)     ‚Üí (B, N, T, C)
        # z_motion:  (B, N, C)     ‚Üí (B, N, T, C)
        # X_inv:     (B, N, F_inv) ‚Üí (B, N, T, F_inv)
        # t_norm:    (B, T)        ‚Üí (B, 1, T, 1) broadcast to (B, N, T, 1)
        C = z_int.shape[-1]
        z_int_exp = z_int.unsqueeze(2).expand(B, N, T_max, C)          # (B, N, T, C)
        z_motion_exp = z_motion.unsqueeze(2).expand(B, N, T_max, C)    # (B, N, T, C)
        X_inv_exp = X_inv.unsqueeze(2).expand(B, N, T_max, F_inv)      # (B, N, T, F_inv)
        t_exp     = t_norm.unsqueeze(1).unsqueeze(-1).expand(
            B, N, T_max, 1
        )  # (B, N, T, 1)

        feat = torch.cat([z_int_exp, z_motion_exp, X_inv_exp, t_exp], dim=-1)  # (B, N, T, 2*C+F_inv+1)

        # Flatten players and time to feed MLP
        feat_flat = feat.view(B * N * T_max, -1)       # (B*N*T, in_dim)
        role_flat = role_ids.unsqueeze(-1).expand(B, N, T_max).reshape(-1)  # (B*N*T,)
        out_flat = torch.zeros(B * N * T_max, 2, device=feat.device)

        # Masks for each role
        mask_tr  = (role_flat == 0)
        mask_def = (role_flat == 1)

        if mask_tr.any():
            out_flat[mask_tr] = self.head_tr(feat_flat[mask_tr])
        if mask_def.any():
            out_flat[mask_def] = self.head_def(feat_flat[mask_def])

        out = out_flat.view(B, N, T_max, 2)
        return out


In [26]:
from torch.utils.data import DataLoader
import numpy as np

# For now, simple random split by index (you can do group splits by game_id if you like)
# Dataset is already at the play level, so this way of splitting is fine
n = len(full_dataset)
idxs = np.arange(n)
np.random.seed(42)
np.random.shuffle(idxs)

n_train = int(0.7 * n)
n_val   = int(0.15 * n)
train_idx = idxs[:n_train]
val_idx   = idxs[n_train:n_train+n_val]
test_idx  = idxs[n_train+n_val:]

from torch.utils.data import Subset

train_ds = Subset(full_dataset, train_idx)
val_ds   = Subset(full_dataset, val_idx)
test_ds  = Subset(full_dataset, test_idx)

train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=1, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=1, shuffle=False)

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Infer dims
F_int = len(interaction_features)
# Get one batch to determine inv_dim and in_motion
X_pair0, X_inv0, motion0, t_norm0, targets0, mask0, x_true0, y_true0, x_hybrid0, y_hybrid0, role_ids0 = next(iter(train_loader))
inv_dim = X_inv0.shape[-1]

model = FullModel(
    in_channels=F_int,
    in_motion=len(motion_features),
    inv_dim=inv_dim,
    hidden_dim=128,
    enc_hidden=64,
    enc_out=64,
).to(device)

print(f"Model initialized with:")
print(f"  F_int (interaction features): {F_int}")
print(f"  in_motion (motion features): {len(motion_features)}")
print(f"  inv_dim (invariant features): {inv_dim}")
print(f"  MLP input dim: {  64 + 64 + inv_dim + 1} (enc_out + enc_out + inv_dim + 1)")


Model initialized with:
  F_int (interaction features): 18
  in_motion (motion features): 19
  inv_dim (invariant features): 97
  MLP input dim: 226 (enc_out + enc_out + inv_dim + 1)


In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Infer dims
F_int = len(interaction_features)
X_pair0, X_inv0, motion0, t_norm0, targets0, mask0, x_true0, y_true0, x_hybrid0, y_hybrid0, role_ids0 = next(iter(train_loader))
inv_dim = X_inv0.shape[-1]

# model = FullModel(
#     in_channels=F_int,
#     in_motion=len(motion_features),
#     inv_dim=inv_dim,
#     hidden_dim=128,
#     enc_hidden=64,
#     enc_out=64,
# ).to(device)

criterion = nn.MSELoss(reduction="sum")  # we'll divide by #valid later
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0              # for residual MSE (training objective)
    # total_squared_distance = 0.0  # for *position* error (Kaggle metric)
    total_squared_error_x = 0.0
    total_squared_error_y = 0.0
    
    total_samples = 0

    for X_pair, X_inv, motion, t_norm, targets, mask, x_true, y_true, x_hybrid, y_hybrid, role_ids in tqdm(loader):
        X_pair  = X_pair.to(device).float()      # (B, F_int, N, N)
        X_inv   = X_inv.to(device).float()       # (B, N, F_inv)
        motion  = motion.to(device).float()      # (B, N, T_pre, F_motion)
        t_norm  = t_norm.to(device).float()      # (B, T)
        targets = targets.to(device).float()     # (B, N, T, 2)  -> residuals
        mask    = mask.to(device)                # (B, N, T)

        x_true    = x_true.to(device).float()    # (B, N, T)
        y_true    = y_true.to(device).float()    # (B, N, T)
        x_hybrid  = x_hybrid.to(device).float()  # (B, N, T)
        y_hybrid  = y_hybrid.to(device).float()  # (B, N, T)

        if train:
            optimizer.zero_grad()

        preds = model(X_pair, X_inv, motion, t_norm, mask, role_ids)   # (B, N, T, 2)

        # # ---- residual MSE loss (training objective) ----
        mask_expanded = mask.unsqueeze(-1).expand_as(preds)  # (B, N, T, 2)
        diff_res = (preds - targets) * mask_expanded         # (B, N, T, 2)
        # sum of squared residual errors
        loss = criterion(diff_res, torch.zeros_like(diff_res))

        # ---- absolute position MSE loss ----
        # mask_expanded = mask.unsqueeze(-1).expand_as(preds)
        # diff = (preds - targets) * mask_expanded  # targets are absolute positions
        # loss = criterion(diff, torch.zeros_like(diff))
        # Loss is the same computation, just different interpretation!

        valid_count = mask.sum().item() * 2  # *2 for x and y
        if valid_count == 0:
            continue

        loss = loss / valid_count  # mean residual MSE over valid coords

        if train:
            loss.backward()
            optimizer.step()

        total_loss += loss.item()

        # ---- true Kaggle-style RMSE on *positions* ----
        with torch.no_grad():
            dx_pred = preds[..., 0]
            dy_pred = preds[..., 1]

            x_pred = x_hybrid + dx_pred
            y_pred = y_hybrid + dy_pred
            # x_pred = preds[..., 0]
            # y_pred = preds[..., 1]

            diff_x = (x_pred - x_true) * mask
            diff_y = (y_pred - y_true) * mask

            # squared_distances = diff_x**2 + diff_y**2   # (B, N, T)
            # total_squared_distance += squared_distances.sum().item()

            # ‚úÖ Sum squared errors separately
            total_squared_error_x += (diff_x ** 2).sum().item()
            total_squared_error_y += (diff_y ** 2).sum().item()
            total_samples += mask.sum().item()

            
    avg_loss = total_loss / max(len(loader), 1)  # avg residual MSE (for logging)
      # ‚úÖ CORRECT formula: sqrt(0.5 * (MSE_x + MSE_y))
    mse_x = total_squared_error_x / max(total_samples, 1)
    mse_y = total_squared_error_y / max(total_samples, 1)
    kaggle_rmse = np.sqrt(0.5 * (mse_x + mse_y))

    # kaggle_rmse = np.sqrt(total_squared_distance / max(total_samples, 1))

    return avg_loss, kaggle_rmse

num_epochs = 90
best_val = float("inf")
best_state = None
current_ts_abbreviated = __import__('datetime').datetime.now().strftime("%Y%m%d_%H%M%S")

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_loss, train_kaggle_rmse = run_epoch(train_loader, train=True)
    val_loss, val_kaggle_rmse = run_epoch(val_loader, train=False)
    print(f"Epoch {epoch+1}: train={train_loss:.4f},, val={val_loss:.4f}, Kaggle RMSE val={val_kaggle_rmse:.4f}")
    if val_loss < best_val:
        best_val = val_loss
        best_state = model.state_dict().copy()
        torch.save(best_state, f"best_model_{current_ts_abbreviated}.pth")
        print(f"  New best model saved with val loss {best_val:.4f}")

# 20251130-9:15pm run shows nothing to be gained from predictin all way from scratch
# RMSE stil like 1.2-1.3 just takes longer to get there

Epoch 1/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:19<00:00, 519.34it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2037.17it/s]


Epoch 1: train=0.8960,, val=0.5578, Kaggle RMSE val=1.0139
  New best model saved with val loss 0.5578
Epoch 2/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 532.09it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1885.25it/s]


Epoch 2: train=0.4880,, val=0.4254, Kaggle RMSE val=0.8847
  New best model saved with val loss 0.4254
Epoch 3/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:20<00:00, 491.35it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2072.22it/s]


Epoch 3: train=0.4268,, val=0.3712, Kaggle RMSE val=0.8346
  New best model saved with val loss 0.3712
Epoch 4/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 527.68it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1869.81it/s]


Epoch 4: train=0.3858,, val=0.3589, Kaggle RMSE val=0.8303
  New best model saved with val loss 0.3589
Epoch 5/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:19<00:00, 504.65it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2016.49it/s]


Epoch 5: train=0.3664,, val=0.3334, Kaggle RMSE val=0.7952
  New best model saved with val loss 0.3334
Epoch 6/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 550.57it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1889.32it/s]


Epoch 6: train=0.3459,, val=0.3159, Kaggle RMSE val=0.7754
  New best model saved with val loss 0.3159
Epoch 7/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 559.43it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2040.10it/s]


Epoch 7: train=0.3299,, val=0.3335, Kaggle RMSE val=0.7990
Epoch 8/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 555.37it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2085.92it/s]


Epoch 8: train=0.3105,, val=0.3147, Kaggle RMSE val=0.7801
  New best model saved with val loss 0.3147
Epoch 9/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 560.17it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1711.29it/s]


Epoch 9: train=0.3436,, val=0.3127, Kaggle RMSE val=0.7685
  New best model saved with val loss 0.3127
Epoch 10/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 575.85it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2105.05it/s]


Epoch 10: train=0.2927,, val=0.3277, Kaggle RMSE val=0.7858
Epoch 11/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:19<00:00, 519.33it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2027.32it/s]


Epoch 11: train=0.2822,, val=0.3055, Kaggle RMSE val=0.7673
  New best model saved with val loss 0.3055
Epoch 12/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 548.86it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2072.35it/s]


Epoch 12: train=0.2695,, val=0.3056, Kaggle RMSE val=0.7721
Epoch 13/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 562.26it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2079.55it/s]


Epoch 13: train=0.2647,, val=0.2996, Kaggle RMSE val=0.7737
  New best model saved with val loss 0.2996
Epoch 14/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 563.58it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2113.60it/s]


Epoch 14: train=0.2553,, val=0.3136, Kaggle RMSE val=0.7839
Epoch 15/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 582.71it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1811.43it/s]


Epoch 15: train=0.2436,, val=0.3249, Kaggle RMSE val=0.7861
Epoch 16/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 552.01it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2039.83it/s]


Epoch 16: train=0.2416,, val=0.3120, Kaggle RMSE val=0.7810
Epoch 17/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:19<00:00, 502.52it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2031.70it/s]


Epoch 17: train=0.2319,, val=0.3010, Kaggle RMSE val=0.7608
Epoch 18/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 578.34it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2088.41it/s]


Epoch 18: train=0.2237,, val=0.3047, Kaggle RMSE val=0.7777
Epoch 19/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 562.26it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2093.31it/s]


Epoch 19: train=0.2203,, val=0.2919, Kaggle RMSE val=0.7617
  New best model saved with val loss 0.2919
Epoch 20/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 581.79it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2121.39it/s]


Epoch 20: train=0.2173,, val=0.3634, Kaggle RMSE val=0.8794
Epoch 21/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 568.95it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2117.53it/s]


Epoch 21: train=0.2061,, val=0.3236, Kaggle RMSE val=0.8006
Epoch 22/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 575.00it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1891.81it/s]


Epoch 22: train=0.2020,, val=0.3130, Kaggle RMSE val=0.7853
Epoch 23/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 579.53it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2099.60it/s]


Epoch 23: train=0.2003,, val=0.3049, Kaggle RMSE val=0.7793
Epoch 24/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 566.34it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2099.20it/s]


Epoch 24: train=0.1951,, val=0.3123, Kaggle RMSE val=0.7860
Epoch 25/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 580.94it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2085.08it/s]


Epoch 25: train=0.1857,, val=0.3027, Kaggle RMSE val=0.7816
Epoch 26/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 552.67it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1877.27it/s]


Epoch 26: train=0.1877,, val=0.3150, Kaggle RMSE val=0.7974
Epoch 27/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 561.11it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2081.07it/s]


Epoch 27: train=0.1808,, val=0.3010, Kaggle RMSE val=0.7757
Epoch 28/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:19<00:00, 519.30it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1943.61it/s]


Epoch 28: train=0.1781,, val=0.2996, Kaggle RMSE val=0.7767
Epoch 29/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 560.78it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2033.75it/s]


Epoch 29: train=0.1722,, val=0.3105, Kaggle RMSE val=0.7928
Epoch 30/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 528.34it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2073.44it/s]


Epoch 30: train=0.1681,, val=0.3089, Kaggle RMSE val=0.7835
Epoch 31/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 530.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1933.42it/s]


Epoch 31: train=0.1631,, val=0.3392, Kaggle RMSE val=0.8305
Epoch 32/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 532.54it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2100.81it/s]


Epoch 32: train=0.1632,, val=0.3016, Kaggle RMSE val=0.7853
Epoch 33/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 569.83it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2101.25it/s]


Epoch 33: train=0.1584,, val=0.3112, Kaggle RMSE val=0.8000
Epoch 34/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 566.49it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2002.27it/s]


Epoch 34: train=0.1564,, val=0.3037, Kaggle RMSE val=0.7805
Epoch 35/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 562.48it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:00<00:00, 2130.58it/s]


Epoch 35: train=0.1606,, val=0.2981, Kaggle RMSE val=0.7735
Epoch 36/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 553.91it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2058.51it/s]


Epoch 36: train=0.1523,, val=0.3164, Kaggle RMSE val=0.7951
Epoch 37/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 550.51it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2066.34it/s]


Epoch 37: train=0.1490,, val=0.2875, Kaggle RMSE val=0.7595
  New best model saved with val loss 0.2875
Epoch 38/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 570.22it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2096.83it/s]


Epoch 38: train=0.1467,, val=0.3068, Kaggle RMSE val=0.7880
Epoch 39/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 585.20it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2094.68it/s]


Epoch 39: train=0.1436,, val=0.2908, Kaggle RMSE val=0.7689
Epoch 40/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 571.08it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2079.78it/s]


Epoch 40: train=0.1415,, val=0.2948, Kaggle RMSE val=0.7769
Epoch 41/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:16<00:00, 583.97it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2021.88it/s]


Epoch 41: train=0.1423,, val=0.3083, Kaggle RMSE val=0.7894
Epoch 42/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 572.10it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2099.14it/s]


Epoch 42: train=0.1362,, val=0.3008, Kaggle RMSE val=0.7753
Epoch 43/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 573.97it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2021.73it/s]


Epoch 43: train=0.1342,, val=0.2917, Kaggle RMSE val=0.7634
Epoch 44/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 572.19it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2093.03it/s]


Epoch 44: train=0.1370,, val=0.2977, Kaggle RMSE val=0.7758
Epoch 45/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 551.44it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1977.13it/s]


Epoch 45: train=0.1299,, val=0.2987, Kaggle RMSE val=0.7803
Epoch 46/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 559.49it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 1950.83it/s]


Epoch 46: train=0.1302,, val=0.3193, Kaggle RMSE val=0.8128
Epoch 47/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 562.18it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2026.86it/s]


Epoch 47: train=0.1286,, val=0.3014, Kaggle RMSE val=0.7822
Epoch 48/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 553.29it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2070.28it/s]


Epoch 48: train=0.1255,, val=0.3242, Kaggle RMSE val=0.8324
Epoch 49/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:17<00:00, 559.65it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2011.47it/s]


Epoch 49: train=0.1262,, val=0.2949, Kaggle RMSE val=0.7706
Epoch 50/90


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9875/9875 [00:18<00:00, 536.74it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2116/2116 [00:01<00:00, 2014.15it/s]


Epoch 50: train=0.1218,, val=0.2967, Kaggle RMSE val=0.7719
Epoch 51/90


 31%|‚ñà‚ñà‚ñà       | 3061/9875 [00:05<00:12, 561.43it/s]


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GroupKFold

# 1) Make sure we have one invariant row per (game, play, player)
inv_cols = ["game_id", "play_id", "nfl_id"] + inv_numeric_features + inv_categorical_features

players_inv = (
    x_data_last[inv_cols]
    .drop_duplicates(subset=["game_id", "play_id", "nfl_id"])
    .reset_index(drop=True)
)

# 2) Join invariants onto every frame in y_data
df = y_data[['game_id','play_id','nfl_id','frame_id','target_dx','target_dy']].merge(
    players_inv,
    on=["game_id", "play_id", "nfl_id"],
    how="left",
    validate="many_to_one"  # each (g,p,n) in y_data maps to at most one row in players_inv
)

# 3) Add a simple time feature per (game, play, player)
df = df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"]).reset_index(drop=True)

def add_t_norm(group: pd.DataFrame) -> pd.DataFrame:
    T = len(group)
    if T == 1:
        t = np.array([0.0], dtype=np.float32)
    else:
        t = np.arange(T, dtype=np.float32) / (T - 1)
    group["t_norm"] = t
    return group

df = df.groupby(["game_id", "play_id", "nfl_id"], group_keys=False).apply(add_t_norm)

# 4) Build X_flat and Y_flat
#    - transform invariants through your existing ColumnTransformer
X_inv = preproc_invariant.transform(df[inv_numeric_features + inv_categorical_features])
X_inv = np.asarray(X_inv, dtype=np.float32)

#    - append t_norm as an extra numeric feature
t_norm = df["t_norm"].to_numpy(dtype=np.float32).reshape(-1, 1)
X_flat = np.hstack([X_inv, t_norm])   # shape: (num_rows, F_inv + 1)

#    - targets from residuals
Y_flat = df[["target_dx", "target_dy"]].to_numpy(dtype=np.float32)

#    - groups by play (so we don't leak plays across folds)
groups = (df["game_id"].astype(str) + "_" + df["play_id"].astype(str)).to_numpy()

  df = df.groupby(["game_id", "play_id", "nfl_id"], group_keys=False).apply(add_t_norm)


      game_id  play_id  nfl_id  frame_id  target_dx  target_dy  height_in  \
0  2023090700      101   44930         1  -0.009727  -0.033013       75.0   
1  2023090700      101   44930         2   0.006174  -0.089786       75.0   
2  2023090700      101   44930         3   0.018751  -0.145187       75.0   
3  2023090700      101   44930         4   0.018004  -0.189218       75.0   
4  2023090700      101   44930         5   0.003933  -0.241878       75.0   

   player_weight  birth_year  x_std  y_std   s_x_std   s_y_std     a  \
0            196        1995  10.43  14.14  7.797271 -1.269866  2.68   
1            196        1995  10.43  14.14  7.797271 -1.269866  2.68   
2            196        1995  10.43  14.14  7.797271 -1.269866  2.68   
3            196        1995  10.43  14.14  7.797271 -1.269866  2.68   
4            196        1995  10.43  14.14  7.797271 -1.269866  2.68   

   dir_std_sin  dir_std_cos  o_std_sin  o_std_cos  qb_x_std  qb_y_std  qb_s  \
0     0.986996    -0.1607

In [34]:
from sklearn.model_selection import GroupKFold

gkf = GroupKFold(n_splits=5)
rmse_list = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X_flat, Y_flat, groups=groups), start=1):
    print(fold)
    X_tr, X_val = X_flat[train_idx], X_flat[val_idx]
    y_tr, y_val = Y_flat[train_idx], Y_flat[val_idx]

    reg_x = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.05, max_iter=300)
    reg_y = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.05, max_iter=300)

    reg_x.fit(X_tr, y_tr[:, 0])
    reg_y.fit(X_tr, y_tr[:, 1])

    dx_pred = reg_x.predict(X_val)
    dy_pred = reg_y.predict(X_val)

    sq = (dx_pred - y_val[:, 0])**2 + (dy_pred - y_val[:, 1])**2
    rmse = np.sqrt(sq.mean())
    rmse_list.append(rmse)

    print(f"Fold {fold} RMSE: {rmse:.4f}")

print("Mean RMSE:", np.mean(rmse_list), "¬±", np.std(rmse_list))

1
Fold 1 RMSE: 1.6108
2
Fold 2 RMSE: 1.4684
3
Fold 3 RMSE: 1.3748
4
Fold 4 RMSE: 1.3845
5
Fold 5 RMSE: 1.4019
Mean RMSE: 1.4480793346450294 ¬± 0.08766161971944728


In [None]:
@torch.no_grad()
def compute_baseline_rmse(loader):
    total_sq = 0.0
    total_points = 0

    for (
        X_pair, X_inv, t_norm, targets, mask,
        x_true, y_true, x_hyb, y_hyb, role_ids,
    ) in loader:
        mask = mask.to(device)
        x_true   = x_true.to(device)
        y_true   = y_true.to(device)
        x_hyb    = x_hyb.to(device)
        y_hyb    = y_hyb.to(device)

        diff_x = (x_hyb - x_true) * mask
        diff_y = (y_hyb - y_true) * mask
        sq = diff_x**2 + diff_y**2
        total_sq += sq.sum().item()
        total_points += mask.sum().item()

    return np.sqrt(total_sq / max(total_points, 1))

baseline_train_rmse = compute_baseline_rmse(train_loader)
baseline_val_rmse   = compute_baseline_rmse(val_loader)
print("Hybrid baseline RMSE  - train:", baseline_train_rmse)
print("Hybrid baseline RMSE  - val  :", baseline_val_rmse)

Hybrid baseline RMSE  - train: 2.844986673890057
Hybrid baseline RMSE  - val  : 2.6904142974101704


In [27]:
import math
import copy
import numpy as np
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Infer dims
F_int = len(interaction_features)
# Get one batch to determine inv_dim
X_pair0, X_inv0, t_norm0, targets0, mask0 = next(iter(train_loader))
inv_dim = X_inv0.shape[-1]

model = FullModel(
    in_channels=F_int,
    inv_dim=inv_dim,
    hidden_dim=128,
    enc_hidden=64,
    enc_out=64,
).to(device)


# criterion = nn.MSELoss(reduction="sum")  # we'll divide by #valid later
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    # For training loss (per-coordinate MSE)
    total_se = 0.0              # sum of squared errors over all coords
    total_valid_coords = 0      # number of valid coords (x + y) across all batches

    # For Kaggle-style RMSE (2D Euclidean per frame)
    total_squared_distance = 0.0  # sum of (dx^2 + dy^2) over valid frames
    total_valid_frames = 0        # number of valid frames

    with torch.set_grad_enabled(train):
        for X_pair, X_inv, t_norm, targets, mask in tqdm(loader):
            X_pair  = X_pair.to(device).float()    # (B, F_int, N, N)
            X_inv   = X_inv.to(device).float()     # (B, N, F_inv)
            t_norm  = t_norm.to(device).float()    # (B, T)
            targets = targets.to(device).float()   # (B, N, T, 2)
            mask    = mask.to(device)              # (B, N, T), bool or 0/1

            # Forward
            preds = model(X_pair, X_inv, t_norm, mask)  # (B, N, T, 2)

            # ---- Training loss (per-coordinate MSE over valid coords) ----
            # Expand mask to match preds/targets shape
            
            mask_expanded = mask.unsqueeze(-1).expand_as(preds)  # (B, N, T, 2)
            mask_expanded_f = mask_expanded.float()

            diff = (preds - targets) * mask_expanded_f          # zero where invalid

            # Sum of squared errors over all coords
            # You can either use criterion, or just square+sum:
            se = (diff ** 2).sum()    # scalar tensor

            # #valid coords = (#valid frames) * 2 (x and y)
            valid_coords = mask.sum().item() * 2

            if valid_coords == 0:
                continue

            batch_loss = se / valid_coords  # mean MSE over coords for this batch

            if train:
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

            # Accumulate for global MSE
            total_se += se.item()
            total_valid_coords += valid_coords

            # ---- Kaggle-style RMSE metric (Euclidean error per frame) ----
            # Note: here we only use mask (B, N, T) once per frame
            mask_f = mask.float()
            diff_x = (preds[..., 0] - targets[..., 0]) * mask_f
            diff_y = (preds[..., 1] - targets[..., 1]) * mask_f
            squared_distances = diff_x**2 + diff_y**2  # (B, N, T)

            total_squared_distance += squared_distances.sum().item()
            total_valid_frames += mask.sum().item()

    # Global mean per-coordinate MSE across dataset
    avg_loss = total_se / max(total_valid_coords, 1)

    # Global Euclidean RMSE across all valid frames
    kaggle_rmse = math.sqrt(
        total_squared_distance / max(total_valid_frames, 1)
    )

    return avg_loss, kaggle_rmse


# ---------- Training loop with model checkpointing on val RMSE ----------

num_epochs = 90
best_val_rmse = float("inf")
best_state = None
current_ts_abbreviated = __import__('datetime').datetime.now().strftime("%Y%m%d_%H%M%S")

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    train_loss, train_kaggle_rmse = run_epoch(train_loader, train=True)
    val_loss,   val_kaggle_rmse   = run_epoch(val_loader,   train=False)

    print(
        f"Epoch {epoch+1}: "
        f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, "
        f"train_RMSE={train_kaggle_rmse:.4f}, val_RMSE={val_kaggle_rmse:.4f}"
    )

    # Use validation RMSE as model-selection metric
    if val_kaggle_rmse < best_val_rmse:
        best_val_rmse = val_kaggle_rmse
        best_state = copy.deepcopy(model.state_dict())
        save_path = f"best_model_{current_ts_abbreviated}.pth"
        torch.save(best_state, save_path)
        print(f"  New best model saved: {save_path} (val_RMSE={best_val_rmse:.4f})")

NameError: name 'train_loader' is not defined

In [None]:
import time

def train_and_eval(model, train_loader, val_loader, num_epochs=10, patience=20, lr=5e-4):
    device = next(model.parameters()).device
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss(reduction="sum")

    best_val = float("inf")
    best_state = None
    bad_epochs = 0

    for epoch in range(num_epochs):
        # ----- train -----
        model.train()
        for X_pair, X_inv, t_norm, targets, mask in tqdm(train_loader):
            X_pair  = X_pair.to(device).float()
            X_inv   = X_inv.to(device).float()
            t_norm  = t_norm.to(device).float()
            targets = targets.to(device).float()
            mask    = mask.to(device)

            optimizer.zero_grad()
            preds = model(X_pair, X_inv, t_norm, mask)

            mask_expanded = mask.unsqueeze(-1).expand_as(preds)
            diff = (preds - targets) * mask_expanded
            loss = criterion(diff, torch.zeros_like(diff))
            valid_count = mask.sum().item() * 2
            if valid_count == 0:
                continue
            loss = loss / valid_count

            loss.backward()
            optimizer.step()

        # ----- validate -----
        model.eval()
        val_loss = 0.0
        n_batches = 0
        with torch.no_grad():
            for X_pair, X_inv, t_norm, targets, mask in tqdm(val_loader):
                X_pair  = X_pair.to(device).float()   
                X_inv   = X_inv.to(device).float()
                t_norm  = t_norm.to(device).float()
                targets = targets.to(device).float()
                mask    = mask.to(device)

                preds = model(X_pair, X_inv, t_norm, mask)
                mask_expanded = mask.unsqueeze(-1).expand_as(preds)
                diff = (preds - targets) * mask_expanded
                loss = criterion(diff, torch.zeros_like(diff))
                valid_count = mask.sum().item() * 2
                if valid_count == 0:
                    continue
                loss = loss / valid_count

                val_loss += loss.item()
                n_batches += 1

        val_loss /= max(n_batches, 1)
        print(f"Epoch {epoch+1}: val={val_loss:.4f}")

        # early stopping
        if val_loss < best_val:
            best_val = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    current_ts_abbreviated = time.time().__str__().replace('.', '')[-6:]
    torch.save(model.state_dict(), f"best_model_{current_ts_abbreviated}.pth")
    return best_val

In [None]:
all_idxs = np.arange(len(full_dataset))
np.random.shuffle(all_idxs)

subset_size = int(0.3 * len(all_idxs))
subset_idxs = all_idxs[:subset_size]
subset_train_idxs = subset_idxs[:int(0.7 * subset_size)]
subset_val_idxs   = subset_idxs[int(0.7 * subset_size):]

subset_train_ds = Subset(full_dataset, subset_train_idxs)
subset_val_ds   = Subset(full_dataset, subset_val_idxs)

subset_train_loader = DataLoader(subset_train_ds, batch_size=1, shuffle=True)
subset_val_loader   = DataLoader(subset_val_ds, batch_size=1, shuffle=False)


lrs = [1e-3, 5e-4, 2e-4]
hidden_dims = [64, 128]
enc_hidden = [32, 64]

results = []
for lr in lrs:
    for hd in hidden_dims:
        for eh in enc_hidden:
            model = FullModel(
                in_channels=len(interaction_features),
                inv_dim=inv_dim,
                hidden_dim=hd,
                enc_hidden=eh,
                enc_out=64,
        ).to(device)

        print(f"Testing lr={lr}, hidden_dim={hd}")
        val_loss = train_and_eval(
            model,
            subset_train_loader,
            subset_val_loader,
            num_epochs=8,
            patience=3,
            lr=lr,
        )
        results.append((lr, hd, val_loss))

print(sorted(results, key=lambda x: x[2]))

NameError: name 'full_dataset' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PairwiseInteractionEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=64):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(hidden_channels, hidden_channels, kernel_size=1)
        self.conv3 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)

    def forward(self, x):
        # x: (B, F_int, N, N)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))   # (B, C, N, N)
        x = x.mean(dim=3)           # pool over "other player" j ‚Üí (B, C, N)
        x = x.permute(0, 2, 1)      # ‚Üí (B, N, C)
        return x

class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )

    def forward(self, x):
        # x: (..., in_dim)
        return self.net(x)

class FullModel(nn.Module):
    def __init__(self, in_channels, inv_dim, hidden_dim=128, enc_hidden=64, enc_out=64):
        super().__init__()
        self.encoder = PairwiseInteractionEncoder(
            in_channels=in_channels,
            hidden_channels=enc_hidden,
            out_channels=enc_out,
        )
        self.mlp = TimeConditionedMLP(
            in_dim=enc_out + inv_dim + 1,  # +1 for time feature
            hidden_dim=hidden_dim,
            out_dim=2,
        )

    def forward(self, X_pair, X_inv, t_norm, mask):
        """
        X_pair: (B, F_int, N, N)
        X_inv:  (B, N, F_inv)
        t_norm: (B, T_max)
        mask:   (B, N, T_max)  (bool) ‚Äì True where target is valid
        """
        B, F_int, N, _ = X_pair.shape
        _, N_inv, F_inv = X_inv.shape
        _, T_max = t_norm.shape

        assert N == N_inv, "Mismatch in N between pairwise and inv features"

        # --- Encode interactions ---
        z_int = self.encoder(X_pair)    # (B, N, C)

        # --- Prepare features over time ---
        # z_int:     (B, N, C)     ‚Üí (B, N, T, C)
        # X_inv:     (B, N, F_inv) ‚Üí (B, N, T, F_inv)
        # t_norm:    (B, T)        ‚Üí (B, 1, T, 1) broadcast to (B, N, T, 1)
        C = z_int.shape[-1]
        z_int_exp = z_int.unsqueeze(2).expand(B, N, T_max, C)          # (B, N, T, C)
        X_inv_exp = X_inv.unsqueeze(2).expand(B, N, T_max, F_inv)      # (B, N, T, F_inv)
        t_exp     = t_norm.unsqueeze(1).unsqueeze(-1).expand(
            B, N, T_max, 1
        )  # (B, N, T, 1)

        feat = torch.cat([z_int_exp, X_inv_exp, t_exp], dim=-1)        # (B, N, T, C+F_inv+1)

        # Flatten players and time to feed MLP
        feat_flat = feat.view(B * N * T_max, -1)       # (B*N*T, in_dim)
        out_flat  = self.mlp(feat_flat)                # (B*N*T, 2)
        out       = out_flat.view(B, N, T_max, 2)      # (B, N, T, 2)

        # Apply mask in loss outside (we return full out)
        return out

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preproc_invariant = ColumnTransformer(
    transformers=[
        ("num", "passthrough", inv_numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), inv_categorical_features),
    ]
)

preproc_invariant.fit(x_data_last[inv_numeric_features + inv_categorical_features])

# merge in play_targets to get y per play
x_with_y = x_data_last.merge(
    y_data[['game_id','play_id','nfl_id','target_dx','target_dy']],
    on=['game_id','play_id','nfl_id'],
    how='inner',
    indicator=True
).query('_merge == "both"').drop(columns=['_merge'])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PairwiseInteractionEncoder(nn.Module):
    """
    Input:  (B, F_int, N, N)  pairwise features
    Output: (B, N, C)         per-player interaction embedding
    """
    def __init__(self, in_channels, hidden_channels=64, out_channels=64):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(hidden_channels, hidden_channels, kernel_size=1)
        self.conv3 = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)

    def forward(self, x):
        # x: (B, F_int, N, N)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))  # (B, C, N, N)

        # pool over "other player" axis (j), keep i:
        # assume dim 2 = i (row player), dim 3 = j (other)
        x = x.mean(dim=3)          # (B, C, N)

        # reshape to per-player embeddings (B, N, C)
        x = x.permute(0, 2, 1)     # (B, N, C)
        return x

In [None]:
def build_play_embeddings(play_df, encoder: PairwiseInteractionEncoder, device="cpu"):
    play_df = play_df.sort_values("nfl_id").reset_index(drop=True)
    N = len(play_df)

    # Interaction features -> pairwise grid
    X_int = play_df[interaction_features].to_numpy(dtype=np.float32)  # (N, F_int)
    N, F_int = X_int.shape

    feat_i = X_int[:, None, :]                # (N, 1, F_int)
    feat_j = X_int[None, :, :]                # (1, N, F_int)
    pair_diff = feat_j - feat_i               # (N, N, F_int)
    X_pair = np.transpose(pair_diff, (2, 0, 1)).astype(np.float32)  # (F_int, N, N)
    X_pair_t = torch.from_numpy(X_pair).unsqueeze(0).to(device)     # (1, F_int, N, N)

    # Invariant features
    X_inv = preproc_invariant.transform(
        play_df[inv_numeric_features + inv_categorical_features]
    )
    X_inv = np.asarray(X_inv, dtype=np.float32)                      # (N, F_inv)
    X_inv_t = torch.from_numpy(X_inv).to(device).unsqueeze(0)       # (1, N, F_inv)

    # Encode interactions
    # with torch.no_grad():  # (for now: treat encoder as fixed)
    z_int = encoder(X_pair_t)            # (1, N, C)

    # Concatenate interaction + invariant per player
    Z_play = torch.cat([z_int, X_inv_t], dim=-1)  # (1, N, D)

    return Z_play, play_df  # return df so we know which row is which

In [None]:
from tqdm import tqdm

device = "cpu"  # or "cuda" if available
encoder = PairwiseInteractionEncoder(
    in_channels=len(interaction_features),
    hidden_channels=128,
    out_channels=128,
).to(device)

X_list = []  # will hold [z_player || time_features]
y_list = []
play_ids = []  # ‚úÖ NEW: Track which play each sample belongs to


for (gid, pid), play_df_all in tqdm(x_data_last.groupby(["game_id", "play_id"])):
    play_df = play_df_all.copy()

    # Build embeddings for all players in this play
    Z_play, play_df_sorted = build_play_embeddings(play_df, encoder, device=device)
    Z_play = Z_play.squeeze(0)   # (N, D)

    play_df_sorted = play_df_sorted.reset_index(drop=True)
    N, D = Z_play.shape

    for i in range(N):
        row = play_df_sorted.iloc[i]
        nid = row["nfl_id"]

        # Get this player's future frames
        out_rows = (
            y_data
            .query(
                "game_id == @gid and play_id == @pid and nfl_id == @nid"
            )
            .sort_values("frame_id")
        )

        if out_rows.empty:
            continue
        
        T_i = len(out_rows)

        # Example time feature: normalized time 0..1
        t_norm = (np.arange(T_i, dtype=np.float32) / max(T_i - 1, 1)).reshape(-1, 1)  # (T_i, 1)
        # print(Z_play)
        # Player embedding (D,) -> repeat over T_i frames
        z_i = Z_play[i].detach().cpu().numpy()         # (D,)
        z_rep = np.repeat(z_i[None, :], T_i, axis=0)   # (T_i, D)

        # Concatenate [z_i || t_features]
        X_i_t = np.concatenate([z_rep, t_norm], axis=1)  # (T_i, D+1)

        # Targets: x_t, y_t for each frame
        y_i_t = out_rows[["target_dx", "target_dy"]].to_numpy(dtype=np.float32)  # (T_i, 2)

        X_list.append(X_i_t)
        y_list.append(y_i_t)

        play_ids.extend([f"{gid}_{pid}"] * T_i)

# Stack all (T_i, ‚Ä¶) chunks into one big (num_samples, ‚Ä¶)
X_all = np.concatenate(X_list, axis=0)  # (num_samples, D+1)
Y_all = np.concatenate(y_list, axis=0)  # (num_samples, 2)
play_ids = np.array(play_ids)  # ‚úÖ (num_samples,)

print(X_all.shape, Y_all.shape, play_ids.shape)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14108/14108 [02:20<00:00, 100.66it/s]


(562936, 177) (562936, 2) (562936,)


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit

# Create train/test split grouped by play
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(splitter.split(X_all, Y_all, groups=play_ids))

X_train_full = X_all[train_idx]
y_train_full = Y_all[train_idx]
play_ids_train = play_ids[train_idx]

X_test = X_all[test_idx]
y_test = Y_all[test_idx]
play_ids_test = play_ids[test_idx]

print(f"Train: {len(X_train_full)} samples from {len(np.unique(play_ids_train))} plays")
print(f"Test: {len(X_test)} samples from {len(np.unique(play_ids_test))} plays")

# ‚úÖ Verify no overlap
assert len(set(play_ids_train) & set(play_ids_test)) == 0, "Data leakage detected!"

# Second split: train -> train + val (also grouped)
splitter_val = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx2, val_idx = next(splitter_val.split(
    X_train_full, y_train_full, groups=play_ids_train
))

X_train = X_train_full[train_idx2]
y_train = y_train_full[train_idx2]

X_val = X_train_full[val_idx]
y_val = y_train_full[val_idx]

print(f"Final split:")
print(f"  Train: {len(X_train)} samples")
print(f"  Val:   {len(X_val)} samples")
print(f"  Test:  {len(X_test)} samples")

Train: 451172 samples from 11286 plays
Test: 111764 samples from 2822 plays
Final split:
  Train: 360785 samples
  Val:   90387 samples
  Test:  111764 samples


In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_ds = TensorDataset(
    torch.from_numpy(X_train),
    torch.from_numpy(y_train),
)
val_ds = TensorDataset(
    torch.from_numpy(X_val),
    torch.from_numpy(y_val),
)
test_ds = TensorDataset(
    torch.from_numpy(X_test),
    torch.from_numpy(y_test),
)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy

class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )
    def forward(self, x):
        return self.net(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_dim = X_all.shape[1]

def train_one_config(hidden_dim, lr, num_epochs=30, patience=5):
    model = TimeConditionedMLP(in_dim=in_dim, hidden_dim=hidden_dim, out_dim=2).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    best_state = None
    bad_epochs = 0

    for epoch in range(num_epochs):
        # ---- Train ----
        model.train()
        train_loss = 0.0
        n_train = 0

        for xb, yb in train_loader:
            xb = xb.to(device).float()
            yb = yb.to(device).float()

            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * xb.size(0)
            n_train += xb.size(0)

        train_loss /= n_train

        # ---- Validate ----
        model.eval()
        val_loss = 0.0
        n_val = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device).float()
                yb = yb.to(device).float()
                preds = model(xb)
                loss = criterion(preds, yb)
                val_loss += loss.item() * xb.size(0)
                n_val += xb.size(0)
        val_loss /= n_val

        print(f"[hd={hidden_dim}, lr={lr}] Epoch {epoch+1}: train={train_loss:.4f}, val={val_loss:.4f}")

        # ---- Early stopping tracking ----
        if val_loss < best_val:
            best_val = val_loss
            best_state = copy.deepcopy(model.state_dict())
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print(f"Early stopping (no val improvement for {patience} epochs).")
                break

    # Load best weights before returning
    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_val

In [None]:
hidden_dims = [128, 256]
lrs = [1e-3, 5e-4]

best_cfg = None
best_val = float("inf")
best_model = None

# hidden_dims = [256]
# lrs = [5e-4]

for hd in hidden_dims:
    for lr in lrs:
        print(f"\n=== Training config: hidden_dim={hd}, lr={lr} ===")
        model, val_loss = train_one_config(hidden_dim=hd, lr=lr, num_epochs=75, patience=10)

        print(f"Config (hd={hd}, lr={lr}) finished with best val MSE={val_loss:.4f}")
        if val_loss < best_val:
            best_val = val_loss
            best_cfg = (hd, lr)
            best_model = model

print("\nBest config:", best_cfg, "with val MSE=", best_val)


=== Training config: hidden_dim=128, lr=0.001 ===
[hd=128, lr=0.001] Epoch 1: train=5.9389, val=3.7616
[hd=128, lr=0.001] Epoch 2: train=4.0389, val=3.5694
[hd=128, lr=0.001] Epoch 3: train=3.7739, val=3.8000
[hd=128, lr=0.001] Epoch 4: train=3.5165, val=3.1176
[hd=128, lr=0.001] Epoch 5: train=3.2068, val=2.8053
[hd=128, lr=0.001] Epoch 6: train=2.7851, val=2.3730
[hd=128, lr=0.001] Epoch 7: train=2.1014, val=1.8786
[hd=128, lr=0.001] Epoch 8: train=1.7370, val=1.5704
[hd=128, lr=0.001] Epoch 9: train=1.5280, val=1.4568
[hd=128, lr=0.001] Epoch 10: train=1.4522, val=1.5396
[hd=128, lr=0.001] Epoch 11: train=1.3617, val=1.3525
[hd=128, lr=0.001] Epoch 12: train=1.3262, val=1.4631
[hd=128, lr=0.001] Epoch 13: train=1.2690, val=1.4267
[hd=128, lr=0.001] Epoch 14: train=1.2444, val=1.2320
[hd=128, lr=0.001] Epoch 15: train=1.2066, val=1.2118
[hd=128, lr=0.001] Epoch 16: train=1.1723, val=1.2130
[hd=128, lr=0.001] Epoch 17: train=1.1372, val=1.1835
[hd=128, lr=0.001] Epoch 18: train=1.130

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

X_train, X_test, y_train, y_test = train_test_split(
    X_all, Y_all, test_size=0.2, random_state=42
)

train_ds = TensorDataset(
    torch.from_numpy(X_train),  # (N_samples, D+1)
    torch.from_numpy(y_train),  # (N_samples, 2)
)
test_ds = TensorDataset(
    torch.from_numpy(X_test),
    torch.from_numpy(y_test),
)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)


class TimeConditionedMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim),
        )
    def forward(self, x):
        return self.net(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
in_dim = X_all.shape[1]

model = TimeConditionedMLP(in_dim=in_dim, hidden_dim=128, out_dim=2).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.0025)

for epoch in range(75):
    model.train()
    total_loss = 0.0
    n = 0
    for xb, yb in train_loader:
        xb = xb.to(device).float()
        yb = yb.to(device).float()

        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        n += xb.size(0)
    print(f"Epoch {epoch+1}: train MSE={total_loss/n:.4f}")

Epoch 1: train MSE=13.8809
Epoch 2: train MSE=5.3395
Epoch 3: train MSE=3.3607
Epoch 4: train MSE=2.4103
Epoch 5: train MSE=2.0325
Epoch 6: train MSE=1.8106
Epoch 7: train MSE=1.6705
Epoch 8: train MSE=1.5075
Epoch 9: train MSE=1.4319
Epoch 10: train MSE=1.3599
Epoch 11: train MSE=1.3028
Epoch 12: train MSE=1.2703
Epoch 13: train MSE=1.2543
Epoch 14: train MSE=1.2273
Epoch 15: train MSE=1.2061
Epoch 16: train MSE=1.1722
Epoch 17: train MSE=1.1794
Epoch 18: train MSE=1.1544
Epoch 19: train MSE=1.1563
Epoch 20: train MSE=1.1255
Epoch 21: train MSE=1.1455
Epoch 22: train MSE=1.1133
Epoch 23: train MSE=1.1113
Epoch 24: train MSE=1.0993
Epoch 25: train MSE=1.1084
Epoch 26: train MSE=1.0819
Epoch 27: train MSE=1.0758
Epoch 28: train MSE=1.0801
Epoch 29: train MSE=1.0694
Epoch 30: train MSE=1.0679
Epoch 31: train MSE=1.0719
Epoch 32: train MSE=1.0625
Epoch 33: train MSE=1.0572
Epoch 34: train MSE=1.0481
Epoch 35: train MSE=1.0404
Epoch 36: train MSE=1.0338
Epoch 37: train MSE=1.0303
Epoch 38: