In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from src.prepData import load_train_data, normalize_input_fields, normalize_output_fields

## Data Prep

#### Load Data

In [2]:
print("Loading data...")
input_df, output_df = load_train_data()
# input_df.to_pickle("data/personal/input_df.pkl")
# output_df.to_pickle("data/personal/output_df.pkl")

print(f"Loaded {len(input_df)} input rows, {len(output_df)} output rows")
print(
  f"Unique plays: {input_df[['game_id', 'play_id']].drop_duplicates().shape[0]}"
)

Loading data...
Loaded 4880579 input rows, 562936 output rows
Unique plays: 14108


#### Normalize fields

In [3]:
input_df = normalize_input_fields(input_df)
norm_helper = input_df[['game_id','play_id','play_direction','absolute_yardline_number']].drop_duplicates()
output_df = normalize_output_fields(output_df, norm_helper)

#### Create play-level features

In [4]:
distinct_plays = input_df[['game_id', 'play_id']].drop_duplicates()
distinct_plays.sort_values(['game_id','play_id']).head(3)

Unnamed: 0,game_id,play_id
0,2023090700,101
234,2023090700,194
650,2023090700,219


In [5]:
input_min_frames = (
    input_df.groupby(['game_id', 'play_id'])['frame_id'].min().reset_index().rename(columns={'frame_id':'throw_frame_id'})
)

In [6]:
# Get max frame_id from input_df for each play (throw_frame_id baseline)
input_max_frames = (
    input_df
    .groupby(['game_id', 'play_id'])[['frame_id', "ball_land_x_std","ball_land_y_std"]]
    .max()
    .reset_index()
    .rename(columns={'frame_id': 'throw_frame_id'})
)

# Get max frame_id from output_df for each play (throw_land_frame_id baseline)
output_max_frames = (
    output_df
    .groupby(['game_id', 'play_id'])[['frame_id']]
    .max()
    .reset_index()
    .rename(columns={'frame_id': 'throw_land_frame_id'})
)

# Combine both into baseline frame info
baseline_frame_info = input_max_frames.merge(
    output_max_frames,
    on=['game_id', 'play_id'],
    how='outer'
)

print(f"Baseline frame info shape: {baseline_frame_info.shape}")
print(f"Unique plays: {baseline_frame_info.shape[0]}")
baseline_frame_info.head(2)

Baseline frame info shape: (14108, 6)
Unique plays: 14108


Unnamed: 0,game_id,play_id,throw_frame_id,ball_land_x_std,ball_land_y_std,throw_land_frame_id
0,2023090700,101,26,21.259998,-0.22,21
1,2023090700,194,32,4.059998,31.55,9


In [7]:
# Create all play-level features
qb_frame = input_df[input_df['player_role'] == 'Passer']
if qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0] < len(distinct_plays):
    print(f"Warning: fewer plays with QB ({qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0]}) than original plays ({len(distinct_plays)})")

# Get QB max frame for plays with a passer
qb_max_frame = (
    qb_frame
    .groupby(['game_id', 'play_id', 'nfl_id', 'player_role'])['frame_id']
    .max()
    .reset_index()
)

# Find plays without a passer
plays_with_qb = qb_max_frame[['game_id', 'play_id']].drop_duplicates()
plays_without_qb = (
    distinct_plays
    .merge(plays_with_qb, on=['game_id', 'play_id'], how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns=['_merge'])
)

# For plays without a passer, use the overall max frame_id
if len(plays_without_qb) > 0:
    print(f"Found {len(plays_without_qb)} plays without a Passer. Using overall max frame_id.")
    
    missing_max_frames = (
        input_df
        .merge(plays_without_qb, on=['game_id', 'play_id'])
        .groupby(['game_id', 'play_id'])['frame_id']
        .max()
        .reset_index()
    )
    
    # Add placeholder columns for nfl_id and player_role
    missing_max_frames['nfl_id'] = None
    missing_max_frames['player_role'] = None
    
    # Combine with QB frames
    qb_max_frame = pd.concat([qb_max_frame, missing_max_frames], ignore_index=True)

# Join back to input_df to get the full row data
qb_rows = pd.merge(
    input_df, 
    qb_max_frame, 
    on=['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_role'], 
    how='inner'
)

# Start with qb_rows
qb_sub = qb_rows.copy()

# Calculate derived features
qb_sub['qb_throw_distance'] = np.sqrt((qb_sub['ball_land_x_std'] - qb_sub['x_std'])**2 + (qb_sub['ball_land_y_std'] - qb_sub['y_std'])**2)
qb_sub['qb_ball_dir'] = (90 - np.degrees(np.arctan2(
    qb_sub['ball_land_y_std'] - qb_sub['y_std'],
    qb_sub['ball_land_x_std'] - qb_sub['x_std']
))) % 360
qb_sub['qb_direction_diff'] = (qb_sub['o_std'] - qb_sub['qb_ball_dir'] + 180) % 360 - 180  # difference between -180 and 180

# Rename frame_id to be QB-specific
qb_sub.rename(columns={'frame_id':'throw_frame_id'}, inplace=True)

# Drop player_to_predict column (not needed for QB)
qb_sub = qb_sub.drop(columns=['player_to_predict'])

# Rename QB kinematic fields to have qb_ prefix
qb_kinematic_fields_rename = {
    "x_std": "qb_x_std",
    "y_std": "qb_y_std",
    "o_std": "qb_o_std",
    "dir_std": "qb_dir_std",
    "s": "qb_s",
    "a": "qb_a"
}
qb_sub = qb_sub.rename(columns=qb_kinematic_fields_rename)

qb_sub = qb_sub.drop(columns=["ball_land_x_std","ball_land_y_std"])

qb_sub.head(3)

Found 3 plays without a Passer. Using overall max frame_id.


Unnamed: 0,game_id,play_id,nfl_id,throw_frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,qb_s,qb_a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,absolute_yardline_number_std,qb_x_std,qb_y_std,qb_o_std,qb_dir_std,qb_throw_distance,qb_ball_dir,qb_direction_diff
0,2023090700,101,43290,26,right,42,Jared Goff,6-4,223,1994-10-14,QB,Offense,Passer,35.41,29.99,0.64,0.47,108.83,212.25,21,63.259998,-0.22,1,42,-6.59,29.99,212.25,108.83,41.08852,137.327657,74.922343
1,2023090700,194,44822,32,left,89,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,97.62,29.67,0.96,1.64,185.14,285.7,9,84.940002,21.75,1,31,-8.62,23.63,105.7,5.14,14.950209,58.010861,47.689139
2,2023090700,219,44822,17,left,79,Patrick Mahomes,6-3,230,1995-09-17,QB,Offense,Passer,85.87,22.97,1.49,2.76,133.64,245.38,8,75.849998,11.49,1,41,-6.87,30.33,65.38,313.64,15.237809,41.115185,24.264815


In [8]:
qb_features = ["qb_x_std", 
               "qb_y_std", 
               "qb_s", 
               "qb_a", 
               "qb_dir_std", 
               "qb_o_std", 
               "qb_throw_distance", 
               "qb_ball_dir"]

play_level_features = baseline_frame_info.merge(
  qb_sub[['game_id','play_id'] + qb_features], 
  how = 'left', 
  on = ['game_id','play_id'])

def impute_qb_features_safe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing QB features using ball trajectory (always available)
    This is 'safe' because ball_land_x/y are inputs, not targets
    """
    mask = df['qb_x_std'].isnull()
    
    if mask.sum() > 0:
        # Proxy: assume QB was ~10 yards behind ball landing
        df.loc[mask, 'qb_x_std'] = df.loc[mask, 'ball_land_x_std'] - 10
        df.loc[mask, 'qb_y_std'] = 26.7  # assume center of field
        
        # Proxy: assume QB was stationary (conservative)
        df.loc[mask, 'qb_s'] = 0.0
        df.loc[mask, 'qb_a'] = 0.0
      
        # Throw distance from imputed position
        df.loc[mask, 'qb_throw_distance'] = np.sqrt(
            (df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std'])**2 +
            (df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'])**2
        )

        # Proxy: QB facing ball direction
        df.loc[mask, 'qb_o_std'] = (90 - np.degrees(np.arctan2(
            df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'],
            df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std']
        ))) % 360
        df.loc[mask, 'qb_dir_std'] = df.loc[mask, 'qb_o_std']

        df.loc[mask, 'qb_ball_dir'] = (90 - np.degrees(np.arctan2(
            df.loc[mask, 'ball_land_y_std'] - df.loc[mask, 'qb_y_std'],
            df.loc[mask, 'ball_land_x_std'] - df.loc[mask, 'qb_x_std']
        ))) % 360
    
    return df

# Apply BEFORE split
play_level_features = impute_qb_features_safe(play_level_features)


In [9]:
x_data = baseline_frame_info[['game_id','play_id','throw_frame_id']].merge(
                                 input_df[input_df['player_to_predict'] == True],
                                 left_on = ['game_id','play_id','throw_frame_id'],
                                 right_on = ['game_id','play_id','frame_id'],
                                 how = 'inner')
player_level_features = ['player_height',
                         'player_weight',
                         'player_birth_date',
                         'player_position',
                         'player_side',
                         'player_role',
                         'x_std',
                         'y_std',
                         'o_std',
                         'dir_std',
                         's',
                         'a']
x_data = x_data[['game_id','play_id','nfl_id'] + player_level_features].copy()
x_data = x_data.merge(play_level_features, on = ['game_id','play_id'])


def height_to_inches(col):
    # col: pandas Series of "6-1" strings
    split_vals = col.str.split("-", expand=True)
    feet = split_vals[0].astype(float)
    inches = split_vals[1].astype(float)
    return feet * 12 + inches

x_data["height_in"] = height_to_inches(x_data["player_height"])
# Age in years (super rough)
x_data["birth_year"] = pd.to_datetime(x_data["player_birth_date"]).dt.year


# Encode angles as sin/cos
for col in ["dir_std", "o_std", "qb_o_std", "qb_dir_std", "qb_ball_dir"]:
    rad = np.deg2rad(x_data[col])
    x_data[col + "_sin"] = np.sin(rad)
    x_data[col + "_cos"] = np.cos(rad)


x_data.sort_values(['game_id','play_id','nfl_id'], inplace=True)
x_data.head(3)


Unnamed: 0,game_id,play_id,nfl_id,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x_std,y_std,o_std,dir_std,s,a,throw_frame_id,ball_land_x_std,ball_land_y_std,throw_land_frame_id,qb_x_std,qb_y_std,qb_s,qb_a,qb_dir_std,qb_o_std,qb_throw_distance,qb_ball_dir,height_in,birth_year,dir_std_sin,dir_std_cos,o_std_sin,o_std_cos,qb_o_std_sin,qb_o_std_cos,qb_dir_std_sin,qb_dir_std_cos,qb_ball_dir_sin,qb_ball_dir_cos
2,2023090700,101,44930,6-3,196,1995-02-16,WR,Offense,Targeted Receiver,10.43,14.14,106.8,99.25,7.9,2.68,26,21.259998,-0.22,21,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,75.0,1995,0.986996,-0.160743,0.957319,-0.289032,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242
0,2023090700,101,46137,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,13.82,17.67,184.99,134.17,5.34,1.8,26,21.259998,-0.22,21,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,73.0,1997,0.717276,-0.69679,-0.086982,-0.99621,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242
1,2023090700,101,52546,6-1,193,1997-01-21,CB,Defense,Defensive Coverage,6.01,12.44,309.47,192.18,2.93,4.75,26,21.259998,-0.22,21,-6.59,29.99,0.64,0.47,108.83,212.25,41.08852,137.327657,73.0,1997,-0.210984,-0.97749,-0.771958,0.635674,-0.533615,-0.845728,0.94648,-0.322761,0.677805,-0.735242


In [10]:
numeric_features = [
    # Predicted player features
    "height_in", "player_weight", "birth_year",
    # Predicted player kinematics
    "x_std", "y_std",
    "s", "a",  # if present
    "dir_std_sin", "dir_std_cos",
    "o_std_sin", "o_std_cos",
    
    # QB kinematics
    "qb_x_std", "qb_y_std", "qb_s", "qb_a",
    "qb_o_std_sin", "qb_o_std_cos",
    "qb_dir_std_sin", "qb_dir_std_cos",
    
    # Throw features - global
    "throw_frame_id", "throw_land_frame_id",
    "ball_land_x_std", "ball_land_y_std",
    # Time of throw - needs QB kinematics
    "qb_throw_distance", 
    "qb_ball_dir_sin", "qb_ball_dir_cos",
]

categorical_features = [
    "player_position",
    "player_role",
    "player_side",
]

X = x_data[numeric_features + categorical_features].copy()
print(f"Input features shape: {X.shape}")

Input features shape: (46045, 29)


In [11]:
y_data = (
    output_df
    .sort_values('frame_id')
    .groupby(['game_id', 'play_id', 'nfl_id'])[['x_std', 'y_std']]
    .last()
    .reset_index()
)

y_data = y_data.merge(baseline_frame_info[['game_id','play_id','ball_land_x_std','ball_land_y_std']], 
             on=['game_id','play_id'], 
             how='inner')

y_data.head(3)

Unnamed: 0,game_id,play_id,nfl_id,x_std,y_std,ball_land_x_std,ball_land_y_std
0,2023090700,101,44930,20.49,2.83,21.259998,-0.22
1,2023090700,101,46137,20.87,4.63,21.259998,-0.22
2,2023090700,101,52546,10.48,5.38,21.259998,-0.22


In [12]:
print(f"Output data shape: {y_data.shape}")
y_data.head()

y_data.sort_values(['game_id','play_id','nfl_id'], inplace=True)

y_data["target_dx"] = y_data["x_std"] - y_data["ball_land_x_std"]
y_data["target_dy"] = y_data["y_std"] - y_data["ball_land_y_std"]
y = y_data[['target_dx','target_dy']].copy()

Output data shape: (46045, 7)


In [13]:
from sklearn.model_selection import train_test_split

# Simple random split of rows (no game grouping needed)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 36836, Test size: 9209


In [14]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preproc = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [15]:
from sklearn.model_selection import KFold, GridSearchCV
from tqdm.auto import tqdm

class TqdmGridSearchCV(GridSearchCV):
    def _run_search(self, evaluate_candidates):
        with tqdm(total=len(self.param_grid)) as pbar:
            def callback(*args, **kwargs):
                pbar.update(1)
                return evaluate_candidates(*args, **kwargs)
            super()._run_search(callback)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np


# --- 1. Base regressor & pipeline (what you already have) ---
base_reg = HistGradientBoostingRegressor(
    max_depth=6,
    learning_rate=0.05,
    max_iter=300,
)

reg = MultiOutputRegressor(base_reg)

model = Pipeline(steps=[
    ("preproc", preproc),   # your ColumnTransformer
    ("reg", reg),
])

# --- 2. Param grid for HistGB inside MultiOutputRegressor ---

param_grid = {
    "reg__estimator__max_depth": [3, 5, 7],
    "reg__estimator__learning_rate": [0.03, 0.05, 0.1],
    "reg__estimator__max_iter": [200, 400],
    "reg__estimator__min_samples_leaf": [20, 50],
    # Optionally:
    # "reg__estimator__l2_regularization": [0.0, 1e-3, 1e-2],
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid = TqdmGridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",   # MSE over both outputs
    cv=kf,
    n_jobs=-1,
    verbose=1,
)

# --- 3. Run grid search ---

grid.fit(X, y)

print("Best params:", grid.best_params_)
print("Best CV RMSE:", (-grid.best_score_) ** 0.5)

best_hgb_model = grid.best_estimator_

# --- 4. Fit best model on all data (optional) ---

best_hgb_model.fit(X, y)

# Predict:
# y_pred = best_hgb_model.predict(X_new)   # shape: (n_samples, 2) for [dx_hat, dy_hat]

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best params: {'reg__estimator__learning_rate': 0.1, 'reg__estimator__max_depth': 7, 'reg__estimator__max_iter': 400, 'reg__estimator__min_samples_leaf': 50}
Best CV RMSE: 1.5294171586879408


0,1,2
,steps,"[('preproc', ...), ('reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,HistGradientB...mples_leaf=50)
,n_jobs,

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,400
,max_leaf_nodes,31
,max_depth,7
,min_samples_leaf,50
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [19]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV

# Base regressor
xgb_base = xgb.XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",  # fast, CPU-friendly
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
)

xgb_pipe = Pipeline(steps=[
    ("preproc", preproc),               # your ColumnTransformer
    ("reg", MultiOutputRegressor(xgb_base)),
])

param_grid = {
    # these get passed to the inner estimator with 'reg__estimator__' prefix
    "reg__estimator__max_depth":   [3, 5, 7],
    "reg__estimator__learning_rate": [0.03, 0.1, 0.15],
    "reg__estimator__subsample":   [ 0.8, 1.0,],
    "reg__estimator__colsample_bytree": [0.8, 1.0],
}

kf = KFold(n_splits=5)

grid = TqdmGridSearchCV(
    xgb_pipe,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=kf.split(X, y),
    n_jobs=-1,
    verbose=1,
)

grid.fit(X, y)

print("Best params:", grid.best_params_)
print("Best RMSE:", (-grid.best_score_) ** 0.5)

best_xgb_model = grid.best_estimator_

  0%|          | 0/4 [00:00<?, ?it/s]

Fitting 5 folds for each of 36 candidates, totalling 180 fits


 25%|██▌       | 1/4 [03:08<09:26, 188.94s/it]


Best params: {'reg__estimator__colsample_bytree': 1.0, 'reg__estimator__learning_rate': 0.1, 'reg__estimator__max_depth': 7, 'reg__estimator__subsample': 0.8}
Best RMSE: 1.5464355769387312


In [16]:
X.isnull().sum()

height_in              0
player_weight          0
birth_year             0
x_std                  0
y_std                  0
s                      0
a                      0
dir_std_sin            0
dir_std_cos            0
o_std_sin              0
o_std_cos              0
qb_x_std               0
qb_y_std               0
qb_s                   0
qb_a                   0
qb_o_std_sin           0
qb_o_std_cos           0
qb_dir_std_sin         0
qb_dir_std_cos         0
throw_frame_id         0
throw_land_frame_id    0
ball_land_x_std        0
ball_land_y_std        0
qb_throw_distance      0
qb_ball_dir_sin        0
qb_ball_dir_cos        0
player_position        0
player_role            0
player_side            0
dtype: int64

In [19]:
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline 
from sklearn.multioutput import MultiOutputRegressor

mlp_base = MLPRegressor(
    hidden_layer_sizes=(64, 64),
    activation="relu",
    solver="adam",
    learning_rate_init=1e-3,
    max_iter=500,
    random_state=42,
)

mlp_pipe = Pipeline(steps=[
    ("preproc", preproc),  # numeric + one-hot
    ("reg", MultiOutputRegressor(mlp_base)),
])

param_grid_mlp = {
    "reg__estimator__hidden_layer_sizes": [(64, 64), (128, 64)],
    "reg__estimator__alpha": [1e-4, 1e-3],     # L2
    "reg__estimator__learning_rate_init": [1e-3, 3e-4],
}

kf = KFold(n_splits=5)

grid_mlp = GridSearchCV(
    mlp_pipe,
    param_grid=param_grid_mlp,
    scoring="neg_mean_squared_error",
    cv=kf.split(X, y),
    n_jobs=-1,
    verbose=1,
)

grid_mlp.fit(X, y)

print("Best MLP params:", grid_mlp.best_params_)
print("Best MLP RMSE:", (-grid_mlp.best_score_) ** 0.5)

best_mlp_model = grid_mlp.best_estimator_

Fitting 5 folds for each of 8 candidates, totalling 40 fits


Best MLP params: {'reg__estimator__alpha': 0.001, 'reg__estimator__hidden_layer_sizes': (128, 64), 'reg__estimator__learning_rate_init': 0.0003}
Best MLP RMSE: 2.1262114193622694


## What was done
Created entire concatenated dataset and made it easy to retrieve
Corrected input variables
Did missing quarterback imputations
Tried different model types for single player-singlevector simple model
    Best RMSE of around 1.5 or so, and this is only for last frame before catch

## Still need to do
Expand out input set according to output frames, adding T as input variable
    Then see how simple model holds up using T
Move to CNN
    Try Zoo implementation
    Try Horton implementation
Move to Transformer / TranSportmer

In [None]:
"""
# TODO: 
# 1. Run below, and make sure it works and that you have method of train/val split to test it
# 2. Reserve some portion of the total/all data for real testing later
#     a. So also need to compile final test set
#     b. Save this out to pickle probably
# 3. Figure out how to interpolate other non-final frames in predictions
#     a. How will other models work knowing t the number of frames to predict?
# 4. How to turn this into one-shot model for all frames?
# 5. Think about other models to use for one-shot
#     a. Zoo-like solution
#     b. Horton feature representation
#     c. TranSportmer - like model?
"""
base_reg = HistGradientBoostingRegressor(
    max_depth=6,
    learning_rate=0.05,``
    max_iter=300,
)

reg = MultiOutputRegressor(base_reg)

model = Pipeline(steps=[
    ("preproc", preproc),
    ("reg", reg),
])

# --- 5. Train/val split (e.g. by game_id or week) ---

from sklearn.model_selection import GroupKFold, cross_val_score

groups = input_df["game_id"]  # to avoid leaking the same game into train & val

gkf = GroupKFold(n_splits=5)

scores = cross_val_score(
    model, X, y,
    cv=gkf,
    scoring="neg_mean_squared_error",
    groups=groups
)

print("CV RMSE per fold:", (-scores) ** 0.5)

# Finally fit on all data
model.fit(X, y)

ValueError: Found input variables with inconsistent numbers of samples: [2679, 2679, 285714]

In [None]:
y_data = output_df.merge(x_data[['game_id','play_id']], on=['game_id','play_id'], how='inner')

In [45]:
x_data.head(3)

Unnamed: 0,game_id,play_id,nfl_id_x,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x_std_x,y_std_x,ball_land_x_std_x,ball_land_y_std_x,o_std_x,dir_std_x,nfl_id_y,x_std_y,y_std_y,s,a,o_std_y,dir_std_y,ball_land_x_std_y,ball_land_y_std_y,throw_distance,ball_dir,direction_diff,ball_flight_time,frame_id,throw_frame_id,down,yards_to_go,quarter,game_clock,possession_team,defensive_team,pre_snap_possession_score,pre_snap_defensive_score,dropback_distance
0,2023090700,101,46137,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,12.13,19.25,21.259998,-0.22,206.88,130.9,43290,-6.59,29.99,0.64,0.47,212.25,108.83,21.259998,-0.22,41.08852,137.327657,74.922343,2.1,26,21,3,3,1,14:25,DET,KC,0,0,2.13
1,2023090700,101,52546,6-1,193,1997-01-21,CB,Defense,Defensive Coverage,5.89,13.23,21.259998,-0.22,337.56,96.81,43290,-6.59,29.99,0.64,0.47,212.25,108.83,21.259998,-0.22,41.08852,137.327657,74.922343,2.1,26,21,3,3,1,14:25,DET,KC,0,0,2.13
2,2023090700,101,44930,6-3,196,1995-02-16,WR,Offense,Targeted Receiver,6.59,14.35,21.259998,-0.22,95.8,86.6,43290,-6.59,29.99,0.64,0.47,212.25,108.83,21.259998,-0.22,41.08852,137.327657,74.922343,2.1,26,21,3,3,1,14:25,DET,KC,0,0,2.13


## Consolidated Play-Level Features

**Feature Checklist:**

### ✅ QB Trajectory Data (at time of throw)
- `x_std`, `y_std` - QB position
- `s` - QB speed
- `o_std` - QB orientation
- `dir_std` - QB direction
- `frame_id` - When thrown (frame number)
- `throw_distance` - Distance to ball landing point ✅ CALCULATED
- `ball_dir` - Direction of ball ✅ CALCULATED
- `direction_diff` - QB dir vs ball dir ✅ CALCULATED
- `ball_land_x_std`, `ball_land_y_std` - Ball landing location

### ✅ Supplemental Data
- `down` - Down number ✅
- `yards_to_go` - Distance to go ✅
- `possession_team` - Possession team ✅
- `defensive_team` - Defense team ✅
- `dropback_distance` - Dropback distance ✅
- `pre_snap_possession_score` - Possession team score ✅
- `pre_snap_defensive_score` - Defense team score ✅
- `pass_result` - Pass result ✅
- `quarter` - Quarter ✅
- `game_clock` - Game clock ✅

### ⚠️ MISSING Feature
- **"Throw time to reach"** - Time from throw to ball arrival (not yet calculated)
  - Can calculate as: `(throw_frame_id - frame_id) * 0.1` seconds

In [None]:
# Check for missing values
print("Missing value summary:")
print(play_level_features.isnull().sum()[play_level_features.isnull().sum() > 0])
print(f"\nFeature summary:")
print(f"  Total features: {len(play_level_features.columns)}")
print(f"  QB trajectory features: 10")
print(f"  Game situation features: 10")
print(f"  Identifiers: 3")

In [None]:
'''
# =====================
# A. Play-level features
# =====================
# 1. QB trajectory data 
QB x, y, speed, orientation at time of throw
When thrown by frame #
Distance to ball landing point at time of throw
Direction of ball
QB dir vs ball dir at time of throw

Throw time to reach by frame #


2. Supplemental data
Down and distance
Possession team
Defense team
Dropback distance
Score
Pass result

# =====================
# B. Player-level features
# =====================
Player role
Player side
Player position
Player height
Player weight
Player birthdate

# =====================
# C. Player-time-level features
# =====================
x, y
speed
direction
orientation
Models to try:
Baselines
Encode play-level features only
Encode play-level + player-level features
Encode play-level + player-level + player-time-level features frozen at throw time
'''

suppl_data['pre_snap_possession_score'] = np.where(
    suppl_data['possession_team'] == suppl_data['home_team_abbr'],
    suppl_data['pre_snap_home_score'], suppl_data['pre_snap_visitor_score']
    )
suppl_data['pre_snap_defensive_score'] = np.where(
    suppl_data['defensive_team'] == suppl_data['home_team_abbr'],
    suppl_data['pre_snap_home_score'], suppl_data['pre_snap_visitor_score']
    )
suppl_data_sub = suppl_data[['game_id','play_id','play_description','down','yards_to_go','quarter','game_clock','possession_team','defensive_team', 'pre_snap_possession_score', 'pre_snap_defensive_score', 'dropback_distance', 'pass_result']]

throw_frames = output_df.groupby(['game_id','play_id'])['frame_id'].max().reset_index().rename(columns={'frame_id':'throw_frame_id'})
throw_frames.sort_values(['game_id','play_id']).head(3)


# Create all play-level features
qb_frame = input_df[input_df['player_role'] == 'Passer']
if qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0] < len(distinct_plays):
    print(f"Warning: fewer plays with QB ({qb_frame[['game_id', 'play_id']].drop_duplicates().shape[0]}) than original plays ({len(distinct_plays)})")

qb_max_frame = qb_frame.groupby(['game_id', 'play_id', 'nfl_id', 'player_role'])['frame_id'].max()
qb_max_frame = qb_max_frame.reset_index()
qb_max_frame

qb_rows = pd.merge(input_df, qb_max_frame, on=['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_role'], how='inner')
qb_rows.head(3)

# qb_sub = qb_rows[['game_id','play_id','nfl_id','frame_id','x_std','y_std','s', 'a', 'o_std', 'dir_std', 'ball_land_x_std', 'ball_land_y_std', 'frame_id']].copy()
qb_sub = qb_rows
qb_sub['throw_distance'] = np.sqrt((qb_sub['ball_land_x_std'] - qb_sub['x_std'])**2 + (qb_sub['ball_land_y_std'] - qb_sub['y_std'])**2)
qb_sub['ball_dir'] = (90 - np.degrees(np.arctan2(
    qb_sub['ball_land_y_std'] - qb_sub['y_std'],
    qb_sub['ball_land_x_std'] - qb_sub['x_std']
))) % 360
qb_sub['direction_diff'] = (qb_sub['o_std'] - qb_sub['ball_dir'] + 180) % 360 - 180  # difference between -180 and 180

# qb_sub.sort_values(['game_id','play_id']).head(3)
qb_sub.head(5)

In [None]:
input_df[(input_df['game_id'] == 2023090700)
         &(input_df['play_id'] == 101)
         &(input_df['player_side'] == 'Offense')
         &(input_df['player_to_predict'] == True)].tail(3)
# Josh Reynolds at 52,14

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,absolute_yardline_number_std,x_std,y_std,ball_land_x_std,ball_land_y_std,o_std,dir_std
231,2023090700,101,True,44930,24,right,42,Josh Reynolds,6-3,196,1995-02-16,WR,Offense,Targeted Receiver,50.87,14.32,7.73,2.62,94.02,104.15,21,63.259998,-0.22,42,8.87,14.32,21.259998,-0.22,104.15,94.02
232,2023090700,101,True,44930,25,right,42,Josh Reynolds,6-3,196,1995-02-16,WR,Offense,Targeted Receiver,51.65,14.25,7.86,2.74,96.68,101.98,21,63.259998,-0.22,42,9.65,14.25,21.259998,-0.22,101.98,96.68
233,2023090700,101,True,44930,26,right,42,Josh Reynolds,6-3,196,1995-02-16,WR,Offense,Targeted Receiver,52.43,14.14,7.9,2.68,99.25,106.8,21,63.259998,-0.22,42,10.43,14.14,21.259998,-0.22,106.8,99.25


In [None]:
output_df[(output_df['game_id'] == 2023090700)
         &(output_df['play_id'] == 101)
         &(output_df['nfl_id'] == 44930)].tail(3)
# Josh Reynolds at 52,14

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y,play_direction,absolute_yardline_number,absolute_yardline_number_std,x_std,y_std
60,2023090700,101,44930,19,62.04,4.25,right,42,42,20.04,4.25
61,2023090700,101,44930,20,62.28,3.52,right,42,42,20.28,3.52
62,2023090700,101,44930,21,62.49,2.83,right,42,42,20.49,2.83
