Imports

In [116]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import torch

In [117]:
pd.set_option('display.max_columns', None)

In [118]:
input_df01 = pd.read_csv("nfl-big-data-bowl-2026-prediction/train/input_2023_w01.csv")
output_df01 = pd.read_csv("nfl-big-data-bowl-2026-prediction/train/output_2023_w01.csv")

now taking first 100 rows data of both file and map it acoording to game_id,play_id,nfl_id to a single df

Data Preprocessing

In [119]:
# ==============================================
# 🧠 STEP 1: Define numeric and categorical columns
# ==============================================
numeric_cols = [
    "game_id", "play_id", "nfl_id", "frame_id",
    "absolute_yardline_number", "player_weight",
    "x", "y", "s", "a", "o", "dir",
    "num_frames_output", "ball_land_x", "ball_land_y",
     # ✅ added your output targets here
]

cat_cols = [
    "player_position", "player_side", 
    "player_role", "play_direction"
]

# ==============================================
# 🧩 STEP 2: Convert numeric columns safely
# ==============================================
existing_numeric_cols = [c for c in numeric_cols if c in input_df01.columns]
for col in existing_numeric_cols:
    input_df01[col] = pd.to_numeric(input_df01[col], errors='coerce')

# ==============================================
# 🧩 STEP 3: One-hot encode categorical columns
# ==============================================
# Filter only columns that exist in the current DataFrame
cat_cols = [c for c in cat_cols if c in input_df01.columns]

# Fill NaNs temporarily (so encoder doesn’t skip)
input_df01[cat_cols] = input_df01[cat_cols].fillna("Unknown")

# One-hot encode all categorical columns together
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = ohe.fit_transform(input_df01[cat_cols])

# Get encoded column names
encoded_cols = ohe.get_feature_names_out(cat_cols)

# Create encoded DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=input_df01.index)

# Drop original categorical columns and merge encoded ones
input_df01 = pd.concat([input_df01.drop(columns=cat_cols), encoded_df], axis=1)


# ==============================================
# 🧩 STEP 4: Boolean and special conversions
# ==============================================
if "player_to_predict" in input_df01.columns:
    input_df01["player_to_predict"] = (
        input_df01["player_to_predict"].astype(str)
        .map({"True": 1, "False": 0, "1": 1, "0": 0})
        .fillna(0)
        .astype(int)
    )

if "player_height" in input_df01.columns:
    def height_to_inches(h):
        try:
            ft, inch = h.split('-')
            return int(ft) * 12 + int(inch)
        except Exception:
            return np.nan
    input_df01["player_height_in"] = input_df01["player_height"].apply(height_to_inches)

# ==============================================
# 🧩 STEP 5: Keep only numeric columns
# ==============================================
input_df01 = input_df01.select_dtypes(include=[np.number])
input_df01 = input_df01.dropna()

if input_df01.empty:
    print("⚠️ Skipped file (no usable numeric data).")
else:
    print(f"✅ Processed shape: {input_df01.shape}")

# ==============================================
# 🧩 STEP 6: Convert to PyTorch Tensor
# ==============================================
# data_tensor = torch.tensor(final_df.values, dtype=torch.float32)
# print("✅ Tensor shape:", data_tensor.shape)


✅ Processed shape: (285714, 40)


In [120]:
input_df01.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,absolute_yardline_number,player_weight,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,player_position_CB,player_position_DE,player_position_DT,player_position_FB,player_position_FS,player_position_ILB,player_position_MLB,player_position_NT,player_position_OLB,player_position_QB,player_position_RB,player_position_S,player_position_SS,player_position_TE,player_position_WR,player_side_Defense,player_side_Offense,player_role_Defensive Coverage,player_role_Other Route Runner,player_role_Passer,player_role_Targeted Receiver,play_direction_left,play_direction_right,player_height_in
0,2023090700,101,0,54527,1,42,210,52.33,36.94,0.09,0.39,322.4,238.24,21,63.259998,-0.22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,73
1,2023090700,101,0,54527,2,42,210,52.33,36.94,0.04,0.61,200.89,236.05,21,63.259998,-0.22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,73
2,2023090700,101,0,54527,3,42,210,52.33,36.93,0.12,0.73,147.55,240.6,21,63.259998,-0.22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,73
3,2023090700,101,0,54527,4,42,210,52.35,36.92,0.23,0.81,131.4,244.25,21,63.259998,-0.22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,73
4,2023090700,101,0,54527,5,42,210,52.37,36.9,0.35,0.82,123.26,244.25,21,63.259998,-0.22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,73


In [121]:
output_df01.shape

(32088, 6)

In [134]:
input_df01.player_to_predict.value_counts()

player_to_predict
0    209315
1     76399
Name: count, dtype: int64

In [135]:
input_df01.frame_id.value_counts()

frame_id
1     10089
2     10089
3     10089
4     10089
5     10089
      ...  
70       12
71       12
72       12
73       12
74       12
Name: count, Length: 74, dtype: int64

In [133]:
output_df01.frame_id.value_counts()

frame_id
1     2679
2     2679
3     2679
4     2679
5     2679
      ... 
90       8
91       8
92       8
93       8
94       8
Name: count, Length: 94, dtype: int64

In [130]:
# Load the files
input_df = input_df01.head(100)
output_df = output_df01.head(100)

# Merge input and output on keys
train_df = input_df.merge(
    output_df,
    on=["game_id", "play_id", "nfl_id", "frame_id"],
    how="inner"   # only rows that exist in output (targets)
)

print(train_df.shape)
print(train_df.head())

(42, 42)
      game_id  play_id  player_to_predict  nfl_id  frame_id  \
0  2023090700      101                  1   46137         1   
1  2023090700      101                  1   46137         2   
2  2023090700      101                  1   46137         3   
3  2023090700      101                  1   46137         4   
4  2023090700      101                  1   46137         5   

   absolute_yardline_number  player_weight    x_x    y_x     s     a     dir  \
0                        42            204  51.32  20.69  0.31  0.49   79.43   
1                        42            204  51.35  20.66  0.36  0.74  118.07   
2                        42            204  51.39  20.63  0.44  0.76  130.89   
3                        42            204  51.43  20.61  0.48  0.62  134.50   
4                        42            204  51.48  20.58  0.54  0.44  129.79   

        o  num_frames_output  ball_land_x  ball_land_y  player_position_CB  \
0  267.68                 21    63.259998        -0.2

In [123]:
# Take first 100 rows
input_sample = input_df01
output_sample = output_df01

# Keep only merge keys + x, y (renamed)
output_sample = output_sample[['game_id', 'play_id', 'nfl_id', 'frame_id', 'x', 'y']].rename(columns={'x': 'tx', 'y': 'ty'})

# Merge based on shared identifiers
final_df = pd.merge(
    input_sample,
    output_sample,
    on=['game_id', 'play_id', 'nfl_id','frame_id'],
    how='left'   # 'left' keeps all input rows even if no match in output
)

final_df = final_df.dropna(subset=["tx", "ty"])
# Save final combined file (optional)
final_df.to_csv("merged_first_w01.csv", index=False)

print("✅ Final merged DataFrame shape:", final_df.shape)
print(final_df.head())

✅ Final merged DataFrame shape: (31446, 42)
       game_id  play_id  player_to_predict  nfl_id  frame_id  \
26  2023090700      101                  1   46137         1   
27  2023090700      101                  1   46137         2   
28  2023090700      101                  1   46137         3   
29  2023090700      101                  1   46137         4   
30  2023090700      101                  1   46137         5   

    absolute_yardline_number  player_weight      x      y     s     a     dir  \
26                        42            204  51.32  20.69  0.31  0.49   79.43   
27                        42            204  51.35  20.66  0.36  0.74  118.07   
28                        42            204  51.39  20.63  0.44  0.76  130.89   
29                        42            204  51.43  20.61  0.48  0.62  134.50   
30                        42            204  51.48  20.58  0.54  0.44  129.79   

         o  num_frames_output  ball_land_x  ball_land_y  player_position_CB  \
26  2

In [124]:
final_df = pd.read_csv("merged_first_w01.csv")

In [125]:
# ==============================================
# 🧩 STEP: Split into inputs (X) and targets (y)
# ==============================================
target_cols = ["tx", "ty"]
input_cols = [c for c in final_df.columns if c not in target_cols]

X = final_df[input_cols].copy()
y = final_df[target_cols].copy()

print(f"✅ X shape: {X.shape}, y shape: {y.shape}")
print("X sample:\n", X.head())
print("y sample:\n", y.head())

✅ X shape: (31446, 40), y shape: (31446, 2)
X sample:
       game_id  play_id  player_to_predict  nfl_id  frame_id  \
0  2023090700      101                  1   46137         1   
1  2023090700      101                  1   46137         2   
2  2023090700      101                  1   46137         3   
3  2023090700      101                  1   46137         4   
4  2023090700      101                  1   46137         5   

   absolute_yardline_number  player_weight      x      y     s     a     dir  \
0                        42            204  51.32  20.69  0.31  0.49   79.43   
1                        42            204  51.35  20.66  0.36  0.74  118.07   
2                        42            204  51.39  20.63  0.44  0.76  130.89   
3                        42            204  51.43  20.61  0.48  0.62  134.50   
4                        42            204  51.48  20.58  0.54  0.44  129.79   

        o  num_frames_output  ball_land_x  ball_land_y  player_position_CB  \
0  267.

Simple Regressiion MOdel Building

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [127]:
# ==============================================
# 🏈 NFL Coordinate Regression Model Comparison
# ==============================================


# Optional: include XGBoost if available
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

# ==============================================
# 1️⃣ Load cleaned data
# ==============================================
df = pd.read_csv("merged_first_w01.csv")

# Ensure no NaN
df = df.dropna()

# Split into features (X) and targets (y)
target_cols = ["tx", "ty"]
X = df.drop(columns=target_cols)
y = df[target_cols]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"✅ Data split -> Train: {X_train.shape}, Test: {X_test.shape}")

# ==============================================
# 2️⃣ Define models to test
# ==============================================
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

if HAS_XGB:
    models["XGBoost"] = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)

# ==============================================
# 3️⃣ Train and Evaluate
# ==============================================
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": name,
        "RMSE": rmse,
        "R2 Score": r2
    })

    print(f"\n📊 {name} Results:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}")

# ==============================================
# 4️⃣ Compare model performance
# ==============================================
results_df = pd.DataFrame(results)
print("\n===============================")
print("🏁 Model Performance Comparison")
print("===============================")
print(results_df)

# Save results (optional)
results_df.to_csv("model_comparison_results_for_w01.csv", index=False)
print("\n✅ Results saved to 'model_comparison_results.csv'")

✅ Data split -> Train: (25156, 40), Test: (6290, 40)

📊 Linear Regression Results:
  RMSE: 4.5545
  R²:   0.9225

📊 Random Forest Results:
  RMSE: 0.6686
  R²:   0.9983

📊 XGBoost Results:
  RMSE: 1.7281
  R²:   0.9893

🏁 Model Performance Comparison
               Model      RMSE  R2 Score
0  Linear Regression  4.554519  0.922539
1      Random Forest  0.668562  0.998306
2            XGBoost  1.728111  0.989251

✅ Results saved to 'model_comparison_results.csv'
