In [16]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, r2_score, confusion_matrix, classification_report
)


from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier

First, we need to read in our data, clean it, and select the appropriate columns for our analysis

In [17]:
football_data = pd.read_csv("play_by_play_2025.csv")
football_data.head()

  football_data = pd.read_csv("play_by_play_2025.csv")


Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,1,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,,,,...,0,0,0.0,,,,,,,
1,40,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,0,0,-0.3527,,,,,,,
2,63,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,0,0,-0.190052,,,,,,0.511128,-51.112807
3,85,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,1,0,1.31734,0.939998,4.750889,3.0,0.666726,0.43911,0.66894,33.105969
4,115,2025_01_ARI_NO,2025090705,NO,ARI,REG,1,ARI,away,NO,...,0,0,-1.69436,,,,,,0.492038,50.796208


In this dataset, each row is a single play. The features of this dataset are a wide variety of information about that particular play. There are string columns that contain which teams are playing like `home_team`, there are integer columns that contain information about the given game in the NFL calendar like `week`, and there are boolean columns like `out_of_bounds` that contain a 1 if the play ended with the ball getting run out of bounds / or not. 

For our analysis, it will be really important to only include features that are avaiable before the play we are trying to predict actually happens. For example, we cannot include a feature like `yards_thrown`, because that feature gives information about the target that we are trying to include implicitly. It has information about the play that we are trying to predict, so this would be cheating. 

Now we need to select relevant columns, and separate our data into features and targets

In [19]:
features = ["down",
"ydstogo",
"qtr",
"goal_to_go",
"yardline_100",
"time",
"game_seconds_remaining",
"score_differential",
"play_type",
"half_seconds_remaining",
"first_down"]

In [20]:
football_data[features]

Unnamed: 0,down,ydstogo,qtr,goal_to_go,yardline_100,time,game_seconds_remaining,score_differential,play_type,half_seconds_remaining,first_down
0,,0,1,0,,15:00,3600.0,,,1800.0,
1,,0,1,0,35.0,15:00,3600.0,0.0,kickoff,1800.0,0.0
2,1.0,10,1,0,78.0,14:56,3596.0,0.0,run,1796.0,0.0
3,2.0,7,1,0,75.0,14:18,3558.0,0.0,pass,1758.0,1.0
4,1.0,10,1,0,64.0,13:40,3520.0,0.0,pass,1720.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
48233,4.0,1,4,0,77.0,02:10,130.0,35.0,punt,130.0,0.0
48234,1.0,10,4,0,79.0,01:54,114.0,-35.0,run,114.0,0.0
48235,2.0,3,4,0,72.0,01:21,81.0,-35.0,run,81.0,0.0
48236,3.0,1,4,0,70.0,00:39,39.0,-35.0,run,39.0,1.0


Let's clean this data to get rid of all the non-relevant playtypes. Field goals, kickoffs, PUNTS TBD, 2 pt conversions, Game row dividers all need to go.

In [21]:
football_data_cleaned = football_data[football_data["play_type"].isin(["run", "pass"])]
football_data_cleaned = football_data_cleaned.dropna(subset=["play_type"])
football_data_cleaned = football_data_cleaned.reset_index(drop=True)

Now we have got rid of all the irrelevant plays for our analysis. We have decided that we are going to train, validate, and test on the 2025 data. We've chose to do a 60%, 20%, 20% split between train, validation, and testing respectivly. 

In [None]:
X = football_data_cleaned[features]
y = football_data_cleaned["play_type"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.4,
    random_state=7,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)