In [1]:
import pandas as pd
import os

In [2]:
file_path = '../data/processed/play_by_play_2023_smote_processed.csv'
pbp_data = pd.read_csv(file_path)

print(f"Shape of the dataset: {pbp_data.shape}")
pbp_data.head()

Shape of the dataset: (93602, 5204)


Unnamed: 0,play_id,old_game_id,week,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,...,fantasy_id_00-0037746,fantasy_id_00-0037840,fantasy_id_00-0038120,fantasy_id_00-0038134,fantasy_id_00-0038542,fantasy_id_00-0039032,fantasy_id_00-0039075,fantasy_id_00-0039139,fantasy_id_Other,play_success
0,1.0,2023091000.0,1.0,0.510204,900.0,1800.0,1.0,0.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,39.0,2023091000.0,1.0,0.346939,900.0,1800.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,55.0,2023091000.0,1.0,0.755102,900.0,1800.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,77.0,2023091000.0,1.0,0.72449,870.0,1770.0,0.991667,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,102.0,2023091000.0,1.0,0.663265,835.0,1735.0,0.981944,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [3]:
columns = pbp_data.columns
print(columns)

Index(['play_id', 'old_game_id', 'week', 'yardline_100',
       'quarter_seconds_remaining', 'half_seconds_remaining',
       'game_seconds_remaining', 'quarter_end', 'drive', 'sp',
       ...
       'fantasy_id_00-0037746', 'fantasy_id_00-0037840',
       'fantasy_id_00-0038120', 'fantasy_id_00-0038134',
       'fantasy_id_00-0038542', 'fantasy_id_00-0039032',
       'fantasy_id_00-0039075', 'fantasy_id_00-0039139', 'fantasy_id_Other',
       'play_success'],
      dtype='object', length=5204)


In [4]:
# Group features relevant to game, play, and players
game_context_features = ['week', 'yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 
                         'game_seconds_remaining', 'quarter_end']

play_features = ['drive', 'sp', 'play_id', 'old_game_id', 'play_success']

player_features = [col for col in columns if 'fantasy_id' in col]

exclude_features = ['play_id', 'old_game_id']

print("Game Context Features:", game_context_features)
print("Play Features:", play_features)
print("Player Features (showing first 10):", player_features[:10])
print("Columns marked for exclusion:", exclude_features)

Game Context Features: ['week', 'yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'quarter_end']
Play Features: ['drive', 'sp', 'play_id', 'old_game_id', 'play_success']
Player Features (showing first 10): ['fantasy_id_00-0031381', 'fantasy_id_00-0031588', 'fantasy_id_00-0031687', 'fantasy_id_00-0032764', 'fantasy_id_00-0033040', 'fantasy_id_00-0033045', 'fantasy_id_00-0033280', 'fantasy_id_00-0033293', 'fantasy_id_00-0033553', 'fantasy_id_00-0033699']
Columns marked for exclusion: ['play_id', 'old_game_id']


In [5]:
processed_data_path = "../data/processed/play_by_play_2023_smote_processed.csv"
pbp_data_cleaned = pd.read_csv(processed_data_path)

pbp_data_cleaned = pbp_data_cleaned.drop(columns=exclude_features, errors='ignore')

print("Remaining columns after excluding irrelevant features:")
print(pbp_data_cleaned.columns)

Remaining columns after excluding irrelevant features:
Index(['week', 'yardline_100', 'quarter_seconds_remaining',
       'half_seconds_remaining', 'game_seconds_remaining', 'quarter_end',
       'drive', 'sp', 'qtr', 'down',
       ...
       'fantasy_id_00-0037746', 'fantasy_id_00-0037840',
       'fantasy_id_00-0038120', 'fantasy_id_00-0038134',
       'fantasy_id_00-0038542', 'fantasy_id_00-0039032',
       'fantasy_id_00-0039075', 'fantasy_id_00-0039139', 'fantasy_id_Other',
       'play_success'],
      dtype='object', length=5202)


In [6]:
print("Dataset shape:", pbp_data_cleaned.shape)
print("Columns in the dataset:", pbp_data_cleaned.columns)

missing_values = pbp_data_cleaned[['play_success']].isnull().sum()
print("Missing values in 'play_success':\n", missing_values)

Dataset shape: (93602, 5202)
Columns in the dataset: Index(['week', 'yardline_100', 'quarter_seconds_remaining',
       'half_seconds_remaining', 'game_seconds_remaining', 'quarter_end',
       'drive', 'sp', 'qtr', 'down',
       ...
       'fantasy_id_00-0037746', 'fantasy_id_00-0037840',
       'fantasy_id_00-0038120', 'fantasy_id_00-0038134',
       'fantasy_id_00-0038542', 'fantasy_id_00-0039032',
       'fantasy_id_00-0039075', 'fantasy_id_00-0039139', 'fantasy_id_Other',
       'play_success'],
      dtype='object', length=5202)
Missing values in 'play_success':
 play_success    0
dtype: int64


In [7]:
exclude_features = ['play_id', 'old_game_id'] 
pbp_data_cleaned = pbp_data_cleaned.drop(columns=exclude_features, errors='ignore')
print(f"Remaining columns after excluding irrelevant features: {len(pbp_data_cleaned.columns)}")

Remaining columns after excluding irrelevant features: 5202


In [8]:
game_context_features = ['week', 'yardline_100', 'quarter_seconds_remaining', 
                         'half_seconds_remaining', 'game_seconds_remaining', 
                         'quarter_end', 'score_differential']

play_features = ['drive', 'sp', 'down', 'yards_gained', 'pass_oe']

player_features = [col for col in pbp_data_cleaned.columns if 'fantasy_id' in col]

custom_features = ['qb_epa', 'xyac_epa', 'xyac_mean_yardage', 'xyac_median_yardage']

print("Game Context Features:", game_context_features)
print("Play Features:", play_features)
print("Player Features (first 10):", player_features[:10])
print("Custom Features:", custom_features)

Game Context Features: ['week', 'yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'quarter_end', 'score_differential']
Play Features: ['drive', 'sp', 'down', 'yards_gained', 'pass_oe']
Player Features (first 10): ['fantasy_id_00-0031381', 'fantasy_id_00-0031588', 'fantasy_id_00-0031687', 'fantasy_id_00-0032764', 'fantasy_id_00-0033040', 'fantasy_id_00-0033045', 'fantasy_id_00-0033280', 'fantasy_id_00-0033293', 'fantasy_id_00-0033553', 'fantasy_id_00-0033699']
Custom Features: ['qb_epa', 'xyac_epa', 'xyac_mean_yardage', 'xyac_median_yardage']


In [9]:
from sklearn.impute import SimpleImputer

# Finding and imputing missing numerical feature values with median category value
numerical_features_to_impute = ['yardline_100', 'score_differential', 
                                 'qb_epa', 'xyac_epa', 
                                 'xyac_mean_yardage', 'xyac_median_yardage']
for col in numerical_features_to_impute:
    pbp_data_cleaned[col].fillna(pbp_data_cleaned[col].median(), inplace=True)

# Finding and imputing missing vlaues in 'play_type' and 'side_of_field' with placeholder value 0
side_of_field_columns = [col for col in pbp_data_cleaned.columns if 'side_of_field_' in col]
play_type_columns = [col for col in pbp_data_cleaned.columns if 'play_type_' in col]
for col in side_of_field_columns + play_type_columns:
    pbp_data_cleaned[col].fillna(0, inplace=True)

print(f"Remaining missing values:\n{pbp_data_cleaned.isnull().sum().sum()}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pbp_data_cleaned[col].fillna(pbp_data_cleaned[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pbp_data_cleaned[col].fillna(pbp_data_cleaned[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work becaus

Remaining missing values:
0


In [10]:
for col in pbp_data_cleaned.columns:
    print(col)

week
yardline_100
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
quarter_end
drive
sp
qtr
down
goal_to_go
ydstogo
ydsnet
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
air_yards
yards_after_catch
kick_distance
home_timeouts_remaining
away_timeouts_remaining
timeout
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa
total_away_rush_epa
total_home_pass_epa
total_away_pass_epa
air_epa
yac_epa
comp_air_epa
comp_yac_epa
total_home_comp_air_epa
total_away_comp_air_epa
total_home_comp_yac_epa
total_away_comp_yac_epa
total_home_raw_air_epa
total_away_raw_air_epa
total_home_raw_yac_epa
total_away_raw_yac_epa
wp
def_wp
home

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Define numerical features
numerical_features_to_scale = ['yardline_100', 'quarter_seconds_remaining', 
                               'half_seconds_remaining', 'game_seconds_remaining', 
                               'score_differential', 'qb_epa', 
                               'xyac_epa', 'xyac_mean_yardage', 'xyac_median_yardage']

# Noramlize numerical features using MinMaxScaler
scaler = MinMaxScaler()
pbp_data_cleaned[numerical_features_to_scale] = scaler.fit_transform(pbp_data_cleaned[numerical_features_to_scale])

print(pbp_data_cleaned[numerical_features_to_scale].describe())

       yardline_100  quarter_seconds_remaining  half_seconds_remaining  \
count  93602.000000               93602.000000            93602.000000   
mean       0.526161                   0.471211                0.460911   
std        0.213956                   0.307656                0.305174   
min        0.000000                   0.000000                0.000000   
25%        0.367347                   0.183632                0.182549   
50%        0.544525                   0.455556                0.452222   
75%        0.702725                   0.742345                0.718889   
max        1.000000                   1.000000                1.000000   

       game_seconds_remaining  score_differential        qb_epa      xyac_epa  \
count            93602.000000        93602.000000  93602.000000  93602.000000   
mean                 0.484578            0.477624      0.610921      0.389482   
std                  0.288685            0.146783      0.057355      0.031113   
min      

In [12]:
output_path = "../data/processed/play_by_play_2023_final_features.csv"
pbp_data_cleaned.to_csv(output_path, index=False)

In [13]:
target_variable = ['play_success']
time_pressure_encoded = [col for col in pbp_data_cleaned.columns if col.startswith('time_pressure_')]
play_type_encoded = [col for col in pbp_data_cleaned.columns if col.startswith('play_type_')]
side_of_field_encoded = [col for col in pbp_data_cleaned.columns if col.startswith('side_of_field_')]

# Grouped tuple of all features relevant to model
relevant_features = (
    game_context_features 
    + play_features 
    + target_variable 
    + custom_features 
    + time_pressure_encoded 
    + play_type_encoded 
    + side_of_field_encoded)

# Define subset of dataset containing relevant features only
pbp_final_features = pbp_data_cleaned[relevant_features]
print(f"Filtered dataset shape: {pbp_final_features.shape}")
print(f"Filtered columns: {pbp_final_features.columns.tolist()}")

Filtered dataset shape: (93602, 76)
Filtered columns: ['week', 'yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'quarter_end', 'score_differential', 'drive', 'sp', 'down', 'yards_gained', 'pass_oe', 'play_success', 'qb_epa', 'xyac_epa', 'xyac_mean_yardage', 'xyac_median_yardage', 'time_pressure_end_game', 'time_pressure_mid_game', 'play_type_field_goal', 'play_type_kickoff', 'play_type_no_play', 'play_type_pass', 'play_type_punt', 'play_type_qb_kneel', 'play_type_qb_spike', 'play_type_run', 'play_type_nfl_END_GAME', 'play_type_nfl_END_QUARTER', 'play_type_nfl_FIELD_GOAL', 'play_type_nfl_FUMBLE_RECOVERED_BY_OPPONENT', 'play_type_nfl_GAME_START', 'play_type_nfl_INTERCEPTION', 'play_type_nfl_KICK_OFF', 'play_type_nfl_PASS', 'play_type_nfl_PAT2', 'play_type_nfl_PENALTY', 'play_type_nfl_PUNT', 'play_type_nfl_RUSH', 'play_type_nfl_SACK', 'play_type_nfl_TIMEOUT', 'play_type_nfl_UNSPECIFIED', 'play_type_nfl_XP_KICK', 'side_of_field_ARI', 'side_of

In [14]:
missing_values = pbp_final_features.isnull().sum().sum()
print(f"Total missing values in filtered dataset: {missing_values}")

Total missing values in filtered dataset: 0


In [18]:
output_path = "../data/processed/play_by_play_2023_filtered.csv"
pbp_final_features.to_csv(output_path, index=False)
print(f"Filtered dataset saved successfully at: {output_path}")

Filtered dataset saved successfully at: ../data/processed/play_by_play_2023_filtered.csv


In [16]:
for col in pbp_final_features.columns:
    print(col)

week
yardline_100
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
quarter_end
score_differential
drive
sp
down
yards_gained
pass_oe
play_success
qb_epa
xyac_epa
xyac_mean_yardage
xyac_median_yardage
time_pressure_end_game
time_pressure_mid_game
play_type_field_goal
play_type_kickoff
play_type_no_play
play_type_pass
play_type_punt
play_type_qb_kneel
play_type_qb_spike
play_type_run
play_type_nfl_END_GAME
play_type_nfl_END_QUARTER
play_type_nfl_FIELD_GOAL
play_type_nfl_FUMBLE_RECOVERED_BY_OPPONENT
play_type_nfl_GAME_START
play_type_nfl_INTERCEPTION
play_type_nfl_KICK_OFF
play_type_nfl_PASS
play_type_nfl_PAT2
play_type_nfl_PENALTY
play_type_nfl_PUNT
play_type_nfl_RUSH
play_type_nfl_SACK
play_type_nfl_TIMEOUT
play_type_nfl_UNSPECIFIED
play_type_nfl_XP_KICK
side_of_field_ARI
side_of_field_ATL
side_of_field_BAL
side_of_field_BUF
side_of_field_CAR
side_of_field_CHI
side_of_field_CIN
side_of_field_CLE
side_of_field_DAL
side_of_field_DEN
side_of_field_DET
side_of_field_G

In [19]:
pbp_final_features.head()

Unnamed: 0,week,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,score_differential,drive,sp,down,yards_gained,pass_oe,play_success,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,time_pressure_end_game,time_pressure_mid_game,play_type_field_goal,play_type_kickoff,play_type_no_play,play_type_pass,play_type_punt,play_type_qb_kneel,play_type_qb_spike,play_type_run,play_type_nfl_END_GAME,play_type_nfl_END_QUARTER,play_type_nfl_FIELD_GOAL,play_type_nfl_FUMBLE_RECOVERED_BY_OPPONENT,play_type_nfl_GAME_START,play_type_nfl_INTERCEPTION,play_type_nfl_KICK_OFF,play_type_nfl_PASS,play_type_nfl_PAT2,play_type_nfl_PENALTY,play_type_nfl_PUNT,play_type_nfl_RUSH,play_type_nfl_SACK,play_type_nfl_TIMEOUT,play_type_nfl_UNSPECIFIED,play_type_nfl_XP_KICK,side_of_field_ARI,side_of_field_ATL,side_of_field_BAL,side_of_field_BUF,side_of_field_CAR,side_of_field_CHI,side_of_field_CIN,side_of_field_CLE,side_of_field_DAL,side_of_field_DEN,side_of_field_DET,side_of_field_GB,side_of_field_HOU,side_of_field_IND,side_of_field_JAX,side_of_field_KC,side_of_field_LA,side_of_field_LAC,side_of_field_LV,side_of_field_MIA,side_of_field_MIN,side_of_field_NE,side_of_field_NO,side_of_field_NYG,side_of_field_NYJ,side_of_field_Other,side_of_field_PHI,side_of_field_PIT,side_of_field_SEA,side_of_field_SF,side_of_field_TB,side_of_field_TEN,side_of_field_WAS
0,1.0,0.510204,1.0,1.0,1.0,0.0,0.488889,11.0,0.0,2.0,0.0,4.06515,0,0.5835,0.390446,0.204605,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.346939,1.0,1.0,1.0,0.0,0.488889,1.0,0.0,2.0,0.0,4.06515,0,0.5835,0.390446,0.204605,0.1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.755102,1.0,1.0,1.0,0.0,0.488889,1.0,0.0,1.0,3.0,-51.505846,0,0.56774,0.390446,0.204605,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.72449,0.966667,0.983333,0.991667,0.0,0.488889,1.0,0.0,2.0,6.0,33.889407,0,0.616479,0.362442,0.139399,0.033333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.663265,0.927778,0.963889,0.981944,0.0,0.488889,1.0,0.0,3.0,2.0,-19.606467,0,0.605529,0.390446,0.204605,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
