In [14]:
import os
import nfl_data_py as nfl
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Calculate Expected Pass Rate

In [3]:
# Set CONSTANTS
SEASONS = [2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
# SEASONS = [2016,2017,2018,2019,2020,2021]

In [4]:
# get play by play for season
cols = ['game_id',
        'play_id',
        'epa',
        'season_type',
        'season','week',
        'qtr',
        'drive','series',
        'down',
        'ydstogo',
        'yards_gained',
        'wp',
        'vegas_wp',
        'ep',
        'yardline_100',
        'home_team',
        'away_team',
        'posteam','posteam_score',
        'defteam','defteam_score',
        'posteam_timeouts_remaining',
        'defteam_timeouts_remaining',
        'half_seconds_remaining',
        'game_half',
        'fixed_drive_result',
        'play_type',
        'penalty_team',
        'penalty_type',
        'penalty_yards']

df_pbp = nfl.import_pbp_data(years=SEASONS, columns=cols, downcast=True, cache=False)

2006 done.
2007 done.
2008 done.
2009 done.
2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


In [5]:
df_schd = nfl.import_schedules(years=SEASONS)
df_schd

Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,away_team,away_score,home_team,...,wind,away_qb_id,home_qb_id,away_qb_name,home_qb_name,away_coach,home_coach,referee,stadium_id,stadium
1845,2006_01_MIA_PIT,2006,REG,1,2006-09-07,Thursday,20:30,MIA,17.0,PIT,...,1.0,00-0003739,00-0000865,Daunte Culpepper,Charlie Batch,Nick Saban,Bill Cowher,Walt Coleman,PIT00,Heinz Field
1846,2006_01_ATL_CAR,2006,REG,1,2006-09-10,Sunday,13:00,ATL,20.0,CAR,...,5.0,00-0020245,00-0004161,Michael Vick,Jake Delhomme,Jim Mora,John Fox,Bill Carollo,CAR00,Bank of America Stadium
1847,2006_01_NO_CLE,2006,REG,1,2006-09-10,Sunday,13:00,NO,19.0,CLE,...,8.0,00-0020531,00-0023502,Drew Brees,Charlie Frye,Sean Payton,Romeo Crennel,Jerome Boger,CLE00,Cleveland Browns Stadium
1848,2006_01_SEA_DET,2006,REG,1,2006-09-10,Sunday,13:00,SEA,9.0,DET,...,,00-0007091,00-0009311,Matt Hasselbeck,Jon Kitna,Mike Holmgren,Rod Marinelli,Jeff Triplette,DET00,Ford Field
1849,2006_01_PHI_HOU,2006,REG,1,2006-09-10,Sunday,13:00,PHI,24.0,HOU,...,,00-0011022,00-0020608,Donovan McNabb,David Carr,Andy Reid,Gary Kubiak,Peter Morelli,HOU00,Reliant Stadium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6404,2022_18_NYG_PHI,2022,REG,18,2023-01-08,Sunday,13:00,NYG,,PHI,...,,,,,,Brian Daboll,Nick Sirianni,,PHI00,Lincoln Financial Field
6405,2022_18_CLE_PIT,2022,REG,18,2023-01-08,Sunday,13:00,CLE,,PIT,...,,,,,,Kevin Stefanski,Mike Tomlin,,PIT00,Acrisure Stadium
6406,2022_18_LA_SEA,2022,REG,18,2023-01-08,Sunday,13:00,LA,,SEA,...,,,,,,Sean McVay,Pete Carroll,,SEA00,Lumen Field
6407,2022_18_ARI_SF,2022,REG,18,2023-01-08,Sunday,13:00,ARI,,SF,...,,,,,,Kliff Kingsbury,Kyle Shanahan,,SFO01,Levi's Stadium


In [6]:
# create added features
df_pbp['score_diff'] = df_pbp.posteam_score - df_pbp.defteam_score
df_pbp['home'] = np.select([df_pbp.posteam == df_pbp.home_team],[1], default=0) 

era_conditions = [df_pbp.season.isin([2006,2007,2008,2009,2010,2011,2013]),
                  df_pbp.season.isin([2014,2015,2016,2017]),
                  df_pbp.season >= 2018]
era_choices = [2,3,4]
df_pbp['era'] = np.select(era_conditions, era_choices, default=np.NaN)

In [7]:
is_down = df_pbp['down'].isin([1,2,3,4])
is_runpass = df_pbp['play_type'].isin(['run','pass'])
df_pbp = df_pbp.merge(df_schd[['game_id','roof']], on=['game_id'])

df = df_pbp.loc[(is_runpass & is_down), ['play_type',
                                         'roof',
                                         'era',
                                         'score_diff',
                                         'home',
                                         'down',
                                         'ydstogo',
                                         'yardline_100',
                                         'game_half',
                                         'qtr',
                                         'half_seconds_remaining',
                                         'posteam_timeouts_remaining',
                                         'defteam_timeouts_remaining',
                                         'wp',
                                         'vegas_wp',]].dropna().copy()

print(df.play_type.value_counts())
df.sample(25)

pass    303062
run     217461
Name: play_type, dtype: int64


Unnamed: 0,play_type,roof,era,score_diff,home,down,ydstogo,yardline_100,game_half,qtr,half_seconds_remaining,posteam_timeouts_remaining,defteam_timeouts_remaining,wp,vegas_wp
691612,run,outdoors,4.0,0.0,1,2.0,5.0,40.0,Half2,3.0,1050.0,3.0,3.0,0.625071,0.680444
41149,pass,outdoors,2.0,-6.0,1,2.0,4.0,74.0,Half2,3.0,1576.0,3.0,3.0,0.310067,0.340662
695303,run,outdoors,4.0,0.0,1,2.0,3.0,63.0,Half1,1.0,1660.0,3.0,3.0,0.652432,0.314966
135944,run,outdoors,2.0,-10.0,1,2.0,1.0,39.0,Half2,3.0,1570.0,3.0,3.0,0.285878,0.351962
776593,pass,outdoors,4.0,0.0,0,1.0,10.0,37.0,Half2,3.0,1395.0,3.0,3.0,0.573317,0.706665
426943,run,closed,3.0,-4.0,0,1.0,10.0,93.0,Half2,4.0,392.0,3.0,2.0,0.231271,0.188022
525848,pass,outdoors,3.0,11.0,1,1.0,10.0,80.0,Half2,4.0,873.0,2.0,3.0,0.9224,0.952646
588122,pass,outdoors,4.0,0.0,1,1.0,15.0,23.0,Half1,1.0,1570.0,3.0,3.0,0.652965,0.831006
24986,run,outdoors,2.0,0.0,1,1.0,10.0,72.0,Half1,1.0,1795.0,3.0,3.0,0.553101,0.59696
405063,run,outdoors,3.0,-14.0,0,1.0,10.0,52.0,Half1,2.0,638.0,3.0,3.0,0.129087,0.032154


In [8]:
df.dtypes

play_type                      object
roof                           object
era                           float64
score_diff                    float32
home                            int64
down                          float32
ydstogo                       float32
yardline_100                  float32
game_half                      object
qtr                           float32
half_seconds_remaining        float32
posteam_timeouts_remaining    float32
defteam_timeouts_remaining    float32
wp                            float32
vegas_wp                      float32
dtype: object

In [58]:
cat_features = ['roof',
             'era',
             'home',
             'down',
             'qtr',
             'game_half']

cont_features = ['score_diff',
              'wp',
              'vegas_wp',
              'ydstogo',
              'yardline_100',
              'half_seconds_remaining',
              'posteam_timeouts_remaining',
              'defteam_timeouts_remaining']

In [59]:
categosrical_transformer = Pipeline(steps=[("one_hot", OneHotEncoder())])
preprocessor = ColumnTransformer(transformers=[("cat", categorical_transformer, cat_features)])

In [60]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor)]
)

#test pipeline
clf.fit(df[cat_features])

In [63]:
clf.transform(df[cat_features])

<520523x21 sparse matrix of type '<class 'numpy.float64'>'
	with 3123138 stored elements in Compressed Sparse Row format>