In [1]:
import nflreadpy as nfl
import polars as pl

In [2]:
raw_nfl_data = nfl.load_schedules(range(1999, 2025))

## GAME DATA
Read in all data relating to nfl games. For each game played since 1999, get the following: 

1. **game_id**: unique identifier for the specific game
2. **season**: year of the season the game belongs to. Note that, for example, all games of the 2024 season have a season value of 2024 even if they were played in 2025 (eg Eagles vs Chiefs in the Super Bowl belongs to the 2024 season, even though the game was in 2025). 
3. **week**: week number
4. **gameday**: date of game being played in YYYY-MM-DD format
5. **home_team**: abbreviation for home team
6. **away_team**: abbreviation for away team
7. **stadium**: name of stadium the game was played at
8. **roof**: status of the roof. Most teams are either "outdoors" or "dome," but for teams with retractable roofs it shows the status of the roof for the given game
9. **is_dome**: true/false for if game was a dome or not
10. **is_soldier_field**: true/false for if game was played at Soldier Field

In [3]:
game_data = (
    raw_nfl_data
    .select([
        "game_id",
        "season",
        "week",
        "gameday",
        "home_team",
        "away_team",
        "stadium",
        "roof",
    ])
    .with_columns([
        # Dome / roof closed flag
        pl.col("roof")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains("dome|closed")
        .alias("is_dome"),

        # Soldier Field flag
        pl.col("stadium")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains("soldier", literal=False)
        .alias("is_soldier_field"),
    ])
    .sort(["season", "week", "game_id"])
)

In [10]:
for c in game_data.columns:
    assert game_data.select(pl.col(c).null_count()).item() == 0

## WEATHER DATA

This is the weather data for each game played since 1999. Each game will have the following: 

1. **game_id**: unique identifier for the specific game
2. **temp**: temerature of the game in fahrenheit
3. **wind**: wind levels measured in mph
4. **precip**: true/false value indicating if the game had any precipitation

This dataset does not have official precipitation levels. For now, precip will be a boolean value, where it will be set to true when one of the following keywords is found in the game's first play's description: rain, snow, sleet, shower. Also note that the weather description comes from a separate dataset, which will be joined in using the game_id

In [29]:
weather_data = (
    raw_nfl_data
    .select([
        "game_id",
        "temp",
        "wind"
    ])
)

In [41]:
weather_data.shape

(6991, 5)

In [30]:
raw_pbp_data = nfl.load_pbp(range(1999, 2025))

In [31]:
precip_data = (
    raw_pbp_data
    .select(["game_id", "weather"])
    .unique(subset=["game_id"])
    .with_columns(
        pl.col("weather")
        .cast(pl.Utf8)
        .str.to_lowercase()
        # look for the following precipitation keywords
        .str.contains("rain|snow|sleet|shower")
        .alias("precip")
    )
)

In [42]:
precip_data.shape

(6988, 3)

In [39]:
weather_data.shape

(6991, 5)

We see that we have 6,991 rows in our weather df, and only 6,988 rows in our precip df. Lost 3 rows
<br>
<br>
Now we can join the two dataframes on game_id to a single dataframe with temp, wind, and precip

In [32]:
weather_data = weather_data.join(precip_data, on="game_id", how="left")

In [43]:
weather_data.select(pl.col("temp").null_count())

temp
u32
1975


In [44]:
weather_data.select(pl.col("wind").null_count())

wind
u32
1975


In [40]:
weather_data.select(pl.col("precip").null_count())

precip
u32
561


We see we have quite a few nulls in our data. There is no way to accurately fill in these nulls with the data available from these datasets, so dropping them for now. 

In [48]:
weather_data = weather_data.drop_nulls()
weather_data.shape

(4612, 5)

Also, I will consider two strategies regarding temperature. First is to treat temperature as a linear variable. Second is to treat it as a categorical variable. The reasoning with the categorical variable is that the same 20 degree difference between 50 and 70 degrees is not the same as a 20 degree difference between 0 and 20, for example

In [None]:
weather_data = weather_data.with_columns(
    pl.when(pl.col("temp_final") < 0).then("below_0")
     .when(pl.col("temp_final") < 32).then("0_to_32")
     .when(pl.col("temp_final") < 50).then("32_to_50")
     .otherwise("above_50")
     .alias("temp_bin")
)

## QB GAME STATS
Read in all data relating to the quarterback stats. For each game played since 1999, get the following: 

1. **game_id**:
2. **qb_id**
3. **offense_team**:
4. **defense_team**:
5. **is_home**:
6. **pass_attempts**: 

... (more)


For the quarterback data, I 

In [58]:
# keep only passing plays (and sacks, as they are passing plays)
pass_plays = raw_pbp_data.filter(
    (pl.col("pass_attempt") == 1) | (pl.col("sack") == 1)
)

In [66]:
# group by game_id, and passer_player_id. Then calculate all stats for that passer in that game. This gives us all the per game passing stats we need. 
qb_game_stats = (
    pass_plays
    .group_by(["game_id", "passer_player_id"])
    .agg([
        pl.col("passer_player_name").drop_nulls().first().alias("qb_name"),
        pl.col("posteam").drop_nulls().first().alias("offense_team"),
        pl.col("defteam").drop_nulls().first().alias("defense_team"),

        pl.col("pass_attempt").sum().alias("pass_attempts"),
        pl.col("complete_pass").sum().alias("completions"),
        pl.col("passing_yards").sum().alias("pass_yards"),
        pl.col("interception").sum().alias("interceptions"),
        pl.col("sack").sum().alias("sacks"),
        pl.len().alias("dropbacks"),
        pl.col("epa").sum().alias("epa_sum"),
        pl.first("home_team").alias("home_team"),
    ])
    .with_columns([
        (pl.col("epa_sum") / pl.col("dropbacks")).alias("epa_per_dropback"),
        (pl.col("offense_team") == pl.col("home_team")).cast(pl.Boolean).alias("is_home"),
    ])
    .select([
        pl.col("game_id"),
        pl.col("passer_player_id").alias("qb_id"),
        "qb_name",
        "offense_team",
        "defense_team",
        "is_home",
        "pass_attempts",
        "completions",
        "pass_yards",
        "interceptions",
        "sacks",
        "dropbacks",
        "epa_sum",
        "epa_per_dropback",
    ])
)


In [68]:
# Ensures that for each qb in the game, there is only one offense team and one defense team. This makes sure that our group by in the previous code block is valid
integrity = (
    pass_plays
    .group_by(["game_id", "passer_player_id"])
    .agg([
        pl.col("posteam").n_unique().alias("n_posteams"),
        pl.col("defteam").n_unique().alias("n_defteams"),
    ])
    .filter((pl.col("n_posteams") > 1) | (pl.col("n_defteams") > 1))
)

integrity

game_id,passer_player_id,n_posteams,n_defteams
str,str,u32,u32


In [69]:
qb_game_stats.shape
    

(17235, 14)

In [70]:
qb_game_stats.drop_nulls().shape

(17233, 14)

In [71]:
qb_game_stats.filter(pl.any_horizontal(pl.all().is_null()))

game_id,qb_id,qb_name,offense_team,defense_team,is_home,pass_attempts,completions,pass_yards,interceptions,sacks,dropbacks,epa_sum,epa_per_dropback
str,str,str,str,str,bool,f64,f64,f64,f64,f64,u32,f64,f64
"""2001_11_MIA_BUF""",,,"""MIA""","""BUF""",False,1.0,1.0,42.0,0.0,0.0,1,-2.094188,-2.094188
"""2001_14_GB_TEN""",,,"""GB""","""TEN""",False,1.0,0.0,0.0,0.0,1.0,1,-2.308805,-2.308805


In [73]:
qb_game_stats.filter(pl.col('game_id') == "2001_14_GB_TEN")

game_id,qb_id,qb_name,offense_team,defense_team,is_home,pass_attempts,completions,pass_yards,interceptions,sacks,dropbacks,epa_sum,epa_per_dropback
str,str,str,str,str,bool,f64,f64,f64,f64,f64,u32,f64,f64
"""2001_14_GB_TEN""","""00-0007308""","""C.Hentrich""","""TEN""","""GB""",True,1.0,0.0,0.0,0.0,0.0,1,-0.913104,-0.913104
"""2001_14_GB_TEN""",,,"""GB""","""TEN""",False,1.0,0.0,0.0,0.0,1.0,1,-2.308805,-2.308805
"""2001_14_GB_TEN""","""00-0011024""","""S.McNair""","""TEN""","""GB""",True,41.0,25.0,283.0,0.0,4.0,41,10.569704,0.257798
"""2001_14_GB_TEN""","""00-0005106""","""B.Favre""","""GB""","""TEN""",False,39.0,20.0,199.0,1.0,1.0,39,0.526273,0.013494


From this, we see that there are two rows with null values. Looking up the 2001 week 11 and week 14 games shown above, the missing passer does not appear in the box scores. Dropping these rows. 

In [74]:
qb_game_stats.drop_nulls()

game_id,qb_id,qb_name,offense_team,defense_team,is_home,pass_attempts,completions,pass_yards,interceptions,sacks,dropbacks,epa_sum,epa_per_dropback
str,str,str,str,str,bool,f64,f64,f64,f64,f64,u32,f64,f64
"""2010_06_SEA_CHI""","""00-0024226""","""J.Cutler""","""CHI""","""SEA""",true,45.0,17.0,290.0,0.0,6.0,45,-9.322985,-0.207177
"""2000_10_MIN_GB""","""00-0003739""","""D.Culpepper""","""MIN""","""GB""",false,37.0,17.0,276.0,3.0,3.0,37,4.674014,0.126325
"""2009_13_SF_SEA""","""00-0022055""","""S.Wallace""","""SEA""","""SF""",true,1.0,1.0,7.0,0.0,0.0,1,0.179572,0.179572
"""2012_08_OAK_KC""","""00-0025409""","""B.Quinn""","""KC""","""LV""",true,5.0,2.0,1.0,1.0,1.0,5,-8.90583,-1.781166
"""2004_05_ARI_SF""","""00-0021206""","""J.McCown""","""ARI""","""SF""",false,36.0,19.0,231.0,1.0,2.0,36,5.480261,0.152229
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2023_17_TEN_HOU""","""00-0039152""","""W.Levis""","""TEN""","""HOU""",false,7.0,2.0,16.0,0.0,1.0,7,-9.646544,-1.378078
"""2000_04_STL_ATL""","""00-0002876""","""C.Chandler""","""ATL""","""LA""",true,42.0,21.0,255.0,2.0,7.0,42,-6.034358,-0.143675
"""1999_17_NYG_DAL""","""00-0003292""","""K.Collins""","""NYG""","""DAL""",false,48.0,30.0,316.0,1.0,1.0,48,10.144497,0.211344
"""2015_12_MIN_ATL""","""00-0031237""","""T.Bridgewater""","""MIN""","""ATL""",false,28.0,20.0,174.0,1.0,0.0,28,0.902853,0.032245


## DEFENSE STATS

In [75]:
# get only plays where epa is populated and there is a set team on defense
pbp_def = raw_pbp_data.filter(
    pl.col("epa").is_not_null() &
    pl.col("defteam").is_not_null()
)

In [76]:
# calculating defensive EPA per season 
defense_season_stats = (
    pbp_def
    .group_by(["season", "defteam"])
    .agg([
        pl.col("epa").sum().alias("epa_allowed_sum"),
        pl.len().alias("n_plays_faced"),
    ])
    .with_columns(
        (pl.col("epa_allowed_sum") / pl.col("n_plays_faced"))
        .alias("def_epa_allowed_season")
    )
).select([
    pl.col("season"),
    pl.col("defteam").alias("defense_team"),
    pl.col("def_epa_allowed_season"),
])


In [80]:
# Make sure we only have a single row per team per season
assert defense_season_stats.select(
    pl.struct(["season", "defense_team"]).n_unique()
).item() == defense_season_stats.height

# Sanitiy check on epa per play per season values (should be around -0.15 to 0.15 range). 
defense_season_stats.select("def_epa_allowed_season").describe()

statistic,def_epa_allowed_season
str,f64
"""count""",829.0
"""null_count""",0.0
"""mean""",-0.007968
"""std""",0.052773
"""min""",-0.181895
"""25%""",-0.043412
"""50%""",-0.006948
"""75%""",0.029598
"""max""",0.145986


## QB DATA