In [1]:
import nflreadpy as nfl
import polars as pl

In [2]:
raw_nfl_data = nfl.load_schedules(range(1999, 2025))

## GAME DATA

In [3]:
game_data = (
    raw_nfl_data
    .select([
        "game_id",
        "season",
        "week",
        "gameday",
        "home_team",
        "away_team",
        "stadium",
        "roof",
    ])
    .with_columns([
        pl.col("roof")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains("dome|closed")
        .alias("is_dome"),

        pl.col("stadium")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains("soldier")
        .alias("is_soldier_field"),
    ])
    .drop("roof")
    .sort(["season", "week", "game_id"])
)


In [4]:
# Checking for nulls
for c in game_data.columns:
    assert game_data.select(pl.col(c).null_count()).item() == 0

In [5]:
# Checking season data
assert (
    game_data.filter(~pl.col("season").is_between(1999, 2024)).height == 0
), "Found season values outside 1999–2024"

In [6]:
# Checking week data
assert (
    game_data.filter(~pl.col("week").is_between(1, 22)).height == 0
), "Found week values outside 1-22"

In [7]:
# Checking home_team data - should see 35 results because 3 teams have moved since 1999. 
assert (
    game_data.group_by('home_team').len().sort("len", descending=True).height == 35
), "Wrong number of home teams"

In [8]:
# Checking away_team data - should see 35 results because 3 teams have moved since 1999. 
assert (
    game_data.group_by('away_team').len().sort("len", descending=True).height == 35
), "Wrong number of away teams"

In [9]:
# Checking roof statuses. Want to make sure every row is "outdoors," "dome," "closed," or "open"
allowed_roof_values = {"outdoors", "dome", "closed", "open"}

invalid_roofs = (
    raw_nfl_data
    .group_by("roof")
    .len()
    .filter(~pl.col("roof").is_in(list(allowed_roof_values)))
)

assert invalid_roofs.height == 0, (
    f"Found unexpected roof values: "
    f"{invalid_roofs.select('roof').to_series().to_list()}"
)

In [10]:
# Checking to make sure number of games at soldier field is around 1/32. 
game_data.group_by('is_soldier_field').len()

is_soldier_field,len
bool,u32
False,6783
True,208


## WEATHER DATA

In [27]:
weather_data = (
    raw_nfl_data
    .select([
        "game_id",
        "temp",
        "wind"
    ])
)

In [28]:
weather_data.shape

(6991, 3)

In [29]:
raw_pbp_data = nfl.load_pbp(range(1999, 2025))

In [30]:
# check weather words to find all precipitation-related words
weather_words = (
    raw_pbp_data
    .select(pl.col("weather"))
    .drop_nulls()
    .with_columns(
        pl.col("weather")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.replace_all(r"[^\w\s]", "")
        .str.split(" ")
        .alias("word")
    )
    .explode("word")
    .filter(pl.col("word") != "")
)

word_counts = (
    weather_words
    .group_by("word")
    .len()
    .sort("len", descending=True)
)

In [31]:
# View scrollable output
from IPython.display import display, HTML

df_pd = word_counts.to_pandas()

html = df_pd.to_html(index=False)
display(HTML(f"<div style='height:600px; overflow:auto; border:1px solid #ddd'>{html}</div>"))

word,len
mph,1161737
wind,1135986
temp,1131870
f,1127807
humidity,992795
cloudy,396586
sunny,288296
partly,138629
clear,105749
5,98076


In [32]:
# From the above output, have these precipication incidators
precip_pattern = (
    "rain|snow|sleet|drizz|shower|flur|flake|sprinkl|squall|storm"
)

precip_data = (
    raw_pbp_data
    .select(["game_id", "weather"])
    .unique(subset=["game_id"])
    .with_columns(
        pl.col("weather")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains(precip_pattern)
        .alias("precip")
    )
)

In [33]:
weather_data.shape

(6991, 3)

In [34]:
# Three rows with missing weather data
precip_data.shape

(6988, 3)

In [35]:
# Join weather and precip data
weather_data = weather_data.join(precip_data, on="game_id", how="left")

In [36]:
# Quite a few games with no weather data. Dropping for now
print(weather_data.shape)
weather_data = weather_data.drop_nulls()
weather_data.shape

(6991, 5)


(4612, 5)

In [44]:
# Adding temperature bin columns
weather_data = weather_data.with_columns(
    pl.when(pl.col("temp") < 0).then(pl.lit("below_0"))
     .when(pl.col("temp") < 32).then(pl.lit("0_to_32"))
     .when(pl.col("temp") < 50).then(pl.lit("32_to_50"))
     .otherwise(pl.lit("above_50"))
     .alias("temp_bin")
)


In [45]:
weather_data.columns

['game_id', 'temp', 'wind', 'weather', 'precip', 'temp_bin']

In [54]:
# Checking temperature values
assert (
    weather_data
    .filter(~pl.col("temp").is_between(-10, 105))
    .height
    == 0
), "Found temperature values outside expected range -10 to 105 F"


In [62]:
# Checking wind values
assert (
    weather_data
    .filter(~pl.col("wind").is_between(0, 60))
    .height
    == 0
), "Found wind values outside of the range of 0 to 40 mph"

AssertionError: Found wind values outside of the range of 0 to 40 mph

In [63]:
# Investigating outlying high winds 
weather_data.filter(pl.col("wind").is_between(60, 80))

game_id,temp,wind,weather,precip,temp_bin
str,i32,i32,str,bool,str
"""2008_02_TEN_CIN""",88,70,"""Partly Cloudy and Windy Temp: …",False,"""above_50"""
"""2016_13_NYG_PIT""",43,71,"""Cloudy Temp: 43° F, Humidity: …",False,"""32_to_50"""


In [66]:
# Checking to see how many games had precipitation. This data matches very closely with global prepitation percentage (~11%)
weather_data.group_by('precip').len()

precip,len
bool,u32
True,518
False,4094


In [70]:
# Quick check to make sure temp bin sizes make sense
weather_data.group_by('temp_bin').len().sort("len", descending=True)

temp_bin,len
str,u32
"""above_50""",3176
"""32_to_50""",1134
"""0_to_32""",299
"""below_0""",3


In [71]:
# After checking both datasets, merge into one
games_full_data = (
    game_data
    .join(
        weather_data,
        on="game_id",
        how="inner"
    )
)
games_full_data.shape

(4612, 14)

## QB GAME STATS
Read in all data relating to the quarterback stats. For each game played since 1999, get the following: 

1. **game_id**:
2. **qb_id**
3. **offense_team**:
4. **defense_team**:
5. **is_home**:
6. **pass_attempts**: 

... (more)


For the quarterback data, I 

In [58]:
# keep only passing plays (and sacks, as they are passing plays)
pass_plays = raw_pbp_data.filter(
    (pl.col("pass_attempt") == 1) | (pl.col("sack") == 1)
)

In [66]:
# group by game_id, and passer_player_id. Then calculate all stats for that passer in that game. This gives us all the per game passing stats we need. 
qb_game_stats = (
    pass_plays
    .group_by(["game_id", "passer_player_id"])
    .agg([
        pl.col("passer_player_name").drop_nulls().first().alias("qb_name"),
        pl.col("posteam").drop_nulls().first().alias("offense_team"),
        pl.col("defteam").drop_nulls().first().alias("defense_team"),

        pl.col("pass_attempt").sum().alias("pass_attempts"),
        pl.col("complete_pass").sum().alias("completions"),
        pl.col("passing_yards").sum().alias("pass_yards"),
        pl.col("interception").sum().alias("interceptions"),
        pl.col("sack").sum().alias("sacks"),
        pl.len().alias("dropbacks"),
        pl.col("epa").sum().alias("epa_sum"),
        pl.first("home_team").alias("home_team"),
    ])
    .with_columns([
        (pl.col("epa_sum") / pl.col("dropbacks")).alias("epa_per_dropback"),
        (pl.col("offense_team") == pl.col("home_team")).cast(pl.Boolean).alias("is_home"),
    ])
    .select([
        pl.col("game_id"),
        pl.col("passer_player_id").alias("qb_id"),
        "qb_name",
        "offense_team",
        "defense_team",
        "is_home",
        "pass_attempts",
        "completions",
        "pass_yards",
        "interceptions",
        "sacks",
        "dropbacks",
        "epa_sum",
        "epa_per_dropback",
    ])
)


In [68]:
# Ensures that for each qb in the game, there is only one offense team and one defense team. This makes sure that our group by in the previous code block is valid
integrity = (
    pass_plays
    .group_by(["game_id", "passer_player_id"])
    .agg([
        pl.col("posteam").n_unique().alias("n_posteams"),
        pl.col("defteam").n_unique().alias("n_defteams"),
    ])
    .filter((pl.col("n_posteams") > 1) | (pl.col("n_defteams") > 1))
)

integrity

game_id,passer_player_id,n_posteams,n_defteams
str,str,u32,u32


In [69]:
qb_game_stats.shape
    

(17235, 14)

In [70]:
qb_game_stats.drop_nulls().shape

(17233, 14)

In [71]:
qb_game_stats.filter(pl.any_horizontal(pl.all().is_null()))

game_id,qb_id,qb_name,offense_team,defense_team,is_home,pass_attempts,completions,pass_yards,interceptions,sacks,dropbacks,epa_sum,epa_per_dropback
str,str,str,str,str,bool,f64,f64,f64,f64,f64,u32,f64,f64
"""2001_11_MIA_BUF""",,,"""MIA""","""BUF""",False,1.0,1.0,42.0,0.0,0.0,1,-2.094188,-2.094188
"""2001_14_GB_TEN""",,,"""GB""","""TEN""",False,1.0,0.0,0.0,0.0,1.0,1,-2.308805,-2.308805


In [73]:
qb_game_stats.filter(pl.col('game_id') == "2001_14_GB_TEN")

game_id,qb_id,qb_name,offense_team,defense_team,is_home,pass_attempts,completions,pass_yards,interceptions,sacks,dropbacks,epa_sum,epa_per_dropback
str,str,str,str,str,bool,f64,f64,f64,f64,f64,u32,f64,f64
"""2001_14_GB_TEN""","""00-0007308""","""C.Hentrich""","""TEN""","""GB""",True,1.0,0.0,0.0,0.0,0.0,1,-0.913104,-0.913104
"""2001_14_GB_TEN""",,,"""GB""","""TEN""",False,1.0,0.0,0.0,0.0,1.0,1,-2.308805,-2.308805
"""2001_14_GB_TEN""","""00-0011024""","""S.McNair""","""TEN""","""GB""",True,41.0,25.0,283.0,0.0,4.0,41,10.569704,0.257798
"""2001_14_GB_TEN""","""00-0005106""","""B.Favre""","""GB""","""TEN""",False,39.0,20.0,199.0,1.0,1.0,39,0.526273,0.013494


From this, we see that there are two rows with null values. Looking up the 2001 week 11 and week 14 games shown above, the missing passer does not appear in the box scores. Dropping these rows. 

In [74]:
qb_game_stats.drop_nulls()

game_id,qb_id,qb_name,offense_team,defense_team,is_home,pass_attempts,completions,pass_yards,interceptions,sacks,dropbacks,epa_sum,epa_per_dropback
str,str,str,str,str,bool,f64,f64,f64,f64,f64,u32,f64,f64
"""2010_06_SEA_CHI""","""00-0024226""","""J.Cutler""","""CHI""","""SEA""",true,45.0,17.0,290.0,0.0,6.0,45,-9.322985,-0.207177
"""2000_10_MIN_GB""","""00-0003739""","""D.Culpepper""","""MIN""","""GB""",false,37.0,17.0,276.0,3.0,3.0,37,4.674014,0.126325
"""2009_13_SF_SEA""","""00-0022055""","""S.Wallace""","""SEA""","""SF""",true,1.0,1.0,7.0,0.0,0.0,1,0.179572,0.179572
"""2012_08_OAK_KC""","""00-0025409""","""B.Quinn""","""KC""","""LV""",true,5.0,2.0,1.0,1.0,1.0,5,-8.90583,-1.781166
"""2004_05_ARI_SF""","""00-0021206""","""J.McCown""","""ARI""","""SF""",false,36.0,19.0,231.0,1.0,2.0,36,5.480261,0.152229
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2023_17_TEN_HOU""","""00-0039152""","""W.Levis""","""TEN""","""HOU""",false,7.0,2.0,16.0,0.0,1.0,7,-9.646544,-1.378078
"""2000_04_STL_ATL""","""00-0002876""","""C.Chandler""","""ATL""","""LA""",true,42.0,21.0,255.0,2.0,7.0,42,-6.034358,-0.143675
"""1999_17_NYG_DAL""","""00-0003292""","""K.Collins""","""NYG""","""DAL""",false,48.0,30.0,316.0,1.0,1.0,48,10.144497,0.211344
"""2015_12_MIN_ATL""","""00-0031237""","""T.Bridgewater""","""MIN""","""ATL""",false,28.0,20.0,174.0,1.0,0.0,28,0.902853,0.032245


## DEFENSE STATS

In [75]:
# get only plays where epa is populated and there is a set team on defense
pbp_def = raw_pbp_data.filter(
    pl.col("epa").is_not_null() &
    pl.col("defteam").is_not_null()
)

In [76]:
# calculating defensive EPA per season 
defense_season_stats = (
    pbp_def
    .group_by(["season", "defteam"])
    .agg([
        pl.col("epa").sum().alias("epa_allowed_sum"),
        pl.len().alias("n_plays_faced"),
    ])
    .with_columns(
        (pl.col("epa_allowed_sum") / pl.col("n_plays_faced"))
        .alias("def_epa_allowed_season")
    )
).select([
    pl.col("season"),
    pl.col("defteam").alias("defense_team"),
    pl.col("def_epa_allowed_season"),
])
