In [1]:
import nflreadpy as nfl
import polars as pl

In [2]:
raw_nfl_data = nfl.load_schedules(range(1999, 2025))

Read in all data relating to nfl games. For each game played since 1999, get the following: 

1. **game_id**: unique identifier for the specific game
2. **season**: year of the season the game belongs to. Note that, for example, all games of the 2024 season have a season value of 2024 even if they were played in 2025 (eg Eagles vs Chiefs in the Super Bowl belongs to the 2024 season, even though the game was in 2025). 
3. **week**: week number
4. **gameday**: date of game being played in YYYY-MM-DD format
5. **home_team**: abbreviation for home team
6. **away_team**: abbreviation for away team
7. **stadium**: name of stadium the game was played at
8. **roof**: status of the roof. Most teams are either "outdoors" or "dome," but for teams with retractable roofs it shows the status of the roof for the given game
9. **is_dome**: true/false for if game was a dome or not
10. **is_soldier_field**: true/false for if game was played at Soldier Field

In [3]:
game_data = (
    raw_nfl_data
    .select([
        "game_id",
        "season",
        "week",
        "gameday",
        "home_team",
        "away_team",
        "stadium",
        "roof",
    ])
    .with_columns([
        # Dome / roof closed flag
        pl.col("roof")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains("dome|closed")
        .alias("is_dome"),

        # Soldier Field flag
        pl.col("stadium")
        .cast(pl.Utf8)
        .str.to_lowercase()
        .str.contains("soldier", literal=False)
        .alias("is_soldier_field"),
    ])
    .sort(["season", "week", "game_id"])
)

In [10]:
for c in game_data.columns:
    assert game_data.select(pl.col(c).null_count()).item() == 0