In [None]:
import nflreadpy as nfl
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set display options
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_width_chars(200)

print("Imports successful")

Imports successful


In [4]:
print("Loading data...")
pbp = nfl.load_pbp([2024])
player_stats = nfl.load_player_stats([2024])
team_stats = nfl.load_team_stats([2024])

print(f"Loaded {len(pbp):,} plays")
print(f"Loaded {len(player_stats):,} player-game records")
print(f"Loaded {len(team_stats):,} team-week records")

Loading data...
Loaded 49,492 plays
Loaded 18,981 player-game records
Loaded 570 team-week records


In [15]:
print("PLAY-BY-PLAY DATA STRUCTURE")
print("=" * 80)
print(f"Shape: {pbp.shape}")
print(f"\nColumns ({len(pbp.columns)}):")
for i, col in enumerate(pbp.columns, 1):
    print(f"{i:3}. {col}")

PLAY-BY-PLAY DATA STRUCTURE
Shape: (49492, 372)

Columns (372):
  1. play_id
  2. game_id
  3. old_game_id
  4. home_team
  5. away_team
  6. season_type
  7. week
  8. posteam
  9. posteam_type
 10. defteam
 11. side_of_field
 12. yardline_100
 13. game_date
 14. quarter_seconds_remaining
 15. half_seconds_remaining
 16. game_seconds_remaining
 17. game_half
 18. quarter_end
 19. drive
 20. sp
 21. qtr
 22. down
 23. goal_to_go
 24. time
 25. yrdln
 26. ydstogo
 27. ydsnet
 28. desc
 29. play_type
 30. yards_gained
 31. shotgun
 32. no_huddle
 33. qb_dropback
 34. qb_kneel
 35. qb_spike
 36. qb_scramble
 37. pass_length
 38. pass_location
 39. air_yards
 40. yards_after_catch
 41. run_location
 42. run_gap
 43. field_goal_result
 44. kick_distance
 45. extra_point_result
 46. two_point_conv_result
 47. home_timeouts_remaining
 48. away_timeouts_remaining
 49. timeout
 50. timeout_team
 51. td_team
 52. td_player_name
 53. td_player_id
 54. posteam_timeouts_remaining
 55. defteam_timeo

In [16]:
# Look at the most important columns for analysis
key_cols = ['game_id', 'play_id', 'posteam', 'defteam', 'down', 'ydstogo', 
            'yardline_100', 'play_type', 'yards_gained', 'touchdown', 'epa', 'wpa']

# Filter to columns that actually exist
existing_cols = [col for col in key_cols if col in pbp.columns]
print(f"Key columns found: {existing_cols}")
print("\nSample data:")
pbp.select(existing_cols).head(10)

Key columns found: ['game_id', 'play_id', 'posteam', 'defteam', 'down', 'ydstogo', 'yardline_100', 'play_type', 'yards_gained', 'touchdown', 'epa', 'wpa']

Sample data:


game_id,play_id,posteam,defteam,down,ydstogo,yardline_100,play_type,yards_gained,touchdown,epa,wpa
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64
"""2024_01_ARI_BUF""",1.0,,,,0.0,,,,,-0.0,-0.0
"""2024_01_ARI_BUF""",40.0,"""ARI""","""BUF""",,0.0,35.0,"""kickoff""",0.0,0.0,0.257819,0.000338
"""2024_01_ARI_BUF""",61.0,"""ARI""","""BUF""",1.0,10.0,70.0,"""run""",3.0,0.0,-0.200602,-0.00727
"""2024_01_ARI_BUF""",83.0,"""ARI""","""BUF""",2.0,7.0,67.0,"""pass""",22.0,0.0,2.028874,0.053842
"""2024_01_ARI_BUF""",108.0,"""ARI""","""BUF""",1.0,10.0,45.0,"""pass""",9.0,0.0,0.754242,0.054495
"""2024_01_ARI_BUF""",133.0,"""ARI""","""BUF""",2.0,1.0,36.0,"""run""",2.0,0.0,-0.029602,-0.004201
"""2024_01_ARI_BUF""",155.0,"""ARI""","""BUF""",1.0,10.0,34.0,"""run""",2.0,0.0,-0.247749,-0.015525
"""2024_01_ARI_BUF""",177.0,"""ARI""","""BUF""",2.0,8.0,32.0,"""run""",2.0,0.0,-0.530139,-0.019851
"""2024_01_ARI_BUF""",199.0,"""ARI""","""BUF""",3.0,6.0,30.0,"""pass""",8.0,0.0,1.6808,0.065693
"""2024_01_ARI_BUF""",224.0,"""ARI""","""BUF""",1.0,10.0,22.0,"""pass""",0.0,0.0,-0.467625,-0.015382


In [17]:
# See what data types we have
pbp.schema

Schema([('play_id', Float64),
        ('game_id', String),
        ('old_game_id', String),
        ('home_team', String),
        ('away_team', String),
        ('season_type', String),
        ('week', Int32),
        ('posteam', String),
        ('posteam_type', String),
        ('defteam', String),
        ('side_of_field', String),
        ('yardline_100', Float64),
        ('game_date', String),
        ('quarter_seconds_remaining', Float64),
        ('half_seconds_remaining', Float64),
        ('game_seconds_remaining', Float64),
        ('game_half', String),
        ('quarter_end', Float64),
        ('drive', Float64),
        ('sp', Float64),
        ('qtr', Float64),
        ('down', Float64),
        ('goal_to_go', Int32),
        ('time', String),
        ('yrdln', String),
        ('ydstogo', Float64),
        ('ydsnet', Float64),
        ('desc', String),
        ('play_type', String),
        ('yards_gained', Float64),
        ('shotgun', Float64),
        ('no_huddle',

In [18]:
print("PLAYER STATS STRUCTURE")
print("=" * 80)
print(f"Shape: {player_stats.shape}")
print(f"\nAll columns ({len(player_stats.columns)}):")
for i, col in enumerate(player_stats.columns, 1):
    print(f"{i:3}. {col}")

PLAYER STATS STRUCTURE
Shape: (18981, 114)

All columns (114):
  1. player_id
  2. player_name
  3. player_display_name
  4. position
  5. position_group
  6. headshot_url
  7. season
  8. week
  9. season_type
 10. team
 11. opponent_team
 12. completions
 13. attempts
 14. passing_yards
 15. passing_tds
 16. passing_interceptions
 17. sacks_suffered
 18. sack_yards_lost
 19. sack_fumbles
 20. sack_fumbles_lost
 21. passing_air_yards
 22. passing_yards_after_catch
 23. passing_first_downs
 24. passing_epa
 25. passing_cpoe
 26. passing_2pt_conversions
 27. pacr
 28. carries
 29. rushing_yards
 30. rushing_tds
 31. rushing_fumbles
 32. rushing_fumbles_lost
 33. rushing_first_downs
 34. rushing_epa
 35. rushing_2pt_conversions
 36. receptions
 37. targets
 38. receiving_yards
 39. receiving_tds
 40. receiving_fumbles
 41. receiving_fumbles_lost
 42. receiving_air_yards
 43. receiving_yards_after_catch
 44. receiving_first_downs
 45. receiving_epa
 46. receiving_2pt_conversions
 47. racr

In [19]:
# Fantasy football key metrics
fantasy_cols = ['player_name', 'position', 'team', 'week', 
                'passing_yards', 'passing_tds', 
                'rushing_yards', 'rushing_tds',
                'receiving_yards', 'receiving_tds', 'receptions',
                'fantasy_points', 'fantasy_points_ppr']

existing_fantasy = [col for col in fantasy_cols if col in player_stats.columns]
print(f"Fantasy columns found: {existing_fantasy}")
print("\nTop 10 fantasy scorers (Week 1):")
(player_stats
 .filter(pl.col('week') == 1) if 'week' in player_stats.columns else player_stats
 .select(existing_fantasy)
 .sort('fantasy_points_ppr', descending=True)
 .head(10)
)

Fantasy columns found: ['player_name', 'position', 'team', 'week', 'passing_yards', 'passing_tds', 'rushing_yards', 'rushing_tds', 'receiving_yards', 'receiving_tds', 'receptions', 'fantasy_points', 'fantasy_points_ppr']

Top 10 fantasy scorers (Week 1):


player_id,player_name,player_display_name,position,position_group,headshot_url,season,…,gwfg_made,gwfg_att,gwfg_missed,gwfg_blocked,gwfg_distance,fantasy_points,fantasy_points_ppr
str,str,str,str,str,str,i32,…,i32,i32,i32,i32,i32,f64,f64
"""00-0023459""","""A.Rodgers""","""Aaron Rodgers""","""QB""","""QB""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,8.58,8.58
"""00-0023853""","""M.Prater""","""Matt Prater""","""K""","""SPEC""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0025565""","""N.Folk""","""Nick Folk""","""K""","""SPEC""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0026190""","""C.Campbell""","""Calais Campbell""","""DE""","""DL""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0026498""","""M.Stafford""","""Matthew Stafford""","""QB""","""QB""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,14.68,14.68
"""00-0026858""","""G.Gano""","""Graham Gano""","""K""","""SPEC""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0027114""","""T.Morstead""","""Thomas Morstead""","""P""","""SPEC""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0027865""","""B.Graham""","""Brandon Graham""","""DE""","""DL""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0027940""","""V.Miller""","""Von Miller""","""OLB""","""LB""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0
"""00-0027962""","""C.Jordan""","""Cameron Jordan""","""DE""","""DL""","""https://static.www.nfl.com/ima…",2024,…,0,0,0,0,0,0.0,0.0


In [9]:
null_summary = pbp.null_count().transpose(include_header=True, header_name='column', column_names=['null_count'])
null_summary.sort('null_count', descending=True).head(15)

column,null_count
str,u32
"""lateral_sack_player_id""",49492
"""lateral_sack_player_name""",49492
"""lateral_punt_returner_player_i…",49492
"""lateral_punt_returner_player_n…",49492
"""tackle_for_loss_2_player_id""",49492
"""tackle_for_loss_2_player_name""",49492
"""tackle_with_assist_2_player_id""",49492
"""tackle_with_assist_2_player_na…",49492
"""tackle_with_assist_2_team""",49492
"""st_play_type""",49492


In [13]:
# What kinds of plays do we have?
if 'play_type' in pbp.columns:
    print("PLAY TYPE DISTRIBUTION")
    print("=" * 80)
    play_dist = (pbp
                 .group_by('play_type')
                 .agg(pl.len().alias('count'))
                 .sort('count', descending=True))
    print(play_dist)

PLAY TYPE DISTRIBUTION
shape: (10, 2)
┌─────────────┬───────┐
│ play_type   ┆ count │
│ ---         ┆ ---   │
│ str         ┆ u32   │
╞═════════════╪═══════╡
│ pass        ┆ 20006 │
│ run         ┆ 15044 │
│ no_play     ┆ 4934  │
│ kickoff     ┆ 2949  │
│ punt        ┆ 2119  │
│ null        ┆ 1460  │
│ extra_point ┆ 1302  │
│ field_goal  ┆ 1166  │
│ qb_kneel    ┆ 437   │
│ qb_spike    ┆ 75    │
└─────────────┴───────┘


In [14]:
# Check if we have situational data for coaching analysis
situational_cols = ['down', 'ydstogo', 'yardline_100', 'qtr', 'score_differential', 
                    'shotgun', 'no_huddle', 'posteam_timeouts_remaining']

print("SITUATIONAL DATA AVAILABILITY")
print("=" * 80)
for col in situational_cols:
    if col in pbp.columns:
        null_pct = (pbp[col].null_count() / len(pbp)) * 100
        print(f"✓ {col:30} - {null_pct:.1f}% null")
    else:
        print(f"✗ {col:30} - NOT FOUND")

SITUATIONAL DATA AVAILABILITY
✓ down                           - 16.2% null
✓ ydstogo                        - 0.0% null
✓ yardline_100                   - 7.2% null
✓ qtr                            - 0.0% null
✓ score_differential             - 5.5% null
✓ shotgun                        - 0.0% null
✓ no_huddle                      - 0.0% null
✓ posteam_timeouts_remaining     - 5.5% null


In [21]:
participation_a = nfl.load_injuries(seasons=2025)

ConnectionError: Failed to download https://github.com/nflverse/nflverse-data/releases/download/injuries/injuries_2025.parquet: 404 Client Error: Not Found for url: https://github.com/nflverse/nflverse-data/releases/download/injuries/injuries_2025.parquet