In [30]:
import pandas as pd
print(pd.__version__)

1.5.3


In [31]:
# https://chatgpt.com/share/68000cf1-1714-800f-a0b5-4afc3d924474
# I used chat gpt to help me create a function to go through the pbp data to 
# extract descriptive metrics about each game. 

import pandas as pd
import nfl_data_py as nfl

# 1. Load 2024 schedule to get game IDs and teams involved
schedule = nfl.import_schedules([2024])
print(schedule.columns)
games = schedule[['game_id', 'home_team', 'away_team', 'home_score', 'away_score', 'week']]
display(games)

schedule.to_csv("../Team Data/team_schedule.csv")

Index(['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday',
       'gametime', 'away_team', 'away_score', 'home_team', 'home_score',
       'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis',
       'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest',
       'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds',
       'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game',
       'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id',
       'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee',
       'stadium_id', 'stadium'],
      dtype='object')


Unnamed: 0,game_id,home_team,away_team,home_score,away_score,week
6706,2024_01_BAL_KC,KC,BAL,27,20,1
6707,2024_01_GB_PHI,PHI,GB,34,29,1
6708,2024_01_PIT_ATL,ATL,PIT,10,18,1
6709,2024_01_ARI_BUF,BUF,ARI,34,28,1
6710,2024_01_TEN_CHI,CHI,TEN,24,17,1
...,...,...,...,...,...,...
6986,2024_20_LA_PHI,PHI,LA,28,22,20
6987,2024_20_BAL_BUF,BUF,BAL,27,25,20
6988,2024_21_WAS_PHI,PHI,WAS,55,23,21
6989,2024_21_BUF_KC,KC,BUF,32,29,21


In [32]:
# 1. List columns available in your play-by-play data (just for reference)
pbp_cols = nfl.see_pbp_cols()

# 2. Pick some useful columns dynamically (adjust based on what you need)
selected_cols = [
    'game_id', 'week','posteam', 'yards_gained', 'complete_pass', 'touchdown', 'play_type', 'epa',
    'rush_attempt', 'pass_attempt', 'passing_yards', 'rushing_yards', 'pass_touchdown', 'rush_touchdown',
    'receiving_yards', 'penalty_yards', 'fumble_forced', 'fumble_lost', 
    'total_home_score', 'total_away_score', 'score_differential', 'field_goal_result',
    'wp', 'sack', 'home_wp', 'away_wp', 'quarter_seconds_remaining', 'game_seconds_remaining',
    'incomplete_pass', 'interception', 'safety', 'punt_blocked', 'first_down_rush',
    'first_down_pass', 'third_down_converted', 'fourth_down_converted', 'field_goal_attempt'
]

# 3. Import play-by-play data for 2024 with the selected columns
pbp = nfl.import_pbp_data([2024], columns=selected_cols)

# 4. Filter out non-offensive plays (run, pass, etc.)
pbp = pbp[pbp['play_type'].isin(['run', 'pass']) & pbp['posteam'].notnull()]

pbp['success'] = pbp['epa'] > 0

# 5. Aggregate stats per team per game (custom aggregation based on your columns)
team_game_stats = (
    pbp.groupby(['game_id', 'posteam', 'week'], as_index=False)
    .agg(
        total_yards=('yards_gained', 'sum'),
        total_tds=('touchdown', 'sum'),
        passing_tds=('pass_touchdown', 'sum'),
        rushing_tds=('rush_touchdown', 'sum'),
        num_plays=('play_type', 'count'),
        avg_epa=('epa', 'mean'),
        success_rate=('success', 'mean'),
        total_rush_yards=('rushing_yards', 'sum'),
        rush_attempts=('rush_attempt', 'sum'),
        longest_rush=('yards_gained', lambda x: x[pbp.loc[x.index, 'play_type'] == 'run'].max()),
        rushes_for_loss=('yards_gained', lambda x: ((x < 0) & (pbp.loc[x.index, 'play_type'] == 'run')).sum()),
        total_pass_yards=('passing_yards', 'sum'),
        pass_attempts=('pass_attempt', 'sum'),
        complete_passes=('complete_pass', 'sum'),
        incomplete_passes=('incomplete_pass', 'sum'),
        sacks=('sack', 'sum'),
        longest_pass=('yards_gained', lambda x: x[pbp.loc[x.index, 'play_type'] == 'pass'].max()),
        total_penalty_yards=('penalty_yards', 'sum'),
        fumbles_lost=('fumble_lost', 'sum'),
        interceptions=('interception', 'sum'),
        third_down_converted=('third_down_converted', 'sum'),
        fourth_down_converted=('fourth_down_converted', 'sum'),
        wp=('wp', 'mean'),
        home_wp=('home_wp', 'mean'),
        away_wp=('away_wp', 'mean')
    )
    .rename(columns={'posteam': 'team'})
)

# 6. Display the aggregated stats
team_game_stats.head()

team_game_stats.to_csv("../Team Data/team_game.csv")

2024 done.
Downcasting floats.


In [33]:
team_game_stats = team_game_stats.sort_values(['team', 'week'])
display(team_game_stats)



Unnamed: 0,game_id,team,week,total_yards,total_tds,passing_tds,rushing_tds,num_plays,avg_epa,success_rate,...,sacks,longest_pass,total_penalty_yards,fumbles_lost,interceptions,third_down_converted,fourth_down_converted,wp,home_wp,away_wp
0,2024_01_ARI_BUF,ARI,1,273.0,2.0,1.0,1.0,60,0.063335,0.383333,...,4.0,24.0,10.0,1.0,0.0,7.0,0.0,0.496706,0.503294,0.496706
46,2024_02_LA_ARI,ARI,2,489.0,5.0,3.0,1.0,62,0.346830,0.548387,...,1.0,60.0,0.0,1.0,0.0,7.0,0.0,0.936734,0.936734,0.063266
72,2024_03_DET_ARI,ARI,3,277.0,1.0,1.0,0.0,53,-0.238405,0.415094,...,1.0,28.0,15.0,0.0,1.0,1.0,0.0,0.236789,0.236789,0.763211
126,2024_04_WAS_ARI,ARI,4,296.0,2.0,1.0,1.0,58,-0.086211,0.465517,...,4.0,22.0,10.0,1.0,0.0,4.0,2.0,0.248035,0.248035,0.751965
128,2024_05_ARI_SF,ARI,5,363.0,2.0,1.0,1.0,56,0.123573,0.446429,...,1.0,34.0,1.0,0.0,1.0,3.0,1.0,0.344559,0.655441,0.344559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,2024_17_ATL_WAS,WAS,17,413.0,4.0,3.0,1.0,76,0.221503,0.526316,...,5.0,37.0,5.0,0.0,1.0,6.0,3.0,0.576894,0.576894,0.423106
543,2024_18_WAS_DAL,WAS,18,270.0,3.0,2.0,1.0,54,0.087090,0.481481,...,6.0,29.0,0.0,0.0,0.0,6.0,1.0,0.352016,0.647984,0.352016
555,2024_19_WAS_TB,WAS,19,355.0,2.0,2.0,0.0,67,0.113291,0.432836,...,1.0,35.0,8.0,0.0,0.0,8.0,3.0,0.525461,0.474539,0.525461
563,2024_20_WAS_DET,WAS,20,483.0,5.0,2.0,3.0,71,0.287723,0.507042,...,0.0,58.0,15.0,0.0,0.0,4.0,3.0,0.651799,0.348201,0.651799
