In [2]:
import nfl_data_py as nfl

seasons = list(range(1999, 2023 + 1, 1))
schedule_df = nfl.import_schedules(seasons)
schedule_df.head()

Unnamed: 0,game_id,season,game_type,week,gameday,weekday,gametime,away_team,away_score,home_team,...,wind,away_qb_id,home_qb_id,away_qb_name,home_qb_name,away_coach,home_coach,referee,stadium_id,stadium
0,1999_01_MIN_ATL,1999,REG,1,1999-09-12,Sunday,,MIN,17.0,ATL,...,,00-0003761,00-0002876,Randall Cunningham,Chris Chandler,Dennis Green,Dan Reeves,Gerry Austin,ATL00,Georgia Dome
1,1999_01_KC_CHI,1999,REG,1,1999-09-12,Sunday,,KC,17.0,CHI,...,12.0,00-0006300,00-0010560,Elvis Grbac,Shane Matthews,Gunther Cunningham,Dick Jauron,Phil Luckett,CHI98,Soldier Field
2,1999_01_PIT_CLE,1999,REG,1,1999-09-12,Sunday,,PIT,43.0,CLE,...,12.0,00-0015700,00-0004230,Kordell Stewart,Ty Detmer,Bill Cowher,Chris Palmer,Bob McElwee,CLE00,Cleveland Browns Stadium
3,1999_01_OAK_GB,1999,REG,1,1999-09-12,Sunday,,OAK,24.0,GB,...,10.0,00-0005741,00-0005106,Rich Gannon,Brett Favre,Jon Gruden,Ray Rhodes,Tony Corrente,GNB00,Lambeau Field
4,1999_01_BUF_IND,1999,REG,1,1999-09-12,Sunday,,BUF,14.0,IND,...,,00-0005363,00-0010346,Doug Flutie,Peyton Manning,Wade Phillips,Jim Mora,Ron Blum,IND99,RCA Dome


In [8]:
import numpy as np

schedule_df['winner'] = np.where(schedule_df['result'] > 0, schedule_df['home_team'], schedule_df['away_team'])
schedule_df.loc[~schedule_df['game_type'].isin(['REG', 'WC', 'DIV', 'CON']), 'winner']

258     STL
517     BAL
776      NE
1043     TB
1310     NE
1577     NE
1844    PIT
2111    IND
2378    NYG
2645    PIT
2912     NO
3179     GB
3446    NYG
3713    BAL
3980    SEA
4247     NE
4514    DEN
4781     NE
5048    PHI
5315     NE
5582     KC
5851     TB
6136     LA
6420     KC
Name: winner, dtype: object

In [None]:
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment


class PFRStats:
    def __init__(self, base_url="https://www.pro-football-reference.com"):
        self.base_url = base_url
        self.off_columns = ['O_Rank', 'Team', 'O_Games_Played', 'O_Points_For', 'O_Total_Yards',
                            'O_Plays', 'O_Y/Play', 'O_TO', 'O_Fumbles_Lost',
                            'O_1st_D', 'O_P_Completions', 'O_P_Attempts', 'O_P_Yards', 'O_P_TD',
                            'O_P_Int', 'O_P_Y/PA', 'O_P_Passing_1st_D', 'O_R_Att', 'O_R_Yards',
                            'O_R_TD', 'O_R_Y/A', 'O_R_1st_D', 'O_Pe', 'O_Pe_Yards',
                            'O_Pe_1st_D', 'O_Scoring_Drives', 'O_TO_%', 'O_Expected_Points', 'Year']
        self.def_columns = ['D_Rank', 'Team', 'D_Games_Played', 'D_Points_Allowed', 'D_Total_Yards',
                            'D_Plays', 'D_Y/Play', 'D_TD', 'D_Fumbles_Lost', 'D_1st_D', 'D_P_Completions',
                            'D_P_Attempts', 'D_P_Yards', 'D_P_TD', 'D_P_Int', 'D_P_Y/PA', 'D_P_Passing_1st_D',
                            'D_R_Att', 'D_R_Yards', 'D_R_TD', 'D_R_Y/A', 'D_R_1st_D', 'D_Pe', 'D_Pe_Yards',
                            'D_Pe_1st_D', 'D_Scoring_Drives', 'D_TD_%', 'D_Expected_Points', 'Year']
        self.adjusted_columns = ['O_Points_For', 'O_Total_Yards',
                                 'O_Plays', 'O_TO', 'O_Fumbles_Lost', 'O_1st_D',
                                 'O_P_Completions', 'O_P_Attempts', 'O_P_Yards', 'O_P_TD', 'O_P_Int',
                                 'O_P_Passing_1st_D', 'O_R_Att', 'O_R_Yards', 'O_R_TD',
                                 'O_R_1st_D', 'O_Pe', 'O_Pe_Yards', 'O_Pe_1st_D',
                                 'O_Scoring_Drives', 'O_Expected_Points',
                                 'D_Games_Played', 'D_Points_Allowed', 'D_Total_Yards', 'D_Plays',
                                 'D_TD', 'D_Fumbles_Lost', 'D_1st_D', 'D_P_Completions',
                                 'D_P_Attempts', 'D_P_Yards', 'D_P_TD', 'D_P_Int',
                                 'D_P_Passing_1st_D', 'D_R_Att', 'D_R_Yards', 'D_R_TD',
                                 'D_R_1st_D', 'D_Pe', 'D_Pe_Yards', 'D_Pe_1st_D', 'D_Scoring_Drives',
                                 'D_Expected_Points']

    @staticmethod
    def parse_html_table(table, header=0):
        return pd.read_html(str(table), header=header)[0]

    def import_table(self, table_id, prefix, header=0):
        soup = BeautifulSoup(requests.get(self.base_url + prefix).content, 'html.parser')
        if table_id.startswith("#"):
            table = soup.select_one(table_id).find_next(text=lambda t: isinstance(t, Comment))
        else:
            table = soup.find('table', id=table_id)

        return self.parse_html_table(table, header=header)

    @staticmethod
    def set_column_type(df, col, dtype):
        return df[col].astype(dtype)

    def import_superbowl_winners(self, table_id="super_bowls", prefix="/super-bowl/"):
        df = self.import_table(table_id, prefix, header=0)
        df['Date'] = pd.to_datetime(df['Date'])
        df['Year'] = pd.DatetimeIndex(df['Date']).year
        return df

    @staticmethod
    def append_superbowl_winners(df, sb_df):
        df['Superbowl'] = 0
        sb_df = sb_df[['Year', 'Winner']]
        sb_df.Year = sb_df.Year.astype('int')
        sb_df['Year'] = sb_df['Year'] - 1
        sb_df['Superbowl'] = 1
        test_df = pd.merge(df, sb_df, left_on=['Year', 'Team'], right_on=['Year', 'Winner'], how='left')
        test_df.drop(['Superbowl_x', 'Winner'], axis=1, inplace=True)
        test_df.Superbowl_y = test_df.Superbowl_y.fillna(0)
        test_df.Superbowl_y = test_df.Superbowl_y.astype('int')
        test_df.rename(columns={'Superbowl_y': 'Superbowl'}, inplace=True)

        return test_df

    def import_all_team_stats_off_table(self, season, table_id="#all_team_stats", prefix=r"/years/{}"):
        return self.import_table(table_id, prefix.format(season), header=1)

    def import_all_team_stats_def_table(self, season, table_id="team_stats", prefix=r"/years/{}/opp.htm"):
        return self.import_table(table_id, prefix.format(season), header=1)

    def batch_import_all_team_stats_table(self, seasons, delay=5):
        off_dfs = []
        def_dfs = []
        for season in seasons:
            off_df = self.import_table(table_id="#all_team_stats", prefix=f"/years/{season}", header=1)[0:31]
            off_df['Year'] = season
            off_df.columns = self.off_columns
            off_dfs.append(off_df)
            def_df = self.import_table(table_id="team_stats", prefix=f"/years/{season}/opp.htm", header=1)[0:31]
            def_df['Year'] = season
            def_df.columns = self.def_columns
            def_dfs.append(def_df)

            time.sleep(delay)

        dfs = off_dfs + def_dfs
        data = pd.concat(dfs)
        data[self.adjusted_columns] = self.set_column_type(data, self.adjusted_columns, 'float')
        data[['O_Rank', 'D_Rank', 'O_Games_Played', 'Year']] = self.set_column_type(data, ['O_Rank', 'D_Rank',
                                                                                           'O_Games_Played', 'Year'],
                                                                                    'int')

        return data

In [18]:
pbp_data = nfl.import_pbp_data(seasons, thread_requests=True)
pbp_data.head()

Downcasting floats.


Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,35.0,1999_01_ARI_PHI,1999091200,PHI,ARI,REG,1,PHI,home,ARI,...,0.0,1.0,0.126818,,,,,,,
1,60.0,1999_01_ARI_PHI,1999091200,PHI,ARI,REG,1,PHI,home,ARI,...,0.0,1.0,-0.561568,,,,,,,
2,82.0,1999_01_ARI_PHI,1999091200,PHI,ARI,REG,1,PHI,home,ARI,...,0.0,1.0,-0.641717,,,,,,,
3,103.0,1999_01_ARI_PHI,1999091200,PHI,ARI,REG,1,PHI,home,ARI,...,0.0,1.0,-0.723302,,,,,,,
4,126.0,1999_01_ARI_PHI,1999091200,PHI,ARI,REG,1,PHI,home,ARI,...,0.0,1.0,0.212661,,,,,,,


In [20]:
teams = nfl.import_team_desc()
teams.head()

Unnamed: 0,team_abbr,team_name,team_id,team_nick,team_conf,team_division,team_color,team_color2,team_color3,team_color4,team_logo_wikipedia,team_logo_espn,team_wordmark,team_conference_logo,team_league_logo,team_logo_squared
0,ARI,Arizona Cardinals,3800,Cardinals,NFC,NFC West,#97233F,#000000,#ffb612,#a5acaf,https://upload.wikimedia.org/wikipedia/en/thum...,https://a.espncdn.com/i/teamlogos/nfl/500/ari.png,https://github.com/nflverse/nflverse-pbp/raw/m...,https://github.com/nflverse/nflverse-pbp/raw/m...,https://raw.githubusercontent.com/nflverse/nfl...,https://github.com/nflverse/nflverse-pbp/raw/m...
1,ATL,Atlanta Falcons,200,Falcons,NFC,NFC South,#A71930,#000000,#a5acaf,#a30d2d,https://upload.wikimedia.org/wikipedia/en/thum...,https://a.espncdn.com/i/teamlogos/nfl/500/atl.png,https://github.com/nflverse/nflverse-pbp/raw/m...,https://github.com/nflverse/nflverse-pbp/raw/m...,https://raw.githubusercontent.com/nflverse/nfl...,https://github.com/nflverse/nflverse-pbp/raw/m...
2,BAL,Baltimore Ravens,325,Ravens,AFC,AFC North,#241773,#9E7C0C,#9e7c0c,#c60c30,https://upload.wikimedia.org/wikipedia/en/thum...,https://a.espncdn.com/i/teamlogos/nfl/500/bal.png,https://github.com/nflverse/nflverse-pbp/raw/m...,https://github.com/nflverse/nflverse-pbp/raw/m...,https://raw.githubusercontent.com/nflverse/nfl...,https://github.com/nflverse/nflverse-pbp/raw/m...
3,BUF,Buffalo Bills,610,Bills,AFC,AFC East,#00338D,#C60C30,#0c2e82,#d50a0a,https://upload.wikimedia.org/wikipedia/en/thum...,https://a.espncdn.com/i/teamlogos/nfl/500/buf.png,https://github.com/nflverse/nflverse-pbp/raw/m...,https://github.com/nflverse/nflverse-pbp/raw/m...,https://raw.githubusercontent.com/nflverse/nfl...,https://github.com/nflverse/nflverse-pbp/raw/m...
4,CAR,Carolina Panthers,750,Panthers,NFC,NFC South,#0085CA,#000000,#bfc0bf,#0085ca,https://upload.wikimedia.org/wikipedia/en/thum...,https://a.espncdn.com/i/teamlogos/nfl/500-dark...,https://github.com/nflverse/nflverse-pbp/raw/m...,https://github.com/nflverse/nflverse-pbp/raw/m...,https://raw.githubusercontent.com/nflverse/nfl...,https://github.com/nflverse/nflverse-pbp/raw/m...


In [22]:
team_stats_2022 = teams
test_season = 2022
team_stats_2022['season'] = test_season
team_stats_2022['games_played'] = team_stats_2022.apply(lambda row: row['team_abbr'] if)
team_stats_2022['games_played']

ValueError: Can only compare identically-labeled Series objects

In [58]:
import pandas as pd

season_2022 = schedule_df.loc[(schedule_df['season'] == 2022) & (schedule_df['game_type'] == 'REG')]
home_games = season_2022.groupby('home_team')['game_id'].count()
home_games.index.name = 'team'
home_games.rename("home_games", inplace=True)

away_games = season_2022.groupby('away_team')['game_id'].count()
away_games.index.name = 'team'
away_games.rename("away_games", inplace=True)
games_played = pd.concat([home_games, away_games], axis=1)
games_played.reset_index(inplace=True)
games_played['games_played'] = games_played['home_games'] + games_played['away_games']
games_played

Unnamed: 0,team,home_games,away_games,games_played
0,ARI,9,8,17
1,ATL,9,8,17
2,BAL,8,9,17
3,BUF,8,8,16
4,CAR,9,8,17
5,CHI,9,8,17
6,CIN,7,9,16
7,CLE,8,9,17
8,DAL,9,8,17
9,DEN,8,9,17


In [59]:
home_scores = season_2022.groupby('home_team')['home_score'].sum()
away_scores = season_2022.groupby('away_team')['away_score'].sum()
points_for = pd.concat([home_scores, away_scores], axis=1)
points_for['points_for'] = points_for['home_score'] + points_for['away_score']
points_for

Unnamed: 0,home_score,away_score,points_for
ARI,176.0,164.0,340.0
ATL,224.0,141.0,365.0
BAL,153.0,197.0,350.0
BUF,254.0,201.0,455.0
CAR,199.0,148.0,347.0
CHI,176.0,150.0,326.0
CIN,201.0,217.0,418.0
CLE,180.0,181.0,361.0
DAL,270.0,197.0,467.0
DEN,144.0,143.0,287.0


In [77]:
pbp_2022 = pbp_data.loc[(pbp_data['season'] == test_season) & (pbp_data['season_type'] == 'REG')]
home_yards = pbp_2022.loc[(pbp_2022['home_team'] == pbp_2022['posteam'])].groupby('posteam')['yards_gained'].sum()
home_yards.index.name = 'team'
home_yards.rename("home_yards", inplace=True)

away_yards = pbp_2022.loc[(pbp_2022['away_team'] == pbp_2022['posteam'])].groupby('posteam')['yards_gained'].sum()
away_yards.index.name = 'team'
away_yards.rename("away_yards", inplace=True)

total_yards = pd.concat([home_yards, away_yards], axis=1)
total_yards['total_yards'] = total_yards['home_yards'] + total_yards['away_yards']
total_yards

Unnamed: 0_level_0,home_yards,away_yards,total_yards
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARI,2932.0,2577.0,5509.0
ATL,3025.0,2398.0,5423.0
BAL,2482.0,3282.0,5764.0
BUF,3185.0,3180.0,6365.0
CAR,2886.0,2322.0,5208.0
CHI,2861.0,2374.0,5235.0
CIN,2857.0,2917.0,5774.0
CLE,2893.0,3043.0,5936.0
DAL,3270.0,2764.0,6034.0
DEN,2745.0,2784.0,5529.0


In [88]:
home_pass_plays = pbp_2022.loc[(pbp_2022['home_team'] == pbp_2022['posteam']) & (pbp_2022['play_type'] == 'pass')].groupby('posteam')['play_id'].count()
home_pass_plays.index.name = 'team'
home_pass_plays.rename("home_pass_plays", inplace=True)

away_pass_plays = pbp_2022.loc[(pbp_2022['away_team'] == pbp_2022['posteam']) & (pbp_2022['play_type'] == 'pass')].groupby('posteam')['play_id'].count()
away_pass_plays.index.name = 'team'
away_pass_plays.rename("away_pass_plays", inplace=True)

total_pass_plays = pd.concat([home_pass_plays, away_pass_plays], axis=1)
total_pass_plays['total_pass_plays'] = total_pass_plays['home_pass_plays'] + total_pass_plays['away_pass_plays']
total_pass_plays

Unnamed: 0_level_0,home_pass_plays,away_pass_plays,total_pass_plays
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARI,386,327,713
ATL,225,225,450
BAL,226,296,522
BUF,283,324,607
CAR,253,238,491
CHI,238,201,439
CIN,283,375,658
CLE,274,306,580
DAL,303,280,583
DEN,305,329,634
