In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# create empty dataframe to store data.
nfl_data = pd.DataFrame(
    columns=['Week', 'Day', 'Date', 'Time', 'Home_team', 'Away_team', 'Home_team_pts', 'Away_team_pts'])

nfl_data

Unnamed: 0,Week,Day,Date,Time,Home_team,Away_team,Home_team_pts,Away_team_pts


In [3]:
df = pd.DataFrame()
seasons = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

for season in seasons:

    url = 'https://www.pro-football-reference.com/years/{}/games.htm'.format(season)

    dfs = pd.read_html(url, flavor='lxml', index_col=False)

    # access the first (and only) dataframe
    df = dfs[0]
    df.columns = ['Week', 'Day', 'Date', 'Time', 'Winner/Tie', 'H/A', 'Loser/Tie', 'Boxscore', 'Pts_winner',
                  'Pts_loser', 'YdsW', 'TOW', 'YdsL', 'TOL']

    # drop rows where the 'Week' column contains 'Week'
    # convert the column to string first.
    df['Week'] = df['Week'].astype(str)
    df = df[~df['Week'].str.contains('Week')]

    # drop the nan row
    df = df.dropna(subset=['Week'])

    # set nulls to home
    df['H/A'] = df['H/A'].fillna('H')
    nfl_data = pd.concat([nfl_data, df], ignore_index=True)

In [4]:
nfl_data.head()

Unnamed: 0,Week,Day,Date,Time,Home_team,Away_team,Home_team_pts,Away_team_pts,Winner/Tie,H/A,Loser/Tie,Boxscore,Pts_winner,Pts_loser,YdsW,TOW,YdsL,TOL
0,1,Thu,2014-09-04,8:42PM,,,,,Seattle Seahawks,H,Green Bay Packers,boxscore,36,16,398,1,255,1
1,1,Sun,2014-09-07,1:00PM,,,,,Buffalo Bills,@,Chicago Bears,boxscore,23,20,360,1,427,3
2,1,Sun,2014-09-07,1:00PM,,,,,Minnesota Vikings,@,St. Louis Rams,boxscore,34,6,355,0,318,2
3,1,Sun,2014-09-07,1:02PM,,,,,Cincinnati Bengals,@,Baltimore Ravens,boxscore,23,16,380,0,423,2
4,1,Sun,2014-09-07,1:02PM,,,,,Pittsburgh Steelers,H,Cleveland Browns,boxscore,30,27,503,1,389,0


In [5]:
nfl_data.tail()

Unnamed: 0,Week,Day,Date,Time,Home_team,Away_team,Home_team_pts,Away_team_pts,Winner/Tie,H/A,Loser/Tie,Boxscore,Pts_winner,Pts_loser,YdsW,TOW,YdsL,TOL
2444,Division,Sun,2023-01-22,3:00PM,,,,,Cincinnati Bengals,@,Buffalo Bills,boxscore,27,10,412,0,325,1
2445,Division,Sun,2023-01-22,6:30PM,,,,,San Francisco 49ers,H,Dallas Cowboys,boxscore,19,12,312,1,282,2
2446,ConfChamp,Sun,2023-01-29,3:00PM,,,,,Philadelphia Eagles,H,San Francisco 49ers,boxscore,31,7,269,0,164,3
2447,ConfChamp,Sun,2023-01-29,6:30PM,,,,,Kansas City Chiefs,H,Cincinnati Bengals,boxscore,23,20,357,1,309,2
2448,SuperBowl,Sun,2023-02-12,6:30PM,,,,,Kansas City Chiefs,N,Philadelphia Eagles,boxscore,38,35,340,0,417,1


In [6]:
# define a function to determine the home team based on the values in the 'H/A' and 'Loser/Tie' columns
def get_home_team(row):
    if row['H/A'] == 'H':
        return row['Winner/Tie']
    elif row['H/A'] == '@':
        return row['Loser/Tie']
    elif row['H/A'] == 'N':
        return row['Winner/Tie']

# define a function to determine the away team based on the values in the 'H/A' and 'Loser/Tie' columns
def get_away_team(row):
    if row['H/A'] == '@':
        return row['Winner/Tie']
    elif row['H/A'] == 'H':
        return row['Loser/Tie']
    elif row['H/A'] == 'N':
        return row['Loser/Tie']

# define a function to determine the home team points based on the values in the 'H/A' and 'Loser/Tie' columns
def get_home_team_pts(row):
    if row['H/A'] == 'H':
        return row['Pts_winner']
    elif row['H/A'] == '@':
        return row['Pts_loser']
    elif row['H/A'] == 'N':
        return row['Pts_winner']

# define a function to determine the away team points based on the values in the 'H/A' and 'Loser/Tie' columns
def get_away_team_pts(row):
    if row['H/A'] == '@':
        return row['Pts_winner']
    elif row['H/A'] == 'H':
        return row['Pts_loser']
    elif row['H/A'] == 'N':
        return row['Pts_loser']

In [7]:
# apply the functions

nfl_data['Home_team'] = nfl_data.apply(get_home_team, axis=1)
nfl_data['Away_team'] = nfl_data.apply(get_away_team, axis=1)
nfl_data['Home_team_pts'] = nfl_data.apply(get_home_team_pts, axis=1)
nfl_data['Away_team_pts'] = nfl_data.apply(get_away_team_pts, axis=1)

nfl_data = nfl_data[['Week', 'Day', 'Date', 'Time', 'Home_team', 'Away_team', 'Home_team_pts', 'Away_team_pts']]

#nfl_data = pd.concat([nfl_data, df], ignore_index=True)

#continue


nfl_data = nfl_data.dropna()

In [8]:
nfl_data.head()

Unnamed: 0,Week,Day,Date,Time,Home_team,Away_team,Home_team_pts,Away_team_pts
0,1,Thu,2014-09-04,8:42PM,Seattle Seahawks,Green Bay Packers,36,16
1,1,Sun,2014-09-07,1:00PM,Chicago Bears,Buffalo Bills,20,23
2,1,Sun,2014-09-07,1:00PM,St. Louis Rams,Minnesota Vikings,6,34
3,1,Sun,2014-09-07,1:02PM,Baltimore Ravens,Cincinnati Bengals,16,23
4,1,Sun,2014-09-07,1:02PM,Pittsburgh Steelers,Cleveland Browns,30,27


In [9]:
nfl_data.tail()

Unnamed: 0,Week,Day,Date,Time,Home_team,Away_team,Home_team_pts,Away_team_pts
2444,Division,Sun,2023-01-22,3:00PM,Buffalo Bills,Cincinnati Bengals,10,27
2445,Division,Sun,2023-01-22,6:30PM,San Francisco 49ers,Dallas Cowboys,19,12
2446,ConfChamp,Sun,2023-01-29,3:00PM,Philadelphia Eagles,San Francisco 49ers,31,7
2447,ConfChamp,Sun,2023-01-29,6:30PM,Kansas City Chiefs,Cincinnati Bengals,23,20
2448,SuperBowl,Sun,2023-02-12,6:30PM,Kansas City Chiefs,Philadelphia Eagles,38,35
