In [1]:
# Import required modules
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Url of 2021 recieving stats page.
url = "https://www.pro-football-reference.com/years/2021/"

# Open URL and pass to beautiful soup
html = urlopen(url)
stats_page = BeautifulSoup(html)

In [3]:
# Collect table headers
column_headers = stats_page.findAll('tr')[0]
column_headers = [i.getText() for i in column_headers.findAll('th')]
print(column_headers)

['Tm', 'W', 'L', 'W-L%', 'PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS', 'OSRS', 'DSRS']


In [18]:
# Collect table rows
rows = stats_page.findAll('tr')

# Get stats from each row
team_stats = []
for i in range(len(rows)):
    team_stats.append([col.getText() for col in rows[i].findAll('th')])
    team_stats.append([col.getText() for col in rows[i].findAll('td')])

In [19]:
# Check first row of wr_stats
print(team_stats[0:5])

[['Tm', 'W', 'L', 'W-L%', 'PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS', 'OSRS', 'DSRS'], [], [], [' AFC East'], ['Buffalo Bills*']]


In [22]:
# Create dataframe from the scraped headers and player stats
data = pd.DataFrame(team_stats, columns = column_headers[:])

data.head(20)

Unnamed: 0,Tm,W,L,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS
0,Tm,W,L,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,AFC East,,,,,,,,,,,
4,Buffalo Bills*,,,,,,,,,,,
5,5,2,.714,229,109,120,17.1,-4.8,12.4,7.2,5.2,
6,New England Patriots,,,,,,,,,,,
7,4,4,.500,206,164,42,5.3,-2.4,2.8,-0.1,2.9,
8,New York Jets,,,,,,,,,,,
9,2,6,.250,144,251,-107,-13.4,0.7,-12.7,-4.4,-8.3,


In [7]:
# Adjust column names.
rec_data = data.rename(columns={"GS":"G's Started","Y/R":"Yds per Rec","1D":"1st Down Rec"})
rec_data.head()

Unnamed: 0,W,L,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS
0,,,,,,,,,,,
1,AFC East,,,,,,,,,,
2,5,2.0,0.714,229.0,109.0,120.0,17.1,-4.8,12.4,7.2,5.2
3,4,4.0,0.5,206.0,164.0,42.0,5.3,-2.4,2.8,-0.1,2.9
4,2,6.0,0.25,144.0,251.0,-107.0,-13.4,0.7,-12.7,-4.4,-8.3


In [8]:
# Create data subset to change data types.
numerical_categories = ["Age","G's Started","Tgt","Rec","Ctch%","Yds","Yds per Rec","TD","1st Down Rec","Lng","Y/Tgt","R/G","Y/G","Fmb"]

# Create new data subset.
rec_data_subset = rec_data[["Player","Tm","Pos"] + numerical_categories]
rec_data_subset.head()

KeyError: "None of [Index(['Player', 'Tm', 'Pos', 'Age', 'G's Started', 'Tgt', 'Rec', 'Ctch%',\n       'Yds', 'Yds per Rec', 'TD', '1st Down Rec', 'Lng', 'Y/Tgt', 'R/G',\n       'Y/G', 'Fmb'],\n      dtype='object')] are in the [columns]"

In [None]:
# Check data types of all columns
rec_data_subset.dtypes

In [None]:
# Remove % sign from catch%.
rec_data_subset["Ctch%"] = rec_data_subset["Ctch%"].str.replace("%","")
rec_data_subset.head()

In [None]:
# Convert data to numerical data where necessary
for i in numerical_categories:
    rec_data_subset[i] = pd.to_numeric(rec_data_subset[i])

In [None]:
rec_data_subset.head()
rec_data_subset.dtypes

In [None]:
# Import plotting libraries
import matplotlib
import hvplot.pandas

In [None]:
rec_data_subset = rec_data_subset.sort_values(by=['Yds'],ascending=False)
rec_data_subset.head()