In [1]:
# Import required modules
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Url of 2021 recieving stats page.
url = "https://www.pro-football-reference.com/years/2021/receiving.htm"

# Open URL and pass to beautiful soup
html = urlopen(url)
stats_page = BeautifulSoup(html)

In [3]:
# Collect table headers
column_headers = stats_page.findAll('tr')[0]
column_headers = [i.getText() for i in column_headers.findAll('th')]
print(column_headers)

['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'Tgt', 'Rec', 'Ctch%', 'Yds', 'Y/R', 'TD', '1D', 'Lng', 'Y/Tgt', 'R/G', 'Y/G', 'Fmb']


In [4]:
# Collect table rows
rows = stats_page.findAll('tr')[1:]

# Get stats from each row
wr_stats = []
for i in range(len(rows)):
    wr_stats.append([col.getText() for col in rows[i].findAll('td')])

In [5]:
# Check first row of wr_stats
print(wr_stats[0])

['Tyreek Hill', 'KAN', '27', 'wr', '8', '8', '90', '64', '71.1%', '735', '11.5', '6', '43', '75', '8.2', '8.0', '91.9', '1']


In [6]:
# Create dataframe from the scraped headers and player stats
data = pd.DataFrame(wr_stats, columns = column_headers[1:])

data.head(5)

Unnamed: 0,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,Fmb
0,Tyreek Hill,KAN,27,wr,8,8,90,64,71.1%,735,11.5,6,43,75,8.2,8.0,91.9,1
1,Cooper Kupp,LAR,28,wr,8,8,90,63,70.0%,924,14.7,10,41,59,10.3,7.9,115.5,0
2,Davante Adams,GNB,29,wr,7,7,73,52,71.2%,744,14.3,3,35,59,10.2,7.4,106.3,0
3,Brandin Cooks,HOU,28,wr,8,8,70,51,72.9%,585,11.5,2,25,52,8.4,6.4,73.1,0
4,Chris Godwin,TAM,25,wr,8,8,69,50,72.5%,660,13.2,4,35,44,9.6,6.3,82.5,1


In [7]:
# Adjust column names.
rec_data = data.rename(columns={"GS":"G's Started","Y/R":"Yds per Rec","1D":"1st Down Rec"})
rec_data.head()

Unnamed: 0,Player,Tm,Age,Pos,G,G's Started,Tgt,Rec,Ctch%,Yds,Yds per Rec,TD,1st Down Rec,Lng,Y/Tgt,R/G,Y/G,Fmb
0,Tyreek Hill,KAN,27,wr,8,8,90,64,71.1%,735,11.5,6,43,75,8.2,8.0,91.9,1
1,Cooper Kupp,LAR,28,wr,8,8,90,63,70.0%,924,14.7,10,41,59,10.3,7.9,115.5,0
2,Davante Adams,GNB,29,wr,7,7,73,52,71.2%,744,14.3,3,35,59,10.2,7.4,106.3,0
3,Brandin Cooks,HOU,28,wr,8,8,70,51,72.9%,585,11.5,2,25,52,8.4,6.4,73.1,0
4,Chris Godwin,TAM,25,wr,8,8,69,50,72.5%,660,13.2,4,35,44,9.6,6.3,82.5,1


In [9]:
# Create data subset to change data types.
numerical_categories = ["Age","G's Started","Tgt","Rec","Ctch%","Yds","Yds per Rec","TD","1st Down Rec","Lng","Y/Tgt","R/G","Y/G","Fmb"]

# Create new data subset.
rec_data_subset = rec_data[["Player","Tm","Pos"] + numerical_categories]
rec_data_subset.head()

Unnamed: 0,Player,Tm,Pos,Age,G's Started,Tgt,Rec,Ctch%,Yds,Yds per Rec,TD,1st Down Rec,Lng,Y/Tgt,R/G,Y/G,Fmb
0,Tyreek Hill,KAN,wr,27,8,90,64,71.1%,735,11.5,6,43,75,8.2,8.0,91.9,1
1,Cooper Kupp,LAR,wr,28,8,90,63,70.0%,924,14.7,10,41,59,10.3,7.9,115.5,0
2,Davante Adams,GNB,wr,29,7,73,52,71.2%,744,14.3,3,35,59,10.2,7.4,106.3,0
3,Brandin Cooks,HOU,wr,28,8,70,51,72.9%,585,11.5,2,25,52,8.4,6.4,73.1,0
4,Chris Godwin,TAM,wr,25,8,69,50,72.5%,660,13.2,4,35,44,9.6,6.3,82.5,1


In [10]:
# Check data types of all columns
rec_data_subset.dtypes

Player          object
Tm              object
Pos             object
Age             object
G's Started     object
Tgt             object
Rec             object
Ctch%           object
Yds             object
Yds per Rec     object
TD              object
1st Down Rec    object
Lng             object
Y/Tgt           object
R/G             object
Y/G             object
Fmb             object
dtype: object

In [11]:
# Remove % sign from catch%.
rec_data_subset["Ctch%"] = rec_data_subset["Ctch%"].str.replace("%","")
rec_data_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Player,Tm,Pos,Age,G's Started,Tgt,Rec,Ctch%,Yds,Yds per Rec,TD,1st Down Rec,Lng,Y/Tgt,R/G,Y/G,Fmb
0,Tyreek Hill,KAN,wr,27,8,90,64,71.1,735,11.5,6,43,75,8.2,8.0,91.9,1
1,Cooper Kupp,LAR,wr,28,8,90,63,70.0,924,14.7,10,41,59,10.3,7.9,115.5,0
2,Davante Adams,GNB,wr,29,7,73,52,71.2,744,14.3,3,35,59,10.2,7.4,106.3,0
3,Brandin Cooks,HOU,wr,28,8,70,51,72.9,585,11.5,2,25,52,8.4,6.4,73.1,0
4,Chris Godwin,TAM,wr,25,8,69,50,72.5,660,13.2,4,35,44,9.6,6.3,82.5,1


In [12]:
# Convert data to numerical data where necessary
for i in numerical_categories:
    rec_data_subset[i] = pd.to_numeric(rec_data_subset[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
rec_data_subset.head()
rec_data_subset.dtypes

Player           object
Tm               object
Pos              object
Age             float64
G's Started     float64
Tgt             float64
Rec             float64
Ctch%           float64
Yds             float64
Yds per Rec     float64
TD              float64
1st Down Rec    float64
Lng             float64
Y/Tgt           float64
R/G             float64
Y/G             float64
Fmb             float64
dtype: object

In [None]:
# Import plotting libraries
import matplotlib
import hvplot.pandas

In [19]:
rec_data_subset = rec_data_subset.sort_values(by=['Yds'],ascending=False)
rec_data_subset.head()

Unnamed: 0,Player,Tm,Pos,Age,G's Started,Tgt,Rec,Ctch%,Yds,Yds per Rec,TD,1st Down Rec,Lng,Y/Tgt,R/G,Y/G,Fmb
1,Cooper Kupp,LAR,wr,28.0,8.0,90.0,63.0,70.0,924.0,14.7,10.0,41.0,59.0,10.3,7.9,115.5,0.0
13,Deebo Samuel,SFO,wr,25.0,7.0,72.0,44.0,61.1,819.0,18.6,4.0,29.0,83.0,11.4,6.3,117.0,2.0
26,Ja'Marr Chase,CIN,wr,21.0,8.0,60.0,38.0,63.3,786.0,20.7,7.0,27.0,82.0,13.1,4.8,98.3,1.0
2,Davante Adams,GNB,wr,29.0,7.0,73.0,52.0,71.2,744.0,14.3,3.0,35.0,59.0,10.2,7.4,106.3,0.0
0,Tyreek Hill,KAN,wr,27.0,8.0,90.0,64.0,71.1,735.0,11.5,6.0,43.0,75.0,8.2,8.0,91.9,1.0
