In [1]:
# import scraping modules
from urllib.request import urlopen
from bs4 import BeautifulSoup

# import pandas, numpy, and matplotlib
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
url = "https://www.pro-football-reference.com/years/2021/passing.htm"

# open url and pass to beautiful soup
html = urlopen(url)
stats_page = BeautifulSoup(html)

For these to be effective, we have to determine patterns in the page source. In our, case the data is nicely formatted in a table, so we can find all the table rows (tr) and columns (td) and extract the text directly from the cells.
First, we need to collect the column headers so we can use them later in our DataFrame. To do this, we find the first tr element in page and collect the text from all the table headers (th):

In [3]:
# collect table headers
column_headers = stats_page.findAll('tr')[0]
column_headers = [i.getText() for i in column_headers.findAll('th')]

In [4]:
print(column_headers)

['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC', 'GWD']


To collect the actual data, we will first collect all the table rows (tr) and store them in an array. Then, we iterate through each row and collect the text in each column (td) with getText() :

In [5]:
# collect table rows
rows = stats_page.findAll('tr')[1:]

# get stats from each row
qb_stats = []
for i in range(len(rows)):
    qb_stats.append([col.getText() for col in rows[i].findAll('td')])

In [6]:
# look at first row of qb stats
print(qb_stats[0])

['Tom Brady*', 'TAM', '44', 'QB', '17', '17', '13-4-0', '485', '719', '67.5', '5316', '43', '6', '12', '1.7', '269', '62', '7.4', '7.8', '11.0', '312.7', '102.1', '68.4', '22', '144', '3', '6.98', '7.41', '3', '5']


In [7]:
# create dataframe
data = pd.DataFrame(qb_stats, columns=column_headers[1:])
data.head(15)

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,...,Y/G,Rate,QBR,Sk,Yds,Sk%,NY/A,ANY/A,4QC,GWD
0,Tom Brady*,TAM,44,QB,17,17,13-4-0,485,719,67.5,...,312.7,102.1,68.4,22,144,3.0,6.98,7.41,3.0,5.0
1,Justin Herbert*,LAC,23,QB,17,17,9-8-0,443,672,65.9,...,294.9,97.7,66.0,31,214,4.4,6.83,6.95,5.0,5.0
2,Matthew Stafford,LAR,33,QB,17,17,12-5-0,404,601,67.2,...,287.4,102.9,63.5,30,243,4.8,7.36,7.45,3.0,4.0
3,Patrick Mahomes*,KAN,26,QB,17,17,12-5-0,436,658,66.3,...,284.6,98.5,62.7,28,146,4.1,6.84,7.07,3.0,3.0
4,Derek Carr,LVR,30,QB,17,17,10-7-0,428,626,68.4,...,282.6,94.0,52.8,40,241,6.0,6.85,6.6,3.0,6.0
5,Joe Burrow,CIN,25,QB,16,16,10-6-0,366,520,70.4,...,288.2,108.3,54.4,51,370,8.9,7.43,7.51,2.0,3.0
6,Dak Prescott,DAL,28,QB,16,16,11-5-0,410,596,68.8,...,278.1,104.2,54.7,30,144,4.8,6.88,7.34,1.0,2.0
7,Josh Allen,BUF,25,QB,17,17,11-6-0,409,646,63.3,...,259.2,92.2,60.7,26,164,3.9,6.31,6.38,,
8,Kirk Cousins,MIN,33,QB,16,16,8-8-0,372,561,66.3,...,263.8,103.1,52.0,28,197,4.8,6.83,7.42,3.0,4.0
9,Aaron Rodgers*+,GNB,38,QB,16,16,13-3-0,366,531,68.9,...,257.2,111.9,68.9,30,188,5.3,7.0,8.0,1.0,2.0


## Clean up Data Frame

In [8]:
# view columns
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

In [9]:
# rename sack yards column to Yds_Sack
new_columns = data.columns.values
new_columns[-6] = 'Yds_Sack'
data.columns = new_columns

In [10]:
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds_Sack', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

## Create a new Data Frame for QBs

In [11]:
# stat categories
cats = ['QBrec', 'Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate', 'QBR']

# create data subset for chart and add player name and team
qb_chart = data[['Player', 'Tm'] + cats]
qb_chart.head(20)

Unnamed: 0,Player,Tm,QBrec,Cmp%,Yds,TD,Int,Y/A,Rate,QBR
0,Tom Brady*,TAM,13-4-0,67.5,5316,43,12,7.4,102.1,68.4
1,Justin Herbert*,LAC,9-8-0,65.9,5014,38,15,7.5,97.7,66.0
2,Matthew Stafford,LAR,12-5-0,67.2,4886,41,17,8.1,102.9,63.5
3,Patrick Mahomes*,KAN,12-5-0,66.3,4839,37,13,7.4,98.5,62.7
4,Derek Carr,LVR,10-7-0,68.4,4804,23,14,7.7,94.0,52.8
5,Joe Burrow,CIN,10-6-0,70.4,4611,34,14,8.9,108.3,54.4
6,Dak Prescott,DAL,11-5-0,68.8,4449,37,10,7.5,104.2,54.7
7,Josh Allen,BUF,11-6-0,63.3,4407,36,15,6.8,92.2,60.7
8,Kirk Cousins,MIN,8-8-0,66.3,4221,33,7,7.5,103.1,52.0
9,Aaron Rodgers*+,GNB,13-3-0,68.9,4115,37,4,7.7,111.9,68.9


In [None]:
# check data types