In [1]:
# import scraping modules
from urllib.request import urlopen
from bs4 import BeautifulSoup

# import pandas, numpy, and matplotlib
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
url = "https://www.pro-football-reference.com/years/2021/passing.htm"

# open url and pass to beautiful soup
html = urlopen(url)
stats_page = BeautifulSoup(html)

In [3]:
# collect table headers
column_headers = stats_page.findAll('tr')[0]
column_headers = [i.getText() for i in column_headers.findAll('th')]

In [4]:
print(column_headers)

['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC', 'GWD']


In [5]:
# collect table rows
rows = stats_page.findAll('tr')[1:]

# get stats from each row
qb_stats = []
for i in range(len(rows)):
    qb_stats.append([col.getText() for col in rows[i].findAll('td')])

In [6]:
# look at first row of qb stats
print(qb_stats[0])

['Tom Brady*', 'TAM', '44', 'QB', '17', '17', '13-4-0', '485', '719', '67.5', '5316', '43', '6', '12', '1.7', '269', '62', '7.4', '7.8', '11.0', '312.7', '102.1', '68.4', '22', '144', '3', '6.98', '7.41', '3', '5']


In [7]:
# create dataframe
data = pd.DataFrame(qb_stats, columns=column_headers[1:])
data.head(15)

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,...,Y/G,Rate,QBR,Sk,Yds,Sk%,NY/A,ANY/A,4QC,GWD
0,Tom Brady*,TAM,44,QB,17,17,13-4-0,485,719,67.5,...,312.7,102.1,68.4,22,144,3.0,6.98,7.41,3.0,5.0
1,Justin Herbert*,LAC,23,QB,17,17,9-8-0,443,672,65.9,...,294.9,97.7,66.0,31,214,4.4,6.83,6.95,5.0,5.0
2,Matthew Stafford,LAR,33,QB,17,17,12-5-0,404,601,67.2,...,287.4,102.9,63.5,30,243,4.8,7.36,7.45,3.0,4.0
3,Patrick Mahomes*,KAN,26,QB,17,17,12-5-0,436,658,66.3,...,284.6,98.5,62.7,28,146,4.1,6.84,7.07,3.0,3.0
4,Derek Carr,LVR,30,QB,17,17,10-7-0,428,626,68.4,...,282.6,94.0,52.8,40,241,6.0,6.85,6.6,3.0,6.0
5,Joe Burrow,CIN,25,QB,16,16,10-6-0,366,520,70.4,...,288.2,108.3,54.4,51,370,8.9,7.43,7.51,2.0,3.0
6,Dak Prescott,DAL,28,QB,16,16,11-5-0,410,596,68.8,...,278.1,104.2,54.7,30,144,4.8,6.88,7.34,1.0,2.0
7,Josh Allen,BUF,25,QB,17,17,11-6-0,409,646,63.3,...,259.2,92.2,60.7,26,164,3.9,6.31,6.38,,
8,Kirk Cousins,MIN,33,QB,16,16,8-8-0,372,561,66.3,...,263.8,103.1,52.0,28,197,4.8,6.83,7.42,3.0,4.0
9,Aaron Rodgers*+,GNB,38,QB,16,16,13-3-0,366,531,68.9,...,257.2,111.9,68.9,30,188,5.3,7.0,8.0,1.0,2.0


## Clean up Data Frame

In [8]:
# view columns
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

In [9]:
# rename sack yards column to Yds_Sack
new_columns = data.columns.values
new_columns[-6] = 'Yds_Sack'
data.columns = new_columns

In [10]:
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds_Sack', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

## Create a new Data Frame for QBs

In [14]:
# stat categories
cats = ['Yds', 'TD', 'Int', 'QBR', 'Rate','Cmp%']

# create data subset for chart and add player name and team
qb_chart = data[['Player', 'Tm'] + cats]
qb_chart.head(20)

Unnamed: 0,Player,Tm,Yds,TD,Int,QBR,Rate,Cmp%
0,Tom Brady*,TAM,5316,43,12,68.4,102.1,67.5
1,Justin Herbert*,LAC,5014,38,15,66.0,97.7,65.9
2,Matthew Stafford,LAR,4886,41,17,63.5,102.9,67.2
3,Patrick Mahomes*,KAN,4839,37,13,62.7,98.5,66.3
4,Derek Carr,LVR,4804,23,14,52.8,94.0,68.4
5,Joe Burrow,CIN,4611,34,14,54.4,108.3,70.4
6,Dak Prescott,DAL,4449,37,10,54.7,104.2,68.8
7,Josh Allen,BUF,4407,36,15,60.7,92.2,63.3
8,Kirk Cousins,MIN,4221,33,7,52.0,103.1,66.3
9,Aaron Rodgers*+,GNB,4115,37,4,68.9,111.9,68.9


In [15]:
# check data types
qb_chart.dtypes

Player    object
Tm        object
Yds       object
TD        object
Int       object
QBR       object
Rate      object
Cmp%      object
dtype: object

In [16]:
# convert data to numerical values
for i in cats:
    qb_chart[i] = pd.to_numeric(data[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qb_chart[i] = pd.to_numeric(data[i])


In [17]:
qb_chart.dtypes

Player     object
Tm         object
Yds       float64
TD        float64
Int       float64
QBR       float64
Rate      float64
Cmp%      float64
dtype: object

In [21]:
# remove * and + for probowl and allpro
qb_chart['Player'] = qb_chart['Player'].str.replace('*', '')
qb_chart['Player'] = qb_chart['Player'].str.replace('*', '')

# filter passing yards
qb_chart_filtered = qb_chart[qb_chart['Yds'] > 1500]

# create columns with percentile rank
for i in cats:
    qb_chart_filtered[i + '_Rank'] = qb_chart_filtered[i].rank(pct=True)
    
# flip the rank for interceptions so that the QB with the most doest have the highest percentile
qb_chart_filtered['Int_Rank'] = 1 - qb_chart_filtered['Int_Rank']

  qb_chart['Player'] = qb_chart['Player'].str.replace('*', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qb_chart['Player'] = qb_chart['Player'].str.replace('*', '')
  qb_chart['Player'] = qb_chart['Player'].str.replace('*', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qb_chart['Player'] = qb_chart['Player'].str.replace('*', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [22]:
# check dataframe
qb_chart_filtered.head(10)

Unnamed: 0,Player,Tm,Yds,TD,Int,QBR,Rate,Cmp%,Yds_Rank,TD_Rank,Int_Rank,QBR_Rank,Rate_Rank,Cmp%_Rank
0,Tom Brady,TAM,5316.0,43.0,12.0,68.4,102.1,67.5,1.0,1.0,0.4375,0.96875,0.8125,0.75
1,Justin Herbert,LAC,5014.0,38.0,15.0,66.0,97.7,65.9,0.96875,0.9375,0.09375,0.9375,0.6875,0.46875
2,Matthew Stafford,LAR,4886.0,41.0,17.0,63.5,102.9,67.2,0.9375,0.96875,0.015625,0.90625,0.84375,0.6875
3,Patrick Mahomes,KAN,4839.0,37.0,13.0,62.7,98.5,66.3,0.90625,0.875,0.3125,0.875,0.71875,0.515625
4,Derek Carr,LVR,4804.0,23.0,14.0,52.8,94.0,68.4,0.875,0.625,0.1875,0.625,0.59375,0.875
5,Joe Burrow,CIN,4611.0,34.0,14.0,54.4,108.3,70.4,0.84375,0.78125,0.1875,0.703125,0.96875,1.0
6,Dak Prescott,DAL,4449.0,37.0,10.0,54.7,104.2,68.8,0.8125,0.875,0.609375,0.75,0.9375,0.90625
7,Josh Allen,BUF,4407.0,36.0,15.0,60.7,92.2,63.3,0.78125,0.8125,0.09375,0.84375,0.53125,0.28125
8,Kirk Cousins,MIN,4221.0,33.0,7.0,52.0,103.1,66.3,0.75,0.75,0.859375,0.5625,0.890625,0.515625
9,Aaron Rodgers+,GNB,4115.0,37.0,4.0,68.9,111.9,68.9,0.71875,0.875,0.96875,1.0,1.0,0.9375
