In [2]:
# Import scraping modules
from urllib.request import urlopen
from bs4 import BeautifulSoup


# Import data manipulation modules
import pandas as pd
import numpy as np
# Import data visualization modules
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
# URL of page
url = 'https://www.fantasypros.com/nfl/stats/qb.php?scoring=HALF&ownership=y'
# Open URL and pass to BeautifulSoup
html = urlopen(url)
stats_page = BeautifulSoup(html)


In [4]:
# Collect table headers
column_headers = stats_page.findAll('tr')[1]
column_headers = [i.getText() for i in column_headers.findAll('th')]

print(column_headers)

['Rank', 'Player', 'CMP', 'ATT', 'PCT', 'YDS', 'Y/A', 'TD', 'INT', 'SACKS', 'ATT', 'YDS', 'TD', 'FL', 'G', ' FPTS', 'FPTS/G', 'OWN']


In [5]:
# Collect table rows
rows = stats_page.findAll('tr')[2:]
# Get stats from each row
qb_stats = []
for i in range(len(rows)):
  qb_stats.append([col.getText() for col in rows[i].findAll('td')])

In [6]:
df = pd.DataFrame(qb_stats, columns=column_headers)
df


Unnamed: 0,Rank,Player,CMP,ATT,PCT,YDS,Y/A,TD,INT,SACKS,ATT.1,YDS.1,TD.1,FL,G,FPTS,FPTS/G,OWN
0,1,Josh Allen (BUF),396,572,69.2,4544,7.9,37,10,26,102,421,8,6,16,405.7,25.4,100.0%
1,2,Kyler Murray (ARI),375,558,67.2,3971,7.1,26,12,27,133,819,11,4,16,390.7,24.4,100.0%
2,3,Aaron Rodgers (GB),372,526,70.7,4299,8.2,48,5,20,38,149,3,2,16,387.6,24.2,98.0%
3,4,Patrick Mahomes II (KC),390,588,66.3,4740,8.1,38,6,22,62,308,2,2,15,380.3,25.4,100.0%
4,5,Deshaun Watson (HOU),382,544,70.2,4823,8.9,33,7,49,90,444,3,3,16,376.4,23.5,70.0%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,137,Trevor Siemian (NO),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0%
137,138,Kenji Bahar (BAL),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0%
138,139,Matt Schaub (FA),0,0,0,0,0,0,0,0,3,-4,0,0,1,-0.4,-0.4,0.0%
139,140,Nate Sudfeld (SF),5,12,41.7,32,2.7,0,1,2,2,12,0,1,1,-0.5,-0.5,0.0%


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 18 columns):
Rank      141 non-null object
Player    141 non-null object
CMP       141 non-null object
ATT       141 non-null object
PCT       141 non-null object
YDS       141 non-null object
Y/A       141 non-null object
TD        141 non-null object
INT       141 non-null object
SACKS     141 non-null object
ATT       141 non-null object
YDS       141 non-null object
TD        141 non-null object
FL        141 non-null object
G         141 non-null object
 FPTS     141 non-null object
FPTS/G    141 non-null object
OWN       141 non-null object
dtypes: object(18)
memory usage: 20.0+ KB


In [8]:
df['Team'] = df['Player'].str[-6:]
position = 'QB'

df['Pos'] = position

new_columns = df.columns.values
new_columns[-3] = 'Pct Owned'
new_columns[-8] = 'Rush TD'
new_columns[-9] = 'Rush YDS'
new_columns[-5] = 'Fantasy Points'
new_columns[-4] = 'Fantasy Points Per Game'
new_columns[4] = 'Cmp%'
new_columns[5] = 'Pass YDS'
new_columns[7] = 'Pass TD'
new_columns[-7] = 'Fumbles Lost'
new_columns[-6] = 'Games'
new_columns[-10] = 'Rushing ATT'
df.columns = new_columns
df



Unnamed: 0,Rank,Player,CMP,ATT,Cmp%,Pass YDS,Y/A,Pass TD,INT,SACKS,Rushing ATT,Rush YDS,Rush TD,Fumbles Lost,Games,Fantasy Points,Fantasy Points Per Game,Pct Owned,Team,Pos
0,1,Josh Allen (BUF),396,572,69.2,4544,7.9,37,10,26,102,421,8,6,16,405.7,25.4,100.0%,(BUF),QB
1,2,Kyler Murray (ARI),375,558,67.2,3971,7.1,26,12,27,133,819,11,4,16,390.7,24.4,100.0%,(ARI),QB
2,3,Aaron Rodgers (GB),372,526,70.7,4299,8.2,48,5,20,38,149,3,2,16,387.6,24.2,98.0%,(GB),QB
3,4,Patrick Mahomes II (KC),390,588,66.3,4740,8.1,38,6,22,62,308,2,2,15,380.3,25.4,100.0%,(KC),QB
4,5,Deshaun Watson (HOU),382,544,70.2,4823,8.9,33,7,49,90,444,3,3,16,376.4,23.5,70.0%,(HOU),QB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,137,Trevor Siemian (NO),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0%,(NO),QB
137,138,Kenji Bahar (BAL),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0%,(BAL),QB
138,139,Matt Schaub (FA),0,0,0,0,0,0,0,0,3,-4,0,0,1,-0.4,-0.4,0.0%,(FA),QB
139,140,Nate Sudfeld (SF),5,12,41.7,32,2.7,0,1,2,2,12,0,1,1,-0.5,-0.5,0.0%,(SF),QB


In [9]:
# Create data subset for radar chart
categories = ['Games','CMP','ATT','Cmp%','Pass YDS','Y/A','Pass TD','INT','SACKS','Rushing ATT','Rush YDS','Rush TD','Fumbles Lost','Fantasy Points','Fantasy Points Per Game','Pct Owned']
df1 = df[['Rank', 'Player','Team','Pos'] + categories]
df1.head()


Unnamed: 0,Rank,Player,Team,Pos,Games,CMP,ATT,Cmp%,Pass YDS,Y/A,Pass TD,INT,SACKS,Rushing ATT,Rush YDS,Rush TD,Fumbles Lost,Fantasy Points,Fantasy Points Per Game,Pct Owned
0,1,Josh Allen (BUF),(BUF),QB,16,396,572,69.2,4544,7.9,37,10,26,102,421,8,6,405.7,25.4,100.0%
1,2,Kyler Murray (ARI),(ARI),QB,16,375,558,67.2,3971,7.1,26,12,27,133,819,11,4,390.7,24.4,100.0%
2,3,Aaron Rodgers (GB),(GB),QB,16,372,526,70.7,4299,8.2,48,5,20,38,149,3,2,387.6,24.2,98.0%
3,4,Patrick Mahomes II (KC),(KC),QB,15,390,588,66.3,4740,8.1,38,6,22,62,308,2,2,380.3,25.4,100.0%
4,5,Deshaun Watson (HOU),(HOU),QB,16,382,544,70.2,4823,8.9,33,7,49,90,444,3,3,376.4,23.5,70.0%


In [10]:
df1['Player'] = df1['Player'].str.slice(0, -6)
df1.head()


Unnamed: 0,Rank,Player,Team,Pos,Games,CMP,ATT,Cmp%,Pass YDS,Y/A,Pass TD,INT,SACKS,Rushing ATT,Rush YDS,Rush TD,Fumbles Lost,Fantasy Points,Fantasy Points Per Game,Pct Owned
0,1,Josh Allen,(BUF),QB,16,396,572,69.2,4544,7.9,37,10,26,102,421,8,6,405.7,25.4,100.0%
1,2,Kyler Murray,(ARI),QB,16,375,558,67.2,3971,7.1,26,12,27,133,819,11,4,390.7,24.4,100.0%
2,3,Aaron Rodgers,(GB),QB,16,372,526,70.7,4299,8.2,48,5,20,38,149,3,2,387.6,24.2,98.0%
3,4,Patrick Mahomes II,(KC),QB,15,390,588,66.3,4740,8.1,38,6,22,62,308,2,2,380.3,25.4,100.0%
4,5,Deshaun Watson,(HOU),QB,16,382,544,70.2,4823,8.9,33,7,49,90,444,3,3,376.4,23.5,70.0%


In [11]:
df1['Pass YDS'] = df1['Pass YDS'].str.replace(',', '')
df1['Rush YDS'] = df1['Rush YDS'].str.replace(',', '')

In [12]:
ints = ['Games','CMP','ATT','Pass YDS','Pass TD','INT','SACKS','Rushing ATT','Rush YDS','Rush TD','Fumbles Lost']
floats = ['Cmp%','Y/A','Fantasy Points','Fantasy Points Per Game',]
df1['Games'] = df1['Games'].astype('int64')

for i in ints:
    df1[i] = df1[i].astype('int64')

for i in floats:
    df1[i] = df1[i].astype('float64')

In [13]:
# Check data types
df1.dtypes


Rank                        object
Player                      object
Team                        object
Pos                         object
Games                        int64
CMP                          int64
ATT                          int64
Cmp%                       float64
Pass YDS                     int64
Y/A                        float64
Pass TD                      int64
INT                          int64
SACKS                        int64
Rushing ATT                  int64
Rush YDS                     int64
Rush TD                      int64
Fumbles Lost                 int64
Fantasy Points             float64
Fantasy Points Per Game    float64
Pct Owned                   object
dtype: object

In [14]:
df1

Unnamed: 0,Rank,Player,Team,Pos,Games,CMP,ATT,Cmp%,Pass YDS,Y/A,Pass TD,INT,SACKS,Rushing ATT,Rush YDS,Rush TD,Fumbles Lost,Fantasy Points,Fantasy Points Per Game,Pct Owned
0,1,Josh Allen,(BUF),QB,16,396,572,69.2,4544,7.9,37,10,26,102,421,8,6,405.7,25.4,100.0%
1,2,Kyler Murray,(ARI),QB,16,375,558,67.2,3971,7.1,26,12,27,133,819,11,4,390.7,24.4,100.0%
2,3,Aaron Rodgers,(GB),QB,16,372,526,70.7,4299,8.2,48,5,20,38,149,3,2,387.6,24.2,98.0%
3,4,Patrick Mahomes II,(KC),QB,15,390,588,66.3,4740,8.1,38,6,22,62,308,2,2,380.3,25.4,100.0%
4,5,Deshaun Watson,(HOU),QB,16,382,544,70.2,4823,8.9,33,7,49,90,444,3,3,376.4,23.5,70.0%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,137,Trevor Siemian,(NO),QB,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0%
137,138,Kenji Bahar,(BAL),QB,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0%
138,139,Matt Schaub,(FA),QB,1,0,0,0.0,0,0.0,0,0,0,3,-4,0,0,-0.4,-0.4,0.0%
139,140,Nate Sudfeld,(SF),QB,1,5,12,41.7,32,2.7,0,1,2,2,12,0,1,-0.5,-0.5,0.0%


In [15]:
df1.to_csv('QBstats.csv',index=False)