The code in this [blog](https://www.fantasyfootballdatapros.com/blog/intermediate/4) was tremendously useful for writing the code below.

In [1]:
# !pip install requests beautifulsoup4 html5lib pandas
from requests import get
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
print("Import Complete")

Import Complete


The following target variables are the ones we need to pull. Most are accounted for with the scraping done below, but not all of them. The highlighted variables are not included in the scraping below.


1. PassingYds
2. PassingTDs
3. Int
4. **Fum**
5. RushingYds
6. RushingTDs
7. Receptions
8. ReceivingYds
9. ReceivingTDs
10. **2PtConversion**



In [11]:
def passing_url(year, week, offset):
  """
  Parameters are a year, week, and offset value
  The offset value is used to navigate the pages of game results, which are 
  limited to 100 records each.
  Returns a string that is a URL of pro-football-reference.com to scrape.
  """
  passingURL = ("https://www.pro-football-reference.com/play-index/pgl_finder"
                ".cgi?request=1&match=game&year_min={year}&year_max={year}"
                "&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A"
                "&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99"
                "&week_num_min={week}&week_num_max={week}&game_day_of_week="
                "&game_location=&game_result=&handedness=&is_active=&is_hof="
                "&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val="
                "&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val="
                "&order_by=pass_rating&from_link=1&offset={offset}").format(year=year, week=week, offset=offset)
  print(passingURL)
  return passingURL

def receiving_url(year, week, offset):
  """
  Parameters are a year, week, and offset value
  The offset value is used to navigate the pages of game results, which are 
  limited to 100 records each.
  Returns a string that is a URL of pro-football-reference.com to scrape.
  """
  receivingURL = ("https://www.pro-football-reference.com/play-index/pgl_finder"
                  ".cgi?request=1&match=game&year_min={year}&year_max={year}"
                  "&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A"
                  "&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99"
                  "&week_num_min={week}&week_num_max={week}&game_day_of_week="
                  "&game_location=&game_result=&handedness=&is_active=&is_hof="
                  "&c1stat=rec&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val="
                  "&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val="
                  "&order_by=rec_yds&from_link=1&offset={offset}").format(year=year, week=week, offset=offset)
  return receivingURL

def rushing_url(year, week, offset):
  """
  Parameters are a year, week, and offset value
  The offset value is used to navigate the pages of game results, which are 
  limited to 100 records each.
  Returns a string that is a URL of pro-football-reference.com to scrape.
  """
  rushingURL = ("https://www.pro-football-reference.com/play-index/pgl_finder"
                ".cgi?request=1&match=game&year_min={year}&year_max={year}"
                "&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A"
                "&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99"
                "&week_num_min={week}&week_num_max={week}&game_day_of_week="
                "&game_location=&game_result=&handedness=&is_active=&is_hof="
                "&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val="
                "&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val="
                "&order_by=rush_yds&from_link=1&offset={offset}").format(year=year, week=week, offset=offset)
  return rushingURL


print("Modules Loaded")

Modules Loaded


In [12]:
def parse_pfr_html(result_category, year, week):
  """
  This function uses beautifulsoup to parse html on webpages from 
  pro-football-reference.com. 
  It takes a result category: 'passing' , 'receiving', 'rushing'
  It takes a year of the result and a week of the result
  It returns a dataframe that is the union of each of the separate 100-record-
  limited pages of results tables in the given year, week, and result category
  """
  next_page = True
  offset = 0
  base = pd.DataFrame()
  while next_page == True:
    if result_category == 'Passing':
      response = get(passing_url(year=year, week=week, offset=offset))
    elif result_category == 'Receiving':
      response = get(receiving_url(year=year, week=week, offset=offset))
    elif result_category == 'Rushing':
      response = get(rushing_url(year=year, week=week, offset=offset))   
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'results'})
    df = pd.read_html(str(table))[0]
    df.columns = df.columns.droplevel(level = 0)
    df.drop(['Result', 'Week', 'G#', 'Unnamed: 7_level_1', 'Age', 'Rk', 'Lg', 'Date', 'Day'], **defColumnSettings)
    df = df[df['Pos'] != 'Pos']
    df.set_index(['Player', 'Pos', 'Tm', 'Opp'], inplace=True)
    if result_category == 'Passing':
      df = df[['Yds', 'TD', 'Int', 'Att', 'Cmp']]
      df.rename({'Yds': 'PassingYds', 'Att': 'PassingAtt', 'Y/A': 'Y/PassingAtt', 'TD': 'PassingTD'}, **defColumnSettings)
    elif result_category =='Receiving':
        df = df[['Rec', 'Tgt', 'Yds', 'TD']]
        df.rename({'Yds': 'ReceivingYds', 'TD': 'ReceivingTD'}, **defColumnSettings)
    elif result_category == 'Rushing':
        df.drop('Y/A', **defColumnSettings)
        df.rename({'Att': 'RushingAtt', 'Yds': 'RushingYds', 'TD': 'RushingTD'}, **defColumnSettings)
    df['Yr'] = year
    df['Wk'] = week
    df.set_index(['Yr', 'Wk'], append=True, inplace=True)
    base = pd.concat([base, df])

    # if the nextpage button doesn't exist, flip the next_page variable to false
    # if the nextpage button exists, increment offset and run through the loop again
    if len(soup.select('a[class^="button2 next"]')) == 0:
        next_page = False
    else:
      offset += 100
  #print(base.shape)
  #for col in base.columns:
    #print(col)
  print(" ")
  return base

print("Modules Loaded")

Modules Loaded


In [17]:
dfs = []
defColumnSettings = {
    'axis': 1,
    'inplace': True
}
print(" ")
print("2020 Start")
print(" ")

for year in range(2020, 2021):
  for week in range(1, 8):
    dfs.append(parse_pfr_html(result_category='Passing', year=year, week=week))
    dfs.append(parse_pfr_html(result_category='Receiving', year=year, week=week))
    dfs.append(parse_pfr_html(result_category='Rushing', year=year, week=week))

print(dfs)
print("Modules Loaded")

 
2020 Start
 
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2020&year_max=2020&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&offset=0
 
 
 
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2020&year_max=2020&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=2&week_num_max=2&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&of

In [14]:
adjoined_dfs = []
counter = 1
for df in dfs:
  if counter == 1:
    adjoined_df = df
    counter += 1
  elif counter == 2:
    #adjoined_df = adjoined_df.join(df, on=['Player', 'Pos', 'Tm', 'Opp', 'Yr', 'Wk'], how='outer')
    adjoined_df = adjoined_df.join(df, on=['Player', 'Pos','Tm', 'Opp', 'Yr', 'Wk'], how='outer')
    counter += 1
  elif counter == 3:
    adjoined_df = adjoined_df.join(df, on=['Player', 'Pos','Tm', 'Opp', 'Yr', 'Wk'], how='outer')
    adjoined_dfs.append(adjoined_df)
    counter = 1

union_df = pd.DataFrame()
for df in adjoined_dfs:
  union_df = pd.concat([union_df, df])

union_df.fillna(0, inplace=True)
union_df = union_df.astype('int64')



union_df['PY_ind'] = np.where(union_df['PassingYds']>300, 3, 0)
union_df['RY_ind'] = np.where(union_df['RushingYds']>100, 3, 0)
union_df['RecY_ind'] = np.where(union_df['ReceivingYds']>100, 3, 0)
union_df['FantasyPoints'] = union_df['PassingYds']/25 \
                            + union_df['PassingTD']*4 \
                            - union_df['Int']*2 \
                            + union_df['Rec'] \
                            + union_df['ReceivingYds']/10 \
                            + union_df['ReceivingTD']*6 \
                            + union_df['RushingYds']/10 \
                            + union_df['RushingTD']*6
union_df['FantasyPoints_DK'] = union_df['PassingYds']/25 \
                            + union_df['PassingTD']*4 \
                            - union_df['Int']*2 \
                            + union_df['Rec'] \
                            + union_df['ReceivingYds']/10 \
                            + union_df['ReceivingTD']*6 \
                            + union_df['RushingYds']/10 \
                            + union_df['RushingTD']*6 + union_df['PY_ind'] + union_df['RY_ind'] + union_df['RecY_ind']
print(union_df)
union_df.reset_index(inplace=True)

print("Modules Loaded")

                                       PassingYds  PassingTD  Int  PassingAtt  \
Player            Pos Tm  Opp Yr   Wk                                           
NaN               NaN NaN NaN 2020 1            0          0    0           0   
                                   1            0          0    0           0   
                                   1            0          0    0           0   
                                   1            0          0    0           0   
                                   1            0          0    0           0   
...                                           ...        ...  ...         ...   
Justin Herbert    QB  LAC JAX 2020 7            0          0    0           0   
Todd Gurley       RB  ATL DET 2020 7            0          0    0           0   
Darrell Henderson RB  LAR CHI 2020 7            0          0    0           0   
Josh Allen        QB  BUF NYJ 2020 7            0          0    0           0   
Frank Gore        RB  NYJ BU

In [15]:
allowed_pos = ['QB','RB','WR','TE']
union_df.loc[~union_df['Pos'].isin(allowed_pos), 'Pos'] = "RB"

union_df['Tm'] = union_df['Tm'].replace({'NYG':'15260879', 'PIT':'15260860', 'JAX':'15260884', 'DEN':'15260873', 
                                        'CHI':'15260864', 'TEN':'15260871', 'MIN':'15260877', 'ATL':'15260883', 
                                        'TAM':'15260886', 'DAL':'15260870', 'KAN':'15260868', 'LAR':'15260882',
                                        'LAC':'15260872', 'CIN':'15260869', 'PHI':'15260862', 'GNB':'15260875', 
                                        'WAS':'15260891', 'CAR':'15260878', 'DET':'15260874', 'NOR':'15260880', 
                                        'NWE':'15260865', 'BUF':'15260861', 'BAL':'15260866', 'SFO':'15260863', 
                                        'IND':'15260867', 'SEA':'15260881','ARI':'15260888', 'CLE':'15260885', 
                                         'HOU':'15260889', 'LVR':'15260876', 'MIA':'15260890', 'NYJ':'15260887'})

union_df['Opp'] = union_df['Opp'].replace({'NYG':'15260879', 'PIT':'15260860', 'JAX':'15260884', 'DEN':'15260873', 
                                        'CHI':'15260864', 'TEN':'15260871', 'MIN':'15260877', 'ATL':'15260883', 
                                        'TAM':'15260886', 'DAL':'15260870', 'KAN':'15260868', 'LAR':'15260882',
                                        'LAC':'15260872', 'CIN':'15260869', 'PHI':'15260862', 'GNB':'15260875', 
                                        'WAS':'15260891', 'CAR':'15260878', 'DET':'15260874', 'NOR':'15260880', 
                                        'NWE':'15260865', 'BUF':'15260861', 'BAL':'15260866', 'SFO':'15260863', 
                                        'IND':'15260867', 'SEA':'15260881','ARI':'15260888', 'CLE':'15260885', 
                                         'HOU':'15260889', 'LVR':'15260876', 'MIA':'15260890', 'NYJ':'15260887'})

union_df['PerfMatch'] = union_df['Player'] + union_df['Pos'] + union_df['Tm']
union_df = union_df.dropna(subset=['PerfMatch'])
print(union_df.shape)

print("Modules Loaded")

(342, 24)
Modules Loaded


In [16]:
# union_df.shape
YearWeekOffensivePlayerData = union_df.to_csv(r'.\data\2020\performance\Performance.csv', encoding='utf-8')

print("Modules Loaded")

Modules Loaded


In [21]:
url = "https://stathead.com/football/pgl_finder.cgi?request=1&match=game&year_min=2019&year_max=2019&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&offset=0"
response = get(url)

url = "https://stathead.com/football/play_finder.cgi?request=1&match=summary_all&sb=0&order_by_asc=0&order_by=yards&year_min=2020&game_type=R&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&quarter%5B%5D=1&quarter%5B%5D=2&quarter%5B%5D=3&quarter%5B%5D=4&quarter%5B%5D=5&minutes_max=15&seconds_max=0&minutes_min=0&seconds_min=0&down%5B%5D=0&down%5B%5D=1&down%5B%5D=2&down%5B%5D=3&down%5B%5D=4&field_pos_min_field=team&field_pos_max_field=team&end_field_pos_min_field=team&end_field_pos_max_field=team&type%5B%5D=PASS&type%5B%5D=RUSH&type%5B%5D=PUNT&type%5B%5D=KOFF&type%5B%5D=ONSD&type%5B%5D=FG&type%5B%5D=XP&type%5B%5D=2PC&no_play=N&turnover_type%5B%5D=interception&turnover_type%5B%5D=fumble&score_type%5B%5D=touchdown&score_type%5B%5D=field_goal&score_type%5B%5D=safety&rush_direction%5B%5D=LE&rush_direction%5B%5D=LT&rush_direction%5B%5D=LG&rush_direction%5B%5D=M&rush_direction%5B%5D=RG&rush_direction%5B%5D=RT&rush_direction%5B%5D=RE&pass_location%5B%5D=SL&pass_location%5B%5D=SM&pass_location%5B%5D=SR&pass_location%5B%5D=DL&pass_location%5B%5D=DM&pass_location%5B%5D=DR#offense::none"
response = get(url)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
#table = soup.find('table', {'id': 'results'})
#df = pd.read_html(str(table))[0]
#df.columns = df.columns.droplevel(level = 0)


<!DOCTYPE html>

<html class="no-js" data-root="/home/pfr/deploy/www" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
<head id="suppress_all_ads">
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
<link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202010221" rel="dns-prefetch"/>
<title>Football | Game Play Finder | Stathead.com</title>
<meta content="Game Play Finder" name="Description"/>
<link href="https://www.stathead.com/football/play_finder.cgi" rel="canonical"/>
<!-- include:start ="/inc/stathead/klecko_header_stathead.html_f" -->
<!-- no:cookie fast load the css.           -->
<script>function gup(n) {n = n.replace(/[\[]/, '\\[').replace(/[\]]/, '\\]'); var r = new RegExp('[\\?&]'+n+'=([^&#]*)'); var re = r.exec(location.search);   return re === null?'':decodeURIComponent(re[1].replace(/\+/g,' '));}; document.srdev = gup(

In [30]:
https://stathead.com/football/play_finder.cgi?request=1&match=summary_all&sb=0&order_by_asc=0&order_by=yards&year_min=2020&game_type=R&game_num_min=0&game_num_max=99&week_num_min=4&week_num_max=4&quarter%5B%5D=1&quarter%5B%5D=2&quarter%5B%5D=3&quarter%5B%5D=4&quarter%5B%5D=5&minutes_max=15&seconds_max=0&minutes_min=0&seconds_min=0&down%5B%5D=0&down%5B%5D=1&down%5B%5D=2&down%5B%5D=3&down%5B%5D=4&field_pos_min_field=team&field_pos_max_field=team&end_field_pos_min_field=team&end_field_pos_max_field=team&type%5B%5D=PASS&type%5B%5D=RUSH&type%5B%5D=PUNT&type%5B%5D=KOFF&type%5B%5D=ONSD&type%5B%5D=FG&type%5B%5D=XP&type%5B%5D=2PC&no_play=N&turnover_type%5B%5D=interception&turnover_type%5B%5D=fumble&score_type%5B%5D=touchdown&score_type%5B%5D=field_goal&score_type%5B%5D=safety&rush_direction%5B%5D=LE&rush_direction%5B%5D=LT&rush_direction%5B%5D=LG&rush_direction%5B%5D=M&rush_direction%5B%5D=RG&rush_direction%5B%5D=RT&rush_direction%5B%5D=RE&pass_location%5B%5D=SL&pass_location%5B%5D=SM&pass_location%5B%5D=SR&pass_location%5B%5D=DL&pass_location%5B%5D=DM&pass_location%5B%5D=DR


response = get(url)
soup = BeautifulSoup(response.content, 'html.parser')
#print(soup.prettify())
print(soup.find_all("a"))

[<a href="https://www.sports-reference.com/"><svg height="15px" width="20px"><use xlink:href="#ic-sr-pennant"></use></svg> Sports Reference</a>, <a href="https://www.baseball-reference.com/">Baseball</a>, <a href="https://www.pro-football-reference.com/">Football</a>, <a href="https://www.sports-reference.com/cfb/">(college)</a>, <a href="https://www.basketball-reference.com/">Basketball</a>, <a href="https://www.sports-reference.com/cbb/">(college)</a>, <a href="https://www.hockey-reference.com/">Hockey</a>, <a href="https://fbref.com/it/">Calcio</a>, <a href="https://www.sports-reference.com/blog/">Blog</a>, <a href="https://stathead.com/?utm_source=web&amp;utm_medium=stathead&amp;utm_campaign=sr-nav-bar-top-link">Stathead</a>, <a href="https://widgets.sports-reference.com/">Widgets</a>, <a href="https://www.sports-reference.com/feedback/">Questions or Comments?</a>, <a href="https://stathead.com/profile/?utm_source=web&amp;utm_medium=stathead&amp;utm_campaign=sr-nav-bar-top-account"

In [31]:
url = "https://stathead.com/football/play_finder.cgi?request=1&match=summary_all&sb=0&order_by_asc=0&order_by=yards&year_min=2020&game_type=R&game_num_min=0&game_num_max=99&week_num_min=4&week_num_max=4&quarter%5B%5D=1&quarter%5B%5D=2&quarter%5B%5D=3&quarter%5B%5D=4&quarter%5B%5D=5&minutes_max=15&seconds_max=0&minutes_min=0&seconds_min=0&down%5B%5D=0&down%5B%5D=1&down%5B%5D=2&down%5B%5D=3&down%5B%5D=4&field_pos_min_field=team&field_pos_max_field=team&end_field_pos_min_field=team&end_field_pos_max_field=team&type%5B%5D=PASS&type%5B%5D=RUSH&type%5B%5D=PUNT&type%5B%5D=KOFF&type%5B%5D=ONSD&type%5B%5D=FG&type%5B%5D=XP&type%5B%5D=2PC&no_play=N&turnover_type%5B%5D=interception&turnover_type%5B%5D=fumble&score_type%5B%5D=touchdown&score_type%5B%5D=field_goal&score_type%5B%5D=safety&rush_direction%5B%5D=LE&rush_direction%5B%5D=LT&rush_direction%5B%5D=LG&rush_direction%5B%5D=M&rush_direction%5B%5D=RG&rush_direction%5B%5D=RT&rush_direction%5B%5D=RE&pass_location%5B%5D=SL&pass_location%5B%5D=SM&pass_location%5B%5D=SR&pass_location%5B%5D=DL&pass_location%5B%5D=DM&pass_location%5B%5D=DR#offense::none"

import urllib.request

print('Beginning file download with urllib2...')

urllib.request.urlretrieve(url, 'OffenseWeek4.xls')

Beginning file download with urllib2...


('OffenseWeek4.xls', <http.client.HTTPMessage at 0x1a495bda948>)

In [2]:
def GetDKPoints():
    DK = []
    for w in range(1,18):
        try:
            url = str('http://rotoguru1.com/cgi-bin/fyday.pl?week='+str(w)+'&game=dk&scsv=1')
            response = get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            data = str(soup.find("pre"))
            lines = data.split("\n")
            l = []
            for line in lines:
                l.append(line.split(";"))
            headers = l.pop(0)
            headers[0] = "Week"
            l.pop()
            df = pd.DataFrame(l, columns=headers)
            DK.append(df)
        except:
            pass
    DKPoints = pd.concat(DK)
    return DKPoints

DKPoints = GetDKPoints()
print(DKPoints.head())
print(DKPoints.shape)
DKPoints.to_csv('data/2020/performance/DKPoints.csv')

  Week  Year   GID             Name Pos Team h/a Oppt DK points DK salary
0    1  2020  1412  Wilson, Russell  QB  sea   a  atl     34.78      7000
1    1  2020  1252   Rodgers, Aaron  QB  gnb   a  min     33.76      6300
2    1  2020  1529      Allen, Josh  QB  buf   h  nyj     33.18      6500
3    1  2020  1301       Ryan, Matt  QB  atl   h  sea      27.9      6700
4    1  2020  1527   Jackson, Lamar  QB  bal   h  cle      27.5      8100
(5639, 10)


AttributeError: 'NoneType' object has no attribute 'find_all'

In [1]:
import csv 
import json 
  
  
# Function to convert a CSV to JSON 
# Takes the file paths as arguments 
def make_json(csvFilePath, jsonFilePath): 
      
    # create a dictionary 
    data = {} 
      
    # Open a csv reader called DictReader 
    with open(csvFilePath, encoding='utf-8') as csvf: 
        csvReader = csv.DictReader(csvf) 
          
        # Convert each row into a dictionary  
        # and add it to data 
        for rows in csvReader: 
              
              
            # Assuming a column named 'No' to 
            # be the primary key 
            key = rows['Name + ID'] 
            data[key] = rows 
  
    # Open a json writer, and use the json.dumps()  
    # function to dump data 
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf: 
        jsonf.write(json.dumps(data, indent=4)) 
          
# Driver Code 
  
# Decide the two file paths according to your  
# computer system 
csvFilePath = r'data\2020\draftkings\Week15.csv'
jsonFilePath = r'data\2020\draftkings\Week15.json'
  
# Call the make_json function 
make_json(csvFilePath, jsonFilePath)