Comparing team attributes, not teams

## Usable Code

In [1]:
from bs4 import BeautifulSoup
import requests
import re

import pandas as pd

import pickle

The following cell gathers URL ends that will be inputted into the subsequent cell in order to gather relevant data for EACH game that will be inputted into a Pandas DF object.

In [136]:
years = range(2011, 2020)
weeks = range(1,22)
#years = [2019]
#weeks = [1,2]
game_link_ends = []
for year in years:
    for week in weeks:
        url0 = 'https://www.pro-football-reference.com/years/{}/week_{}.htm'.format(year, week)
        response0 = requests.get(url0)
        page0 = response0.text
        soup0 = BeautifulSoup(page0)
        for link in soup0.find_all(class_="right gamelink"):
            url1 = link.findNext()
            game_link_ends0 = url1.get('href')  #.get() method and 
                                                #calling index by attr (url['href']) act the same!
            game_link_ends.append(game_link_ends0.strip())

Get to the actual game pages where we can extract data we want.

In [137]:
url2 = 'https://www.pro-football-reference.com'
url_list = []
for game_link in game_link_ends:
    url = ''.join([url2, game_link])
    url_list.append(url)

In [138]:
#Sanity checks to make sure we get all the boxscore links & subsequent pages (267 games/year)
#print(*url_list, sep='\n')
print(len(url_list), len(game_link_ends))

2403 2403


Now that we are actually on the pages where we can grab info for a model, let's grab it!

In [139]:
def game_stat_dict(url):
    '''
    From Pro-Football-Reference link stub, request gamepage, parse with BeautifulSoup, and
    collect 
        - game total (target) 
        - traditional stats (features)
        - efficiency stats (features)
    Return total & stats as a dictionary.
    '''
    response = requests.get(url)
    page = response.text
    page = page.replace("<!--","").replace("-->","")
    soup = BeautifulSoup(page, 'html.parser')

    headers = ['total','tot_1st','tot_rush_att','tot_rush_yds','tot_rush_tds',
               'tot_comp','tot_att','tot_pass_yds','total_pass_tds','total_int',
               'tot_sacks','tot_sack_yds','tot_net_pass_yds','total_tot_yds',
               'tot_fum','tot_fum_l','total_to','tot_pen','tot_pen_yds',
               'tot_third_conv','tot_thirds','tot_third_per',
               'tot_fourth_conv','tot_fourths','tot_fourth_per',
               'margin','tot_off_epa','tot_pass_epa','tot_rush_epa','tot_to_epa','tot_spec_epa']

#scrape totals from scorebox at top of page
    team_scores = []
    table = soup.find('table', class_="linescore nohover stats_table no_freeze")
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        #cols = [ele.text.strip() for ele in cols]
        team_scores.append([ele for ele in cols[-1] if ele]) # Get rid of empty values
    total = sum([int(scores) for teams in team_scores for scores in teams]) #flatten list to be able to add

#scrape team stats box for traditional metrics
    data1 = []
    table1 = soup.find('table', class_="add_controls stats_table")
    table_body1 = table1.find('tbody')

    rows1 = table_body1.find_all('tr')
    for row in rows1:
        cols = row.find_all('td')
        #cols = [ele.text.strip() for ele in cols]
        data1.append([ele.text for ele in cols if ele]) # Get rid of empty values
#process traditional stats table
#total first downs
    total_first_downs = sum(int(item) for item in data1[0])
#separate rush stats & make appropriate transformations
    rush_stats = [item.split('-') for item in data1[1]]
    total_rush_att = int(rush_stats[0][0]) + int(rush_stats[1][0])
    total_rush_yds = int(rush_stats[0][1]) + int(rush_stats[1][1])
    total_rush_tds = int(rush_stats[0][2]) + int(rush_stats[1][2])
#separate pass stats & make appropriate transformations
    pass_stats = [item.split('-') for item in data1[2]]
    total_comp = int(pass_stats[0][0]) + int(pass_stats[1][0])
    total_att = int(pass_stats[0][1]) + int(pass_stats[1][1])
    total_pass_yds = int(pass_stats[0][2]) + int(pass_stats[1][2])
    total_pass_tds = int(pass_stats[0][3]) + int(pass_stats[1][3])
    total_int = int(pass_stats[0][4]) + int(pass_stats[1][4])
#separate sack stats & make appropriate transformations
    sack_stats = [item.split('-') for item in data1[3]]
    total_sacks = int(sack_stats[0][0]) + int(sack_stats[1][0])
    total_sack_yds = int(sack_stats[0][1]) + int(sack_stats[1][1])
#net pass yards & total yards (stats don't require splits)
    total_net_pass_yds = int(data1[4][0]) + int(data1[4][1])
    total_tot_yds = int(data1[5][0]) + int(data1[5][1])
#separate fumbles & make appropriate transformations
    fum_stats = [item.split('-') for item in data1[6]]
    total_fum = int(fum_stats[0][0]) + int(fum_stats[1][0])
    total_fum_l = int(fum_stats[0][1]) + int(fum_stats[1][1])
#turnover stats (doesn't require split)
    total_to = int(data1[7][0]) + int(data1[7][1])
#separate penalty stats & make appropriate transformations
    pen_stats = [item.split('-') for item in data1[8]]
    total_pen = int(pen_stats[0][0]) + int(pen_stats[1][0])
    total_pen_yds = int(pen_stats[0][1]) + int(pen_stats[1][1])
#separate 3rd down stats & make appropriate transformations
    third_dn_stats = [item.split('-') for item in data1[9]]
    total_third_conv = int(third_dn_stats[0][0]) + int(third_dn_stats[1][0])
    total_thirds = int(third_dn_stats[0][1]) + int(third_dn_stats[1][1])
    total_third_per = round(100*total_third_conv/total_thirds,2)
#separate 4th down stats & make appropriate transformations
    fourth_dn_stats = [item.split('-') for item in data1[10]]
    total_fourth_conv = int(fourth_dn_stats[0][0]) + int(fourth_dn_stats[1][0])
    total_fourths = int(fourth_dn_stats[0][1]) + int(fourth_dn_stats[1][1])
    if total_fourths != 0:
        total_fourth_per = round(100*total_fourth_conv/total_fourths,2)
    else:
        total_fourth_per = None

#scrape team stats box for efficiency metrics
    data2 = []
    table2 = soup.find('table', id='expected_points')
    table_body2 = table2.find('tbody')

    rows2 = table_body2.find_all('tr')
    for row in rows2:
        cols = row.find_all('td')
        data2.append([ele.text for ele in cols if ele]) # Get rid of empty values
#grab relevant OFFESNIVE values & make appropriate transformations (defensive numbers are flipped sign)
    scoring_margin = int(abs(float(data2[0][0])))
    total_off_epa = round(float(data2[0][1]) + float(data2[1][1]),2)
    total_pass_epa = round(float(data2[0][2]) + float(data2[1][2]),2)
    total_rush_epa = round(float(data2[0][3]) + float(data2[1][3]),2)
    total_to_epa = round(float(data2[0][4]) + float(data2[1][4]),2)
    total_spec_epa = abs(float(data2[0][9]))

#make a dictionary of all the stats pulled from the gamepage
    game_dict = dict(zip(headers,[total,total_first_downs,total_rush_att,total_rush_yds,total_rush_tds,
                                  total_comp,total_att,total_pass_yds,total_pass_tds,total_int,
                                  total_sacks,total_sack_yds,total_net_pass_yds,total_tot_yds,
                                  total_fum,total_fum_l,total_to,total_pen,total_pen_yds,
                                  total_third_conv,total_thirds,total_third_per,
                                  total_fourth_conv,total_fourths,total_fourth_per,
                                  scoring_margin,total_off_epa,total_pass_epa,total_rush_epa,
                                  total_to_epa,total_spec_epa]))
    return game_dict

In [140]:
game_list = []

for url in url_list:
    game_list.append(game_stat_dict(url))

In [141]:
game_df = pd.DataFrame(game_list) #convert list of dicts to DF
game_df.head()

Unnamed: 0,total,tot_1st,tot_rush_att,tot_rush_yds,tot_rush_tds,tot_comp,tot_att,tot_pass_yds,total_pass_tds,total_int,...,tot_third_per,tot_fourth_conv,tot_fourths,tot_fourth_per,margin,tot_off_epa,tot_pass_epa,tot_rush_epa,tot_to_epa,tot_spec_epa
0,76,54,48,184,2,59,84,731,6,0,...,65.38,0,1,0.0,8,43.0,38.99,4.54,-3.32,1.53
1,44,48,58,390,2,32,67,396,2,0,...,40.0,2,4,50.0,18,8.39,3.99,7.93,-15.1,0.21
2,42,37,47,236,1,39,70,504,4,3,...,40.0,0,2,0.0,27,-12.06,-10.1,-0.69,-29.55,3.57
3,30,33,60,206,1,38,58,438,2,1,...,44.83,0,0,,2,3.3,8.45,-5.5,-5.12,2.37
4,41,41,57,231,2,33,55,417,2,2,...,30.0,0,1,0.0,27,-11.81,-1.75,-8.81,-24.37,8.14


In [143]:
game_df.shape

(2403, 31)

In [144]:
with open('game_df.pickle', 'wb') as to_write:
    pickle.dump(game_df, to_write)

In [None]:
#sanity check that all games went through the function
print(len(game_list), len(url_list))

In [129]:
#sanity check link
#url_list[31]

'https://www.pro-football-reference.com/boxscores/201909160nyj.htm'

In [130]:
#sanity check stats
#game_list[31]

{'total': 26,
 'tot_1st': 29,
 'tot_rush_att': 45,
 'tot_rush_yds': 163,
 'tot_rush_tds': 1,
 'tot_comp': 42,
 'tot_att': 66,
 'tot_pass_yds': 526,
 'total_pass_tds': 1,
 'total_int': 1,
 'tot_sacks': 7,
 'tot_sack_yds': 52,
 'tot_net_pass_yds': 474,
 'total_tot_yds': 637,
 'tot_fum': 1,
 'tot_fum_l': 1,
 'total_to': 2,
 'tot_pen': 21,
 'tot_pen_yds': 174,
 'tot_third_conv': 6,
 'tot_thirds': 27,
 'tot_third_per': 22.22,
 'tot_fourth_conv': 0,
 'tot_fourths': 2,
 'tot_fourth_per': 0.0,
 'margin': 20,
 'tot_off_epa': -9.47,
 'tot_pass_epa': -0.95,
 'tot_rush_epa': -6.29,
 'tot_to_epa': -7.22,
 'tot_spec_epa': 0.9}

In [132]:
#sanity check on dictionary key-value pairs
headers = ['total','tot_1st','tot_rush_att','tot_rush_yds','tot_rush_tds',
               'tot_comp','tot_att','tot_pass_yds','total_pass_tds','total_int',
               'tot_sacks','tot_sack_yds','tot_net_pass_yds','total_tot_yds',
               'tot_fum','tot_fum_l','total_to','tot_pen','tot_pen_yds',
               'tot_third_conv','tot_thirds','tot_third_per',
               'tot_fourth_conv','tot_fourths','tot_fourth_per',
               'margin','tot_off_epa','tot_pass_epa','tot_rush_epa','tot_to_epa','tot_spec_epa']
stats = [total,total_first_downs,total_rush_att,total_rush_yds,total_rush_tds,
         total_comp,total_att,total_pass_yds,total_pass_tds,total_int,
         total_sacks,total_sack_yds,total_net_pass_yds,total_tot_yds,
         total_fum,total_fum_l,total_to,total_pen,total_pen_yds,
         total_third_conv,total_thirds,total_third_per,
         total_fourth_conv,total_fourths,total_fourth_per,
         scoring_margin,total_off_epa,total_pass_epa,total_rush_epa,total_to_epa,total_spec_epa]

#len(headers) - len(stats)

0

## Scratch Work Area

### Workable Page

In [83]:
url_list[0]

'https://www.pro-football-reference.com/boxscores/201909050chi.htm'

In [84]:
#start with first game to get a sense of basic structure; REMOVE COMMENTS
response = requests.get(url_list[0])
page = response.text
page = page.replace("<!--","").replace("-->","")
soup = BeautifulSoup(page, 'html.parser')

### Grab Target

In [99]:
#scrape totals from scorebox at top of page
data = []
table = soup.find('table', class_="linescore nohover stats_table no_freeze")
table_body = table.find('tbody')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    #cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols[-1] if ele]) # Get rid of empty values

total = sum([int(item) for sublist in data for item in sublist])
total

13

### Grab Traditional Stats Table

In [86]:
#scrape team stats box for traditional metrics
data1 = []
table1 = soup.find('table', class_="add_controls stats_table")
table_body1 = table1.find('tbody')

rows1 = table_body1.find_all('tr')
for row in rows1:
    cols = row.find_all('td')
    #cols = [ele.text.strip() for ele in cols]
    data1.append([ele.text for ele in cols if ele]) # Get rid of empty values

data1

[['13', '16'],
 ['22-47-0', '15-46-0'],
 ['18-30-203-1-0', '26-45-228-0-1'],
 ['5-37', '5-20'],
 ['166', '208'],
 ['213', '254'],
 ['1-0', '0-0'],
 ['0', '1'],
 ['10-71', '10-107'],
 ['2-12', '3-15'],
 ['0-0', '0-2'],
 ['31:03', '28:57']]

### Process Traditional Stats Table

In [87]:
#total first downs
total_first_downs = sum(int(item) for item in data1[0])
print(total_first_downs)

29


In [88]:
#separate rush stats and make appropriate transformations
rush_stats = [item.split('-') for item in data1[1]]
total_rush_att = int(rush_stats[0][0]) + int(rush_stats[1][0])
print(total_rush_att)
total_rush_yds = int(rush_stats[0][1]) + int(rush_stats[1][1])
print(total_rush_yds)
total_rush_tds = int(rush_stats[0][2]) + int(rush_stats[1][2])
print(total_rush_tds)

37
93
0


In [101]:
#separate pass stats and make appropriate transformations
pass_stats = [item.split('-') for item in data1[2]]
#print(pass_stats)
total_comp = int(pass_stats[0][0]) + int(pass_stats[1][0])
print(total_comp)
total_att = int(pass_stats[0][1]) + int(pass_stats[1][1])
print(total_att)
total_pass_yds = int(pass_stats[0][2]) + int(pass_stats[1][2])
print(total_pass_yds)
total_pass_tds = int(pass_stats[0][3]) + int(pass_stats[1][3])
print(total_pass_tds)
total_int = int(pass_stats[0][4]) + int(pass_stats[1][4])
print(total_int)

44
75
431
1
1


In [90]:
#seperate sacks and make appropriate transformations
sack_stats = [item.split('-') for item in data1[3]]
#print(sack_stats)
total_sacks = int(sack_stats[0][0]) + int(sack_stats[1][0])
print(total_sacks)
total_sack_yds = int(sack_stats[0][1]) + int(sack_stats[1][1])
print(total_sack_yds)

10
57


In [91]:
#net pass yards & total yards (stats that don't require splits)
total_net_pass_yds = int(data1[4][0]) + int(data1[4][1])
print(total_net_pass_yds)
total_tot_yds = int(data1[5][0]) + int(data1[5][1])
print(total_tot_yds)

374
467


In [92]:
#seperate fumbles and make appropriate transformations
fum_stats = [item.split('-') for item in data1[6]]
print(fum_stats)
total_fum = int(fum_stats[0][0]) + int(fum_stats[1][0])
print(total_fum)
total_fum_l = int(fum_stats[0][1]) + int(fum_stats[1][1])
print(total_fum_l)

[['1', '0'], ['0', '0']]
1
0


In [93]:
#turnover stats (doesn't require split)
total_to = int(data1[7][0]) + int(data1[7][1])
print(total_to)

1


In [94]:
#seperate penalities & make appropriate transformations
pen_stats = [item.split('-') for item in data1[8]]
print(pen_stats)
total_pen = int(pen_stats[0][0]) + int(pen_stats[1][0])
print(total_pen)
total_pen_yds = int(pen_stats[0][1]) + int(pen_stats[1][1])
print(total_pen_yds)

[['10', '71'], ['10', '107']]
20
178


In [95]:
#seperate 3rd downs & make appropriate transformations
third_dn_stats = [item.split('-') for item in data1[9]]
print(third_dn_stats)
total_third_conv = int(third_dn_stats[0][0]) + int(third_dn_stats[1][0])
print(total_third_conv)
total_thirds = int(third_dn_stats[0][1]) + int(third_dn_stats[1][1])
print(total_thirds)
total_third_per = round(100*total_third_conv/total_thirds,2)
print(total_third_per)

[['2', '12'], ['3', '15']]
5
27
18.52


In [96]:
#seperate 4th downs & make appropriate transformations
fourth_dn_stats = [item.split('-') for item in data1[10]]
print(fourth_dn_stats)
total_fourth_conv = int(fourth_dn_stats[0][0]) + int(fourth_dn_stats[1][0])
print(total_fourth_conv)
total_fourths = int(fourth_dn_stats[0][1]) + int(fourth_dn_stats[1][1])
print(total_fourths)
total_fourth_per = round(100*total_fourth_conv/total_fourths,2)
print(total_fourth_per)

[['0', '0'], ['0', '2']]
0
2
0.0


In [127]:
total_fourth_conv = 0
total_fourths = 0
if total_fourths != 0:
    total_fourth_per = round(100*total_fourth_conv/total_fourths,2)
else:
    total_fourth_per = None
print(total_fourth_per)

None


### Grab Efficiency Stats Table

In [97]:
#scrape team stats box for efficiency metrics
data2 = []
table2 = soup.find('table', id='expected_points')
table_body2 = table2.find('tbody')

rows2 = table_body2.find_all('tr')
for row in rows2:
    cols = row.find_all('td')
    #cols = [ele.text.strip() for ele in cols]
    data2.append([ele.text for ele in cols if ele]) # Get rid of empty values

data2

[['-7.00',
  '-20.02',
  '-14.69',
  '-4.34',
  '-3.36',
  '9.57',
  '-2.26',
  '10.15',
  '0.00',
  '-2.01',
  '-2.21',
  '1.83',
  '4.70',
  '-6.31',
  '-0.02'],
 ['7.00',
  '-9.57',
  '2.26',
  '-10.15',
  '0.00',
  '20.02',
  '14.69',
  '4.34',
  '3.36',
  '2.01',
  '-1.83',
  '2.21',
  '6.31',
  '-4.70',
  '0.02']]

### Process Efficiency Stats Table

In [109]:
#grab relevant OFFESNIVE values & make appropriate transformations (defensive numbers are flipped sign)
scoring_margin = int(abs(float(data2[0][0])))
print(scoring_margin)
total_off_epa = round(float(data2[0][1]) + float(data2[1][1]),2)
print(total_off_epa)
total_pass_epa = round(float(data2[0][2]) + float(data2[1][2]),2)
print(total_pass_epa)
total_rush_epa = round(float(data2[0][3]) + float(data2[1][3]),2)
print(total_rush_epa)
total_to_epa = round(float(data2[0][4]) + float(data2[1][4]),2)
print(total_to_epa)
total_spec_epa = abs(float(data2[0][9]))
print(total_spec_epa)

7
-29.59
-12.43
-14.49
-3.36
2.01


## Scratch Work Graveyard

Cirlce back for:
- Play-by-play table to grab EPA's
- Another source for other advanced stats (CPOE)
- QB specific stats

In [None]:
#Play-By-Play table, doesn't seem to pull in BS since it has weird "comment" section above it...
#will cirlce back if time permits
#REDEFINE DUMMY VARIABLES!!
data2 = []
table2 = soup.find('table', class_="sortable stats_table now_sortable sticky_table eq1 eq2 re2 le1")
table2
table_body = table.find('<tbody>')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data1.append([ele for ele in cols if ele]) # Get rid of empty values

In [None]:
##Trying to grab totals from last row of Scoring Plays Table
#start with first game to get a sense of basic structure
#response = requests.get(url_list[0])
#page = response.text
#soup = BeautifulSoup(page, 'html.parser')
#Basic form to scrape any table
#data = []
#table = soup.find('table', attrs={'id':'scoring'})
#table_body = table.find('tbody')

#rows = table_body.find_all('tr')
#for row in rows:
 #   cols = row.find_all('td')
 #   cols = [ele.text.strip() for ele in cols]
 #   data.append([ele for ele in cols if ele]) # Get rid of empty values
#rows

In [None]:
#Attempting to use Comment module from bs4 package (didn't work; claimed Comment wasn't defined??)
#to_remove = soup.find_all(text=Comment) 
#for element in to_remove: 
#    element.extract()

#for comments in soup.findAll(text=lambda text:isinstance(text, Comment)):
#    print(comments)
#    comments.extract()

#comments = soup.find_all(string=lambda text: isinstance(text, Comment))
#for c in comments:
#    print(c)
#    print("===========")
#    c.extract()
#soup