In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from collections import defaultdict
from functools import partial, reduce
import pickle
path = !pwd

In [2]:
def soup_from_url(url):
    response = requests.get(url)
    page = response.text
    return BeautifulSoup(page, "lxml")

In [3]:
#Get list of teams and team links
soup = soup_from_url('https://www.pro-football-reference.com/teams')
team_links = [x.a for x in soup.find_all(attrs ={'data-stat' : 'team_name'})[1:]]
team_links = list(filter(None, team_links))
teams = [(x.text, x['href']) for x in team_links]

In [46]:
#Get team summary data
url = 'https://www.pro-football-reference.com{}'
dd = defaultdict(dict)
for t in teams:
    soup = soup_from_url(url.format(t[1]))
    for i, row in enumerate(soup.tbody.find_all('tr')[1:12]):
        year = 2016 - i
        dd[t[0]][year] = dict([(x['data-stat'], x.text) for x in row.find_all('td')][2:])

In [58]:
#Dump team summary data
with open('team_info.pkl', 'wb') as picklefile:
    pickle.dump(dd, picklefile)

In [62]:
#Get team penalty data
d = defaultdict(partial(defaultdict, dict))
penalty_keys = ['Number of Penalties', 
                'Yards Penalized', 
                'Delay of Game', 
                'False Start',
                'Offensive Holding',
                'Offensive Pass Interference',
                'Offsides',
                'Defensive Holding',
                'Illegal Contact',
                'Defensive Pass Interference',
                'Roughing the Passer',
                'Personal Foul',
                'Unnecessary Roughness', 
                'Unsportsmanlike Conduct',
                'Other']
for year in range(2006, 2017):
    soup = soup_from_url('http://www.footballdb.com/stats/penalties.html?yr={}'.format(year))
    for row in soup.tbody.find_all('tr'):
        penalties = [x.text for x in row.find_all('td')[1:]]
        d[row.a.text][year] = dict(zip(penalty_keys, penalties))

In [64]:
#Dump team penalty data
with open('penalties.pkl', 'wb') as picklefile:
    pickle.dump(d, picklefile)

In [6]:
#Helper methods for ratings parsing
def merge_2nd_and_3rd(l):
    l_ = l[:]
    l_[2:4] = [''.join(l_[2:4])]
    return l_


def is_week(s):
    return bool(re.match('WEEK.*', s))

def is_date(s):
    return bool(re.match(r'\w*, \w* \d\d?', s))

In [208]:
# 2016 Ratings
soup = soup_from_url('http://www.sportsmediawatch.com/nfl-tv-ratings-viewership-nbc-cbs-fox-espn-nfln-regular-season-playoffs/2')
l2 = [re.split('#', x.get_text(separator='#')) for x in soup.tbody.find_all('tr')]
l = []
week = None
for row in l2[2:]:
    if len(row) in {1, 2, 3} and is_week(row[0]):
        week = row[0]
    if week: #skips playoff games
        if len(row) == 9:
            row.append(week)
            l.append(row)
        elif len(row) == 10:
            row.append(week)
            l.append(merge_2nd_and_3rd(row))
labels = ['Window', 'Game', 'Net', 'Rtg.', '+/-', 'Vwrs.', '+/-', 'Strm.', '18-49', 'Date']
df = pd.DataFrame(l, columns=labels)
df.to_csv('ratings_2016.csv')

In [209]:
df.head(100)

Unnamed: 0,Window,Game,Net,Rtg.,+/-,Vwrs.,+/-.1,Strm.,18-49,Date
0,SNF,GB/DET,NBC,13.0,-6%,23.8M,-2%,—,—,WEEK SEVENTEEN
1,Late DH,NYG/WSH (76%),FOX,12.3,+3%,22.4M,+7%,—,7.4,WEEK SEVENTEEN
2,Early DH,DAL/PHI (79%),FOX,9.65,+7%,17.0M,+13%,—,—,WEEK SEVENTEEN
3,Early DH,MIA/NE (61%),CBS,9.2,-18%,16.6M,-11%,—,—,WEEK SEVENTEEN
4,Late DH,Regional action,CBS,6.8,-34%,12.1M,-31%,—,—,WEEK SEVENTEEN
5,SNF,KC/DEN,NBC,10.6,+2%,21.4M,+20%,81K,6.4,WEEK SIXTEEN
6,MNF,DET/DAL,ESPN,10.1,+7%,18.6M,+18%,260K,5.8,WEEK SIXTEEN
7,Early DH,GB/MIN (71%),FOX,10.0,+8%,18.4M,+13%,—,—,WEEK SIXTEEN
8,TNF**,NYG/PHI,"NBC, NFLN",10.6,+194%,17.9M,+154%,350K,5.1,WEEK SIXTEEN
9,Late DH,TB/NO (66%),FOX,8.1,-48%,15.4M,-44%,—,—,WEEK SIXTEEN


In [12]:
# 2015 Ratings
soup = soup_from_url('http://www.sportsmediawatch.com/nfl-tv-ratings-viewership-nbc-cbs-fox-espn-nfln-regular-season-playoffs/3')
l = []
l2 = [re.split('#', x.get_text(separator='#')) for x in soup.tbody.find_all('tr')]
for row in l2:
    if len(row) == 1:
        week = row[0].title()
    if len(row) == 8:
        row.pop(1)
        row.append(week)
        l.append(row) 
labels = ['Game', 'Window', 'Net', 'Rtg.', '+/-', 'Vwrs.', '+/-', 'Date']
df = pd.DataFrame(l, columns=labels)
df.to_csv(path[0] + '/ratings_2015.csv')

In [13]:
df.head(100)

Unnamed: 0,Game,Window,Net,Rtg.,+/-,Vwrs.,+/-.1,Date
0,Steelers/Patriots,NFL Kickoff,NBC,16.2,+3%,27.4M,+1%,Week One
1,Giants/Cowboys,Sun. Night Football,NBC,15.4,+10%,26.8M,+12%,Week One
2,Ravens/Broncos (80%),Doubleheader Gm. 2,CBS,13.5,-14%,23.3M,-17%,Week One
3,Packers/Bears (47%),Singleheader,FOX,11.6,+17%,20.1M,+22%,Week One
4,Regional action,Doubleheader Gm. 1,CBS,—,—,14.9M,+6%,Week One
5,Vikings/49ers,Mon. Night Football,ESPN,8.8,+24%,14.3M,+24%,Week One
6,Eagles/Falcons,Mon. Night Football,ESPN,8.5,+1%,13.6M,-1%,Week One
7,Cowboys/Eagles,Doubleheader Gm. 2,FOX,15.5,+8%,27.2M,+9%,Week Two
8,Seahawks/Packers,Sun. Night Football,NBC,15.1,+17%,26.4M,+19%,Week Two
9,Broncos/Chiefs,Thu. Night Football,"CBS, NFLN",12.9,+2%,21.1M,+2%,Week Two


In [15]:
#2014 Ratings
soup = soup_from_url('http://www.sportsmediawatch.com/nfl-tv-ratings-viewership-nbc-cbs-fox-espn-nfln-regular-season-playoffs/4/')
l2 = [re.split('#', x.get_text(separator='#')) for x in soup.tbody.find_all('tr')]
l2[0].append('Date')
for row in l2[1:]:
    if is_date(row[0]):
        date = row[0]
        continue
    if len(row) > 6:
        row.append(date)
l2 = [x for x in l2 if len(x) > 6]
df = pd.DataFrame(l2[1:], columns=['Window', 'Game', 'Net', 'Rtg.', '+/-', 'Vwrs.', '+/-', 'Date'])
df = df[:107]
df.to_csv(path[0] + '/ratings_2014.csv')
df.head(20)

Unnamed: 0,Window,Game,Net,Rtg.,+/-,Vwrs.,+/-.1,Date
0,Kickoff,Packers/Seahawks,NBC,15.5,+4%,26.91M,+7%,"Thursday, September 4"
1,National,49ers/Cowboys (89%),FOX,15.7,-5%,28.00M,-2%,"Sunday, September 7"
2,SNF,Colts/Broncos,NBC,13.9,-8%,23.69M,-7%,"Sunday, September 7"
3,Single,Regional action,CBS,9.9,+4%,16.50M,+6%,"Sunday, September 7"
4,Regional,Saints/Falcons (49%),FOX,8.2,-8%,14.10M,-4%,"Sunday, September 7"
5,MNF,Giants/Lions,ESPN,8.4,-17%,13.73M,-17%,"Monday, September 8"
6,MNF,Cardinals/Chargers,ESPN,7.1,even,11.52M,+5%,"Monday, September 8"
7,TNF,Steelers/Ravens,"CBS, NFLN",12.7,+84%,20.77M,+89%,"Thursday, September 11"
8,National,Chiefs/Broncos (56%),CBS,14.4,-9%,24.94M,-6%,"Sunday, September 14"
9,SNF,Bears/49ers,NBC,12.9,+6%,22.16M,+8%,"Sunday, September 14"


In [21]:
# Injuries Data

data = []
url = 'http://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2006-09-01&EndDate=2017-09-28&ILChkBx=yes&submit=Search'
soup = soup_from_url(url)
page_indices = list(range(25, 10501, 25))
pre_url = "http://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=2006-09-01&EndDate=2017-09-28&ILChkBx=yes&submit=Search&start={}"
for i in page_indices:
    url = pre_url.format(i)
    soup = soup_from_url(url)
    for row in soup.table.find_all('tr'):
        data.append([x.text for x in row.find_all('td')])  
df = pd.DataFrame(data)
df.to_csv(path[0] + '/injuries.csv')

***Getting the data for scores by year***
        

In [261]:
base_url = 'https://www.pro-football-reference.com/years/2016/week_{}.htm'
l = []
for week in list(range(1, 18)):
    url = base_url.format(week)
    soup = soup_from_url(url)
    l2 = [[re.split('#', x.get_text(separator='#')) for x in y.find_all('td')] for y in soup.find_all(class_='teams')]
    l += [(week, x[0][0], x[1][0], x[2][0], x[4][0], x[5][0]) for x in l2]

In [262]:
games_2016 = pd.DataFrame(l,columns=['week', 'date', 'team_1', 'team_1_score', 'team_2', 'team_2_score'])
games_2016.head(20)
games_2016.to_csv('games_2016.csv')

In [263]:
base_url = 'https://www.pro-football-reference.com/years/2015/week_{}.htm'
l = []
for week in list(range(1, 18)):
    url = base_url.format(week)
    soup = soup_from_url(url)
    l2 = [[re.split('#', x.get_text(separator='#')) for x in y.find_all('td')] for y in soup.find_all(class_='teams')]
    l += [(week, x[0][0], x[1][0], x[2][0], x[4][0], x[5][0]) for x in l2]

In [264]:
games_2015 = pd.DataFrame(l,columns=['week', 'date', 'team_1', 'team_1_score', 'team_2', 'team_2_score'])
games_2015.to_csv('games_2015.csv')

In [265]:
base_url = 'https://www.pro-football-reference.com/years/2014/week_{}.htm'
l = []
for week in list(range(1, 18)):
    url = base_url.format(week)
    soup = soup_from_url(url)
    l2 = [[re.split('#', x.get_text(separator='#')) for x in y.find_all('td')] for y in soup.find_all(class_='teams')]
    l += [(week, x[0][0], x[1][0], x[2][0], x[4][0], x[5][0]) for x in l2]


In [266]:
games_2014 = pd.DataFrame(l,columns=['week', 'date', 'team_1', 'team_1_score', 'team_2', 'team_2_score'])
games_2014.to_csv('games_2014.csv')

In [16]:
base_url = 'https://www.pro-football-reference.com/years/2013/week_{}.htm'
l = []
for week in list(range(1, 18)):
    url = base_url.format(week)
    soup = soup_from_url(url)
    l2 = [[re.split('#', x.get_text(separator='#')) for x in y.find_all('td')] for y in soup.find_all(class_='teams')]
    l += [(week, x[0][0], x[1][0], x[2][0], x[4][0], x[5][0]) for x in l2]



In [18]:
games_2013 = pd.DataFrame(l,columns=['week', 'date', 'team_1', 'team_1_score', 'team_2', 'team_2_score'])
games_2013.to_csv('games_2013.csv')