### Sourcing/scraping data
* identify data sources
* scrape data
    * [x] Injuries: https://www.pro-football-reference.com/teams/nyg/2011_injuries.htm
    * [x] Weekly DVOA: https://www.footballoutsiders.com/dvoa-ratings/2011/week-1-dvoa-ratings
    * [x] Historical lines/scores: http://www.sportsbookreviewsonline.com/scoresoddsarchives/nfl/nfloddsarchives.htm
    * [x] Play-by-play?: used F-REF (needs to be parsed) but backup link: http://nflsavant.com/about.php
    * [x] Historical weather: used F-REF but backup link: http://www.nflweather.com/en/week/2014/Week-5/
    * [x] Snap counts (could be helpful with injuries): https://www.footballoutsiders.com/stats/snapcounts
    * [x] Head coach info (# wins, winning %, age)
    * [x] Weekly stats -- player/team: https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2017&year_max=2017&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=2&week_num_max=2&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1
    * [x] division/conference
    * [x] travel data between teams, or timezone of location (timezone change and direction of change)
    * [] timezones
    * NFL API: http://www.nfl.com/liveupdate/game-center/2016010310/2016010310_gtd.json



In [1]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

In [157]:
from __future__ import division
from bs4 import BeautifulSoup
from collections import OrderedDict
from datetime import date
from datetime import datetime
import json
import math
import os
from pandas import *
import requests
from selenium import webdriver
from time import sleep
import wikipedia

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

options.display.max_rows = 500

In [158]:
class ScrapeTools:
    '''class to handle opening a webpage using selenium
    and return a BeautifulSoup object for data extraction'''
    def __init__(self):
        pass
    
    def get_driver(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--incognito")

        return webdriver.Chrome('/tmp/chromedriver'
                                  , chrome_options=chrome_options)        
    
    def get_soup(self, url, driver=None):
        if not driver:
            driver = self.get_driver()
        driver.get(url)
        r = driver.page_source
        return BeautifulSoup(r, 'lxml')

In [1]:
DATA_DIR = '../football_data/'

### Team ID Mapping for pro football-reference:
* https://www.pro-football-reference.com/teams/

In [194]:
driver = ScrapeTools().get_driver()
url = 'https://www.pro-football-reference.com/teams/'
soup = ScrapeTools().get_soup(url, driver)

pfr_teams = {}
for tbl in soup.findAll('table', {'id': 'teams_active'}):
    tbl = tbl.findAll('tbody')[0]
    for row in tbl.findAll('tr'):
        for team in row.findAll('th'):
            for a in team.findAll('a'):
                pfr_teams[a.attrs['href'].split('/')[2]] = a.text

In [195]:
pfr_teams

{'atl': u'Atlanta Falcons',
 'buf': u'Buffalo Bills',
 'car': u'Carolina Panthers',
 'chi': u'Chicago Bears',
 'cin': u'Cincinnati Bengals',
 'cle': u'Cleveland Browns',
 'clt': u'Indianapolis Colts',
 'crd': u'Arizona Cardinals',
 'dal': u'Dallas Cowboys',
 'den': u'Denver Broncos',
 'det': u'Detroit Lions',
 'gnb': u'Green Bay Packers',
 'htx': u'Houston Texans',
 'jax': u'Jacksonville Jaguars',
 'kan': u'Kansas City Chiefs',
 'mia': u'Miami Dolphins',
 'min': u'Minnesota Vikings',
 'nor': u'New Orleans Saints',
 'nwe': u'New England Patriots',
 'nyg': u'New York Giants',
 'nyj': u'New York Jets',
 'oti': u'Tennessee Titans',
 'phi': u'Philadelphia Eagles',
 'pit': u'Pittsburgh Steelers',
 'rai': u'Oakland Raiders',
 'ram': u'Los Angeles Rams',
 'rav': u'Baltimore Ravens',
 'sdg': u'Los Angeles Chargers',
 'sea': u'Seattle Seahawks',
 'sfo': u'San Francisco 49ers',
 'tam': u'Tampa Bay Buccaneers',
 'was': u'Washington Redskins'}

### Injuries on PF-R

In [196]:
def get_injury_team_year(team, year, driver):
    '''scrape football-reference for injury info:
    '''
    i = 0
    url = 'https://www.pro-football-reference.com/teams/{}/{}_injuries.htm'\
                .format(team, year)
    soup = ScrapeTools().get_soup(url, driver)

    tbl = soup.findAll('div', {'id':'div_team_injuries'})[0]
    tbl = tbl.findAll('tbody')[0]

    def get_status(col):
        status = col.attrs['class'][2:]
        if len(status) == 0:
            return ['A', False]
        elif len(status) == 1:
            return [status[0], False]
        else:
            return [status[0], True]

    def get_injury(col):
        if 'data-tip' in col.attrs.keys():
            return col.attrs['data-tip'].split(': ')[-1]
        else:
            return ''

    players = {}
    for row in tbl.findAll('tr'):
        for th in row.findAll('th'):
            if 'data-append-csv' not in th.attrs.keys():
                return None
            pid = th.attrs['data-append-csv']
            name = th.text
        for col in row.findAll('td'):
            status, dnp = get_status(col)

            week = int(col.attrs['data-stat'].split('_')[-1])
            inj = get_injury(col)
            players[i] = {
                'pid': pid,
                'status': status, 
                'dnp': dnp, 
                'inj': inj,
                'name': name,
                'team': team,
                'week': week, 
                'year': year
            }
            i += 1

    return DataFrame.from_dict(players, orient='index')

In [201]:
first = False
for team in pfr_teams.keys():
    for year in np.arange(2018,2019):
        df = get_injury_team_year(team, year, driver)
        if df is not None:
            if first is True:
                injuries = df
                first = False
            else:
                injuries = injuries.append(df).reset_index(drop=True)

In [202]:
print injuries.shape
injuries.head()

(167208, 8)


Unnamed: 0,status,week,dnp,inj,team,year,pid,name
0,A,1,False,,Los Angeles Rams,2011,AhYoC.00,C.J. Ah You
1,out,2,True,wrist,Los Angeles Rams,2011,AhYoC.00,C.J. Ah You
2,questionable,3,True,wrist,Los Angeles Rams,2011,AhYoC.00,C.J. Ah You
3,A,4,False,,Los Angeles Rams,2011,AhYoC.00,C.J. Ah You
4,A,6,False,,Los Angeles Rams,2011,AhYoC.00,C.J. Ah You


In [203]:
injuries['team'] = injuries['team'].map(pfr_teams)

In [204]:
injuries.to_csv('all_injuries.csv'.format(DATA_DIR))

### Historical Lines:
* SBR has a horrible data format. lines and O/U are inconsistent
 * ML will indicate who is the favorite (open or close?)
 * lower of 2H is the spread; higher is the O/U
 * open/close have 4 combos. take max open and max close

In [5]:
df = read_excel('http://www.sportsbookreviewsonline.com/scoresoddsarchives/nfl/nfl%20odds%202017-18.xlsx')\
        .reset_index(drop=True)
df.head()

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H
0,907,451,V,KansasCity,7,7,7,21,42,48,47.5,371,25.5
1,907,452,H,NewEngland,7,10,10,0,27,7,8.0,-480,4.5
2,910,453,V,NYJets,0,6,6,0,12,43,42.0,317,21.0
3,910,454,H,Buffalo,0,7,7,7,21,6,7.0,-400,5.0
4,910,455,V,Atlanta,3,7,3,10,23,7,6.5,-319,4.5


#### Check that it alternates Visitor-Home

In [99]:
df = df.reset_index(drop=False)
df['exp'] = df['index'].apply(lambda x: 'V' if x % 2 == 0 else 'H')
df[df['exp'] != df['VH']] # neutral location

Unnamed: 0,index,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,exp
532,532,204,101,N,Philadelphia,9,13,7,12,41,47.5,49.5,170,27.5,V
533,533,204,102,N,NewEngland,3,9,14,7,33,6.0,4.0,-200,7.0,H


In [160]:
def process_lines(df, season):
    '''takes in a season of lines. pre-processes the funky schema/pattern
    and returns a df ready for analysis'''
    df['game_id'] = map(lambda x: np.floor(x/2.), np.arange(df.shape[0]))

    def clean_pk(x):
        '''takes in a value that is either a numeric or string.
        if string, it is a pickem (value==pk). return 0. otherwise,
        return original value'''
        try:
            return float(x)
        except:
            return 0.

    games_dict = {}
    for idx in df.index:
        ## visitor
        if idx % 2 == 0:
            game_id = df.loc[idx, 'game_id']
            games_dict[game_id] = {
                'Date': df.loc[idx, 'Date'],
                'V_Team': df.loc[idx, 'Team'],
                'V_Q1': df.loc[idx, '1st'],
                'V_Q2': df.loc[idx, '2nd'],
                'V_Q3': df.loc[idx, '3rd'],
                'V_Q4': df.loc[idx, '4th'],
                'V_Final': df.loc[idx, 'Final'],
                'v_open': clean_pk(df.loc[idx, 'Open']),
                'v_close': clean_pk(df.loc[idx, 'Close']),
                'V_ML': df.loc[idx, 'ML'],
                'v_2h': df.loc[idx, '2H'],
                'Neutral': df.loc[idx, 'VH'] == 'N'
            }

        ## home
        else:
            games_dict[game_id]['H_Team'] = df.loc[idx, 'Team']
            games_dict[game_id]['H_Q1'] = df.loc[idx, '1st']
            games_dict[game_id]['H_Q2'] = df.loc[idx, '2nd']
            games_dict[game_id]['H_Q3'] = df.loc[idx, '3rd']
            games_dict[game_id]['H_Q4'] = df.loc[idx, '4th']
            games_dict[game_id]['H_Final'] = df.loc[idx, 'Final']
            games_dict[game_id]['h_open'] = clean_pk(df.loc[idx, 'Open'])
            games_dict[game_id]['h_close'] = clean_pk(df.loc[idx, 'Close'])
            games_dict[game_id]['H_ML'] = df.loc[idx, 'ML']
            games_dict[game_id]['h_2h'] = df.loc[idx, '2H']

    def ml_fav(h_ml):
        '''if the money line is negative, 
        the home team is favorited. if it is
        100, it\'s a coinflip'''
        if h_ml == 100:
            return 'pickem'
        elif h_ml < 0:
            return 'H'
        else:
            return 'V'

    def spread_fav(home, vis):
        '''the smaller number between home/visitor for both
        open and close is the favorite. 
        this returns the spread, favorite, or O/U'''
        fav = 'H' if home < vis else 'V'
        if home < vis:
            fav = 'H'
            ou = vis
            spread = home
        else:
            fav = 'V'
            ou = home
            spread = vis

        return [fav, ou, spread]

    games = DataFrame.from_dict(games_dict, orient='index')
    # ML will indicate who is the favorite (open or close?)
    games['ML_Fav'] = games['H_ML'].map(ml_fav)
    # favorite is the one with the lower (spread)
    for bet_type, bet_name in [('open','Open'),('close','Close'),('2h','2H')]:
        info_col = '{}_info'.format(bet_type)
        games[info_col] = games[
                    ['h_{}'.format(bet_type),
                     'v_{}'.format(bet_type)]
                ].apply(
                    lambda (home,vis): spread_fav(home, vis), axis=1
                )

        for i, (field, field_name) in enumerate([
                                        ('fav','Fav'),
                                        ('ou','OU'),
                                        ('spread','Spread')]):
            data_field = '{}_{}'.format(bet_name, field_name)
            games[data_field] = games[info_col].apply(lambda x: x[i])

    games['Season'] = season
    games = games[[
        'Season','Date','H_Team','V_Team','H_Final','V_Final','H_ML','V_ML','ML_Fav',
        'Open_Fav','Open_OU','Open_Spread','Close_Fav','Close_OU','Close_Spread',
        '2H_Fav','2H_OU','2H_Spread','H_Q1','H_Q2','H_Q3','H_Q4',
        'V_Q1','V_Q2','V_Q3','V_Q4'
    ]]

    assert games[games['Open_Spread'] >= games['Open_OU']].shape[0] == 0
    assert games[games['Close_Spread'] >= games['Close_OU']].shape[0] == 0
    assert games[games['2H_Spread'] >= games['2H_OU']].shape[0] == 0

    return games

In [71]:
dfs = []
for season in np.arange(2018, 2007, -1):
    print season
    df = read_excel(
        'http://www.sportsbookreviewsonline.com/scoresoddsarchives/nfl/nfl%20odds%20' + 
        '{}-{}.xlsx'.format(season-1, str(season)[-2:]))\
            .reset_index(drop=True)
    dfs.append(process_lines(df, season-1))

2018
2017
2016
2015
2014
2013
2012
2011
2010
2009
2008


In [103]:
all_lines = reduce(lambda x,y: x.append(y).reset_index(drop=True), dfs)
all_lines.shape

(2937, 26)

#### Fix erroneous game dates for historical lines

In [100]:
## These games say 10-17 but were actually on 10-18
## caught this when exploring data
idx = all_lines[
    (all_lines['Season'] == 2015)
    & (all_lines['Date'] == 1017)
].index

all_lines.loc[idx, 'Date'] = 1018

idx = all_lines[
    (all_lines['Season'] == 2010)
    & (all_lines['Date'] == 1224)
    & (all_lines['H_Team'] == 'Arizona')
].index

all_lines.loc[idx, 'Date'] = 1225

idx = all_lines[
    (all_lines['Season'] == 2014)
    & (all_lines['Date'] == 930)
    & (all_lines['H_Team'] == 'KansasCity')
].index

all_lines.loc[idx, 'Date'] = 929

idx = all_lines[
    (all_lines['Season'] == 2015)
    & (all_lines['Date'] == 123)
    & (all_lines['H_Team'] == 'Denver')
].index

all_lines.loc[idx, 'Date'] = 124

idx = all_lines[
    (all_lines['Season'] == 2015)
    & (all_lines['Date'] == 123)
    & (all_lines['H_Team'] == 'Carolina')
].index

all_lines.loc[idx, 'Date'] = 124

In [101]:
from datetime import date
def map_dates(x):
    dt, yy = x
    dt = str(dt)
    dd = int(dt[-2:])
    mm = int(dt[:-2])
    yy = int(yy)
    
    if mm < 6:
        return date(yy+1, mm, dd)
    else:
        return date(yy, mm, dd)
    
all_lines['Date'] = all_lines[['Date','Season']].apply(map_dates, axis=1)

In [102]:
all_lines.to_csv('{}/all_lines.csv'.format(DATA_DIR))

### Alternative For Lines: http://www.drwagpicks.com/p/nfl-statistics-downloads.html

In [24]:
started = False
dfs = []
for years in [(2018, 2014), (2014, 2010), (2010, 2006), (2006, 2000)]:
    curr = read_csv(
            'http://sports.snoozle.net/api?' + 
            'league=nfl&fileType=csvFile&' + 
            'statType=latestodds&' + 
            'startDate={}-02-06&'.format(years[1]) + 
            'endDate={}-02-12&'.format(years[0]) +
            'source=drwagpics'
        )
    
    dfs.append(curr)

In [30]:
lines = reduce(
    lambda x,y: x.append(y[x.columns]).reset_index(drop=True),
    dfs
)

In [31]:
lines.head()

(4158, 6)

In [36]:
lines.to_csv('drwag_lines_lines.csv'.format(DATA_DIR))

### Historical DVOA

In [161]:
def scrape_dvoa_week(week, year):
    '''for a given week/year, scrapes the footballoutsiders
    DVOA by week'''
    
    base_url = 'https://www.footballoutsiders.com/dvoa-ratings'
    url = base_url + '/{}/week-{}-dvoa-ratings'.format(year, week)
    soup = ScrapeTools().get_soup(url, driver)
    data_tables = soup.findAll('table', {'class': 'stats'})

    for data_table in data_tables:
        cols = []
        team_dvoa = {}
        if ( ('TOTALDVOA' in data_table.text) | ('TOTALVOA' in data_table.text) ) \
               & (('OFFENSEDVOA' in data_table.text) | ('OFFENSEVOA' in data_table.text)) \
               & ('W-L' in data_table.text):
            try:
                tbl = data_table
                for i, tr in enumerate(tbl.findAll('tr')):
                    team_dvoa[i] = {}
                    for j, td in enumerate(tr.findAll('td')):
                        if i == 0:
                            cols.append(td.text)
                        else:
                            team_dvoa[i][cols[j]] = td.text
                return DataFrame.from_dict(team_dvoa, orient='index')
            except:
                pass

    return None

In [162]:
def clean_cols(df):
    '''clean up/standardize column
    names for merging weekly dataframes'''
    col_map = dict(map(
        lambda x: (x, x.replace('.','').strip()), 
        df.columns
    ))
    return df.rename(columns=col_map)

def merge_dfs(a, b):
    '''function passed into reduce call.
    merges the weekly dataframes. standardize columns.
    filling with None where necessary to line up columns.'''
    a = clean_cols(a)
    b = clean_cols(b)
    
    if sorted(a.columns) != sorted(b.columns):
        for a_not_b in set(a) - set(b):
            b[a_not_b] = None
        for b_not_a in set(b) - set(a):
            a[b_not_a] = None    
    return a.append(b[a.columns]).reset_index(drop=True)

#### TODO: 2018 needs a change to work

In [167]:
driver = ScrapeTools().get_driver()

In [168]:
!mkdir -p dvoa

dfs = []
started = True
for year in np.arange(2017, 2006, -1):
    for week in np.arange(1,17): # no week 17 ratings
        if started is True:
            week_dvoa = scrape_dvoa_week(week, year)
            if week_dvoa is None:
                print 'FAIL', week, year
            else:
                week_dvoa['Year'] = year
                week_dvoa['Week'] = week
                week_dvoa.to_csv('{}/dvoa/dvoa_{:04d}_{:02d}.csv'.format(DATA_DIR, year, week))

'dvoa_2017_01.csv'

'dvoa_2017_02.csv'

'dvoa_2017_03.csv'

'dvoa_2017_04.csv'

'dvoa_2017_05.csv'

'dvoa_2017_06.csv'

'dvoa_2017_07.csv'

'dvoa_2017_08.csv'

'dvoa_2017_09.csv'

'dvoa_2017_10.csv'

'dvoa_2017_11.csv'

'dvoa_2017_12.csv'

'dvoa_2017_13.csv'

'dvoa_2017_14.csv'

'dvoa_2017_15.csv'

'dvoa_2017_16.csv'

'dvoa_2016_01.csv'

'dvoa_2016_02.csv'

'dvoa_2016_03.csv'

'dvoa_2016_04.csv'

'dvoa_2016_05.csv'

'dvoa_2016_06.csv'

'dvoa_2016_07.csv'

'dvoa_2016_08.csv'

'dvoa_2016_09.csv'

'dvoa_2016_10.csv'

'dvoa_2016_11.csv'

'dvoa_2016_12.csv'

'dvoa_2016_13.csv'

'dvoa_2016_14.csv'

'dvoa_2016_15.csv'

'dvoa_2016_16.csv'

'dvoa_2015_01.csv'

'dvoa_2015_02.csv'

'dvoa_2015_03.csv'

'dvoa_2015_04.csv'

'dvoa_2015_05.csv'

'dvoa_2015_06.csv'

'dvoa_2015_07.csv'

'dvoa_2015_08.csv'

'dvoa_2015_09.csv'

'dvoa_2015_10.csv'

'dvoa_2015_11.csv'

'dvoa_2015_12.csv'

'dvoa_2015_13.csv'

'dvoa_2015_14.csv'

'dvoa_2015_15.csv'

'dvoa_2015_16.csv'

'dvoa_2014_01.csv'

'dvoa_2014_02.csv'

'dvoa_2014_03.csv'

'dvoa_2014_04.csv'

'dvoa_2014_05.csv'

'dvoa_2014_06.csv'

'dvoa_2014_07.csv'

'dvoa_2014_08.csv'

'dvoa_2014_09.csv'

'dvoa_2014_10.csv'

'dvoa_2014_11.csv'

'dvoa_2014_12.csv'

'dvoa_2014_13.csv'

'dvoa_2014_14.csv'

'dvoa_2014_15.csv'

'dvoa_2014_16.csv'

'dvoa_2013_01.csv'

'dvoa_2013_02.csv'

'dvoa_2013_03.csv'

'dvoa_2013_04.csv'

'dvoa_2013_05.csv'

'dvoa_2013_06.csv'

'dvoa_2013_07.csv'

'dvoa_2013_08.csv'

'dvoa_2013_09.csv'

'dvoa_2013_10.csv'

'dvoa_2013_11.csv'

'dvoa_2013_12.csv'

'dvoa_2013_13.csv'

'dvoa_2013_14.csv'

'dvoa_2013_15.csv'

'dvoa_2013_16.csv'

'dvoa_2012_01.csv'

'dvoa_2012_02.csv'

'dvoa_2012_03.csv'

'dvoa_2012_04.csv'

'dvoa_2012_05.csv'

'dvoa_2012_06.csv'

'dvoa_2012_07.csv'

'dvoa_2012_08.csv'

'dvoa_2012_09.csv'

'dvoa_2012_10.csv'

'dvoa_2012_11.csv'

'dvoa_2012_12.csv'

'dvoa_2012_13.csv'

'dvoa_2012_14.csv'

'dvoa_2012_15.csv'

'dvoa_2012_16.csv'

'dvoa_2011_01.csv'

'dvoa_2011_02.csv'

'dvoa_2011_03.csv'

'dvoa_2011_04.csv'

'dvoa_2011_05.csv'

'dvoa_2011_06.csv'

'dvoa_2011_07.csv'

'dvoa_2011_08.csv'

'dvoa_2011_09.csv'

'dvoa_2011_10.csv'

'dvoa_2011_11.csv'

'dvoa_2011_12.csv'

'dvoa_2011_13.csv'

'dvoa_2011_14.csv'

'dvoa_2011_15.csv'

'dvoa_2011_16.csv'

'dvoa_2010_01.csv'

'dvoa_2010_02.csv'

'dvoa_2010_03.csv'

'dvoa_2010_04.csv'

'dvoa_2010_05.csv'

'dvoa_2010_06.csv'

'dvoa_2010_07.csv'

'dvoa_2010_08.csv'

'dvoa_2010_09.csv'

'dvoa_2010_10.csv'

'dvoa_2010_11.csv'

'dvoa_2010_12.csv'

'dvoa_2010_13.csv'

'dvoa_2010_14.csv'

'dvoa_2010_15.csv'

'dvoa_2010_16.csv'

'dvoa_2009_01.csv'

'dvoa_2009_02.csv'

'dvoa_2009_03.csv'

'dvoa_2009_04.csv'

'dvoa_2009_05.csv'

'dvoa_2009_06.csv'

'dvoa_2009_07.csv'

'dvoa_2009_08.csv'

'dvoa_2009_09.csv'

'dvoa_2009_10.csv'

'dvoa_2009_11.csv'

'dvoa_2009_12.csv'

'dvoa_2009_13.csv'

'dvoa_2009_14.csv'

'dvoa_2009_15.csv'

'dvoa_2009_16.csv'

'dvoa_2008_01.csv'

'dvoa_2008_02.csv'

'dvoa_2008_03.csv'

'dvoa_2008_04.csv'

'dvoa_2008_05.csv'

'dvoa_2008_06.csv'

'dvoa_2008_07.csv'

'dvoa_2008_08.csv'

'dvoa_2008_09.csv'

'dvoa_2008_10.csv'

'dvoa_2008_11.csv'

'dvoa_2008_12.csv'

'dvoa_2008_13.csv'

'dvoa_2008_14.csv'

'dvoa_2008_15.csv'

'dvoa_2008_16.csv'

'dvoa_2007_01.csv'

'dvoa_2007_02.csv'

'dvoa_2007_03.csv'

'dvoa_2007_04.csv'

'dvoa_2007_05.csv'

'dvoa_2007_06.csv'

'dvoa_2007_07.csv'

'dvoa_2007_08.csv'

'dvoa_2007_09.csv'

'dvoa_2007_10.csv'

'dvoa_2007_11.csv'

'dvoa_2007_12.csv'

'dvoa_2007_13.csv'

'dvoa_2007_14.csv'

'dvoa_2007_15.csv'

'dvoa_2007_16.csv'

#### Clean DVOA
* In Week 8, they switch from TOTAL DAVE to WEIGHTED DVOA (except weeks 8-10 2007)
 * TOTAL DAVE: NULL weeks 8-16
 * WEIGHTED DVOA: NULL weeks 1-7 (for 2007, it's weeks 1-10)
* LASTWEEK: NULL week 1

In [178]:
import json as JSON
dvoa_team_map = JSON.load(open('team_map_for_dvoa.json','r'))

for i, (k,v) in enumerate(dvoa_team_map.iteritems()):
    print '{} --> {}'.format(k, v)
    if i >= 5:
        break
        


MIN --> Vikings
MIA --> Dolphins
CAR --> Panthers
ATL --> Falcons
DET --> Lions
CIN --> Bengals


In [183]:
dvoa_dfs = map(
    lambda dvoa: read_csv('dvoa/{}'.format(dvoa)),
    sorted(filter(lambda x: x.startswith('dvoa_2'), os.listdir('dvoa')))
)

col_namechanges = {
    'S.T.VOA': 'S.T.DVOA',
    'TOTALVOA': 'TOTALDVOA',
    'OFFENSEVOA': 'OFFENSEDVOA',
    'DEFENSEVOA': 'DEFENSEDVOA',
    'WEI.DVOA':'WEIGHTEDDVOA',
    'WEIGHTEDVOA':'WEIGHTEDDVOA',
    'TOTALDAVE':'TOTAL DAVE',
    'DAVE':'TOTAL DAVE',
}

for i in range(len(dvoa_dfs)):
    d = dvoa_dfs[i]
    d.columns = map(lambda x: str(x).strip(), d.columns)
    drop_cols = filter(lambda c: 'Unnamed' in c, d.columns)
    d.drop(drop_cols, axis=1, inplace=True)
    
    for c in ['RANK','NON-ADJTOT VOA','','\xc2\xa0']:
        if c in d.columns:
            d.drop(c, axis=1, inplace=True)

    d.rename(columns=col_namechanges, inplace=True)  
    
    d['TEAM'] = d['TEAM'].map(dvoa_team_map)

In [191]:
dvoa_all = reduce(lambda x,y: x.append(y).reset_index(drop=True), dvoa_dfs)
dvoa_all.to_csv('{}/dvoa/dvoa_alltime.csv'.format(DATA_DIR))

In [189]:
dvoa_all['Year'].value_counts().sort_index()

2007    528
2008    528
2009    528
2010    528
2011    528
2012    528
2013    528
2014    528
2015    528
2016    528
2017    528
Name: Year, dtype: int64

In [190]:
dvoa_all['Week'].value_counts().sort_index()

1     363
2     363
3     363
4     363
5     363
6     363
7     363
8     363
9     363
10    363
11    363
12    363
13    363
14    363
15    363
16    363
Name: Week, dtype: int64

#### Below is the work to find the columns that need altering (they have been incorporated above)

In [172]:
from collections import Counter
counts = DataFrame.from_dict(
        Counter(reduce(lambda x,y: x+y, map(lambda z: z.columns.tolist(), dvoa_dfs))),
        orient='index'
    ).sort_values(by=0, ascending=False)
counts[0] = counts[0] / float(len(dvoa_dfs))
counts

Unnamed: 0,0
Week,1.0
OFF.RANK,1.0
TEAM,1.0
Year,1.0
TOTALDVOA,1.0
OFFENSEDVOA,1.0
S.T.RANK,1.0
DEF.RANK,1.0
DEFENSEDVOA,1.0
W-L,1.0


In [174]:
years = {}
weeks = {}
week_years = {}
for d in dvoa_dfs:
    yr, wk = d[['Year','Week']].values[0]
    if yr not in years.keys():
        years[yr] = d.columns.tolist()
    else:
        years[yr] += d.columns.tolist()
    
    if wk not in weeks.keys():
        weeks[wk] = d.columns.tolist()
    else:
        weeks[wk] += d.columns.tolist()
        
    week_years[(wk,yr)] = d.columns.tolist()    
    
counters = {}
n = 11.
for i, (k,v) in enumerate(weeks.iteritems()):
    curr = DataFrame.from_dict(Counter(v), orient='index').rename(columns={0:k})
    curr[k] = curr[k].apply(lambda x: (n-x)/n)
    if i == 0:
        combined_wk = curr
    else:
        combined_wk = combined_wk.merge(curr, left_index=True, right_index=True, how='outer').fillna(1.)
        
    counters = {}
n = 16.
for i, (k,v) in enumerate(years.iteritems()):
    curr = DataFrame.from_dict(Counter(v), orient='index').rename(columns={0:k})
    curr[k] = curr[k].apply(lambda x: (n-x)/n)
    if i == 0:
        combined_yr = curr
    else:
        combined_yr = combined_yr.merge(curr, left_index=True, right_index=True, how='outer').fillna(1.)
    
for i, (k,v) in enumerate(week_years.iteritems()):
    curr = DataFrame.from_dict(Counter(v), orient='index').rename(columns={0:k})
    if i == 0:
        combined_wkyr = curr
    else:
        combined_wkyr = combined_wkyr.merge(curr, left_index=True, right_index=True, how='outer').fillna(0)

combined_wkyr = combined_wkyr.T

In [179]:
## NULL weeks 8-16
combined_wk.T[combined_wk.T['TOTAL DAVE'] > 0].sort_index()[['TOTAL DAVE']]
combined_yr.T[combined_yr.T['TOTAL DAVE'] > 0].sort_index()[['TOTAL DAVE']]

Unnamed: 0,TOTAL DAVE
8,1.0
9,1.0
10,1.0
11,1.0
12,1.0
13,1.0
14,1.0
15,1.0
16,1.0


Unnamed: 0,TOTAL DAVE
2007,0.5625
2008,0.5625
2009,0.5625
2010,0.5625
2011,0.5625
2012,0.5625
2013,0.5625
2014,0.5625
2015,0.5625
2016,0.5625


In [180]:
## null weeks 1-7 all seasons; additionally 8-10 in 2007 
## ** FO switches from TOTAL DAVE to WEIGHTEDDVOA in week 8
combined_wk.T[combined_wk.T['WEIGHTEDDVOA'] > 0].sort_index()[['WEIGHTEDDVOA']]
combined_yr.T[combined_yr.T['WEIGHTEDDVOA'] > 0].sort_index()[['WEIGHTEDDVOA']]

Unnamed: 0,WEIGHTEDDVOA
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,0.090909
9,0.090909
10,0.090909


Unnamed: 0,WEIGHTEDDVOA
2007,0.625
2008,0.4375
2009,0.4375
2010,0.4375
2011,0.4375
2012,0.4375
2013,0.4375
2014,0.4375
2015,0.4375
2016,0.4375


In [181]:
# only NULL week 1
combined_wk.T[combined_wk.T['LASTWEEK'] > 0].sort_index()[['LASTWEEK']]
combined_yr.T[combined_yr.T['LASTWEEK'] > 0].sort_index()[['LASTWEEK']]

Unnamed: 0,LASTWEEK
1,1.0


Unnamed: 0,LASTWEEK
2007,0.0625
2008,0.0625
2009,0.0625
2010,0.0625
2011,0.0625
2012,0.0625
2013,0.0625
2014,0.0625
2015,0.0625
2016,0.0625


### PFR Game Stats -- Primary Data Source
* Step 1: write all HTML to disk
* Step 2: extract individual tables and write them to separate directories

In [26]:
driver = ScrapeTools().get_driver()

In [27]:
all_games = {}

for year in range(2007,2019):
    for week in np.arange(1,22):
        url = 'https://www.pro-football-reference.com/years/{}/week_{}.htm'\
                .format(year, week)

        soup = ScrapeTools().get_soup(url, driver)
        all_games[(week, year)] = map(
            lambda x: x.findAll('a')[0].attrs['href'], 
            soup.findAll('td', {'class': 'gamelink'})
        )

In [28]:
DataFrame.from_dict(all_games, orient='index').to_csv('{}/fref_all_games_urls.csv'.format(DATA_DIR))

In [29]:
all_urls = reduce(lambda x,y: x+y, all_games.values())

In [31]:
import os 
files = os.listdir('games_stats')

In [32]:
len(files)
files[0]

2937

'201211180was.txt'

#### games_stats directory is raw html unicode

In [368]:
# 1102
for i, url in enumerate(all_urls):
    if i % 100 == 0:
        print i
    game_id = url.split('/')[-1].split('.')[0]
    if '{}.txt'.format(game_id) in files:
        pass
    else:
        url = 'https://www.pro-football-reference.com/{}'.format(url)
        soup = ScrapeTools().get_soup(url, driver)
        with open('games_stats/{}.txt'.format(game_id), 'w') as w:
            w.write(unicode(soup))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100


#### Extract Data From Stored HTML

In [5]:
def extract_scorebox_team_info(div, curr_game_metadata, i):
    '''for the team divs in scorebox, extract the team name/link, 
    city, score, and coach name/link'''
    team = div.findAll('a', {'itemprop':'name'})[0]

    team_link = team.attrs['href']
    team_full_name = team.text
    team_name = team_full_name.split(' ')[-1]
    team_city = ' '.join(team_full_name.split(' ')[:-1])
    team_score = div.findAll('div', {'class': 'score'})[0].text
    team_coach_raw = div.findAll('div', {'class': 'datapoint'})[0]\
                        .findAll('a')[0]
    team_coach_name = team_coach_raw.text
    team_coach_id = team_coach_raw.attrs['href']

    curr_game_metadata['team{}_link'.format(i)] = team_link
    curr_game_metadata['team{}_fullname'.format(i)] = team_full_name
    curr_game_metadata['team{}_name'.format(i)] = team_name
    curr_game_metadata['team{}_city'.format(i)] = team_city
    curr_game_metadata['team{}_score'.format(i)] = team_score
    curr_game_metadata['team{}_coach_raw'.format(i)] = team_coach_raw
    curr_game_metadata['team{}_coach_name'.format(i)] = team_coach_name
    curr_game_metadata['team{}_coach_id'.format(i)] = team_coach_id    
    
    return curr_game_metadata

In [6]:
def clean_game_meta(div, field):
    '''converts attendance to integer, and
    strips unnecessary headers from strings
    '''
    div = div.replace('Stadium: ','')\
             .replace('Start Time: ','')\
             .replace('Attendance: ','')\
             .replace('Time of Game: ','')
    
    if field == 'attendance':
        return int(div.replace(',',''))
    else:
        return div.strip()

In [7]:
def extract_game_metadata(scorebox):
    '''takes in the scorebox div. parses the 3
    divs: team0, team1, game data, and returns a dict
    entry for this game'''

    div_children = filter(
        lambda ch: str(type(ch)) != "<class 'bs4.element.NavigableString'>",
        scorebox.children
    )

    div_teams = filter(
            lambda x: x.attrs.get('class', []) == [], 
            div_children
        )
    div_game = filter(
            lambda x: x.attrs.get('class', []) == ['scorebox_meta'], 
            div_children
        )[0]

    ## TEAM INFO
    curr_game_metadata = OrderedDict()
    for i, div in enumerate(div_teams):
        curr_game_metadata = extract_scorebox_team_info(div, curr_game_metadata, i) 

    ## GAME INFO
    ## it looks like there is no indicator of the field
    ## e.g. date, start time. feels gross but hardcoding
    ## is the only solution I can think of
    meta_order = ['date','time','stadium','attendance','duration']

    for field, div in zip(meta_order, div_game.findAll('div')):
        curr_game_metadata[field] = clean_game_meta(div.text, field)
        if field == 'stadium':
            curr_game_metadata['stadium_link'] = div.findAll('a')[0].attrs['href']

    return curr_game_metadata

In [8]:
def text_or_link(td):
    '''if the element contains a hyperlink, return the
    raw string. otherwise, return only the text'''
    
    ## some a tags do not have href. hrefs is a list
    ## of href values. if empty, there is no href and
    ## return the text
    hrefs = map(lambda x: x.attrs.get('href', None), td.findAll('a'))
    hrefs = filter(lambda x: (x is not None) & ('#' not in str(x)), hrefs)
        
    if hrefs:
        return td
    else:
        return td.text

def get_game_tables(soup, game_id):
    '''for the given file (BeautifulSoup object),
    extract all data tables, and return a dictionary
    of data tables for the game.'''
    file_tables = {}
    for tbl_container in soup.findAll('div', {'class':'table_outer_container'}):
        tbl = tbl_container.findAll('tbody')[-1]

        table_dict = {}
        for i, tr in enumerate(tbl.findAll('tr')):
            if 'thead' in tr.attrs.get('class', []):
                pass
            else:
                table_dict[i] = OrderedDict()
                table_dict[i]['game_id'] = game_id

                ## first column is a th, but can be treated same as td
                for td in tr.findAll('th') + tr.findAll('td'):
                    if td.attrs['data-stat'] in ['player','name']:
                        name_type = td.attrs['data-stat']
                        table_dict[i][name_type+'_raw'] = text_or_link(td)
                        table_dict[i][name_type+'_name'] = td.text
                        name = map(
                            lambda x: x.attrs['href'], td.findAll('a')
                        )
                        if name:
                            table_dict[i][name_type+'_id'] = name[0]
                    else:
                        table_dict[i][td.attrs['data-stat']] = text_or_link(td)
                        
        table_name = tbl_container.findAll('caption')[0].text
        file_tables[table_name] = DataFrame.from_dict(table_dict, orient='index')
    return file_tables

In [9]:
def merge_split_tables(game_metadata, game_tables):
    '''certain data is broken out by team (drives, starters, snap counts). 
    this is indicated by the team name\' presence in the table key.
    make a Team column with the team name, and merge the DataFrames.
    '''

    team_names = map(lambda i: game_metadata['team{}_name'.format(i)], [0,1])
    tables_w_team_name = filter(
        lambda x: x.split(' ')[0] in team_names, 
        game_tables.keys()
    )

    for tbl in tables_w_team_name:
        assert 'Team' not in game_tables[tbl].columns
        game_tables[tbl]['Team'] = tbl.split(' ')[0]

        base_name = ' '.join(tbl.split(' ')[1:])

        if base_name not in game_tables.keys():
            game_tables[base_name] = game_tables[tbl]
        else:
            game_tables[base_name] = game_tables[base_name].append(
                game_tables[tbl]
            ).reset_index(drop=True)

    for tbl in tables_w_team_name:
        del game_tables[tbl]
        
    return game_tables

In [20]:
files = sorted(os.listdir('{}/games_stats'.format(DATA_DIR)))[::-1]
len(files)

start_pt = 0

for i, f in enumerate(files):
    if i % 100 == 0:
        print i

    if i >= start_pt:
        try:
            soup = BeautifulSoup(open('{}/games_stats/{}'.format(DATA_DIR, f)).read(), 'lxml')

            scorebox = soup.findAll('div', {'class': 'scorebox'})[0]
            game_metadata = extract_game_metadata(scorebox)
                            
            game_tables = get_game_tables(soup, f.split('.')[0])
            game_tables = merge_split_tables(game_metadata, game_tables)

            for k,v in game_tables.iteritems():
                directory = k.replace(' ', '_')\
                             .replace('-','_')\
                             .replace('/','_')\
                             .replace('&','and')\
                             .replace(',','_')\
                             .lower()\
                             .replace('_table','')
                filename = f.replace('txt','csv')

                filepath = '{}/{}/{}'.format(DATA_DIR, directory, filename)
                if not os.path.exists(filepath):
                    v.to_csv(filepath)
    
            game_metadata_df = DataFrame.from_dict(
                    game_metadata, orient='index'
                ).T
            game_metadata_df['game_id'] = game_tables['Game Info Table'].loc[0, 'game_id']
            game_metadata_df.to_csv(
                '{}/game_metadata.csv'.format(DATA_DIR), 
                 mode='a', 
                 header=(i==0),
                 index=False
                )
        except Exception as e:
            print 'FAILED: {}'.format(f)
            print e

2937

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900


#### TODO: Combine the PRF Play-by-Play

In [4]:
import os

directory = '{}/full_play_by_play'.format(DATA_DIR)
files = map(lambda x: '{}/{}'.format(directory, x), os.listdir(directory))

In [19]:
!head -n 1 full_play_by_play/201012260kan.csv > full_play_by_play/ALL_full_play_by_play.csv

In [None]:
!for l in `ls full_play_by_play/*.*` ; do sed 1,1d $l >> full_play_by_play/ALL_full_play_by_play.csv  ; done

In [None]:
!wc -l full_play_by_play/ALL*

In [None]:
full_set_name = '{d}/ALL_{d}.csv'.format(d=directory)
combined.to_csv(full_set_name, index=False)

In [1]:
#### sportradar API

In [None]:
import http.client

conn = http.client.HTTPSConnection("api.sportradar.us")

key = ''

conn.request("GET", "/nfl-ngs-t1/games/25fde8b4-6e4e-468c-ba0a-a511530a303c/tracked_pbp.xml?api_key={}".format(key))

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))


### Use wikipedia python package, scrape Wikipedia pages and extract lat/long for each stadium -- goal is to eventually compute travel distances between games

In [76]:
def extract_lat_long(stadium_name):
    '''given a WikipediaPage object, use 
    BeautifulSoup to extract/return the
    lat/long for the stadium'''
    soup = BeautifulSoup(
        wikipedia.WikipediaPage(title = s).html()
    )
    geo = soup.findAll('span', {'class':'geo'})
    if geo:
        lat_long = map(float, geo[0].text.split('; '))
        return {'lat': lat_long[0], 'long': lat_long[1]}

In [78]:
stadiums = read_csv('game_metadata.csv')['stadium'].unique()
stadium_geo = {}
for s in stadiums:
    stadium_geo[s] = extract_lat_long(s)

U.S. Bank Stadium {'lat': 44.974, 'long': -93.258}
Lincoln Financial Field {'lat': 39.90083, 'long': -75.1675}
Gillette Stadium {'lat': 42.0909444, 'long': -71.2643444}
Heinz Field {'lat': 40.44667, 'long': -80.01583}
Mercedes-Benz Superdome {'lat': 29.95083, 'long': -90.08111}
EverBank Field {'lat': 30.32389, 'long': -81.6375}
Los Angeles Memorial Coliseum {'lat': 34.01417, 'long': -118.28778}
Arrowhead Stadium {'lat': 39.04889, 'long': -94.48389}
Raymond James Stadium {'lat': 27.97583, 'long': -82.50333}
CenturyLink Field {'lat': 47.5952, 'long': -122.3316}
StubHub Center {'lat': 33.864, 'long': -118.261}
M&T Bank Stadium {'lat': 39.27806, 'long': -76.62278}
Nissan Stadium {'lat': 36.16639, 'long': -86.77139}
MetLife Stadium {'lat': 40.813528, 'long': -74.074361}
Hard Rock Stadium {'lat': 25.95806, 'long': -80.23889}
Ford Field {'lat': 42.34, 'long': -83.04556}
Sports Authority Field at Mile High {'lat': 39.74389, 'long': -105.02}
Lucas Oil Stadium {'lat': 39.760056, 'long': -86.1638

In [2]:
stadium_df = DataFrame.from_dict(stadium_geo, orient='index')
stadium_df.index.name = 'stadium'
stadium_df.head()
stadium_df.to_csv('stadium_lat_long.csv'.format(DATA_DIR))

NameError: name 'DataFrame' is not defined

### Get Yearly Divisions on PFR

In [144]:
def extract_season_divisions(soup, divisions, season):
    '''given a BeautifulSoup object for a PFR
    season standings page, get the team:division
    mapping for that season'''
    for conf in ['AFC','NFC']:
        table = soup.findAll('table', {'id':conf})[0].findAll('tbody')[0]
        for tr in table.findAll('tr'):
            ## division name change
            if len(tr.findAll('th')) == 0:
                division = str(tr.text.strip())
            else:
                teamname = str(
                    tr.findAll('th')[0].text.strip()
                      .replace('*','')
                      .replace('+','')
                    )
                divisions[(teamname, season)] = {
                    'division': division,
                }
    return divisions

In [145]:
driver = ScrapeTools().get_driver()

In [146]:
divisions = {}
for season in range(2007,2019):
    url = 'https://www.pro-football-reference.com/years/{}/index.htm'\
            .format(season)

    soup = ScrapeTools().get_soup(url, driver)
    divisions = extract_season_divisions(soup, divisions, season)

In [153]:
divisions_df = DataFrame.from_dict(divisions, orient='index')\
                    .reset_index(drop=False)
divisions_df.columns = ['team','season','division']
divisions_df.index.name = 'idx'
divisions_df.to_csv('{}/team_divisions.csv'.format(DATA_DIR))

In [154]:
divisions_df['season'].value_counts()
divisions_df.head()

2018    32
2017    32
2016    32
2015    32
2014    32
2013    32
2012    32
2011    32
2010    32
2009    32
2008    32
2007    32
Name: season, dtype: int64

Unnamed: 0_level_0,team,season,division
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Arizona Cardinals,2007,NFC West
1,Arizona Cardinals,2008,NFC West
2,Arizona Cardinals,2009,NFC West
3,Arizona Cardinals,2010,NFC West
4,Arizona Cardinals,2011,NFC West
