In [1]:
import requests
from selenium import webdriver
import pandas as pd
import time
from bs4 import BeautifulSoup
from pprint import pprint

In [2]:
cols = ['game_date', 'home_team', 'away_team', 'home_score', 'away_score', 'game_time', 'stadium', 'duration', 
        'home_first_downs', 'away_first_downs', 'home_num_rushes', 'home_rush_yards', 'home_rush_tds', 'away_num_rushes',
        'away_rush_yards', 'away_rush_tds', 'home_pass_comp', 'home_pass_att', 'home_pass_yards', 'home_pass_tds',
        'home_pass_ints', 'away_pass_comp', 'away_pass_att', 'away_pass_yards', 'away_pass_tds', 'away_pass_ints',
        'home_sacks_allowed', 'home_sacks_yards_allowed', 'away_sacks_allowed', 'away_sacks_yards_allowed', 
        'home_net_pass_yards', 'away_net_pass_yards', 'home_total_yards', 'away_total_yards', 'home_fumbles', 
        'home_fumbles_lost', 'away_fumbles', 'away_fumbles_lost', 'home_turnovers', 'away_turnovers', 'home_num_penallity',
        'home_penallity_yards', 'away_num_penallity', 'away_penallity_yards', 'home_third_downs', 'home_third_downs_converted',
        'away_third_downs', 'away_third_downs_converted', 'home_fourth_downs', 'home_fourth_downs_converted', 
        'away_fourth_downs', 'away_fourth_downs_converted', 'home_time_possession', 'away_time_possession', 'won_toss',
        'roof', 'surface', 'attendance', 'weather', 'fav_team', 'handicap', 'points_line', 
        'points_line_result']

In [40]:
BASE_URL = 'https://www.pro-football-reference.com/'
URL = 'https://www.pro-football-reference.com/years/' + str(YEAR) + '/week_'

def scrape_data(driver, year):
    
    YEAR = year

    for week in range(1, 22):

        week_data = []
        games_url = URL + str(week) + '.htm'
        print(games_url)
        driver.get(games_url)

        time.sleep(5)

        html = driver.execute_script('return document.documentElement.outerHTML')
        soup = BeautifulSoup(html, 'html.parser')

        games = soup.find_all('table', attrs = {'class': 'teams'})
        links = sum([game.find_all('td', attrs={'class': 'right gamelink'}) for game in games], [])
        links = sum([link.find_all('a', href=True) for link in links], [])
        links = list(set([a['href'][1:] for a in links]))
        links = [link for link in links if link[10:14] in [str(YEAR), str(YEAR + 1)]]

        for i in range(len(links)):

            print(BASE_URL + links[i])

            driver.get(BASE_URL + links[i])

            html = driver.execute_script('return document.documentElement.outerHTML')
            soup = BeautifulSoup(html, 'html.parser')

            time.sleep(5)

            scorebox = soup.find('div', attrs = {'class': 'scorebox'})

            teams = [team.get_text() for team in scorebox.find_all('a', attrs = {'itemprop': 'name'})]
            home_team = teams[0]
            away_team = teams[1]

            print(home_team)
            print(away_team)

            scores = [int(score.get_text()) for score in scorebox.find_all('div', attrs = {'class': 'score'})]
            home_score = int(scores[0])
            away_score = int(scores[1])

            print(home_score)
            print(away_score)

            scorebox_meta_dct = {}
            scorebox_meta = soup.find('div', attrs = {'class': 'scorebox_meta'})
            scorebox_meta = scorebox_meta.find_all('div')

            for i in range(1, len(scorebox_meta)-1):

                print(scorebox_meta[i].get_text())
                print(scorebox_meta[i].get_text().split(': ', 1))

                a, b = scorebox_meta[i].get_text().split(': ', 1)
                scorebox_meta_dct[a] = b

            print(scorebox_meta_dct)

            game_date = scorebox_meta[0].get_text()
            start_time = scorebox_meta[1].get_text().split(': ', 1)[1]
            stadium = scorebox_meta_dct['Stadium'] if 'Stadium' in scorebox_meta_dct else ''
            game_length = scorebox_meta_dct['Time of Game'] if 'Time of Game' in scorebox_meta_dct else ''

            print(game_date)
            print(start_time)
            print(stadium)
            print(game_length)

            time.sleep(5)

            # team stats
            team_stats = soup.find('table', attrs = {'class': 'add_controls stats_table'})
            team_stats = team_stats.find_all('td', attrs = {'class': 'center'})

            away_first_downs = int(team_stats[0].get_text())
            home_first_downs = int(team_stats[1].get_text())

            if '--' in team_stats[2].get_text():
                t_stat_2 = team_stats[2].get_text().replace('--', '-')
            else:
                t_stat_2 = team_stats[2].get_text()

            away_num_rushes, away_rush_yards, away_rush_tds = list(map(int, t_stat_2.split('-')))


            if '--' in team_stats[3].get_text():
                t_stat_3 = team_stats[3].get_text().replace('--', '-')
            else:
                t_stat_3 = team_stats[3].get_text()

            home_num_rushes, home_rush_yards, home_rush_tds = list(map(int, t_stat_3.split('-')))

            if '--' in team_stats[4].get_text():
                t_stat_4 = team_stats[4].get_text().replace('--', '-')
            else:
                t_stat_4 = team_stats[4].get_text()


            away_pass_comp, away_pass_att, away_pass_yards, away_pass_tds, away_pass_ints = list(map(int, t_stat_4.split('-')))
            home_pass_comp, home_pass_att, home_pass_yards, home_pass_tds, home_pass_ints = list(map(int, team_stats[5].get_text().split('-')))
            away_sacks_allowed, away_sacks_yards_allowed = list(map(int, team_stats[6].get_text().split('-')))
            home_sacks_allowed, home_sacks_yards_allowed = list(map(int, team_stats[7].get_text().split('-')))
            away_net_pass_yards, home_net_pass_yards = int(team_stats[8].get_text()), int(team_stats[9].get_text())
            away_total_yards, home_total_yards = int(team_stats[10].get_text()), int(team_stats[11].get_text())
            away_fumbles, away_fumbles_lost = list(map(int, team_stats[12].get_text().split('-')))
            home_fumbles, home_fumbles_lost = list(map(int, team_stats[13].get_text().split('-')))
            away_turnovers, home_turnovers = int(team_stats[14].get_text()), int(team_stats[15].get_text())
            away_num_penallity, away_penallity_yards = list(map(int, team_stats[16].get_text().split('-')))
            home_num_penallity, home_penallity_yards = list(map(int, team_stats[17].get_text().split('-')))
            away_third_downs, away_third_downs_converted = list(map(int, team_stats[18].get_text().split('-')))
            home_third_downs, home_third_downs_converted = list(map(int, team_stats[19].get_text().split('-')))
            away_fourth_downs, away_fourth_downs_converted = list(map(int, team_stats[20].get_text().split('-')))
            home_fourth_downs, home_fourth_downs_converted = list(map(int, team_stats[21].get_text().split('-')))
            away_time_possession, home_time_possession = team_stats[22].get_text(), team_stats[23].get_text()


            # game info
            game_info_dct = {}
            game_info = soup.find_all('table', attrs={'class': 'suppress_all sortable stats_table now_sortable'})[0]
            game_info_left = game_info.find_all('th')
            game_info_right = game_info.find_all('td')[1:]

            for l, r in zip(game_info_left, game_info_right):
                game_info_dct[l.get_text()] = r.get_text()

            print(game_info_dct)

            won_toss = game_info_dct['Won Toss'] if 'Won Toss' in game_info_dct else ''
            roof = game_info_dct['Roof'] if 'Roof' in game_info_dct else ''
            surface = game_info_dct['Surface'] if 'Surface' in game_info_dct else ''
            attendance = game_info_dct['Attendance'] if 'Attendance' in game_info_dct else 0
            weather = game_info_dct['Weather'] if 'Weather' in game_info_dct else ''
            vegas_line = game_info_dct['Vegas Line'] if 'Vegas Line' in game_info_dct else ''
            over_under = game_info_dct['Over/Under'] if 'Over/Under' in game_info_dct else ''

            # vegas line    
            neg = vegas_line.find('-')
            fav_team = vegas_line[:neg-1]
            handicap = vegas_line[neg:]

            # over/under
            inner_brac = over_under.find('(')
            outer_brac = over_under.find(')')
            points_line = float(over_under[:inner_brac-1])
            points_line_result = over_under[inner_brac+1:outer_brac]

            game_data = [
                game_date,
                home_team,
                away_team,
                home_score,
                away_score,
                start_time,
                stadium,
                game_length,
                home_first_downs,
                away_first_downs,
                home_num_rushes,
                home_rush_yards,
                home_rush_tds,
                away_num_rushes,
                away_rush_yards,
                away_rush_tds,
                home_pass_comp,
                home_pass_att,
                home_pass_yards,
                home_pass_tds,
                home_pass_ints,
                away_pass_comp,
                away_pass_att,
                away_pass_yards,
                away_pass_tds,
                away_pass_ints,
                home_sacks_allowed,
                home_sacks_yards_allowed,
                away_sacks_allowed,
                away_sacks_yards_allowed,
                home_net_pass_yards,
                away_net_pass_yards,
                home_total_yards,
                away_total_yards,
                home_fumbles,
                home_fumbles_lost,
                away_fumbles,
                away_fumbles_lost,
                home_turnovers,
                away_turnovers,
                home_num_penallity,
                home_penallity_yards,
                away_num_penallity,
                away_penallity_yards,
                home_third_downs,
                home_third_downs_converted,
                away_third_downs,
                away_third_downs_converted,
                home_fourth_downs,
                home_fourth_downs_converted,
                away_fourth_downs,
                away_fourth_downs_converted,
                home_time_possession,
                away_time_possession,
                won_toss,
                roof,
                surface,
                attendance,
                weather,
                fav_team,
                handicap,
                points_line,
                points_line_result
            ]
            print(game_data)
            week_data.append(game_data)


        df = pd.DataFrame(week_data, columns=cols)

        df['game_date'] = pd.to_datetime(df['game_date'])

        df['game_time'] = pd.to_datetime(df['game_time'])
        df['game_time'] = df['game_time'].dt.time

        df['duration'] = pd.to_datetime(df['duration'], format='%M:%S').dt.time
        df['home_time_possession'] = pd.to_datetime(df['home_time_possession'], format='%M:%S').dt.time
        df['away_time_possession'] = pd.to_datetime(df['away_time_possession'], format='%M:%S').dt.time

        df.sort_values(by=['game_date', 'game_time'], inplace=True)

        df.to_csv('./scraped_data/' + str(YEAR) + '/week_' + str(week) + '.csv', index=False)

    driver.close()

https://www.pro-football-reference.com/years/2002/week_1.htm
https://www.pro-football-reference.com/boxscores/200209080mia.htm
Miami Dolphins
Detroit Lions
49
21
Start Time: 1:04pm
['Start Time', '1:04pm']
Stadium: Pro Player Stadium 
['Stadium', 'Pro Player Stadium ']
Attendance: 72,216
['Attendance', '72,216']
Time of Game: 3:02
['Time of Game', '3:02']
{'Start Time': '1:04pm', 'Stadium': 'Pro Player Stadium ', 'Attendance': '72,216', 'Time of Game': '3:02'}
Sunday Sep 8, 2002
1:04pm
Pro Player Stadium 
3:02
{'Won Toss': 'Dolphins', 'Roof': 'outdoors', 'Surface': 'grass', 'Duration': '3:02', 'Attendance': '72,216', 'Weather': '87 degrees, relative humidity 69%, wind 12 mph, wind chill 0', 'Vegas Line': 'Miami Dolphins -9.5', 'Over/Under': '35.5 (over)'}
['Sunday Sep 8, 2002', 'Miami Dolphins', 'Detroit Lions', 49, 21, '1:04pm', 'Pro Player Stadium ', '3:02', 27, 15, 41, 182, 3, 19, 51, 2, 18, 27, 207, 3, 0, 17, 36, 220, 1, 1, 0, 0, 3, 14, 207, 206, 389, 257, 2, 0, 2, 1, 0, 2, 4, 54, 

Washington Redskins
Arizona Cardinals
31
23
Start Time: 1:05pm
['Start Time', '1:05pm']
Stadium: FedExField 
['Stadium', 'FedExField ']
Attendance: 85,140
['Attendance', '85,140']
Time of Game: 3:04
['Time of Game', '3:04']
{'Start Time': '1:05pm', 'Stadium': 'FedExField ', 'Attendance': '85,140', 'Time of Game': '3:04'}
Sunday Sep 8, 2002
1:05pm
FedExField 
3:04
{'Won Toss': 'Redskins', 'Roof': 'outdoors', 'Surface': 'grass', 'Duration': '3:04', 'Attendance': '85,140', 'Weather': '76 degrees, relative humidity 71%, wind 5 mph, wind chill 0', 'Vegas Line': 'Washington Redskins -6.0', 'Over/Under': '44.5 (over)'}
['Sunday Sep 8, 2002', 'Washington Redskins', 'Arizona Cardinals', 31, 23, '1:05pm', 'FedExField ', '3:04', 21, 14, 32, 122, 1, 20, 70, 1, 28, 40, 327, 3, 1, 14, 36, 187, 1, 1, 1, 7, 1, 0, 320, 187, 442, 257, 0, 0, 1, 0, 1, 1, 7, 56, 1, 5, 9, 16, 4, 13, 1, 2, 0, 0, '34:24', '25:36', 'Redskins', 'outdoors', 'grass', '85,140', '76 degrees, relative humidity 71%, wind 5 mph, wind 

{'Won Toss': 'Eagles', 'Roof': 'outdoors', 'Surface': 'grass', 'Duration': '3:20', 'Attendance': '68,804', 'Weather': '83 degrees, relative humidity 60%, no wind, wind chill 0', 'Vegas Line': 'Tennessee Titans -2.0', 'Over/Under': '37.0 (over)'}
['Sunday Sep 8, 2002', 'Tennessee Titans', 'Philadelphia Eagles', 27, 24, '1:06pm', 'The Coliseum ', '3:20', 22, 17, 25, 61, 1, 22, 80, 0, 24, 34, 269, 2, 1, 18, 36, 212, 3, 2, 1, 2, 6, 31, 267, 181, 328, 261, 4, 3, 1, 1, 4, 3, 6, 50, 10, 113, 6, 10, 7, 15, 0, 0, 1, 1, '30:48', '29:12', 'Eagles', 'outdoors', 'grass', '68,804', '83 degrees, relative humidity 60%, no wind, wind chill 0', 'Tennessee Titans', '-2.0', 37.0, 'over']
https://www.pro-football-reference.com/boxscores/200209080jax.htm
Jacksonville Jaguars
Indianapolis Colts
25
28
Start Time: 1:04pm
['Start Time', '1:04pm']
Stadium: Alltel Stadium 
['Stadium', 'Alltel Stadium ']
Attendance: 56,595
['Attendance', '56,595']
Time of Game: 3:01
['Time of Game', '3:01']
{'Start Time': '1:04pm'

Cleveland Browns
Cincinnati Bengals
20
7
Start Time: 1:03pm
['Start Time', '1:03pm']
Stadium: Cleveland Browns Stadium 
['Stadium', 'Cleveland Browns Stadium ']
Attendance: 73,358
['Attendance', '73,358']
Time of Game: 3:14
['Time of Game', '3:14']
{'Start Time': '1:03pm', 'Stadium': 'Cleveland Browns Stadium ', 'Attendance': '73,358', 'Time of Game': '3:14'}
Sunday Sep 15, 2002
1:03pm
Cleveland Browns Stadium 
3:14
{'Won Toss': 'Bengals', 'Roof': 'outdoors', 'Surface': 'grass', 'Duration': '3:14', 'Attendance': '73,358', 'Weather': '72 degrees, relative humidity 68%, wind 6 mph, wind chill 0', 'Vegas Line': 'Cleveland Browns -3.5', 'Over/Under': '39.0 (under)'}
['Sunday Sep 15, 2002', 'Cleveland Browns', 'Cincinnati Bengals', 20, 7, '1:03pm', 'Cleveland Browns Stadium ', '3:14', 15, 28, 27, 75, 0, 31, 156, 0, 17, 30, 198, 2, 0, 26, 47, 239, 1, 3, 1, 8, 5, 38, 190, 201, 265, 357, 0, 0, 2, 0, 0, 3, 12, 87, 3, 30, 4, 12, 9, 17, 0, 1, 0, 2, '28:42', '31:18', 'Bengals', 'outdoors', 'grass'

San Diego Chargers
Houston Texans
24
3
Start Time: 4:15pm
['Start Time', '4:15pm']
Stadium: Qualcomm Stadium 
['Stadium', 'Qualcomm Stadium ']
Attendance: 56,098
['Attendance', '56,098']
Time of Game: 3:27
['Time of Game', '3:27']
{'Start Time': '4:15pm', 'Stadium': 'Qualcomm Stadium ', 'Attendance': '56,098', 'Time of Game': '3:27'}
Sunday Sep 15, 2002
4:15pm
Qualcomm Stadium 
3:27
{'Won Toss': 'Chargers', 'Roof': 'outdoors', 'Surface': 'grass', 'Duration': '3:27', 'Attendance': '56,098', 'Weather': '74 degrees, relative humidity 78%, wind 8 mph, wind chill 0', 'Vegas Line': 'San Diego Chargers -13.5', 'Over/Under': '34.0 (under)'}
['Sunday Sep 15, 2002', 'San Diego Chargers', 'Houston Texans', 24, 3, '4:15pm', 'Qualcomm Stadium ', '3:27', 16, 7, 35, 124, 1, 25, 89, 0, 15, 28, 163, 1, 1, 6, 25, 87, 0, 2, 2, 20, 9, 58, 143, 29, 267, 118, 1, 0, 2, 1, 1, 3, 4, 35, 12, 73, 4, 14, 1, 14, 0, 0, 1, 2, '32:24', '27:36', 'Chargers', 'outdoors', 'grass', '56,098', '74 degrees, relative humidity

Denver Broncos
Buffalo Bills
28
23
Start Time: 4:05pm
['Start Time', '4:05pm']
Stadium: Invesco Field at Mile High 
['Stadium', 'Invesco Field at Mile High ']
Attendance: 75,359
['Attendance', '75,359']
Time of Game: 3:23
['Time of Game', '3:23']
{'Start Time': '4:05pm', 'Stadium': 'Invesco Field at Mile High ', 'Attendance': '75,359', 'Time of Game': '3:23'}
Sunday Sep 22, 2002
4:05pm
Invesco Field at Mile High 
3:23
{'Won Toss': 'Broncos', 'Roof': 'outdoors', 'Surface': 'grass', 'Duration': '3:23', 'Attendance': '75,359', 'Weather': '59 degrees, relative humidity 48%, wind 3 mph, wind chill 0', 'Vegas Line': 'Denver Broncos -7.5', 'Over/Under': '47.0 (over)'}
['Sunday Sep 22, 2002', 'Denver Broncos', 'Buffalo Bills', 28, 23, '4:05pm', 'Invesco Field at Mile High ', '3:23', 22, 17, 32, 163, 1, 14, 39, 1, 19, 31, 211, 2, 0, 27, 41, 283, 2, 0, 4, 32, 4, 31, 179, 252, 342, 291, 1, 0, 1, 1, 0, 1, 9, 64, 10, 82, 3, 13, 6, 14, 1, 2, 0, 0, '32:47', '27:13', 'Broncos', 'outdoors', 'grass', '7

{'Won Toss': 'Panthers', 'Roof': 'dome', 'Surface': 'astroturf', 'Duration': '2:58', 'Attendance': '63,945', 'Vegas Line': 'Minnesota Vikings -6.5', 'Over/Under': '44.5 (under)'}
['Sunday Sep 22, 2002', 'Minnesota Vikings', 'Carolina Panthers', 14, 21, '1:00pm', 'Hubert H. Humphrey Metrodome ', '2:58', 13, 20, 18, 73, 1, 33, 164, 2, 19, 30, 191, 1, 4, 20, 30, 221, 1, 2, 5, 14, 3, 15, 177, 206, 250, 370, 4, 1, 0, 0, 5, 2, 6, 55, 6, 41, 2, 10, 6, 15, 0, 1, 0, 0, '24:36', '35:24', 'Panthers', 'dome', 'astroturf', '63,945', '', 'Minnesota Vikings', '-6.5', 44.5, 'under']


ValueError: time data '' does not match format '%M:%S' (match)

In [None]:
print('Insert webdriver')

driver = webdriver.Chrome()
YEAR = 2019

scrape_data(driver, year)