In [1]:
import requests, os, datetime, json
from bs4 import BeautifulSoup
import pandas as pd
import time as tm

- https://www.espn.com/soccer/schedule and pick a league from the drop-down menu for the url below
- Dates are year-month-day, but without hyphens
- Set commentary flag

Other Notes
- Commentary section doesn't look the prettiest
- File hierarchy data -> game_stats -> year -> .csvs

In [None]:
# As you track back, change first_year near the bottom

In [2]:
# To change -----------------------------
url_date =  "20200910" # start date
stop_date = "20210524"
league = 'eng.1' # put league or competition here
folder_name = 'epl'
commentary = True # whether or not to look for commentary
# ---------------------------------------

one_week = datetime.timedelta(7)
match_df = pd.DataFrame()
events_df = pd.DataFrame()
year = url_date[:4]
url = "https://www.espn.com/soccer/fixtures/_/date/{}/league/{}"

try:
    ma_df = pd.read_csv(os.path.join('data', folder_name, 'game_stats', 'matches.csv'))
    list_of_matches = ma_df['id'].unique()
except:
    list_of_matches = []

while True: 
    # Get page for week of games
    tm.sleep(2)
    while True:
        try:
            r = requests.get(url.format(url_date, league))
            r.raise_for_status()
            break
        except:
            print('here1', url.format(url_date, league))
            tm.sleep(2)
        
    print('Looking at...', url_date)
    soup = BeautifulSoup(r.text, "html.parser")
    
    current_date = datetime.datetime.strptime(url_date, "%Y%m%d")
    
    # get all games
    stuff = soup.find_all('div', attrs={'id':'sched-container'})[0]
    
    # if there are no games, keep going
    if stuff.text == 'No games scheduled':
        url_date = (current_date + one_week).strftime("%Y%m%d")
        continue
    
    # zip each chunk of games with it's header (the date)
    for h2, table in zip(stuff.find_all('h2'), stuff.find_all('table')):
        matches = table.find_all('tr', class_=['even has-results', 'odd has-results'])
        
        # go through each match
        for match in matches:
            match_dict = {}
            parts = match.find_all('span')
            # parts[2].text # score (should be added under matchstats)

            match_dict['id'] = parts[2].find('a').get('href').split('=')[-1]
            
            if int(match_dict['id']) in list_of_matches: # game already saved
                continue
            
            game_url = "https://www.espn.com" + parts[2].find('a').get('href')
            game_url = game_url.replace('report', '{}').replace('matchstats', "{}")

            tm.sleep(2)
            while True:
                try:
                    r = requests.get(game_url.format('match'))
                    r.raise_for_status()
                    break
                except:
                    print('here2', game_url.format('match'))
                    tm.sleep(2)
            soup = BeautifulSoup(r.text, "html.parser")

            # if this fails, the game doesn't have data
            # likely postponed
            try:
                time = soup.find('li', class_='subdued').find_all('div')[0].find('span').get('data-date')
            except:
                print(match_dict['id'], 'skipped')
                continue
        
            # check to see if the game already happened
            # possibly find a better way
            game_status = soup.find('div', class_='game-status').text.strip()
            if game_status == "Postponed":
                print(match_dict['id'], 'postponed')
                continue
            elif game_status == "":
                print(match_dict['id'], 'future match')
                continue  
            elif game_status == "Canceled":
                print(match_dict['id'], 'canceled')
                continue
            
            # ---------------------------------------------------------
            # ------------------------ GENERAL ------------------------
            # ---------------------------------------------------------
            match_dict['home'] = parts[0].text # home
            match_dict['away'] = parts[-1].text # away
            match_dict['date'] = h2.text.strip()
            match_dict['year'] = year
            match_dict['time (utc)'] = datetime.datetime.strptime(time.replace('Z', 'UTC'), "%Y-%m-%dT%H:%M%Z").strftime("%H:%M%z")
            match_dict['attendance'] = match.find_all('td')[-1].text # attendance
            match_dict['venue'] = match.find_all('td')[-2].text
            
            # one game per year (or less) 
            # not for points, just for fun (against a top Euro team)
            if match_dict['home'] == "MLS All-Stars":
                print('skipped All-Star Game ... id:', match_dict['id'])
                continue

            # ------------------------------------------------------------------
            # --------------------------- MATCHSTATS ---------------------------
            # ------------------------------------------------------------------
            try: # MLS has a regular season and a postseason
                match_dict['league'] = soup.find('div', class_='game-details header').text.strip().split(',')[0]
                match_dict['part_of_competition'] = soup.find('div', class_='game-details header').text.strip().split(',')[1]
            except: # most leagues/competitions don't have a postseason
                match_dict['league'] = soup.find('div', class_='game-details header').text.strip()
                match_dict['part_of_competition'] = "na"

            # FT/ FT-Pens/ maybe some others
            try: 
                match_dict['game_status'] = game_status
            except:
                match_dict['game_status'] = ""

            # important for knockout round games
            # same information could be gathered from 'game_status' 
            # (not sure if this applies to all leagues and older games)
            try:
                if soup.find('article', class_='sub-module penalty-shootout').get('style') is None:
                    match_dict['shootout'] = True
                else:
                    match_dict['shootout'] = False
            except:
                match_dict['shootout'] = False

            # get all stats
            stats = soup.find_all(['span', 'td'], attrs={'data-home-away':['home', 'away']})
            for item in stats:
                if item.text.strip() != '':
                    match_dict[item.get('data-home-away') + "_" + item.get('data-stat')] = item.text.strip()
            
            # -----------------------------------------------------------------
            # ----------------------------- GOALS -----------------------------
            # -----------------------------------------------------------------
            goals = soup.find_all('div', class_='team-info players')
            try:
                home_goals = goals[0].find_all('ul', attrs={'data-event-type':'goal'})[0].find_all('li')
            except:
                home_goals = []
            try:
                away_goals = goals[1].find_all('ul', attrs={'data-event-type':'goal'})[0].find_all('li')
            except:
                away_goals = []
            
            i = 0
            home_goal_minutes = []
            home_goal_scorers = []
            for goal in home_goals:
                scorer = goal.contents[0].strip()
                minute = goal.contents[1].text.strip().replace('(', "").replace(')', "")
                try:
                    minutes = minute.split(',')
                    for minute in minutes:
                        home_goal_minutes.append(minute)
                        home_goal_scorers.append(scorer)
                except:
                    home_goal_minutes.append(minute)
                    home_goal_scorers.append(scorer)

            i = 0
            away_goal_minutes = []
            away_goal_scorers = []
            for goal in away_goals:
                scorer = goal.contents[0].strip()
                minute = goal.contents[1].text.strip().replace('(', "").replace(')', "")
                try:
                    minutes = minute.split(',')
                    for minute in minutes:
                        away_goal_minutes.append(minute)
                        away_goal_scorers.append(scorer)
                except:
                    away_goal_minutes.append(minute)
                    away_goal_scorers.append(scorer)

            match_dict['home_goal_minutes'] = ":".join(home_goal_minutes)
            match_dict['home_goal_scorers'] = ":".join(home_goal_scorers)
            match_dict['away_goal_minutes'] = ":".join(away_goal_minutes)
            match_dict['away_goal_scorers'] = ":".join(away_goal_scorers)

            # ---------------------------------------------------------
            # ------------------------ LINEUPS ------------------------
            # ---------------------------------------------------------
            # get the team formations
            try:
                soup.find('div', class_='game-details header')
                match_dict['home_formation'] = soup.find_all('div', class_='formations__text')[0].text
                match_dict['away_formation'] = soup.find_all('div', class_='formations__text')[1].text
            except:
                pass
            
            team = 'home_'
            j = 0 # which block for loop is in
            lineups = soup.find_all('tbody')
            lineups = [group.find_all('div', class_='accordion-header lineup-player') for group in lineups[1:5]]
            if len(lineups) == 3:
                lineups = [lineups[0], [], lineups[1], luneups[2]]
            
            for group in lineups:
                if j == 2: team = 'away_'

                section = 'starting_' if j % 2 == 0 else 'bench_'
                i = 1

                for block in group:
                    # if true, they were subbed in
                    if block.find('span', attrs={'style':' display:inline-block; width: 24px;'}) is None: 
                        if j == 0 or j == 2: # if sub is in starting lineup, they started on the bench
                            lineups[j+1].append(block) # add them to their respective bench
                            continue
                        else: # now add them
                            match_dict[team + section + str(i) + '_num'] = block.find('span', class_='name').contents[-1].strip()
                            match_dict[team + section + str(i)] = block.find('a').text.strip()
                            try:
                                match_dict[team + section + str(i) + "_minute"] = block.find('span', class_='detail').text
                            except:
                                match_dict[team + section + str(i) + "_minute"] = 'not given'
                            i += 1
                            continue

                    # starting players and bench players that weren't subbed on
                    match_dict[team + section + str(i) + '_num'] = block.find('span', attrs={'style':' display:inline-block; width: 24px;'}).text
                    match_dict[team + section + str(i)] = block.find('a').text.strip()
                    if j == 1 or j == 3:
                        match_dict[team + section + str(i) + "_minute"] = "na"
                    i += 1
                j += 1
            
            # ------------------------------------------------------------------
            # --------------------------- COMMENTARY ---------------------------
            # ------------------------------------------------------------------
            # redo this section...doesn't look too good
            events_list = []
            if commentary:
                try:
                    while True:
                        try:
                            r = requests.get(game_url.format('commentary'))
                            r.raise_for_status()
                            break
                        except:
                            print('here3', game_url.format('commentary'))
                            tm.sleep(2)
                    soup = BeautifulSoup(r.text, "html.parser")

                    # get all events
                    events = soup.find_all('table')[2].find_all('tr') # switch to 3 for just key events

                    for event in events:
                        events_list.append([match_dict['id'], 
                                                event.find('td', class_='time-stamp').text, 
                                                event.find('td', class_='game-details').text.strip()])

                    events_list = events_list[::-1] # reverse it so start of match is at the top


                except:
                    print(match_dict['id'], 'no commentary')
                    events_list.append([match_dict['id'], '-', 'no commentary'])
            else:
                events_list.append([match_dict['id'], '-', 'no commentary'])
            
            # Add data to dataframes - prob should build up lists instead
            match_df = match_df.append(pd.DataFrame(match_dict, index=[0]), ignore_index=True)
            events_df = events_df.append(pd.DataFrame(events_list, columns=['id', 'Time', 'Event']), ignore_index=True)
            
        # end - for each match
    
    # move url_date back (or forward? just switch - to + in other places as well)
    url_date = (current_date + one_week).strftime("%Y%m%d")
    
    
    # add some stopping condition
    if url_date > stop_date:
        break

Looking at... 20200910
Looking at... 20200917
Looking at... 20200924
Looking at... 20201001
Looking at... 20201008
Looking at... 20201015
Looking at... 20201022
Looking at... 20201029
Looking at... 20201105
Looking at... 20201112
Looking at... 20201119
Looking at... 20201126
Looking at... 20201203
Looking at... 20201210
Looking at... 20201217
Looking at... 20201224
Looking at... 20201231
Looking at... 20210107
Looking at... 20210114
Looking at... 20210121
Looking at... 20210128
Looking at... 20210204
Looking at... 20210211
Looking at... 20210218
Looking at... 20210225
Looking at... 20210304
Looking at... 20210311
Looking at... 20210318
Looking at... 20210325
Looking at... 20210401
Looking at... 20210408
Looking at... 20210415
Looking at... 20210422
Looking at... 20210429
Looking at... 20210506
Looking at... 20210513
Looking at... 20210520


In [3]:
match_df.tail()

Unnamed: 0,id,home,away,date,year,time (utc),attendance,venue,league,part_of_competition,...,home_bench_8_minute,home_bench_9_num,home_bench_9,home_bench_9_minute,away_bench_8_num,away_bench_8,away_bench_8_minute,away_bench_9_num,away_bench_9,away_bench_9_minute
375,578284,Liverpool,Crystal Palace,"Sunday, May 23",2020,15:00,9901.0,"Anfield, Liverpool, England",2020-21 English Premier League,na,...,78',20.0,Diogo Jota,90'+1',,,,,,
376,578285,Manchester City,Everton,"Sunday, May 23",2020,15:00,10000.0,"Etihad Stadium, Manchester, England",2020-21 English Premier League,na,...,74',10.0,Sergio Agüero,65',17.0,Alex Iwobi,57',,,
377,578282,Sheffield United,Burnley,"Sunday, May 23",2020,15:00,,"Bramall Lane, Sheffield, England",2020-21 English Premier League,na,...,79',15.0,Phil Jagielka,83',27.0,Matej Vydra,45',,,
378,578288,West Ham United,Southampton,"Sunday, May 23",2020,15:00,,"London Stadium, London, England",2020-21 English Premier League,na,...,69',16.0,Mark Noble,84',23.0,Nathan Tella,62',14.0,Michael Obafemi,84'
379,578286,Wolverhampton Wanderers,Manchester United,"Sunday, May 23",2020,15:00,4500.0,"Molineux Stadium, Wolverhampton, England",2020-21 English Premier League,na,...,27',,,,74.0,Shola Shoretire,82',48.0,William Fish,90'+5'


In [4]:
events_df.tail()

Unnamed: 0,id,Time,Event
36400,578286,90'+6',Brandon Williams (Manchester United) wins a fr...
36401,578286,90'+6',Foul by Morgan Gibbs-White (Wolverhampton Wand...
36402,578286,90'+6',Morgan Gibbs-White (Wolverhampton Wanderers) i...
36403,578286,90'+7',"Second Half ends, Wolverhampton Wanderers 1, M..."
36404,578286,-,"Match ends, Wolverhampton Wanderers 1, Manches..."


In [5]:
# Save the data
spath = os.path.join('data', folder_name, 'game_stats', year)
if not os.path.exists(spath):
    os.makedirs(spath)
else: # if it's already there, add new data to end of old
    m_df = pd.read_csv(os.path.join(spath, year+'_matches.csv'))
    e_df = pd.read_csv(os.path.join(spath, year+'_events.csv'))
    match_df = m_df.append(match_df, ignore_index=True)
    events_df = e_df.append(events_df, ignore_index=True)

match_df.to_csv(os.path.join(spath, year+'_matches.csv'), encoding='utf-8-sig', index=False)
events_df.to_csv(os.path.join(spath, year+'_events.csv'), encoding='utf-8-sig', index=False)

In [6]:
# Combines the seasons into matches.csv, events.csv
first_year = 2011
matches_df = pd.read_csv(os.path.join('data', folder_name, 'game_stats', str(first_year), str(first_year)+'_matches.csv'))
events_df = pd.read_csv(os.path.join('data', folder_name, 'game_stats', str(first_year), str(first_year)+'_events.csv'))

# Read in first one
for year in range(first_year+1, 2021):
    # read it in
    m_df = pd.read_csv(os.path.join('data', folder_name, 'game_stats', str(year), str(year)+'_matches.csv'))
    e_df = pd.read_csv(os.path.join('data', folder_name, 'game_stats', str(year), str(year)+'_events.csv'))
    
    # combine it
    matches_df = matches_df.append(m_df, ignore_index=True)
    events_df = events_df.append(e_df, ignore_index=True)
    
matches_df.to_csv(os.path.join('data', folder_name, 'game_stats', 'matches.csv'), encoding='utf-8-sig', index=False)
events_df.to_csv(os.path.join('data', folder_name, 'game_stats', 'events.csv'), encoding='utf-8-sig', index=False)

In [7]:
# Have:
# Premier League
# 2011-2012
# 2012-2013
# 2013-2014
# 2014-2015
# 2015-2016
# 2016-2017
# 2017-2018
# 2018-2019
# 2019-2020
# 2020-2021
