In [26]:
import requests, os, datetime, json
from bs4 import BeautifulSoup
import pandas as pd
import time as tm

- https://www.espn.com/soccer/schedule and pick a league from the drop-down menu for the url below
- Dates are year-month-day, but without hyphens
- Set commentary flag

Other Notes
- Commentary section doesn't look the prettiest
- File hierarchy data -> game_stats -> year -> .csvs

In [70]:
# To change -----------------------------
url_date =  "20200220" # start date
stop_date = "20200808"
url = "https://www.espn.com/soccer/fixtures/_/date/{}/league/usa.1"
league = 'mls' # put league or competition here
commentary = True # whether or not to look for commentary
# ---------------------------------------

one_week = datetime.timedelta(7)
match_df = pd.DataFrame()
events_df = pd.DataFrame()
year = url_date[:4]

while True: 
    # Get page for week of games
    tm.sleep(2)
    while True:
        try:
            r = requests.get(url.format(url_date))
            r.raise_for_status()
            break
        except:
            print('here1', url.format(url_date))
            tm.sleep(2)
        
    soup = BeautifulSoup(r.text, "html.parser")
    
    current_date = datetime.datetime.strptime(url_date, "%Y%m%d")
    
    # get all games
    stuff = soup.find_all('div', attrs={'id':'sched-container'})[0]
    
    # if there are no games, keep going
    if stuff.text == 'No games scheduled':
        url_date = (current_date + one_week).strftime("%Y%m%d")
        continue
    
    # zip each chunk of games with it's header (the date)
    for h2, table in zip(stuff.find_all('h2'), stuff.find_all('table')):
        matches = table.find_all('tr', class_=['even has-results', 'odd has-results'])
        
        # go through each match
        for match in matches:
            match_dict = {}
            parts = match.find_all('span')
            # parts[2].text # score (should be added under matchstats)

            match_dict['id'] = parts[2].find('a').get('href').split('=')[-1]
            game_url = "https://www.espn.com" + parts[2].find('a').get('href')
            game_url = game_url.replace('report', '{}').replace('matchstats', "{}")

            tm.sleep(2)
            while True:
                try:
                    r = requests.get(game_url.format('match'))
                    r.raise_for_status()
                    break
                except:
                    print('here2', game_url.format('match'))
                    tm.sleep(2)
            soup = BeautifulSoup(r.text, "html.parser")

            # if this fails, the game doesn't have data
            # likely postponed
            try:
                time = soup.find('li', class_='subdued').find_all('div')[0].find('span').get('data-date')
            except:
                print(match_dict['id'], 'skipped')
                continue
        
            # check to see if the game already happened
            # possibly find a better way
            game_status = soup.find('div', class_='game-status').text.strip()
            if game_status == "Postponed":
                print(match_dict['id'], 'postponed')
                continue
            elif game_status == "":
                print(match_dict['id'], 'future match')
                continue  
            elif game_status == "Canceled":
                print(match_dict['id'], 'canceled')
                continue
            
            # ---------------------------------------------------------
            # ------------------------ GENERAL ------------------------
            # ---------------------------------------------------------
            match_dict['home'] = parts[0].text # home
            match_dict['away'] = parts[-1].text # away
            match_dict['date'] = h2.text.strip()
            match_dict['year'] = year
            match_dict['time (utc)'] = datetime.datetime.strptime(time.replace('Z', 'UTC'), "%Y-%m-%dT%H:%M%Z").strftime("%H:%M%z")
            match_dict['attendance'] = match.find_all('td')[-1].text # attendance
            match_dict['venue'] = match.find_all('td')[-2].text
            
            # one game per year (or less) 
            # not for points, just for fun (against a top Euro team)
            if match_dict['home'] == "MLS All-Stars":
                print('skipped All-Star Game ... id:', match_dict['id'])
                continue

            # ------------------------------------------------------------------
            # --------------------------- MATCHSTATS ---------------------------
            # ------------------------------------------------------------------
            try: # MLS has a regular season and a postseason
                match_dict['league'] = soup.find('div', class_='game-details header').text.strip().split(',')[0]
                match_dict['part_of_competition'] = soup.find('div', class_='game-details header').text.strip().split(',')[1]
            except: # most leagues/competitions don't have a postseason
                match_dict['league'] = soup.find('div', class_='game-details header').text.strip()
                match_dict['part_of_competition'] = "na"

            # FT/ FT-Pens/ maybe some others
            try: 
                match_dict['game_status'] = game_status
            except:
                match_dict['game_status'] = ""

            # important for knockout round games
            # same information could be gathered from 'game_status' 
            # (not sure if this applies to all leagues and older games)
            try:
                if soup.find('article', class_='sub-module penalty-shootout').get('style') is None:
                    match_dict['shootout'] = True
                else:
                    match_dict['shootout'] = False
            except:
                match_dict['shootout'] = False

            # get all stats
            stats = soup.find_all(['span', 'td'], attrs={'data-home-away':['home', 'away']})
            for item in stats:
                if item.text.strip() != '':
                    match_dict[item.get('data-home-away') + "_" + item.get('data-stat')] = item.text.strip()
            
            # -----------------------------------------------------------------
            # ----------------------------- GOALS -----------------------------
            # -----------------------------------------------------------------
            goals = soup.find_all('div', class_='team-info players')
            try:
                home_goals = goals[0].find_all('ul', attrs={'data-event-type':'goal'})[0].find_all('li')
            except:
                home_goals = []
            try:
                away_goals = goals[1].find_all('ul', attrs={'data-event-type':'goal'})[0].find_all('li')
            except:
                away_goals = []
            
            i = 0
            home_goal_minutes = []
            home_goal_scorers = []
            for goal in home_goals:
                scorer = goal.contents[0].strip()
                minute = goal.contents[1].text.strip().replace('(', "").replace(')', "")
                try:
                    minutes = minute.split(',')
                    for minute in minutes:
                        home_goal_minutes.append(minute)
                        home_goal_scorers.append(scorer)
                except:
                    home_goal_minutes.append(minute)
                    home_goal_scorers.append(scorer)

            i = 0
            away_goal_minutes = []
            away_goal_scorers = []
            for goal in away_goals:
                scorer = goal.contents[0].strip()
                minute = goal.contents[1].text.strip().replace('(', "").replace(')', "")
                try:
                    minutes = minute.split(',')
                    for minute in minutes:
                        away_goal_minutes.append(minute)
                        away_goal_scorers.append(scorer)
                except:
                    away_goal_minutes.append(minute)
                    away_goal_scorers.append(scorer)

            match_dict['home_goal_minutes'] = ":".join(home_goal_minutes)
            match_dict['home_goal_scorers'] = ":".join(home_goal_scorers)
            match_dict['away_goal_minutes'] = ":".join(away_goal_minutes)
            match_dict['away_goal_scorers'] = ":".join(away_goal_scorers)

            # ---------------------------------------------------------
            # ------------------------ LINEUPS ------------------------
            # ---------------------------------------------------------
            # get the team formations
            try:
                soup.find('div', class_='game-details header')
                match_dict['home_formation'] = soup.find_all('div', class_='formations__text')[0].text
                match_dict['away_formation'] = soup.find_all('div', class_='formations__text')[1].text
            except:
                pass
            
            team = 'home_'
            j = 0 # which block for loop is in
            lineups = soup.find_all('tbody')
            lineups = [group.find_all('div', class_='accordion-header lineup-player') for group in lineups[1:5]]
            if len(lineups) == 3:
                lineups = [lineups[0], [], lineups[1], luneups[2]]
            
            for group in lineups:
                if j == 2: team = 'away_'

                section = 'starting_' if j % 2 == 0 else 'bench_'
                i = 1

                for block in group:
                    # if true, they were subbed in
                    if block.find('span', attrs={'style':' display:inline-block; width: 24px;'}) is None: 
                        if j == 0 or j == 2: # if sub is in starting lineup, they started on the bench
                            lineups[j+1].append(block) # add them to their respective bench
                            continue
                        else: # now add them
                            match_dict[team + section + str(i) + '_num'] = block.find('span', class_='name').contents[-1].strip()
                            match_dict[team + section + str(i)] = block.find('a').text.strip()
                            try:
                                match_dict[team + section + str(i) + "_minute"] = block.find('span', class_='detail').text
                            except:
                                match_dict[team + section + str(i) + "_minute"] = 'not given'
                            i += 1
                            continue

                    # starting players and bench players that weren't subbed on
                    match_dict[team + section + str(i) + '_num'] = block.find('span', attrs={'style':' display:inline-block; width: 24px;'}).text
                    match_dict[team + section + str(i)] = block.find('a').text.strip()
                    if j == 1 or j == 3:
                        match_dict[team + section + str(i) + "_minute"] = "na"
                    i += 1
                j += 1
            
            # ------------------------------------------------------------------
            # --------------------------- COMMENTARY ---------------------------
            # ------------------------------------------------------------------
            # redo this section...doesn't look too good
            events_list = []
            if commentary:
                try:
                    while True:
                        try:
                            r = requests.get(game_url.format('commentary'))
                            r.raise_for_status()
                            break
                        except:
                            print('here3', game_url.format('commentary'))
                            tm.sleep(2)
                    soup = BeautifulSoup(r.text, "html.parser")

                    # get all events
                    events = soup.find_all('table')[2].find_all('tr') # switch to 3 for just key events

                    for event in events:
                        events_list.append([match_dict['id'], 
                                                event.find('td', class_='time-stamp').text, 
                                                event.find('td', class_='game-details').text.strip()])

                    events_list = events_list[::-1] # reverse it so start of match is at the top


                except:
                    print(match_dict['id'], 'no commentary')
                    events_list.append([match_dict['id'], '-', 'no commentary'])
            else:
                events_list.append([match_dict['id'], '-', 'no commentary'])
            
            # Add data to dataframes - prob should build up lists instead
            match_df = match_df.append(pd.DataFrame(match_dict, index=[0]), ignore_index=True)
            events_df = events_df.append(pd.DataFrame(events_list, columns=['id', 'Time', 'Event']), ignore_index=True)
            
        # end - for each match
    
    # move url_date back (or forward? just switch - to + in other places as well)
    url_date = (current_date + one_week).strftime("%Y%m%d")
    
    
    # add some stopping condition
    if url_date > stop_date:
        break

here2 https://www.espn.com/soccer/match?gameId=560546
here2 https://www.espn.com/soccer/match?gameId=560546
here2 https://www.espn.com/soccer/match?gameId=560546
here1 https://www.espn.com/soccer/fixtures/_/date/20200618/league/usa.1
here1 https://www.espn.com/soccer/fixtures/_/date/20200618/league/usa.1
here1 https://www.espn.com/soccer/fixtures/_/date/20200618/league/usa.1
here1 https://www.espn.com/soccer/fixtures/_/date/20200702/league/usa.1
here1 https://www.espn.com/soccer/fixtures/_/date/20200702/league/usa.1
here1 https://www.espn.com/soccer/fixtures/_/date/20200702/league/usa.1
571506 postponed
571509 postponed
571534 postponed
560856 canceled


In [71]:
match_df.tail()

Unnamed: 0,id,home,away,date,year,time (utc),attendance,venue,league,part_of_competition,...,away_bench_9_minute,away_bench_10_num,away_bench_10,away_bench_10_minute,away_bench_11_num,away_bench_11,away_bench_11_minute,away_bench_12_num,away_bench_12,away_bench_12_minute
71,571924,Orlando City SC,LAFC,"Friday, July 31",2020,23:30,,,2020 Major League Soccer,MLS is Back - Quarterfinals,...,77',11.0,José Cifuentes,81',8.0,Francisco Ginella,65',16.0,Danny Musovski,77'
72,571922,San Jose Earthquakes,Minnesota United FC,"Saturday, August 1",2020,00:00,,,2020 Major League Soccer,MLS is Back - Quarterfinals,...,61',,,,,,,,,
73,571923,New York City FC,Portland Timbers,"Saturday, August 1",2020,02:30,,,2020 Major League Soccer,MLS is Back - Quarterfinals,...,75',9.0,Felipe Mora,74',8.0,Diego Valeri,45',11.0,Jaroslaw Niezgoda,52'
74,571921,Philadelphia Union,Portland Timbers,"Wednesday, August 5",2020,00:00,,,2020 Major League Soccer,MLS is Back - Semifinals,...,90'+1',11.0,Jaroslaw Niezgoda,56',9.0,Felipe Mora,85',7.0,Andy Polo,56'
75,571920,Orlando City SC,Minnesota United FC,"Thursday, August 6",2020,00:00,,,2020 Major League Soccer,MLS is Back - Semifinals,...,71',23.0,Mason Toye,58',,,,,,


In [73]:
# Save the data
spath = os.path.join('data', league, 'game_stats', year)
if not os.path.exists(spath):
    os.makedirs(spath)

match_df.to_csv(os.path.join(spath, year+'_matches.csv'), encoding='utf-8-sig', index=False)
events_df.to_csv(os.path.join(spath, year+'_events.csv'), encoding='utf-8-sig', index=False)

In [74]:
# Combines the seasons into matches.csv, events.csv
first_year = int(year) 
matches_df = pd.read_csv(os.path.join('data', league, 'game_stats', str(first_year), str(first_year)+'_matches.csv'))
events_df = pd.read_csv(os.path.join('data', league, 'game_stats', str(first_year), str(first_year)+'_events.csv'))

# Read in first one
for year in range(first_year+1, 2021):
    # read it in
    m_df = pd.read_csv(os.path.join('data', league, 'game_stats', str(year), str(year)+'_matches.csv'))
    e_df = pd.read_csv(os.path.join('data', league, 'game_stats', str(year), str(year)+'_events.csv'))
    
    # combine it
    matches_df = matches_df.append(m_df, ignore_index=True)
    events_df = events_df.append(e_df, ignore_index=True)
    
matches_df.to_csv(os.path.join('data', league, 'game_stats', 'matches.csv'), encoding='utf-8-sig', index=False)
events_df.to_csv(os.path.join('data', league, 'game_stats', 'events.csv'), encoding='utf-8-sig', index=False)