In [1]:
from league_data_dict import league_data_dict
from bs4 import BeautifulSoup

import requests, os, datetime
import pandas as pd
import numpy as np
import time as tm

In [2]:
def cleanDF(df):
    df[['RK', 'Team']] = df[['RK', 'Team']].fillna(method="ffill")
    df['RK'] = df['RK'].astype(int)
    return df

def updateDF(new_df, file_name, current_year):
    temp_df = pd.read_csv(os.path.join('data', folder_name, 'agg_stats', file_name))
    temp_df = temp_df.drop(temp_df[temp_df['YEAR'] == current_year].index) # drop current year
    return temp_df.append(new_df) # add the updated version

def getSoup(url):
    while True:
        try:
            r = requests.get(url)
            r.raise_for_status()
            break
        except:
            tm.sleep(2)
    return BeautifulSoup(r.text, "html.parser")

In [3]:
chosen_league = 'English Premier League'
update = True
end_year = 2022

league = league_data_dict[chosen_league]['Code']
start_year = league_data_dict[chosen_league]['First Year - Agg Stats']
folder_name = league_data_dict[chosen_league]['Folder']

# if folder doesn't exist, make it
apath = os.path.join('data', folder_name, 'agg_stats')
if not os.path.exists(apath):
    os.makedirs(apath)

In [4]:
# Only want to update current season
if update:
    start_year = end_year - 1

all_goals_df = pd.DataFrame()
all_assists_df = pd.DataFrame()
all_disc_df = pd.DataFrame()

# For each year
for year in range(start_year, end_year):
    # Scoring
    url = 'https://www.espn.com/soccer/stats/_/league/{}/season/{}'
    
    soup = getSoup(url.format(league, year))
    
    try:
        goals_df, assists_df = pd.read_html(str(soup))
    except:
        print('found no tables in {}'.format(year))
        continue
    
    goals_df = cleanDF(goals_df)
    goals_df['YEAR'] = year
    assists_df = cleanDF(assists_df)
    assists_df['YEAR'] = year
    
    # Discipline
    url = 'https://www.espn.com/soccer/stats/_/league/{}/season/{}/view/discipline'
    r = requests.get(url.format(league, year))
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    
    disc_df = pd.read_html(str(soup))[0]
    disc_df['RK'] = disc_df['RK'].fillna(method='ffill').astype(int)
    disc_df['YEAR'] = year
    
    # Append
    all_goals_df = all_goals_df.append(goals_df)
    all_assists_df = all_assists_df.append(assists_df)
    all_disc_df = all_disc_df.append(disc_df)
    
# if just an update is wanted
# update the df with the new data
if update:
    all_goals_df = updateDF(all_goals_df, 'goal_leaders.csv', start_year)
    all_assists_df = updateDF(all_assists_df, 'assist_leaders.csv', start_year)
    all_disc_df = updateDF(all_disc_df, 'team_discipline.csv', start_year)

# Save them
all_goals_df.to_csv(os.path.join('data', folder_name, 'agg_stats', 'goal_leaders.csv'), index=False)
all_assists_df.to_csv(os.path.join('data', folder_name, 'agg_stats', 'assist_leaders.csv'), index=False)
all_disc_df.to_csv(os.path.join('data', folder_name, 'agg_stats', 'team_discipline.csv'), index=False)