In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

In [2]:
from diskcache import Cache

cache = Cache("matches.shelve")

@cache.memoize()
def get_page(link):
    print(link)
    return requests.get(link).content

In [3]:
list(cache.iterkeys())

[('__main__.get_page', 'https://understat.com/league/EPL/2020')]

In [4]:
import sys
import collections

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [5]:
leagues = ['EPL']
years = {
    '2020-2021': '2020',
    '2019-2020': '2019',
    '2018-2019': '2018',
    '2017-2018': '2017',
    '2016-2017': '2016',
    '2015-2016': '2015',
    '2014-2015': '2014',
}

In [19]:

for league in leagues:
    dfs = []
    for season in years:
        year = years[season]
        url = f'https://understat.com/league/{league}/{year}'
        
        res = get_page(url)
        soup = BeautifulSoup(res, 'lxml')
        scripts = soup.find_all('script')
        strings = scripts[1].string
        
        ind_start = strings.index("('")+2 
        ind_end = strings.index("')") 
        json_data = strings[ind_start:ind_end] 
        json_data = json_data.encode('utf8').decode('unicode_escape')
        
        data = json.loads(json_data)
        
        data = [flatten(d) for d in data]
        
        df = pd.DataFrame(data)
        df['season'] = season
        df['league'] = league
        df['datetime'] = pd.to_datetime(df['datetime'])
        df = df.sort_values(by = 'datetime', ascending = False)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index= True)
    
    break

In [20]:
df

Unnamed: 0,id,isResult,h_id,h_title,h_short_title,a_id,a_title,a_short_title,goals_h,goals_a,xG_h,xG_a,datetime,forecast_w,forecast_d,forecast_l,season,league
0,14814,False,229,Wolverhampton Wanderers,WOL,89,Manchester United,MUN,,,,,2021-05-22 17:00:00,,,,2020-2021,EPL
1,14813,False,81,West Ham,WHU,74,Southampton,SOU,,,,,2021-05-22 17:00:00,,,,2020-2021,EPL
2,14812,False,238,Sheffield United,SHE,92,Burnley,BUR,,,,,2021-05-22 17:00:00,,,,2020-2021,EPL
3,14811,False,88,Manchester City,MCI,72,Everton,EVE,,,,,2021-05-22 17:00:00,,,,2020-2021,EPL
4,14810,False,87,Liverpool,LIV,78,Crystal Palace,CRY,,,,,2021-05-22 17:00:00,,,,2020-2021,EPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,4751,True,202,Queens Park Rangers,QPR,91,Hull,HUL,0,1,1.90067,1.11757,2014-08-16 15:00:00,0.59,0.2449,0.1651,2014-2015,EPL
2656,4750,True,75,Leicester,LEI,72,Everton,EVE,2,2,1.2783,0.613273,2014-08-16 15:00:00,0.5513,0.2922,0.1565,2014-2015,EPL
2657,4753,True,76,West Bromwich Albion,WBA,77,Sunderland,SUN,2,2,1.68343,0.991901,2014-08-16 15:00:00,0.5894,0.2676,0.143,2014-2015,EPL
2658,4754,True,81,West Ham,WHU,82,Tottenham,TOT,0,1,1.8531,1.01706,2014-08-16 15:00:00,0.5856,0.2648,0.1496,2014-2015,EPL


In [21]:
df.dtypes

id                       object
isResult                   bool
h_id                     object
h_title                  object
h_short_title            object
a_id                     object
a_title                  object
a_short_title            object
goals_h                  object
goals_a                  object
xG_h                     object
xG_a                     object
datetime         datetime64[ns]
forecast_w               object
forecast_d               object
forecast_l               object
season                   object
league                   object
dtype: object

In [23]:
df.to_csv('dfs/matches/matches.csv', index = False)