In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
from pathlib import Path

In [2]:
from diskcache import Cache

cache = Cache("matches.shelve")

@cache.memoize()
def get_page(link):
    print(link)
    return requests.get(link).content

In [3]:
list(cache.iterkeys())

[('__main__.get_page', 'https://understat.com/league/EPL/2014'),
 ('__main__.get_page', 'https://understat.com/league/EPL/2015'),
 ('__main__.get_page', 'https://understat.com/league/EPL/2016'),
 ('__main__.get_page', 'https://understat.com/league/EPL/2017'),
 ('__main__.get_page', 'https://understat.com/league/EPL/2018'),
 ('__main__.get_page', 'https://understat.com/league/EPL/2019'),
 ('__main__.get_page', 'https://understat.com/league/EPL/2020'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2014'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2015'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2016'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2017'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2018'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2019'),
 ('__main__.get_page', 'https://understat.com/league/La_liga/2020'),
 ('__main__.get_page', 'https://understat.com/league/Ligue_1/2

In [4]:
import sys
import collections

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [5]:
leagues = ['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1']
years = {
    '2020-2021': '2020',
    '2019-2020': '2019',
    '2018-2019': '2018',
    '2017-2018': '2017',
    '2016-2017': '2016',
    '2015-2016': '2015',
    '2014-2015': '2014',
}

In [6]:
dfs = []
for league in leagues:
    for season in years:
        year = years[season]
        url = f'https://understat.com/league/{league}/{year}'
        
        res = get_page(url)
        soup = BeautifulSoup(res, 'lxml')
        scripts = soup.find_all('script')
        strings = scripts[1].string
        
        ind_start = strings.index("('")+2 
        ind_end = strings.index("')") 
        json_data = strings[ind_start:ind_end] 
        json_data = json_data.encode('utf8').decode('unicode_escape')
        
        data = json.loads(json_data)
        
        data = [flatten(d) for d in data]
        
        df = pd.DataFrame(data)
        df['season'] = season
        df['league'] = league
        df['datetime'] = pd.to_datetime(df['datetime'])
        df = df.sort_values(by = 'datetime', ascending = False)
        dfs.append(df)
    
df = pd.concat(dfs, ignore_index= True)

path = Path(f'dfs')
path.mkdir(parents=True, exist_ok=True)


  


In [7]:
df['datetime'] = pd.to_datetime(df['datetime']).dt.date
df[df['datetime'] == pd.Timestamp(2017, 9, 15)]

Unnamed: 0,id,isResult,h_id,h_title,h_short_title,a_id,a_title,a_short_title,goals_h,goals_a,xG_h,xG_a,datetime,forecast_w,forecast_d,forecast_l,season,league
1479,7159,True,73,Bournemouth,BOU,220,Brighton,BRI,2,1,1.39529,0.874276,2017-09-15,0.4979,0.3047,0.1974,2017-2018,EPL
6516,8286,True,128,Hannover 96,HAN,118,Hamburger SV,HAM,2,0,2.2982,0.453334,2017-09-15,0.8903,0.0894,0.0203,2017-2018,Bundesliga
11591,8615,True,174,Toulouse,TOU,176,Bordeaux,BOR,0,1,0.909141,0.733308,2017-09-15,0.373,0.3569,0.2701,2017-2018,Ligue_1


In [None]:
df.to_csv(os.path.join(path, 'matches.csv'), index = False)

In [7]:
df['league'].unique()

array(['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1'], dtype=object)

In [8]:
len(df)

12782