In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

import os
from pathlib import Path

from tqdm.auto import tqdm

In [2]:
from diskcache import Cache

cache = Cache("match_details.shelve")

@cache.memoize()
def get_page(link):
    print(link)
    return requests.get(link).content

In [3]:
import sys
import collections

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [4]:
matches_df = pd.read_csv(f'dfs/matches.csv', parse_dates=['datetime'])
leagues = matches_df['league'].unique()

In [26]:
base_url = 'https://understat.com/match/'

dfs = []

for _, row in tqdm(matches_df.iterrows()):
    if row['h_short_title'] == 'BAS' and row['a_short_title'] == 'LYO' and row['datetime'] == pd.Timestamp(2017,4,16):
        continue
    if row['isResult'] == True:
        match_id = row['id']
        url = base_url + str(match_id)

        #Use requests to get the webpage and BeautifulSoup to parse the page
        res = get_page(url)
        soup = BeautifulSoup(res, 'lxml')
        scripts = soup.find_all('script')

        #get only the shotsData
        strings = scripts[1].string
        # strip unnecessary symbols and get only JSON data 
        ind_start = strings.index("('")+2 
        ind_end = strings.index("')") 
        json_data = strings[ind_start:ind_end] 
        json_data = json_data.encode('utf8').decode('unicode_escape')

        #convert string to json format
        data = json.loads(json_data)
        data_home = data['h']
        data_away = data['a']

        df_h = pd.DataFrame(data_home)
        df_h['season'] = row['season']
        df_h['league'] = row['league']
        df_h['HomeAway'] = 'Home'

        df_a = pd.DataFrame(data_away)
        df_a['season'] = row['season']
        df_a['league'] = row['league']
        df_a['HomeAway'] = 'Away'

        df = pd.concat([df_h, df_a])

        dfs.append(df)
        
merged_df = pd.concat(dfs)
merged_df.to_csv(f'dfs/shots.csv', index=False)

0it [00:00, ?it/s]

https://understat.com/match/4241
https://understat.com/match/4240
https://understat.com/match/4235
https://understat.com/match/4244
https://understat.com/match/4243
https://understat.com/match/4236
https://understat.com/match/4239
https://understat.com/match/4229
https://understat.com/match/4226
https://understat.com/match/4232
https://understat.com/match/4230
https://understat.com/match/4225
https://understat.com/match/4227
https://understat.com/match/4231
https://understat.com/match/4234
https://understat.com/match/4233
https://understat.com/match/4228
https://understat.com/match/3231
https://understat.com/match/4217
https://understat.com/match/4222
https://understat.com/match/4216
https://understat.com/match/4215
https://understat.com/match/4220
https://understat.com/match/4219
https://understat.com/match/4218
https://understat.com/match/4224
https://understat.com/match/4213
https://understat.com/match/4212
https://understat.com/match/4206
https://understat.com/match/4211
https://un

https://understat.com/match/2325
https://understat.com/match/2324
https://understat.com/match/2323
https://understat.com/match/2322
https://understat.com/match/2320
https://understat.com/match/2319
https://understat.com/match/2318
https://understat.com/match/2317
https://understat.com/match/2316
https://understat.com/match/2315
https://understat.com/match/2314
https://understat.com/match/2313
https://understat.com/match/2312
https://understat.com/match/2311
https://understat.com/match/2310
https://understat.com/match/2309
https://understat.com/match/2308
https://understat.com/match/2307
https://understat.com/match/2306
https://understat.com/match/2305
https://understat.com/match/2304
https://understat.com/match/2303
https://understat.com/match/2302
https://understat.com/match/2301
https://understat.com/match/2300
https://understat.com/match/2299
https://understat.com/match/2298
https://understat.com/match/2297
https://understat.com/match/2296
https://understat.com/match/2295
https://un

https://understat.com/match/2075
https://understat.com/match/2074
https://understat.com/match/2073
https://understat.com/match/2072
https://understat.com/match/2071
https://understat.com/match/2070
https://understat.com/match/2069
https://understat.com/match/2068
https://understat.com/match/2067
https://understat.com/match/2066
https://understat.com/match/2063
https://understat.com/match/2062
https://understat.com/match/2061
https://understat.com/match/2064
https://understat.com/match/2065
https://understat.com/match/2060
https://understat.com/match/2059
https://understat.com/match/2058
https://understat.com/match/2057
https://understat.com/match/2056
https://understat.com/match/2055
https://understat.com/match/2054
https://understat.com/match/2053
https://understat.com/match/2052
https://understat.com/match/2051
https://understat.com/match/2050
https://understat.com/match/2049
https://understat.com/match/2048
https://understat.com/match/2047
https://understat.com/match/2046
https://un

https://understat.com/match/5923
https://understat.com/match/5922
https://understat.com/match/5917
https://understat.com/match/5919
https://understat.com/match/5918
https://understat.com/match/5920
https://understat.com/match/5921
https://understat.com/match/5916
https://understat.com/match/5915
https://understat.com/match/5914
https://understat.com/match/5913
https://understat.com/match/5912
https://understat.com/match/5908
https://understat.com/match/5909
https://understat.com/match/5907
https://understat.com/match/5910
https://understat.com/match/5911
https://understat.com/match/5906
https://understat.com/match/5905
https://understat.com/match/5904
https://understat.com/match/5903
https://understat.com/match/5900
https://understat.com/match/5899
https://understat.com/match/5898
https://understat.com/match/5901
https://understat.com/match/5902
https://understat.com/match/5897
https://understat.com/match/5896
https://understat.com/match/5895
https://understat.com/match/5894
https://un

https://understat.com/match/6122
https://understat.com/match/6121
https://understat.com/match/6120
https://understat.com/match/6117
https://understat.com/match/6116
https://understat.com/match/6118
https://understat.com/match/6119
https://understat.com/match/6115
https://understat.com/match/6114
https://understat.com/match/6184
https://understat.com/match/6183
https://understat.com/match/6182
https://understat.com/match/6181
https://understat.com/match/6179
https://understat.com/match/6178
https://understat.com/match/6177
https://understat.com/match/6180
https://understat.com/match/6176
https://understat.com/match/6175
https://understat.com/match/6174
https://understat.com/match/6173
https://understat.com/match/6170
https://understat.com/match/6168
https://understat.com/match/6169
https://understat.com/match/6167
https://understat.com/match/6171
https://understat.com/match/6172
https://understat.com/match/6166
https://understat.com/match/6165
https://understat.com/match/6164
https://un

In [21]:
row

id                              4238
isResult                        True
h_id                             162
h_title                    SC Bastia
h_short_title                    BAS
a_id                             178
a_title                         Lyon
a_short_title                    LYO
goals_h                          0.0
goals_a                          3.0
xG_h                             0.0
xG_a                             0.0
datetime         2017-04-16 00:00:00
forecast_w                      0.33
forecast_d                      0.33
forecast_l                      0.33
season                     2016-2017
league                       Ligue_1
Name: 11696, dtype: object

In [25]:
row['datetime'] == pd.Timestamp(2017,4,16)

True

In [21]:
from tqdm.auto import tqdm

base_url = 'https://understat.com/match/'

dfs = []

for _, row in tqdm(matches_df.iterrows()):
    if row['isResult'] == True:
        match_id = row['id']
        url = base_url + str(match_id)

        #Use requests to get the webpage and BeautifulSoup to parse the page
        res = get_page(url)
        soup = BeautifulSoup(res, 'lxml')
        scripts = soup.find_all('script')

        #get only the shotsData
        strings = scripts[2].string
        # strip unnecessary symbols and get only JSON data 
        ind_start = strings.index("('")+2 
        ind_end = strings.index("')") 
        json_data = strings[ind_start:ind_end] 
        json_data = json_data.encode('utf8').decode('unicode_escape')

        #convert string to json format
        data = json.loads(json_data)
        data_home = data['h']
        data_away = data['a']
        
        data_home = [data_home[key] for key in data_home]
        data_away = [data_away[key] for key in data_away]

        df_h = pd.DataFrame(data_home)
        df_h['season'] = row['season']
        df_h['league'] = row['league']
        df_h['match_id'] = row['id']
        df_h['HomeAway'] = 'Home'

        df_a = pd.DataFrame(data_away)
        df_a['season'] = row['season']
        df_a['league'] = row['league']
        df_a['match_id'] = row['id']
        df_a['HomeAway'] = 'Away'

        df = pd.concat([df_h, df_a])

        dfs.append(df)
        
roster_df = pd.concat(dfs)
roster_df.to_csv('dfs/rosters.csv', index=False)

0it [00:00, ?it/s]