In [10]:
import requests
from bs4 import BeautifulSoup
import json
from typing import List

In [55]:
URL = 'https://understat.com/league/EPL/2021'

In [21]:
def get_understat_data(url: str) -> List:
    """
    Use bs4 and requests to get the json data that is on the webpage.
    We then need 
    """
    result = requests.get(url)
    src = result.content
    soup = BeautifulSoup(src, 'html.parser')
    json_data = soup.find_all("script")

    Game_Table_Player_data = []
    for json_string in json_data[1:4]:
        """
        Convert the entry into a string, then find the first and last
        quotemarks as this finds our json, then put that into a list.
        """
        
        json_string = str(json_string)
        s_1 = json_string.find('\'')
        s_2 = json_string.rfind('\'')
        json_string = json_string[s_1:s_2 + 1]
        Game_Table_Player_data.append(json_string)

    return Game_Table_Player_data

def load_json(json_list : List) -> dict:
    return {['Game data', 'Team data', 'Player data'][i] : 
           json.loads(json_list[i].encode('ascii').decode('unicode-escape')[1:-1]) for i in range(len(json_list))}

In [26]:
import pandas as pd

In [58]:
def game_dict_to_df(game_dict : dict) -> pd.DataFrame:
    """
    Encode/decode the game data to give us a list of dictionaries,
    We then clean up each dictionary and create a dataframe out of
    them all.
    """
    
    game_data = game_dict['Game data']

    df = pd.DataFrame()
    for game in game_data:

        #Rename titles found within our json.
        game['h'] = game['h']['title']
        game['a'] = game['a']['title']
        game['Home goals'] = game['goals']['h']
        game['Away goals'] = game['goals']['a']
        game['xG Home'] = game['xG']['h']
        game['xG Away'] = game['xG']['a']
        game['Probability Home Win'] = game['forecast']['w']
        game['Probability Draw'] = game['forecast']['d']
        game['Probability Away Win'] = game['forecast']['l']

        #delete entries we no longer need
        del game['goals']
        del game['xG']
        del game['forecast']

        #add to main df
        df = pd.concat([df, pd.DataFrame.from_dict([game])], axis = 0, ignore_index = True)

    return df

In [90]:
def team_dict_to_df(team_dict : dict) -> pd.DataFrame:
    """
    Encode/decode the team data to give us a list of dictionaries,
    We then clean up each dictionary and create a dataframe out of
    them all.
    """
    
    team_data = team_dict['Team data']
    for key in list(team_data.keys()):
        team_data[team_data[key]['title']] = team_data[key]
        del team_data[key]
    for team in list(team_data.keys()):
        team_data[team] = team_data[team]['history']

    
    team_data_dict = {}
    for team in list(team_data.keys()):

        #the dictionaries all represent games, hence we call these games again.
        team_df = pd.DataFrame()
        for game in team_data[team]:

            #Rename titles found within our json.
            game['ppda att'] = game['ppda']['att']
            game['ppda def'] = game['ppda']['def']
            game['ppda allowed att'] = game['ppda_allowed']['att']
            game['ppda allowed def'] = game['ppda_allowed']['def']

            #delete entries we no longer need
            del game['ppda']
            del game['ppda_allowed']

            #add to main df
            team_df = pd.concat([team_df, pd.DataFrame.from_dict([game])], axis = 0)
        team_data_dict[team] = team_df

    return team_data_dict

In [84]:
for team in list(team_data.keys()):
        team_data[team] = team_data[team]['history']

    df = pd.DataFrame()
    #the dictionaries all represent games, hence we call these games again.
    print(type(team_data))
    for game in team_data:
        print(game.keys())

        #Rename titles found within our json.
        game['ppda att'] = game['ppda']['att']
        game['ppda def'] = game['ppda']['def']
        game['ppda allowed att'] = game['ppda_allowed']['att']
        game['ppda allowed def'] = game['ppda_allowed']['def']

        #delete entries we no longer need
        del game['ppda']
        del game['ppda_allowed']

        #add to main df
        df = pd.concat([df, pd.DataFrame.from_dict([game])], axis = 0)

    return df

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 4)

In [91]:
EPL_data = get_understat_data(URL)
EPL_gtp_data = load_json(EPL_data)
#EPL_game_data = game_dict_to_df(EPL_gtp_data)
EPL_team_data = team_dict_to_df(EPL_gtp_data)

In [69]:
EPL_gtp_data['Team data']

{'71': {'id': '71',
  'title': 'Aston Villa',
  'history': [{'h_a': 'a',
    'xG': 1.13718,
    'xGA': 1.35036,
    'npxG': 0.376011,
    'npxGA': 1.35036,
    'ppda': {'att': 182, 'def': 28},
    'ppda_allowed': {'att': 303, 'def': 21},
    'deep': 6,
    'deep_allowed': 4,
    'scored': 2,
    'missed': 3,
    'xpts': 1.1959,
    'result': 'l',
    'date': '2021-08-14 14:00:00',
    'wins': 0,
    'draws': 0,
    'loses': 1,
    'pts': 0,
    'npxGD': -0.974349},
   {'h_a': 'h',
    'xG': 1.18426,
    'xGA': 0.63163,
    'npxG': 0.423091,
    'npxGA': 0.63163,
    'ppda': {'att': 202, 'def': 14},
    'ppda_allowed': {'att': 150, 'def': 17},
    'deep': 5,
    'deep_allowed': 4,
    'scored': 2,
    'missed': 0,
    'xpts': 1.9441,
    'result': 'w',
    'date': '2021-08-21 14:00:00',
    'wins': 1,
    'draws': 0,
    'loses': 0,
    'pts': 3,
    'npxGD': -0.20853900000000003},
   {'h_a': 'h',
    'xG': 0.431464,
    'xGA': 1.13312,
    'npxG': 0.431464,
    'npxGA': 1.13312,
    'p