In [4]:
import requests
from bs4 import BeautifulSoup
import json
from typing import List

In [5]:
url = 'https://understat.com/league/EPL'

In [7]:
def get_understat_data(url: str) -> List:
    """
    Use bs4 and requests to get the json data that is on the webpage.
    We then need 
    """
    result = requests.get(url)
    src = result.content
    soup = BeautifulSoup(src, 'html.parser')
    json_data = soup.find_all("script")

    Game_Table_Player_data = []
    for json_string in json_data[1:4]:
        """
        Convert the entry into a string, then find the first and last
        quotemarks as this finds our json, then put that into a list.
        """
        
        json_string = str(json_string)
        s_1 = json_string.find('\'')
        s_2 = json_string.rfind('\'')
        json_string = json_string[s_1:s_2 + 1]
        Game_Table_Player_data.append(json_string)

    return Game_Table_Player_data

def load_json(json_list : List) -> dict:
    return {['Game data', 'Team data', 'Player data'][i] : 
    json.loads(json_list[i].encode('ascii').decode('unicode-escape')[1:-1]) for i in range(len(json_list))}

In [13]:
import pandas as pd

def game_dict_to_df(game_dict : dict) -> pd.DataFrame:
    """
    Encode/decode the game data to give us a list of dictionaries,
    We then clean up each dictionary and create a dataframe out of
    them all.
    """
    
    game_data = game_dict['Game data']

    df = pd.DataFrame()
    for game in game_data:

        #Rename titles found within our json.
        game['h'] = game['h']['title']
        game['a'] = game['a']['title']
        game['Home goals'] = game['goals']['h']
        game['Away goals'] = game['goals']['a']
        game['xG Home'] = game['xG']['h']
        game['xG Away'] = game['xG']['a']
        game['Probability Home Win'] = game['forecast']['w']
        game['Probability Draw'] = game['forecast']['d']
        game['Probability Away Win'] = game['forecast']['l']

        #delete entries we no longer need
        del game['goals']
        del game['xG']
        del game['forecast']

        #add to main df, NB we wrap game in square brackets else we get an error.
        df = pd.concat([df, pd.DataFrame.from_dict([game])], axis = 0, ignore_index = True)

    return df

In [14]:
def team_dict_to_df(team_dict : dict) -> pd.DataFrame:
    """
    Encode/decode the team data to give us a list of dictionaries,
    We then clean up each dictionary and create a dataframe out of
    them all.
    """

    #Fix the dictionary up to make a dataframe
    team_data = team_dict['Team data']
    for key in list(team_data.keys()):
        team_data[team_data[key]['title']] = team_data[key]
        del team_data[key]
    for team in list(team_data.keys()):
        team_data[team] = team_data[team]['history']

    team_data_dict = {}
    for team in list(team_data.keys()):

        #the dictionaries all represent games, hence we call these games again.
        team_df = pd.DataFrame()
        for game in team_data[team]:

            #Rename titles found within our json.
            game['ppda att'] = game['ppda']['att']
            game['ppda def'] = game['ppda']['def']
            game['ppda allowed att'] = game['ppda_allowed']['att']
            game['ppda allowed def'] = game['ppda_allowed']['def']

            #delete entries we no longer need
            del game['ppda']
            del game['ppda_allowed']

            #add to main df
            team_df = pd.concat([team_df, pd.DataFrame.from_dict([game])], axis = 0)

        team_data_dict[team] = team_df

    return team_data_dict

In [23]:
def player_dict_to_df(player_dict : dict) -> pd.DataFrame:
    """
    Encode/decode the player data to give us a list of dictionaries
    and create a dataframe out of them all.
    """
    
    player_data = player_dict['Player data']

    player_df = pd.DataFrame()
    for game in player_data:

        #add to main df, NB we wrap game in square brackets else we get an error.
        player_df = pd.concat([player_df, pd.DataFrame.from_dict([game])], axis = 0, ignore_index = True)

    return player_df

In [24]:
URL = 'https://understat.com/league/EPL/2021'

OG_cols = ['Home/Away',
 'xG',
 'xGA',
 'non-penalty xG',
 'non-penalty xG Against',
 'Deep',
 'Deep Allowed',
 'scored',
 'conceded',
 'Expected Points',
 'Result',
 'date',
 'Wins',
 'Draws',
 'Loses',
 'Points',
 'Non-penalty xG difference',
 'ppda attack',
 'ppda defence',
 'ppda allowed att',
 'ppda allowed def',
 'Wins cum',
 'Loses cum',
 'Draws cum',
 'Points cum',
 'npxG difference cum',
 'xG cum',
 'xGA cum',
 'npxG cum',
 'npxGA cum',
 'scored cum',
 'conceded cum',
 'Expected Points cum',
 'Opponent',
 'Prob Win',
 'Prob Draw',
 'Prob Lose',
 'Wins in last 5',
 'Loses in last 5',
 'Draws in last 5',
 'Points in last 5',
 'npxG difference last 5',
 'xG last 5',
 'xGA last 5',
 'npxG last 5',
 'npxGA last 5',
 'scored last 5',
 'conceded last 5',
 'Expected Points last 5']

CUMSUM_LIST = ['xG', 
 'xGA', 
 'npxG', 
 'npxGA',
 'scored',
 'missed', 
 'xpts',
 'wins', 
 'draws', 
 'loses', 
 'pts',
 'npxGD']

In [28]:
#Get the json data into 3 lists of dictionaries
league_jsonstr_data = get_understat_data(URL)
league_gtp_data = load_json(league_jsonstr_data)

#Get the game, team, player data into a DataFrame
league_game_data = game_dict_to_df(league_gtp_data)
league_team_data = team_dict_to_df(league_gtp_data)
league_player_data = player_dict_to_df(league_gtp_data)

In [30]:
league_team_data['Arsenal'].columns

Index(['h_a', 'xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored',
       'missed', 'xpts', 'result', 'date', 'wins', 'draws', 'loses', 'pts',
       'npxGD', 'ppda att', 'ppda def', 'ppda allowed att',
       'ppda allowed def'],
      dtype='object')