### Scraping EPL data from understat

In [50]:
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import os.path
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

In [61]:
## Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"{homedir}/chromedriver/stable/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# Get page
browser.get("https://understat.com/league/EPL/2022")

page_source = BeautifulSoup(browser.page_source, 'lxml')

# Get the data in the script
rawData = page_source.find_all('script')


<html class="wf-barlow-n5-inactive wf-anton-n4-inactive wf-inactive js flexbox flexboxlegacy canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths"><head>
<base href="https://understat.com/"/>
<title>EPL xG Table and Scorers for the 2022/2023 season | Understat.com</title>
<meta charset="utf-8"/>
<meta content="xG table of EPL standings and top scorers for the 2022/2023 season, also tables from past seasons and other European football leagues." name="description"/>
<meta content="EPL, xg table,scorers, expected goals table, season 2022/2023" name="Keywords"/>
<link href="apple-touch-icon.png" rel="apple-t

In [15]:
strings = rawData[5].string

ind_start = strings.index("('")+2
ind_end = strings.index("')")

json_data = strings[ind_start:ind_end]
json_data = json_data.encode('utf8').decode('unicode_escape')

epl_data_json = json.loads(json_data)

In [16]:
epl_data = pd.DataFrame.from_dict(epl_data_json, orient='index')
epl_data

Unnamed: 0,id,title,history
71,71,Aston Villa,"[{'h_a': 'a', 'xG': 0.488895, 'xGA': 0.588341,..."
72,72,Everton,"[{'h_a': 'h', 'xG': 0.541983, 'xGA': 1.92315, ..."
73,73,Bournemouth,"[{'h_a': 'h', 'xG': 0.588341, 'xGA': 0.488895,..."
74,74,Southampton,"[{'h_a': 'a', 'xG': 0.386546, 'xGA': 1.6172, '..."
75,75,Leicester,"[{'h_a': 'h', 'xG': 0.455695, 'xGA': 0.931067,..."
78,78,Crystal Palace,"[{'h_a': 'h', 'xG': 1.20637, 'xGA': 1.43601, '..."
80,80,Chelsea,"[{'h_a': 'a', 'xG': 1.92315, 'xGA': 0.541983, ..."
81,81,West Ham,"[{'h_a': 'h', 'xG': 0.246188, 'xGA': 2.58739, ..."
82,82,Tottenham,"[{'h_a': 'h', 'xG': 1.6172, 'xGA': 0.386546, '..."
83,83,Arsenal,"[{'h_a': 'a', 'xG': 1.43601, 'xGA': 1.20637, '..."


In [50]:
clubs = []

for (key, team) in epl_data_json.items():
    club = { "name": team['title'], "xG": 0, 'ppda': [] }
    total_ppda = 0
    
    for idx, match in enumerate(team['history']):
        club['xG'] += match["xG"]
        ppda = round(match['ppda']['att'], 2)/round(match['ppda']['def'], 2)
        temp_ppda = round(ppda, 2)
        print(match)
        if (idx > 0):
            ppda = round((ppda+total_ppda)/(idx+1), 2)
        total_ppda += temp_ppda

        club['ppda'].append(round(ppda, 2))
        # print(match)

    club['ppda-total'] = club['ppda'][-1]
    clubs.append(club)
    
print(clubs)


{'h_a': 'a', 'xG': 0.488895, 'xGA': 0.588341, 'npxG': 0.488895, 'npxGA': 0.588341, 'ppda': {'att': 152, 'def': 23}, 'ppda_allowed': {'att': 292, 'def': 25}, 'deep': 11, 'deep_allowed': 3, 'scored': 0, 'missed': 2, 'xpts': 1.1566999999999998, 'result': 'l', 'date': '2022-08-06 14:00:00', 'wins': 0, 'draws': 0, 'loses': 1, 'pts': 0, 'npxGD': -0.09944599999999998}
{'h_a': 'h', 'xG': 2.66696, 'xGA': 1.37507, 'npxG': 2.66696, 'npxGA': 1.37507, 'ppda': {'att': 214, 'def': 23}, 'ppda_allowed': {'att': 242, 'def': 13}, 'deep': 13, 'deep_allowed': 4, 'scored': 2, 'missed': 1, 'xpts': 2.3254, 'result': 'w', 'date': '2022-08-13 11:30:00', 'wins': 1, 'draws': 0, 'loses': 0, 'pts': 3, 'npxGD': 1.29189}
{'h_a': 'a', 'xG': 0.803828, 'xGA': 2.91137, 'npxG': 0.803828, 'npxGA': 2.00662, 'ppda': {'att': 252, 'def': 34}, 'ppda_allowed': {'att': 251, 'def': 18}, 'deep': 3, 'deep_allowed': 8, 'scored': 1, 'missed': 3, 'xpts': 0.2122, 'result': 'l', 'date': '2022-08-20 14:00:00', 'wins': 0, 'draws': 0, 'lose