In [1]:
!pip install -- update selenium



In [2]:
# Import the libraries we need
import pandas as pd
import re
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime

In [3]:
# set url for understat
xg_url = 'https://understat.com/league/EPL'

# download the webpage
xg_data = requests.get(xg_url)

# get the html code for the webpage
xg_html = xg_data.content

# parse the html using bs4
soup = BeautifulSoup(xg_html, 'lxml')

print(soup.title)

<title>EPL xG Table and Scorers for the 2021/2022 season | Understat.com</title>


In [4]:
#set up the selenium driver
options = webdriver.ChromeOptions()

# run without opening the browser
options.add_argument('headless')

driver = webdriver.Chrome(executable_path = '/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/chrome/chromedriver', options = options)

# navigate to the page url
driver.get(xg_url)

# get the html code from the webpage
soup = BeautifulSoup(driver.page_source, 'lxml')

print(soup.title)

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 85


In [None]:
# Get the table headers using 3 chained find operations
# 1. Find the div containing the table (div class = chemp jTable)
# 2. Find the table within that div
# 3. Find all 'th' elements where class = sort
headers = soup.find('div', attrs={'class':'chemp margin-top jTable'}).find('table').find_all('th',attrs={'class':'sort'})
 
headers

In [None]:
# iterate over headers, get the text from each item, and add the results to headers_list
headers_list=[]
for header in headers:
    headers_list.append(header.get_text(strip=True))
print(headers_list)

In [None]:
body = soup.find('div', attrs={'class':'chemp margin-top jTable'}).table.tbody

#create a master list for row data
all_rows_list = []

#for each row in the table body
for tr in body.find_all('tr'):
    #get data from each cell in the row
    row = tr.find_all('td')
    #create list to save current row data to
    current_row = []
    # for each item in the row variable
    for item in row:
    #add the text data to the current_row list
        current_row.append(item.get_text(strip=True))
    
    #add the current row data to the master list
    all_rows_list.append(current_row)
        
# create a dataframe where the rows = all_rows_list and columns = headers_list
team_xg_df = pd.DataFrame(all_rows_list, columns=headers_list)
team_xg_df

In [None]:
team_xg_df['xG'] = team_xg_df['xG'].str.split('[- +]').str.get(0)

team_xg_df['xGA'] = team_xg_df['xGA'].str.split('[- +]').str.get(0)

team_xg_df['xPTS'] = team_xg_df['xPTS'].str.split('[- +]').str.get(0)

team_xg_df.head()

In [None]:
# calculate goal difference (GD)
team_xg_df['GD'] = team_xg_df['G'].astype(int) - team_xg_df['GA'].astype(int)

#calculate expected goal difference (xGD)
team_xg_df['xGD'] = team_xg_df['xG'].astype(float) - team_xg_df['xGA'].astype(float)

team_xg_df.head()

In [None]:
# calculate GD vs. xGD
team_xg_df['GD_vs_xGD'] = team_xg_df['GD'].astype(float) - team_xg_df['xGD'].astype(float)

#sort values 
team_xg_df = team_xg_df.sort_values(by = ['GD_vs_xGD'], ascending = False)

# import matplotlib library for plotting
from matplotlib import pyplot as plt

# create horizontal bar chart to visualize teams that have been overperforming or underperforming based on GD vs xGD
plt.barh(team_xg_df['Team'], team_xg_df['GD_vs_xGD'])

plt.title('GD vs. xGD')

# show the plot
plt.show()

In [None]:
# list desired cols
cols = ['G', 'GA', 'PTS', 'xG', 'xGA', 'xPTS', 'GD', 'xGD']

# create a list of new columns for per game data
for col in cols:
    team_xg_df['%s_pg' % col] = team_xg_df[col].astype(float)/team_xg_df['M'].astype(float)
    
team_xg_df.head()

In [None]:
# sort teams by xG per game
team_xg_df = team_xg_df.sort_values(by = ['xG_pg'], ascending = False)

# import matplotlib library for plotting
from matplotlib import pyplot as plt

# create horizontal bar chart to visualize teams that have been overperforming or underperforming based on GD vs xGD
plt.barh(team_xg_df['Team'], team_xg_df['xG_pg'])

plt.title('xG_pg')

# show the plot
plt.show()

In [None]:
# calculate goals against vs. xGA
team_xg_df['GA_vs_xGA'] = team_xg_df['GA'].astype(float) - team_xg_df['xGA'].astype(float)

#sort values 
team_xg_df = team_xg_df.sort_values(by = ['GA_vs_xGA'], ascending = True)

# create horizontal bar chart to visualize teams that have been overperforming or underperforming based on GD vs xGD
plt.barh(team_xg_df['Team'], team_xg_df['GA_vs_xGA'])

plt.title('GA vs. xGA')

# show the plot
plt.show()

In [None]:
# sort the teams by goals against
GA_sort = team_xg_df.sort_values(by = ['GA'], ascending = False)

GA_sort

In [None]:
# sort the teams by xgoals against per game
xGApg_sort = team_xg_df.sort_values(by = ['xGA_pg'], ascending = False)

xGApg_sort

In [None]:
# sort the teams by xgoals against per game
xGpg_sort = team_xg_df.sort_values(by = ['xG_pg'], ascending = False)

xGpg_sort