## EPL VAR Analysis
### Part 1 - Web Scraper

In [6]:
# Import necessary dependencies
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import json
from datetime import datetime, date, timedelta

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [20]:
VAR_PAGE_2020_2021 = 'https://www.espn.com.sg/football/english-premier-league/story/4182135/how-var-decisions-affected-every-premier-league-club-in-2020-21'
VAR_PAGE_2019_2020 = 'https://www.espn.com/soccer/english-premier-league/story/3929823/how-var-decisions-have-affected-every-premier-league-club'

In [7]:
# Set wait times
waittime = 30
sleeptime = 0.5

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Define target pages
homepage = VAR_PAGE_2020_2021

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(options=options) 

# Get driver to retrieve homepage
driver.get(homepage)

### Get team statistics (2020-2021 data)

In [10]:
# Wait for page to load
driver.implicitly_wait(waittime)

# Team numbering
team_list_elems = driver.find_elements_by_xpath("//div[@class='article-body']/h2")
team_list = []
net_score_list = []

for team in team_list_elems:
    team_name = team.text.rsplit(' ',1)[0]
    net_score = team.text.rsplit(' ',1)[-1]
    team_list.append(team_name)
    net_score_list.append(net_score)

In [11]:
all_elems = driver.find_elements_by_xpath("//div[@class='article-body']/p")

# Get general statistics for each of the 20 teams
team_stats_elems = [elem.text for elem in all_elems if 'Overturns: ' in elem.text]
team_stats_list = []

for stats in team_stats_elems:
    if 'Overturns: ' in stats:
        team_stats_list.append(stats)

In [12]:
data_tuples = list(zip(team_list,net_score_list, team_stats_list))
team_stats_df_2021 = pd.DataFrame(data_tuples, columns=['team_name', 'net_score', 'stats_combined'])
team_stats_df_2021

Unnamed: 0,team_name,net_score,stats_combined
0,Burnley,5,Overturns: 6\nLeading to goals for: 1\nDisallo...
1,Chelsea,3,Overturns: 8\nRejected overturns: 1\nLeading t...
2,Everton,3,Overturns: 6\nRejected overturns: 1\nLeading t...
3,Fulham,3,Overturns: 14\nRejected overturns: 1\nLeading ...
4,Manchester City,3,Overturns: 7\nLeading to goals for: 2\nDisallo...
5,Brighton & Hove Albion,2,Overturns: 15\nRejected overturns: 1\nLeading ...
6,Aston Villa,1,Overturns: 10\nRejected overturns: 1\nLeading ...
7,Crystal Palace,1,Overturns: 10\nRejected overturns: 1\nLeading ...
8,Leeds,1,Overturns: 9\nLeading to goals for: 1\nDisallo...
9,Leicester City,1,Overturns: 13\nLeading to goals for: 4\nDisall...


In [13]:
stats_col_mapping = [('overturns_total','Overturns'),
                    ('overturns_rejected','Rejected overturns'),
                    ('leading_to_goals_for','Leading to goals for'),
                    ('leading_to_goals_against','Leading to goals against'), 
                    ('disallowed_goals_for','Disallowed goals for'),
                    ('disallowed_goals_against','Disallowed goals against'),
                    ('net_goal_score','Net goal score'),
                    ('subj_decisions_for','Subjective decisions for'),
                    ('subj_decisions_against','Subjective decisions against'),
                    ('net_subjective_score','Net subjective score'),
                    ('penalties_for','Penalties for / against'),
                    ('penalties_against','Penalties for / against'),
                    ]

In [None]:
# Create columns
stats_col_list = [mapping[0] for mapping in stats_col_mapping]

for col in stats_col_list:
    team_stats_df_2021[col] = 0
    
# Update columns based on stats combined information
for i in range(len(team_stats_df_2021)):
    stats_info = team_stats_df_2021.iloc[i, :]['stats_combined']
    for line in stats_info.split('\n'):
        key = line.split(': ')[0]
        value = line.split(': ')[1]
        for mapping in stats_col_mapping:
            if mapping[1] == key:
                team_stats_df_2021.loc[i, mapping[0]] = value
                
# Amend penalties_for and penalties_against columns
team_stats_df_2021['penalties_for'] = team_stats_df_2021['penalties_for'].apply(lambda x: x.split(' / ')[0])

# Amend penalties_for and penalties_against columns
team_stats_df_2021['penalties_against'] = team_stats_df_2021['penalties_against'].apply(lambda x: x.split(' / ')[1])

In [19]:
team_stats_df_2021

Unnamed: 0,team_name,net_score,stats_combined,overturns_total,overturns_rejected,leading_to_goals_for,leading_to_goals_against,disallowed_goals_for,disallowed_goals_against,net_goal_score,subj_decisions_for,subj_decisions_against,net_subjective_score,penalties_for,penalties_against
0,Burnley,5,Overturns: 6\nLeading to goals for: 1\nDisallo...,6,0,1,0,0,1,0,3,0,3,1,0
1,Chelsea,3,Overturns: 8\nRejected overturns: 1\nLeading t...,8,1,2,0,1,0,1,2,1,1,1,0
2,Everton,3,Overturns: 6\nRejected overturns: 1\nLeading t...,6,1,2,0,1,1,2,3,0,3,1,0
3,Fulham,3,Overturns: 14\nRejected overturns: 1\nLeading ...,14,1,1,1,2,5,3,5,5,0,2,1
4,Manchester City,3,Overturns: 7\nLeading to goals for: 2\nDisallo...,7,0,2,0,2,0,0,2,1,1,1,0
5,Brighton & Hove Albion,2,Overturns: 15\nRejected overturns: 1\nLeading ...,15,1,2,3,2,3,0,4,4,0,2,3
6,Aston Villa,1,Overturns: 10\nRejected overturns: 1\nLeading ...,10,1,0,0,3,3,0,4,3,1,0,0
7,Crystal Palace,1,Overturns: 10\nRejected overturns: 1\nLeading ...,10,1,2,1,0,1,0,4,4,0,2,1
8,Leeds,1,Overturns: 9\nLeading to goals for: 1\nDisallo...,9,0,1,1,2,1,-1,3,2,1,0,1
9,Leicester City,1,Overturns: 13\nLeading to goals for: 4\nDisall...,13,0,4,2,2,1,1,3,3,0,3,2


### Get incident statistics