# English Premier League VAR Analysis
## Part 1 - Data Acquisition
___

#### Data Source
- https://www.espn.com.sg/football/english-premier-league/story/4182135/how-var-decisions-affected-every-premier-league-club-in-2020-21
- https://www.espn.com/soccer/english-premier-league/story/3929823/how-var-decisions-have-affected-every-premier-league-club

In [1]:
# Import necessary dependencies
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import json
from datetime import datetime, date, timedelta
import numpy as np
import pandas as pd
from datetime import datetime as dt
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

pd.options.display.max_rows = 5000

VAR_PAGE_2020_2021 = 'https://www.espn.com.sg/football/english-premier-league/story/4182135/how-var-decisions-affected-every-premier-league-club-in-2020-21'
VAR_PAGE_2019_2020 = 'https://www.espn.com/soccer/english-premier-league/story/3929823/how-var-decisions-have-affected-every-premier-league-club'

# Set wait times
waittime = 30
sleeptime = 0.5

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(options=options) 

___
### 1. Get Team Statistics

#### (i) 2020/2021

In [2]:
# Get driver to retrieve homepage
driver.get(VAR_PAGE_2020_2021)

# Wait for page to load
driver.implicitly_wait(waittime)

# Team numbering
team_list_elems = driver.find_elements_by_xpath("//div[@class='article-body']/h2")
team_list = []
net_score_list = []

for team in team_list_elems:
    team_name = team.text.rsplit(' ',1)[0]
    net_score = team.text.rsplit(' ',1)[-1]
    team_list.append(team_name)
    net_score_list.append(net_score)
    
all_elems = driver.find_elements_by_xpath("//div[@class='article-body']/p")

# Get general statistics for each of the 20 teams
team_stats_elems = [elem.text for elem in all_elems if 'Overturns: ' in elem.text]
team_stats_list = []

for stats in team_stats_elems:
    if 'Overturns: ' in stats:
        team_stats_list.append(stats)

data_tuples = list(zip(team_list,net_score_list, team_stats_list))
team_stats_df_2021 = pd.DataFrame(data_tuples, columns=['team_name', 'net_score', 'stats_combined'])
team_stats_df_2021

Unnamed: 0,team_name,net_score,stats_combined
0,Burnley,4,Overturns: 7\nLeading to goals for: 1\nDisallo...
1,Everton,4,Overturns: 7\nRejected overturns: 1\nLeading t...
2,Chelsea,3,Overturns: 11\nRejected overturns: 1\nLeading ...
3,Fulham,3,Overturns: 14\nRejected overturns: 1\nLeading ...
4,Sheffield United,3,Overturns: 11\nLeading to goals for: 3\nDisall...
5,Aston Villa,2,Overturns: 11\nRejected overturns: 1\nLeading ...
6,Manchester City,2,Overturns: 8\nLeading to goals for: 2\nDisallo...
7,Brighton & Hove Albion,1,Overturns: 16\nRejected overturns: 1\nLeading ...
8,Crystal Palace,1,Overturns: 10\nRejected overturns: 1\nLeading ...
9,Leeds,1,Overturns: 9\nLeading to goals for: 1\nDisallo...


In [3]:
stats_col_mapping = [('overturns_total','Overturns'),
                    ('overturns_rejected','Rejected overturns'),
                    ('leading_to_goals_for','Leading to goals for'),
                    ('leading_to_goals_against','Leading to goals against'), 
                    ('disallowed_goals_for','Disallowed goals for'),
                    ('disallowed_goals_against','Disallowed goals against'),
                    ('net_goal_score','Net goal score'),
                    ('subj_decisions_for','Subjective decisions for'),
                    ('subj_decisions_against','Subjective decisions against'),
                    ('net_subjective_score','Net subjective score'),
                    ('penalties_for','Penalties for / against'),
                    ('penalties_against','Penalties for / against'),
                    ]

In [4]:
# Create columns
stats_col_list = [mapping[0] for mapping in stats_col_mapping]

for col in stats_col_list:
    team_stats_df_2021[col] = 0
    
# Update columns based on stats combined information
for i in range(len(team_stats_df_2021)):
    stats_info = team_stats_df_2021.iloc[i, :]['stats_combined']
    for line in stats_info.split('\n'):
        key = line.split(': ')[0]
        value = line.split(': ')[1]
        for mapping in stats_col_mapping:
            if mapping[1] == key:
                team_stats_df_2021.loc[i, mapping[0]] = value
                
# Amend penalties_for and penalties_against columns
team_stats_df_2021['penalties_for'] = team_stats_df_2021['penalties_for'].apply(lambda x: x.split(' / ')[0])

# Amend penalties_for and penalties_against columns
team_stats_df_2021['penalties_against'] = team_stats_df_2021['penalties_against'].apply(lambda x: x.split(' / ')[1])

# Add year column
team_stats_df_2021['year'] = '2020/2021'

# Drop stats_combined column
team_stats_df_2021.drop(columns=['stats_combined'], inplace=True)

team_stats_df_2021

Unnamed: 0,team_name,net_score,overturns_total,overturns_rejected,leading_to_goals_for,leading_to_goals_against,disallowed_goals_for,disallowed_goals_against,net_goal_score,subj_decisions_for,subj_decisions_against,net_subjective_score,penalties_for,penalties_against,year
0,Burnley,4,7,0,1,0,1,1,-1,3,0,3,1,0,2020/2021
1,Everton,4,7,1,2,0,1,1,2,3,0,3,1,0,2020/2021
2,Chelsea,3,11,1,2,0,2,0,0,3,1,2,1,0,2020/2021
3,Fulham,3,14,1,1,1,2,5,3,5,5,0,2,1,2020/2021
4,Sheffield United,3,11,0,3,1,0,3,5,2,3,-1,3,1,2020/2021
5,Aston Villa,2,11,1,0,0,3,3,0,5,3,2,0,0,2020/2021
6,Manchester City,2,8,0,2,0,2,0,0,2,2,0,1,0,2020/2021
7,Brighton & Hove Albion,1,16,1,2,3,3,3,-1,4,4,0,2,3,2020/2021
8,Crystal Palace,1,10,1,2,1,0,1,0,4,4,0,2,1,2020/2021
9,Leeds,1,9,0,1,1,2,1,-1,3,2,1,0,1,2020/2021


### 1. Get Team Statistics  

#### (ii) 2019/2020

In [5]:
# Direct driver to 2019-2020 page
driver.get(VAR_PAGE_2019_2020)

# Team numbering
team_list_elems = driver.find_elements_by_xpath("//div[@class='article-body']/video1/h2") + \
                    driver.find_elements_by_xpath("//div[@class='article-body']/video1/video2/h2")
                    
team_list = []
net_score_list = []

for team in team_list_elems:
    team_name = team.text.rsplit(' ',1)[0]
    net_score = team.text.rsplit(' ',1)[-1]
    team_list.append(team_name)
    net_score_list.append(net_score)
    
all_elems = driver.find_elements_by_xpath("//div[@class='article-body']/video1/p") + \
            driver.find_elements_by_xpath("//div[@class='article-body']/video1/video2/p")

# Get general statistics for each of the 20 teams
team_stats_elems = [elem.text for elem in all_elems if 'Overturns: ' in elem.text]
team_stats_list = []

for stats in team_stats_elems:
    if 'Overturns: ' in stats:
        team_stats_list.append(stats)
        
data_tuples = list(zip(team_list,net_score_list, team_stats_list))
team_stats_df_1920 = pd.DataFrame(data_tuples, columns=['team_name', 'net_score', 'stats_combined'])

# Create columns
stats_col_list = [mapping[0] for mapping in stats_col_mapping]

for col in stats_col_list:
    team_stats_df_1920[col] = 0
    
# Update columns based on stats combined information
for i in range(len(team_stats_df_1920)):
    stats_info = team_stats_df_1920.iloc[i, :]['stats_combined']
    for line in stats_info.split('\n'):
        key = line.split(': ')[0]
        value = line.split(': ')[1]
        for mapping in stats_col_mapping:
            if mapping[1] == key:
                team_stats_df_1920.loc[i, mapping[0]] = value

2019/2020 data does not have `penalties for/against` information

In [6]:
# Amend penalties_for and penalties_against columns
team_stats_df_1920['penalties_for'] = ''
team_stats_df_1920['penalties_against'] = ''

# Add year column
team_stats_df_1920['year'] = '2019/2020'

# Drop unnecessary columns
team_stats_df_1920.drop(columns=['stats_combined'], inplace=True)
team_stats_df_1920

Unnamed: 0,team_name,net_score,overturns_total,overturns_rejected,leading_to_goals_for,leading_to_goals_against,disallowed_goals_for,disallowed_goals_against,net_goal_score,subj_decisions_for,subj_decisions_against,net_subjective_score,penalties_for,penalties_against,year
0,Brighton & Hove Albion,8,12,0,2,0,2,7,7,2,0,2,,,2019/2020
1,Manchester United,7,13,0,1,2,0,7,6,6,2,4,,,2019/2020
2,Crystal Palace,4,12,0,3,0,4,1,2,6,2,4,,,2019/2020
3,Burnley,3,11,0,2,1,3,4,2,4,2,2,,,2019/2020
4,Newcastle,3,3,0,1,0,0,0,1,2,0,2,,,2019/2020
5,Southampton,3,13,0,0,1,0,7,6,1,4,-3,,,2019/2020
6,Liverpool,2,8,0,1,0,3,4,2,1,1,0,,,2019/2020
7,Leicester City,1,15,0,1,1,3,4,1,3,3,0,,,2019/2020
8,Tottenham Hotspur,1,15,0,1,1,4,6,2,3,3,0,,,2019/2020
9,Manchester City,0,16,0,3,2,4,2,-1,4,4,0,,,2019/2020


### 1. Get team statistics

#### (iii) Combined: 2019/2020 and 2020/2021

In [7]:
team_stats_df = team_stats_df_1920.append(team_stats_df_2021)

# For net score related columns, remove + sign
for col in team_stats_df.filter(like='net_').columns:
    team_stats_df[col] = team_stats_df[col].str.replace('+','', regex=True)
    
team_stats_df = team_stats_df.reset_index(drop=True)
team_stats_df

Unnamed: 0,team_name,net_score,overturns_total,overturns_rejected,leading_to_goals_for,leading_to_goals_against,disallowed_goals_for,disallowed_goals_against,net_goal_score,subj_decisions_for,subj_decisions_against,net_subjective_score,penalties_for,penalties_against,year
0,Brighton & Hove Albion,8,12,0,2,0,2,7,7,2,0,2,,,2019/2020
1,Manchester United,7,13,0,1,2,0,7,6,6,2,4,,,2019/2020
2,Crystal Palace,4,12,0,3,0,4,1,2,6,2,4,,,2019/2020
3,Burnley,3,11,0,2,1,3,4,2,4,2,2,,,2019/2020
4,Newcastle,3,3,0,1,0,0,0,1,2,0,2,,,2019/2020
5,Southampton,3,13,0,0,1,0,7,6,1,4,-3,,,2019/2020
6,Liverpool,2,8,0,1,0,3,4,2,1,1,0,,,2019/2020
7,Leicester City,1,15,0,1,1,3,4,1,3,3,0,,,2019/2020
8,Tottenham Hotspur,1,15,0,1,1,4,6,2,3,3,0,,,2019/2020
9,Manchester City,0,16,0,3,2,4,2,-1,4,4,0,,,2019/2020


In [8]:
# Export as CSV file
export_date = dt.today().strftime('%Y%m%d')
team_stats_df.to_csv(f'./data/EPL_VAR_Team_Stats_Raw_{export_date}.csv', index = False)

___
### 2. Get incident data  

#### (i) 2020 - 2021 Season

##### Define custom functions

In [9]:
def get_opposition_team(x):
    '''
    Get name of opposition team
    '''
    pattern = "Game: (.*?) \("
    substring = re.search(pattern, x).group(1).strip()
    return substring

def get_home_or_away(x):
    '''
    Get string on match played home or away
    '''
    pattern = "\((.*?)\;"
    try:
        substring = re.search(pattern, x).group(1)
        if substring.strip() == 'A':
            return 'Away'
        else:
            return 'Home'
    except:
        return ''
    
def get_match_date(x):
    '''
    Get date of match played
    '''
    pattern = "\;(.*?)\)"
    try:
        substring = re.search(pattern, x).group(1).strip()
    except:
        substring = ''
    return substring

def get_all_incidents(x):
    '''
    Get list of all incidents for a match
    '''
    return x.split('\nIncident: ')[1:]

def get_decision(x):
    '''
    Get decision of VAR incident (for or against)
    '''
    decision = x.split('-')[-1].title().strip()
    return decision

# Adapted from: https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
def explode(df, lst_cols, fill_value='', preserve_index=False):
    '''
    Create new row for each individual incident of a match
    '''
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
        
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))

    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
        
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

def get_incident_minute(x):  
    '''
    Get match minute of incident
    '''
    minute = x.split(',')[-1].split('-')[0].strip()
    minute_num = re.sub('\D', '', minute)
    return minute_num

In [10]:
# Direct web driver to 2020-2021 page
driver.get(VAR_PAGE_2020_2021)

# Team numbering
team_list_elems = driver.find_elements_by_xpath("//div[@class='article-body']/h2")
team_title_list = []
team_list = []

for team in team_list_elems:
    team_title_list.append(team.text)
    team_list.append(team.text.rsplit(' ',1)[0])

incident_list = []
inc_list = team_title_list + ['Game', 'Incident']
exc_list = ['Overturns:']

parent = driver.find_element_by_xpath("//div[@class='article-body']")
for child in parent.find_elements_by_xpath('./child::*'):
    if any(keyword in child.text for keyword in inc_list) and not any(keyword in child.text for keyword in exc_list):
        incident_list.append(child.text)

team_name = ''
incident_list_organized_2021 = []

for line in incident_list[1:]:
    global team_name
    for name in team_list:
        if line.startswith(name):
            team_name = name
            continue
    if line.startswith('Game'):
        incident_list_organized_2021.append([team_name, line])
        
incident_list_organized_2021

[['Burnley',
  'Game: Man City (A; Nov. 28)\nIncident: Bailey Peacock-Farrell own goal disallowed for offside in the build-up against Gabriel Jesus, 77th minute - FOR'],
 ['Burnley',
  'Game: Arsenal (A; Dec. 13)\nIncident: Granit Xhaka sent off for violent conduct on Ashley Westwood, 58th minute - FOR'],
 ['Burnley',
  'Game: Man United (H; Jan. 12)\nIncident: Man United free-kick and Robbie Brady red-card review cancelled for foul in the build-up by Luke Shaw, who was booked - NEUTRAL'],
 ['Burnley',
  'Game: West Brom (H; Feb. 20)\nIncident: Semi Ajayi sent off for handball and denying an obvious goal-scoring opportunity, 28th minute - FOR'],
 ['Burnley',
  'Game: Arsenal (H; March 6)\nIncident: Penalty and red card for handball against Erik Pieters cancelled, 85th minute - FOR'],
 ['Burnley',
  'Game: Southampton (A; April 4)\nIncident: Penalty awarded (scored by Chris Wood) for foul on Erik Pieters by Kyle Walker-Peters, 8th minute - FOR'],
 ['Burnley',
  'Game: Wolves (A; April 2

In [11]:
len(incident_list_organized_2021)

199

In [12]:
incident_df_2021 = pd.DataFrame.from_records(incident_list_organized_2021, columns=['team', 'match_info_full'])
incident_df_2021['year'] = '2020/2021'
incident_df_2021

Unnamed: 0,team,match_info_full,year
0,Burnley,Game: Man City (A; Nov. 28)\nIncident: Bailey ...,2020/2021
1,Burnley,Game: Arsenal (A; Dec. 13)\nIncident: Granit X...,2020/2021
2,Burnley,Game: Man United (H; Jan. 12)\nIncident: Man U...,2020/2021
3,Burnley,Game: West Brom (H; Feb. 20)\nIncident: Semi A...,2020/2021
4,Burnley,Game: Arsenal (H; March 6)\nIncident: Penalty ...,2020/2021
5,Burnley,Game: Southampton (A; April 4)\nIncident: Pena...,2020/2021
6,Burnley,Game: Wolves (A; April 25)\nIncident: Matej Vy...,2020/2021
7,Everton,Game: West Brom (H; Sept. 19)\nIncident: Domin...,2020/2021
8,Everton,Game: Crystal Palace (A; Sept. 26)\nIncident: ...,2020/2021
9,Everton,Game: Liverpool (H; Oct. 17)\nIncident: Jordan...,2020/2021


___
### 2. Get incident data  

#### (ii) 2019/2020 Season

In [15]:
# Direct driver to 2019/2020 page
driver.get(VAR_PAGE_2019_2020)

# Team numbering
team_list_elems = driver.find_elements_by_xpath("//div[@class='article-body']/video1/h2") + \
                    driver.find_elements_by_xpath("//div[@class='article-body']/video1/video2/h2")
team_title_list = []
team_list = []

for team in team_list_elems:
    team_title_list.append(team.text)
    team_list.append(team.text.rsplit(' ',1)[0])

incident_list = []
inc_list = team_title_list + ['Game', 'Incident']
exc_list = ['Overturns:']

parent = driver.find_element_by_xpath("//div[@class='article-body']/video1")

all_children_elements = parent.find_elements_by_xpath('./child::*') + \
                        parent.find_elements_by_xpath('./video2/child::*')

for child in all_children_elements:
    if any(keyword in child.text for keyword in inc_list) and not any(keyword in child.text for keyword in exc_list):
        incident_list.append(child.text)
team_name = ''
incident_list_organized_1920 = []

for line in incident_list[1:]:
    global team_name
    for name in team_list:
        if line.startswith(name):
            team_name = name
            continue
    if line.startswith('Game'):
        incident_list_organized_1920.append([team_name, line])

In [16]:
incident_df_1920 = pd.DataFrame.from_records(incident_list_organized_1920, columns=['team', 'match_info_full'])
incident_df_1920['year'] = '2019/2020'
incident_df_1920

Unnamed: 0,team,match_info_full,year
0,Brighton & Hove Albion,Game: West Ham (H; Aug. 17)\nIncident: Brighto...,2019/2020
1,Brighton & Hove Albion,Game: Aston Villa (A; Oct. 19)\nIncident: Cono...,2019/2020
2,Brighton & Hove Albion,Game: Everton (H; Oct. 26)\nIncident: Penalty ...,2019/2020
3,Brighton & Hove Albion,Game: Leicester (H; Nov. 23)\nIncident: Jamie ...,2019/2020
4,Brighton & Hove Albion,Game: Arsenal (A; Dec. 5)\nIncident: David Lui...,2019/2020
5,Brighton & Hove Albion,Game: Sheffield United (H; Dec. 21)\nIncident:...,2019/2020
6,Brighton & Hove Albion,Game: Tottenham (A; Dec. 26)\nIncident: Harry ...,2019/2020
7,Brighton & Hove Albion,Game: Bournemouth (H; Dec. 28)\nIncident: Dan ...,2019/2020
8,Brighton & Hove Albion,Game: Everton (A; Jan. 11)\nIncident: Goal for...,2019/2020
9,Brighton & Hove Albion,Game: West Ham (A; Feb. 1)\nIncident: Goal for...,2019/2020


___
### 2. Get incident data

#### (iii) Combined (2019/2020 and 2020/2021 Seasons)

In [17]:
incidents_df = incident_df_1920.append(incident_df_2021)
incidents_df['opposition'] = incidents_df['match_info_full'].apply(lambda x: get_opposition_team(x))
incidents_df['home_or_away'] = incidents_df['match_info_full'].apply(lambda x: get_home_or_away(x))
incidents_df['date'] = incidents_df['match_info_full'].apply(lambda x: get_match_date(x))
incidents_df['incident'] = incidents_df['match_info_full'].apply(lambda x: get_all_incidents(x))
incidents_df = explode(incidents_df, ['incident'], fill_value='', preserve_index=True)
incidents_df.drop(columns = ['match_info_full'], inplace = True)
incidents_df['team_decision'] = incidents_df['incident'].apply(lambda x: get_decision(x))
incidents_df = incidents_df[['incident', 'team_decision', 'team', 'opposition', 'home_or_away', 'date', 'year']]
incidents_df = incidents_df.reset_index(drop = True)
len(incidents_df)

459

In [18]:
# Export as CSV file
export_date = dt.today().strftime('%Y%m%d')
incidents_df.to_csv(f'./data/EPL_VAR_Incidents_Raw_{export_date}.csv', index=False)

___
### 3. Get team VAR decisions

#### (i) 2020/2021 Season

In [19]:
# Direct web driver to 2020-2021 page
driver.get(VAR_PAGE_2020_2021)

decision_elems = driver.find_elements_by_xpath("//aside[@class='inline editorial float-r']")

for elem in decision_elems:
    if 'decisions for' in elem.text.lower():
        decisions_for_text_2021 = elem.text 
    elif 'decisions against' in elem.text.lower():
        decisions_against_text_2021 = elem.text

In [20]:
decisions_for_2021_list = decisions_for_text_2021.split('\n')[2:]
decisions_against_2021_list = decisions_against_text_2021.split('\n')[2:]
decisions_for_2021_list

['Brighton & Hove Albion 9',
 'Fulham 9',
 'Aston Villa 7',
 'Chelsea 7',
 'Leicester City 7',
 'Liverpool 7',
 'Manchester United 7',
 'Sheffield United 7',
 'Southampton 7',
 'Tottenham Hotspur 7',
 'Crystal Palace 6',
 'Everton 6',
 'Burnley 5',
 'Leeds 5',
 'Manchester City 5',
 'West Ham 5',
 'Newcastle 4',
 'Wolves 4',
 'Arsenal 3',
 'West Brom 3']

In [21]:
decisions_df_2021 = pd.DataFrame(columns = ['team', 'year', 'decisions_for', 'decisions_against'])
year = '2020/2021'

for i in decisions_for_2021_list:
    count = i.rsplit(' ',1)[-1]
    team = i.rsplit(' ',1)[0]
    decisions_df_2021 = decisions_df_2021.append({'team': team, 
                                                  'year': year,                                                  
                                                  'decisions_for': count}, 
                                                  ignore_index=True)
    
for i in decisions_against_2021_list:
    count = i.rsplit(' ',1)[-1]
    team = i.rsplit(' ',1)[0]
    decisions_df_2021.loc[decisions_df_2021['team'] == team, 'decisions_against'] = count

decisions_df_2021

Unnamed: 0,team,year,decisions_for,decisions_against
0,Brighton & Hove Albion,2020/2021,9,8
1,Fulham,2020/2021,9,6
2,Aston Villa,2020/2021,7,5
3,Chelsea,2020/2021,7,4
4,Leicester City,2020/2021,7,6
5,Liverpool,2020/2021,7,12
6,Manchester United,2020/2021,7,9
7,Sheffield United,2020/2021,7,4
8,Southampton,2020/2021,7,7
9,Tottenham Hotspur,2020/2021,7,9


___
### 3. Get team VAR decisions

#### (ii) 2019/2020 Season

In [22]:
# Direct web driver to 2019-2020 page
driver.get(VAR_PAGE_2019_2020)

decision_elems = driver.find_elements_by_xpath("//aside[@class='inline editorial float-r']")

for elem in decision_elems:
    if 'decisions for' in elem.text.lower():
        decisions_for_text_1920 = elem.text 
    elif 'decisions against' in elem.text.lower():
        decisions_against_text_1920 = elem.text

In [23]:
decisions_for_1920_list = decisions_for_text_1920.split('\n')[2:]
decisions_against_1920_list = decisions_against_text_1920.split('\n')[2:]

In [24]:
decisions_df_1920 = pd.DataFrame(columns = ['team', 'year', 'decisions_for', 'decisions_against'])
year = '2019/2020'

for i in decisions_for_1920_list:
    count = i.rsplit(' ',1)[-1]
    team = i.rsplit(' ',1)[0]
    decisions_df_1920 = decisions_df_1920.append({'team': team, 
                                                  'year': year,                                                  
                                                  'decisions_for': count}, 
                                                  ignore_index=True)
    
for i in decisions_against_1920_list:
    count = i.rsplit(' ',1)[-1]
    team = i.rsplit(' ',1)[0]
    decisions_df_1920.loc[decisions_df_1920['team'] == team, 'decisions_against'] = count

decisions_df_1920

Unnamed: 0,team,year,decisions_for,decisions_against
0,Brighton & Hove Albion,2019/2020,10,2
1,Manchester United,2019/2020,10,3
2,Crystal Palace,2019/2020,8,4
3,Leicester City,2019/2020,8,7
4,Manchester City,2019/2020,8,8
5,Southampton,2019/2020,8,5
6,Tottenham Hotspur,2019/2020,8,7
7,Burnley,2019/2020,7,4
8,West Ham,2019/2020,6,10
9,AFC Bournemouth,2019/2020,5,7


___
### 3. Get team VAR decisions

#### (iii) Combined (2019/2020 and 2020/2021 Seasons)

In [25]:
decisions_df = pd.concat([decisions_df_2021, decisions_df_1920]).reset_index(drop=True)
decisions_df

Unnamed: 0,team,year,decisions_for,decisions_against
0,Brighton & Hove Albion,2020/2021,9,8
1,Fulham,2020/2021,9,6
2,Aston Villa,2020/2021,7,5
3,Chelsea,2020/2021,7,4
4,Leicester City,2020/2021,7,6
5,Liverpool,2020/2021,7,12
6,Manchester United,2020/2021,7,9
7,Sheffield United,2020/2021,7,4
8,Southampton,2020/2021,7,7
9,Tottenham Hotspur,2020/2021,7,9


In [26]:
# Export as CSV file
export_date = dt.today().strftime('%Y%m%d')
decisions_df.to_csv(f'./data/EPL_VAR_Decisions_{export_date}.csv', index=False)