In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pprint
import json
import re
import selenium
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Scraping

In [None]:
class webScraper:
    """
    Scraping toolkit for Understat fixtures
    """
    def __init__(self, initial_url):
        self.url = initial_url
        self.driver = webdriver.Chrome()
    
    def start(self):
        self.driver.get(self.url)
    
    def end(self):
        self.driver.close()
    
    def get_match_numbers(self):
        """
        Looks for any fixture IDs on any pages on Understat
        """
        driver = self.driver
        mi = driver.find_elements_by_class_name("match-info")
        fixture_nums = []
        for i, x in enumerate(mi):
            href = mi[i].get_attribute("href")
            fixture_nums.append(href[href.find("match/")+6:])
        return fixture_nums
     
    def get_all_match_numbers(self):
        """
        Scrape all fixtures from season in question
        """
        all_match_ids = []
        for i in range(10000):
            try:
                all_match_ids = all_match_ids + get_match_numbers(self.driver)
            except:
                pass
            time.sleep(6)
            button = self.driver.find_element_by_class_name("calendar-prev")
            button.click()
            print(len(all_match_ids))
            time.sleep(6)
            if len(all_match_ids)>400:
                break
        return all_match_ids
    
    def get_fixture_data(self, fix_id):
        """
        Get all data for a particular fixture ID
        """
        url = "https://understat.com/match/" + str(fix_id)
        self.driver.get(url)
        time.sleep(5)
        match_info = dict()
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        
        match_info["title_details"] = str(soup.find_all("title"))
        
        data1name = self.driver.find_element_by_xpath("/html/body/div[1]/div[3]/div[4]/div/div[1]/div/label[1]")
        data1 = soup.find_all("table")
    
        button = self.driver.find_element_by_xpath("/html/body/div[1]/div[3]/div[4]/div/div[1]/div/label[2]")
        button.click()
        time.sleep(5)
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        data2 = soup.find_all("table")
    
        data2name = button.text
    
        match_info["away_team_stats"] = str(data2)
        match_info["home_team_stats"] = str(data1)
        match_info["date"] = title[title.find("(")+1:title.find(")")]
        match_info["home_team"] = title[title.find(">")+1:title.find(" ")]
        match_info["away_team"] = title[title.find("-")+4:title.find("(")-1]
    
        return match_info
        
    def get_all_fixture_data(self, list_of_ids):
        return [self.get_fixture_data(x) for x in list_of_ids]

In [None]:
webs = webScraper("https://understat.com/league/EPL/2020")
webs.start()
all_match_ids = webs.get_all_match_numbers()
all_match_ids = list(set(all_match_ids)) # de-dupe
fix_info = webs.get_all_fixture_data(all_match_ids)

In [None]:
data = json.dumps(fixture_info)
with open('EPL_20_21a.json', 'w') as outfile:
    json.dump(data, outfile)

# Parsing and EDA

In [None]:
with open('EPL_20_21___old.json', 'r') as json_file:
    json_data = json.load(json_file)
    json_file.close()

df = json.loads(json_data)

fixture_ids = list(df.keys())

def fixture_data_ETL(fix_id):
    
    date = df[fix_id]["date"]
    
    home = pd.read_html(df[fix_id]["home_team_stats"])[0]
    
    try:
        home["xG"] = home["xG"].apply(lambda x: float(x[0:4]))
    except:
        home["xG"] = home["xG"].apply(lambda x: float(x))
    
    try:
        home["xA"] = home["xA"].apply(lambda x: float(x[0:4]))
    except:
        home["xA"] = home["xA"].apply(lambda x: float(x))
    
    home["team"] = df[fix_id]["home_team"]
    home["opposition"] = df[fix_id]["away_team"]
    home["home_or_away"] = "home"
    
    home["team_Sh"] = home.iloc[-1,:]["Sh"]
    home["team_G"] = home.iloc[-1,:]["G"]
    home["team_KP"] = home.iloc[-1,:]["KP"]
    home["team_A"] = home.iloc[-1,:]["A"]
    home["team_xG"] = home.iloc[-1,:]["xG"]
    home["team_xA"] = home.iloc[-1,:]["xA"]



    away = pd.read_html(df[fix_id]["away_team_stats"])[0]
    
    try:
        away["xG"] = away["xG"].apply(lambda x: float(x[0:4]))
    except:
        away["xG"] = away["xG"].apply(lambda x: float(x))
    
    try:
        away["xA"] = away["xA"].apply(lambda x: float(x[0:4]))
    except:
        away["xA"] = away["xA"].apply(lambda x: float(x))
    
    away["team"] = df[fix_id]["away_team"]
    away["opposition"] = df[fix_id]["home_team"]
    away["home_or_away"] = "away"
    
    away["team_Sh"] = away.iloc[-1,:]["Sh"]
    away["team_G"] = away.iloc[-1,:]["G"]
    away["team_KP"] = away.iloc[-1,:]["KP"]
    away["team_A"] = away.iloc[-1,:]["A"]
    away["team_xG"] = away.iloc[-1,:]["xG"]
    away["team_xA"] = away.iloc[-1,:]["xA"]
    
    away = away.iloc[:-1,1:]
    home = home.iloc[:-1,1:]
    
    home["date"] = date
    away["date"] = date


    away_team_stats = away[["team_Sh", "team_G", "team_KP", "team_A", "team_xG", "team_xA"]].rename(columns={
        "team_Sh":"opponent_Sh",
        "team_G":"opponent_G",
        "team_KP":"opponent_KP",
        "team_A":"opponent_A",
        "team_xG":"opponent_xG",
        "team_xA":"opponent_xA",
    }).iloc[0:11,:]
    
    home_team_stats = home[["team_Sh", "team_G", "team_KP", "team_A", "team_xG", "team_xA"]].rename(columns={
        "team_Sh":"opponent_Sh",
        "team_G":"opponent_G",
        "team_KP":"opponent_KP",
        "team_A":"opponent_A",
        "team_xG":"opponent_xG",
        "team_xA":"opponent_xA",
    }).iloc[0:11,:]
    
    home = pd.concat([home, away_team_stats], axis=1).ffill(axis=0)
    away = pd.concat([away, home_team_stats], axis=1).ffill(axis=0)
    
    fixture_df = pd.concat([home,away]).reset_index().iloc[:,1:]
    
    fixture_df["title_details"] = df[fix_id]["title_details"]
    
    return fixture_df


big_df = fixture_data_ETL(fixture_ids[0])

for i, n in enumerate(fixture_ids[1:]):#range(len(fixture_ids[1:])):
    fix_id = fixture_ids[1:][i]
    if i%25 == 0:
        print(i)
    big_df = pd.concat([big_df, fixture_data_ETL(fix_id)]).reset_index().iloc[:,1:]

big_df["month"] = big_df["date"].apply(lambda x: x[:x.find(" ")])

big_df["day"] = big_df["date"].apply(lambda x: x[x.find(" "):][1:3])

big_df["year"] = big_df["date"].apply(lambda x: x[x.find(" "):][4:])

month_mapper = {"January":1,"February":2,"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,
               "September":9,"October":10,"November":11,"December":12}

big_df["month"] = big_df["month"].map(month_mapper)

big_df["datetime"] = pd.to_datetime(big_df[["year", "month","day"]])


In [None]:
def team_name_generator(title_instance, homeaway_instance):
    titled = title_instance
    dig_ind = re.search(r"\d", titled).start()
    name1 = titled[8:dig_ind-1]
    first = re.search(r"\-", titled).start()
    second= re.search(r"\(", titled).start()
    name2 = titled[first+4:second-1]
    
    if homeaway_instance == "home":
        team = name1
        opponent = name2
    else:
        team = name2
        opponent = name1
    return (team, opponent)

In [None]:
teams = []
opps = []
for i,x in enumerate(big_df):
    thingy = team_name_generator(big_df["title_details"][i], big_df["home_or_away"][i])
    teams.append(thingy[0])
    opps.append(thingy[1])

In [None]:
big_df["team_fixed"] = teams
big_df["opposition_fixed"] = opps

In [None]:
big_df.to_csv("epl2021_new.csv")