In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from lxml import html
import time
import re
import requests
import os.path
import urllib.request
from urllib.error import HTTPError


# pandas options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
# from selenium.webdriver.common.keys import Keys

In [3]:
results_url = 'https://www.premierleague.com/results'
fixtures_url = 'https://www.premierleague.com/fixtures'
tables_url = 'https://www.premierleague.com/tables'
player_url = 'https://www.premierleague.com/players'

In [4]:
# define webdriver options
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_argument("--headless")
chrome_options.add_argument('window-size=1920x1080');
chrome_options.add_argument("--start-maximized");
chrome_options.add_experimental_option("prefs", prefs)

# Web Scraping for past 10 season + Current Season

If we want to repeat the scraping for the past 10 season, we can just delete the two files past_results.csv and curr_results.csv

In [5]:
# Global dataframe to store result information
df = pd.DataFrame()
currdf = pd.DataFrame()

In [6]:
# check if file exist (already scraped, open the file)
if os.path.isfile('past_results.csv'):
    df = pd.read_csv('past_results.csv')
else:
    print ("past_result file not exist")

In [7]:
def scrape_last_ten_years_results(seasonnum):    
    ####### Restart URL
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(results_url)
    # scroll down to get all matches
    SCROLL_PAUSE_TIME = 1

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # click accept cookies button
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    nextx = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '/html/body/section/div/div')))
    ActionChains(driver).click(nextx).perform()

    # pause to get caught up
    time.sleep(3)
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    
    ######## click season button
    seasonbut = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div[2]/div[1]/section/div[3]/div[2]')))
    ActionChains(driver).click(seasonbut).perform()
    
    time.sleep(1)
    
    ######## click relevant season
    seasonclick = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div[2]/div[1]/section/div[3]/ul/li[' + str(seasonnum) + ']')))
    ActionChains(driver).click(seasonclick).perform()
    
    time.sleep(1)
    
    ####### Get season year
    curr_year = driver.find_element_by_xpath('//*[@id="mainContent"]/div[2]/div[1]/section/div[3]/div[2]').text
        
    ######## Scroll down infinetely section to get all matches
    SCROLL_PAUSE_TIME = 2

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    time.sleep(1)

    ####### Convert to beautifulsoup and get result information
    # initialize beautifulsoup and get all matches
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    curr_matches = bs.find_all("div", class_="fixture postMatch")
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    
    # close the unneeded driver
    driver.quit()
    
    # sleep to catch up with time
    time.sleep(1)

    # scrape the results page
    all_team1 = []
    all_team2 = []
    all_score = []
    all_links = []
    all_stadium = []

    for idx, elem in enumerate(curr_matches):
        teams = re.findall(r'<span class="shortname">(.+?)<',str(curr_matches[idx]))
        all_team1.append(teams[0]) # append team 1 (HOME)
        all_team2.append(teams[1]) # append team 2 (AWAY)

        scoretxt = curr_matches[idx].find_all("span", class_="score")
        scoretxt = re.findall(r'\d', str(scoretxt[0]))
        all_score.append(str(scoretxt[0]) + "-" + str(scoretxt[1])) # append score

        stadiumtxt = curr_matches[idx].find_all("span", class_="stadiumName")
        stadiumtxt = re.findall(r'<\/span>(.+?),', str(stadiumtxt[0]))[0]
        all_stadium.append(stadiumtxt) # append stadium name

        linktxt = re.search(r'data-href=\"(.+?)\"',str(curr_matches[idx])).group(1)
        linktxt = 'http:' + linktxt
        all_links.append(linktxt) # append match link

    df = pd.DataFrame()
    df['Home'] = all_team1
    df['Away'] = all_team2
    df['score'] = all_score
    df['stadium'] = all_stadium
    df['matchlink'] = all_links
    df['Season'] = curr_year
    df.drop_duplicates(inplace = True)
    df.reset_index(inplace = True, drop = True)
    
    if df.shape[0] != 380:
        print('rescrape')
        print(df.shape)
        df = scrape_last_ten_years_results(seasonnum)
        return df
    
    print(curr_year)
    print(df.shape)
    return df

In [8]:
# read in all fixtures for past 10 years
if df.shape[0] == 0:
    for i in range(2,11):
        rdf = scrape_last_ten_years_results(i)
        df = pd.concat([df,rdf],ignore_index=True)

In [9]:
df.head()

Unnamed: 0,Home,Away,score,stadium,matchlink,Season,Home_goal,Home_goal_times,Away_goal,Away_goal_times,Home_assist,Home_assist_times,Away_assist,Away_assist_times,Home_Clearances,Away_Clearances,Home_Corners,Away_Corners,Home_Fouls conceded,Away_Fouls conceded,Home_Offsides,Away_Offsides,Home_Passes,Away_Passes,Home_Possession %,Away_Possession %,Home_Shots,Away_Shots,Home_Shots on target,Away_Shots on target,Home_Tackles,Away_Tackles,Home_Touches,Away_Touches,Home_Yellow cards,Away_Yellow cards,Home_Red cards,Away_Red cards,Referee,Match_Date,Attendance,Home_formation,Away_formation,Home_captain,Away_captain,Home_starting_lineup,Home_subs_lineup,Home_subout,Home_subout_time,Home_subin,Home_subin_time,Home_yellow,Home_red,Away_starting_lineup,Away_subs_lineup,Away_subout,Away_subout_time,Away_subin,Away_subin_time,Away_yellow,Away_red
0,Brighton,Man City,1-4,Amex Stadium,http://www.premierleague.com/match/38678,2018/19,Glenn Murray,27.0,"Sergio Agüero,Aymeric Laporte,Riyad Mahrez,Ilk...",28386372.0,Pascal Groß,27.0,"David Silva,Riyad Mahrez,David Silva",283863.0,41,5,2,6,12,8,1,1,245,796,23.7,76.3,6,20,2,9,16,15,401,955,0,0,0,0,Michael Oliver,1557669600000,30662,4-5-1,4-4-1-1,Bruno,Vincent Kompany,"Mat Ryan,Lewis Dunk,Bruno,Bernardo,Shane Duffy...","David Button,Dan Burn,Gaëtan Bong,Martín Monto...","Alireza Jahanbakhsh,Glenn Murray,Bruno",676784,"Florin Andone,Jürgen Locadia,Martín Montoya",676784,,,"Ederson,Aymeric Laporte,Kyle Walker,Oleksandr ...","Arijanet Muric,John Stones,Nicolás Otamendi,Da...","David Silva,Vincent Kompany,Kyle Walker",788688,"Kevin De Bruyne,Nicolás Otamendi,Danilo",788688,,
1,Burnley,Arsenal,1-3,Turf Moor,http://www.premierleague.com/match/38679,2018/19,Ashley Barnes,65.0,"Pierre-Emerick Aubameyang,Pierre-Emerick Aubam...",526390.0,Johann Gudmundsson,65.0,"Alex Iwobi,Alex Iwobi",6390.0,21,27,4,5,11,3,4,0,341,533,39.7,60.3,14,17,5,6,15,11,527,714,5,1,0,0,Mike Dean,1557669600000,21461,4-4-2,4-2-3-1,Tom Heaton,Nacho Monreal,"Tom Heaton,Charlie Taylor,Ben Mee,Matthew Lowt...","Joe Hart,Stephen Ward,Kevin Long,Robbie Brady,...","Dwight McNeil,Chris Wood,Jeff Hendrick",647782,"Johann Gudmundsson,Peter Crouch,Robbie Brady",647782,"Tom Heaton,Matthew Lowton,James Tarkowski,Jack...",,"Bernd Leno,Konstantinos Mavropanos,Shkodran Mu...","Petr Cech,Laurent Koscielny,Sead Kolasinac,Gra...","Konstantinos Mavropanos,Joseph Willock",3462,"Laurent Koscielny,Eddie Nketiah",3462,Matteo Guendouzi,
2,Crystal Palace,Bournemouth,5-3,Selhurst Park,http://www.premierleague.com/match/38680,2018/19,"Michy Batshuayi,Michy Batshuayi,Jack Simpson (...",2432376580.0,"Jefferson Lerma,Jordon Ibe,Joshua King",455673.0,"Aaron Wan-Bissaka,Wilfried Zaha,Wilfried Zaha",326580.0,"Nathaniel Clyne,Adam Smith,Chris Mepham",455673.0,23,19,4,4,11,8,4,1,429,517,45.0,55.0,17,16,8,8,16,18,627,713,3,0,0,0,Roger East,1557669600000,25433,4-4-2,4-4-1-1,Luka Milivojevic,Steve Cook,"Vicente Guaita,Joel Ward,Martin Kelly,Aaron Wa...","Wayne Hennessey,Nikola Tavares,Luke Dreher,Bak...","Wilfried Zaha,Michy Batshuayi,Andros Townsend",878990,"Bakary Sako,Connor Wickham,Luke Dreher",878990,"Luka Milivojevic,James McArthur,Wilfried Zaha",,"Mark Travers,Nathaniel Clyne,Steve Cook,Jack S...","Artur Boruc,Chris Mepham,Matt Butcher,Emerson ...","Jack Simpson,Jordon Ibe,Nathaniel Clyne",707089,"Chris Mepham,Lys Mousset,Sam Surridge",707089,,
3,Fulham,Newcastle,0-4,Craven Cottage,http://www.premierleague.com/match/38681,2018/19,,,"Jonjo Shelvey,Ayoze Pérez,Fabian Schär,Salomón...",9116190.0,,,"Matt Ritchie,Matt Ritchie",961.0,7,38,5,5,6,8,1,1,765,358,68.1,31.9,16,13,2,6,12,15,940,532,1,0,0,0,Kevin Friend,1557669600000,24979,4-2-3-1,5-4-1,Tom Cairney,Jamaal Lascelles,"Sergio Rico,Alfie Mawson,Cyrus Christie,Maxime...","Fabri,Steven Sessegnon,Tim Ream,Ryan Babel,Nee...","Cyrus Christie,Floyd Ayité,Ryan Sessegnon",657481,"Ryan Babel,Harvey Elliott,Neeskens Kebano",657481,Joe Bryan,,"Martin Dubravka,Jamaal Lascelles,Matt Ritchie,...","Karl Darlow,Lewis Cass,Federico Fernández,Ciar...","Christian Atsu,Isaac Hayden,Paul Dummett",666981,"Yoshinori Muto,Federico Fernández,Kenedy",666981,,
4,Leicester,Chelsea,0-0,King Power Stadium,http://www.premierleague.com/match/38682,2018/19,,,,,,,,,17,16,4,5,9,8,1,2,470,533,46.9,53.1,9,14,3,4,16,12,644,711,0,1,0,0,Anthony Taylor,1557669600000,32140,4-1-4-1,4-3-3,Kasper Schmeichel,César Azpilicueta,"Kasper Schmeichel,Ben Chilwell,Ricardo Pereira...","Danny Ward,Wes Morgan,Danny Simpson,Harvey Bar...","Wilfred Ndidi,Marc Albrighton,James Maddison",667684,"Shinji Okazaki,Danny Simpson,Harvey Barnes",667684,,,"Willy Caballero,César Azpilicueta,Marcos Alons...","Kepa Arrizabalaga,Emerson,Marc Guehi,Andreas C...","Willian,Ross Barkley,Gonzalo Higuaín",697684,"Eden Hazard,Mateo Kovacic,Olivier Giroud",697684,Jorginho,


In [10]:
def scrape_current_season_results():    
    ####### Restart URL
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(results_url)
    # scroll down to get all matches
    SCROLL_PAUSE_TIME = 1

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # click accept cookies button
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    nextx = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '/html/body/section/div/div')))
    ActionChains(driver).click(nextx).perform()

    # pause to get caught up
    time.sleep(3)
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    
    ####### Get season year
    curr_year = driver.find_element_by_xpath('//*[@id="mainContent"]/div[2]/div[1]/section/div[3]/div[2]').text
        
    ######## Scroll down infinetely section to get all matches
    SCROLL_PAUSE_TIME = 2

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    time.sleep(1)

    ####### Convert to beautifulsoup and get result information
    # initialize beautifulsoup and get all matches
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    curr_matches = bs.find_all("div", class_="fixture postMatch")
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    
    # close the unneeded driver
    driver.close()
    
    # sleep to catch up with time
    time.sleep(1)

    # scrape the results page
    all_team1 = []
    all_team2 = []
    all_score = []
    all_links = []
    all_stadium = []

    for idx, elem in enumerate(curr_matches):
        teams = re.findall(r'<span class="shortname">(.+?)<',str(curr_matches[idx]))
        all_team1.append(teams[0]) # append team 1 (HOME)
        all_team2.append(teams[1]) # append team 2 (AWAY)

        scoretxt = curr_matches[idx].find_all("span", class_="score")
        scoretxt = re.findall(r'\d', str(scoretxt[0]))
        all_score.append(str(scoretxt[0]) + "-" + str(scoretxt[1])) # append score

        stadiumtxt = curr_matches[idx].find_all("span", class_="stadiumName")
        stadiumtxt = re.findall(r'<\/span>(.+?),', str(stadiumtxt[0]))[0]
        all_stadium.append(stadiumtxt) # append stadium name

        linktxt = re.search(r'data-href=\"(.+?)\"',str(curr_matches[idx])).group(1)
        linktxt = 'http:' + linktxt
        all_links.append(linktxt) # append match link

    df = pd.DataFrame()
    df['Home'] = all_team1
    df['Away'] = all_team2
    df['score'] = all_score
    df['stadium'] = all_stadium
    df['matchlink'] = all_links
    df['Season'] = curr_year
    df.drop_duplicates(inplace = True)
    df.reset_index(inplace = True, drop = True)
    
    print(curr_year)
    print(df.shape)
    return df

In [11]:
rdf = scrape_current_season_results()
currdf = pd.concat([currdf,rdf],ignore_index=True)

2019/20
(288, 6)


In [12]:
currdf.head()

Unnamed: 0,Home,Away,score,stadium,matchlink,Season
0,Leicester,Aston Villa,4-0,King Power Stadium,http://www.premierleague.com/match/46889,2019/20
1,Chelsea,Everton,4-0,Stamford Bridge,http://www.premierleague.com/match/46887,2019/20
2,Man Utd,Man City,2-0,Old Trafford,http://www.premierleague.com/match/46891,2019/20
3,Liverpool,Bournemouth,2-1,Anfield,http://www.premierleague.com/match/46890,2019/20
4,Arsenal,West Ham,1-0,Emirates Stadium,http://www.premierleague.com/match/46885,2019/20


In [14]:
### TEMPORARY PLACEMENT -- DELETE LATER
# df.to_csv('df.csv', index=False)
# currdf.to_csv('currdf.csv', index=False)

In [15]:
# if only starting columns
if len(df.columns) == 6:
    for elem in ['Home_goal', 'Home_goal_times', 
                 'Away_goal', 'Away_goal_times', 'Home_assist', 'Home_assist_times', 'Away_assist', 'Away_assist_times', 
                 'Home_Clearances', 'Away_Clearances', 'Home_Corners', 'Away_Corners', 'Home_Fouls conceded', 
                 'Away_Fouls conceded', 'Home_Offsides', 'Away_Offsides', 'Home_Passes', 'Away_Passes', 
                 'Home_Possession %', 'Away_Possession %', 'Home_Shots', 'Away_Shots', 'Home_Shots on target', 
                 'Away_Shots on target', 'Home_Tackles', 'Away_Tackles', 'Home_Touches', 'Away_Touches', 'Home_Yellow cards', 
                 'Away_Yellow cards', 'Home_Red cards', 'Away_Red cards', 'Referee', 'Match_Date', 'Attendance', 'Home_formation', 
                 'Away_formation', 'Home_captain',
                 'Away_captain', 'Home_starting_lineup', 'Home_subs_lineup', 'Home_subout', 'Home_subout_time', 'Home_subin',
                 'Home_subin_time', 'Home_yellow', 'Home_red', 'Away_starting_lineup', 'Away_subs_lineup', 'Away_subout', 
                 'Away_subout_time', 'Away_subin', 'Away_subin_time', 'Away_yellow', 'Away_red']:
        df[elem] = 0

# if only starting columns
if len(currdf.columns) == 6:
     for elem in ['Home_goal', 'Home_goal_times', 
                 'Away_goal', 'Away_goal_times', 'Home_assist', 'Home_assist_times', 'Away_assist', 'Away_assist_times', 
                 'Home_Clearances', 'Away_Clearances', 'Home_Corners', 'Away_Corners', 'Home_Fouls conceded', 
                 'Away_Fouls conceded', 'Home_Offsides', 'Away_Offsides', 'Home_Passes', 'Away_Passes', 
                 'Home_Possession %', 'Away_Possession %', 'Home_Shots', 'Away_Shots', 'Home_Shots on target', 
                 'Away_Shots on target', 'Home_Tackles', 'Away_Tackles', 'Home_Touches', 'Away_Touches', 'Home_Yellow cards', 
                 'Away_Yellow cards', 'Home_Red cards', 'Away_Red cards', 'Referee', 'Match_Date', 'Attendance', 'Home_formation', 
                 'Away_formation', 'Home_captain',
                 'Away_captain', 'Home_starting_lineup', 'Home_subs_lineup', 'Home_subout', 'Home_subout_time', 'Home_subin',
                 'Home_subin_time', 'Home_yellow', 'Home_red', 'Away_starting_lineup', 'Away_subs_lineup', 'Away_subout', 
                 'Away_subout_time', 'Away_subin', 'Away_subin_time', 'Away_yellow', 'Away_red']:
        currdf[elem] = 0

In [16]:
print('DF Shape:', df.shape)
df.head()

DF Shape: (3420, 61)


Unnamed: 0,Home,Away,score,stadium,matchlink,Season,Home_goal,Home_goal_times,Away_goal,Away_goal_times,Home_assist,Home_assist_times,Away_assist,Away_assist_times,Home_Clearances,Away_Clearances,Home_Corners,Away_Corners,Home_Fouls conceded,Away_Fouls conceded,Home_Offsides,Away_Offsides,Home_Passes,Away_Passes,Home_Possession %,Away_Possession %,Home_Shots,Away_Shots,Home_Shots on target,Away_Shots on target,Home_Tackles,Away_Tackles,Home_Touches,Away_Touches,Home_Yellow cards,Away_Yellow cards,Home_Red cards,Away_Red cards,Referee,Match_Date,Attendance,Home_formation,Away_formation,Home_captain,Away_captain,Home_starting_lineup,Home_subs_lineup,Home_subout,Home_subout_time,Home_subin,Home_subin_time,Home_yellow,Home_red,Away_starting_lineup,Away_subs_lineup,Away_subout,Away_subout_time,Away_subin,Away_subin_time,Away_yellow,Away_red
0,Brighton,Man City,1-4,Amex Stadium,http://www.premierleague.com/match/38678,2018/19,Glenn Murray,27.0,"Sergio Agüero,Aymeric Laporte,Riyad Mahrez,Ilk...",28386372.0,Pascal Groß,27.0,"David Silva,Riyad Mahrez,David Silva",283863.0,41,5,2,6,12,8,1,1,245,796,23.7,76.3,6,20,2,9,16,15,401,955,0,0,0,0,Michael Oliver,1557669600000,30662,4-5-1,4-4-1-1,Bruno,Vincent Kompany,"Mat Ryan,Lewis Dunk,Bruno,Bernardo,Shane Duffy...","David Button,Dan Burn,Gaëtan Bong,Martín Monto...","Alireza Jahanbakhsh,Glenn Murray,Bruno",676784,"Florin Andone,Jürgen Locadia,Martín Montoya",676784,,,"Ederson,Aymeric Laporte,Kyle Walker,Oleksandr ...","Arijanet Muric,John Stones,Nicolás Otamendi,Da...","David Silva,Vincent Kompany,Kyle Walker",788688,"Kevin De Bruyne,Nicolás Otamendi,Danilo",788688,,
1,Burnley,Arsenal,1-3,Turf Moor,http://www.premierleague.com/match/38679,2018/19,Ashley Barnes,65.0,"Pierre-Emerick Aubameyang,Pierre-Emerick Aubam...",526390.0,Johann Gudmundsson,65.0,"Alex Iwobi,Alex Iwobi",6390.0,21,27,4,5,11,3,4,0,341,533,39.7,60.3,14,17,5,6,15,11,527,714,5,1,0,0,Mike Dean,1557669600000,21461,4-4-2,4-2-3-1,Tom Heaton,Nacho Monreal,"Tom Heaton,Charlie Taylor,Ben Mee,Matthew Lowt...","Joe Hart,Stephen Ward,Kevin Long,Robbie Brady,...","Dwight McNeil,Chris Wood,Jeff Hendrick",647782,"Johann Gudmundsson,Peter Crouch,Robbie Brady",647782,"Tom Heaton,Matthew Lowton,James Tarkowski,Jack...",,"Bernd Leno,Konstantinos Mavropanos,Shkodran Mu...","Petr Cech,Laurent Koscielny,Sead Kolasinac,Gra...","Konstantinos Mavropanos,Joseph Willock",3462,"Laurent Koscielny,Eddie Nketiah",3462,Matteo Guendouzi,
2,Crystal Palace,Bournemouth,5-3,Selhurst Park,http://www.premierleague.com/match/38680,2018/19,"Michy Batshuayi,Michy Batshuayi,Jack Simpson (...",2432376580.0,"Jefferson Lerma,Jordon Ibe,Joshua King",455673.0,"Aaron Wan-Bissaka,Wilfried Zaha,Wilfried Zaha",326580.0,"Nathaniel Clyne,Adam Smith,Chris Mepham",455673.0,23,19,4,4,11,8,4,1,429,517,45.0,55.0,17,16,8,8,16,18,627,713,3,0,0,0,Roger East,1557669600000,25433,4-4-2,4-4-1-1,Luka Milivojevic,Steve Cook,"Vicente Guaita,Joel Ward,Martin Kelly,Aaron Wa...","Wayne Hennessey,Nikola Tavares,Luke Dreher,Bak...","Wilfried Zaha,Michy Batshuayi,Andros Townsend",878990,"Bakary Sako,Connor Wickham,Luke Dreher",878990,"Luka Milivojevic,James McArthur,Wilfried Zaha",,"Mark Travers,Nathaniel Clyne,Steve Cook,Jack S...","Artur Boruc,Chris Mepham,Matt Butcher,Emerson ...","Jack Simpson,Jordon Ibe,Nathaniel Clyne",707089,"Chris Mepham,Lys Mousset,Sam Surridge",707089,,
3,Fulham,Newcastle,0-4,Craven Cottage,http://www.premierleague.com/match/38681,2018/19,,,"Jonjo Shelvey,Ayoze Pérez,Fabian Schär,Salomón...",9116190.0,,,"Matt Ritchie,Matt Ritchie",961.0,7,38,5,5,6,8,1,1,765,358,68.1,31.9,16,13,2,6,12,15,940,532,1,0,0,0,Kevin Friend,1557669600000,24979,4-2-3-1,5-4-1,Tom Cairney,Jamaal Lascelles,"Sergio Rico,Alfie Mawson,Cyrus Christie,Maxime...","Fabri,Steven Sessegnon,Tim Ream,Ryan Babel,Nee...","Cyrus Christie,Floyd Ayité,Ryan Sessegnon",657481,"Ryan Babel,Harvey Elliott,Neeskens Kebano",657481,Joe Bryan,,"Martin Dubravka,Jamaal Lascelles,Matt Ritchie,...","Karl Darlow,Lewis Cass,Federico Fernández,Ciar...","Christian Atsu,Isaac Hayden,Paul Dummett",666981,"Yoshinori Muto,Federico Fernández,Kenedy",666981,,
4,Leicester,Chelsea,0-0,King Power Stadium,http://www.premierleague.com/match/38682,2018/19,,,,,,,,,17,16,4,5,9,8,1,2,470,533,46.9,53.1,9,14,3,4,16,12,644,711,0,1,0,0,Anthony Taylor,1557669600000,32140,4-1-4-1,4-3-3,Kasper Schmeichel,César Azpilicueta,"Kasper Schmeichel,Ben Chilwell,Ricardo Pereira...","Danny Ward,Wes Morgan,Danny Simpson,Harvey Bar...","Wilfred Ndidi,Marc Albrighton,James Maddison",667684,"Shinji Okazaki,Danny Simpson,Harvey Barnes",667684,,,"Willy Caballero,César Azpilicueta,Marcos Alons...","Kepa Arrizabalaga,Emerson,Marc Guehi,Andreas C...","Willian,Ross Barkley,Gonzalo Higuaín",697684,"Eden Hazard,Mateo Kovacic,Olivier Giroud",697684,Jorginho,


In [17]:
if os.path.exists('curr_results.csv') == True:
    currstoreddf = pd.read_csv('curr_results.csv')
    # no need to rescrape old stuff
    currdf = currdf.append(currstoreddf,  ignore_index=True)
    currdf = currdf.drop_duplicates(subset=['Home', 'Away', 'score', 'stadium', 'matchlink'], keep="last")
    currdf = currdf.reset_index(drop = True)

In [18]:
print('currdf shape:', currdf.shape)
currdf.head()

currdf shape: (288, 61)


Unnamed: 0,Home,Away,score,stadium,matchlink,Season,Home_goal,Home_goal_times,Away_goal,Away_goal_times,Home_assist,Home_assist_times,Away_assist,Away_assist_times,Home_Clearances,Away_Clearances,Home_Corners,Away_Corners,Home_Fouls conceded,Away_Fouls conceded,Home_Offsides,Away_Offsides,Home_Passes,Away_Passes,Home_Possession %,Away_Possession %,Home_Shots,Away_Shots,Home_Shots on target,Away_Shots on target,Home_Tackles,Away_Tackles,Home_Touches,Away_Touches,Home_Yellow cards,Away_Yellow cards,Home_Red cards,Away_Red cards,Referee,Match_Date,Attendance,Home_formation,Away_formation,Home_captain,Away_captain,Home_starting_lineup,Home_subs_lineup,Home_subout,Home_subout_time,Home_subin,Home_subin_time,Home_yellow,Home_red,Away_starting_lineup,Away_subs_lineup,Away_subout,Away_subout_time,Away_subin,Away_subin_time,Away_yellow,Away_red
0,Leicester,Aston Villa,4-0,King Power Stadium,http://www.premierleague.com/match/46889,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Chelsea,Everton,4-0,Stamford Bridge,http://www.premierleague.com/match/46887,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Man Utd,Man City,2-0,Old Trafford,http://www.premierleague.com/match/46891,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Liverpool,Bournemouth,2-1,Anfield,http://www.premierleague.com/match/46890,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Arsenal,West Ham,1-0,Emirates Stadium,http://www.premierleague.com/match/46885,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
def split_home_away_stats(temp):
    orig_columns = temp.columns
    for elem in temp.columns:
        part1 = "Home_" + elem
        part2 = "Away_" + elem
        temp[part1] = temp[elem].apply(lambda x: 0 if x is np.nan else x[0])
        temp[part2] = temp[elem].apply(lambda x: 0 if x is np.nan else x[1])
    temp.drop(orig_columns, axis = 1, inplace = True)
    return temp

In [20]:
def conv_list_to_str(x):
    if len(x) == 0:
        return ''
    if len(x) == 1:
        return x[0]
    else:
        ans = ''
        for idx, elem in enumerate(x):
            if idx == len(x) - 1:
                ans += str(elem)
            else:
                ans += str(elem) + ','
        return ans

In [21]:
currdf[(currdf['Referee'] == 0) | (pd.isnull(currdf['Referee']))]

Unnamed: 0,Home,Away,score,stadium,matchlink,Season,Home_goal,Home_goal_times,Away_goal,Away_goal_times,Home_assist,Home_assist_times,Away_assist,Away_assist_times,Home_Clearances,Away_Clearances,Home_Corners,Away_Corners,Home_Fouls conceded,Away_Fouls conceded,Home_Offsides,Away_Offsides,Home_Passes,Away_Passes,Home_Possession %,Away_Possession %,Home_Shots,Away_Shots,Home_Shots on target,Away_Shots on target,Home_Tackles,Away_Tackles,Home_Touches,Away_Touches,Home_Yellow cards,Away_Yellow cards,Home_Red cards,Away_Red cards,Referee,Match_Date,Attendance,Home_formation,Away_formation,Home_captain,Away_captain,Home_starting_lineup,Home_subs_lineup,Home_subout,Home_subout_time,Home_subin,Home_subin_time,Home_yellow,Home_red,Away_starting_lineup,Away_subs_lineup,Away_subout,Away_subout_time,Away_subin,Away_subin_time,Away_yellow,Away_red
0,Leicester,Aston Villa,4-0,King Power Stadium,http://www.premierleague.com/match/46889,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Chelsea,Everton,4-0,Stamford Bridge,http://www.premierleague.com/match/46887,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Man Utd,Man City,2-0,Old Trafford,http://www.premierleague.com/match/46891,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Liverpool,Bournemouth,2-1,Anfield,http://www.premierleague.com/match/46890,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Arsenal,West Ham,1-0,Emirates Stadium,http://www.premierleague.com/match/46885,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Crystal Palace,Watford,1-0,Selhurst Park,http://www.premierleague.com/match/46888,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Sheffield Utd,Norwich,1-0,Bramall Lane,http://www.premierleague.com/match/46892,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,Southampton,Newcastle,0-1,St. Mary's Stadium,http://www.premierleague.com/match/46893,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Wolves,Brighton,0-0,Molineux Stadium,http://www.premierleague.com/match/46894,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Burnley,Spurs,1-1,Turf Moor,http://www.premierleague.com/match/46886,2019/20,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
def scrape_stats_new(df, row):
    # if already scrapped skip
    if df.loc[row, 'Referee'] == 0 or pd.isnull(df.loc[row, 'Referee']) == True:
        ###### get match link
        link = df.loc[row, 'matchlink']
        print(link)

        ################## Extract Match Stats
        # open new webdriver to load javascript
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(link)
        statsbut = WebDriverWait(driver, 3).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div/section/div[2]/div/div[1]/div/div/ul/li[3]')))
        ActionChains(driver).click(statsbut).perform()
        time.sleep(1.5)

        # load to beautifulsoup
        bs = BeautifulSoup(driver.page_source, 'html.parser')
        match_stats = bs.find_all("tbody", class_="matchCentreStatsContainer")
        match_stats = match_stats[0].find_all("tr")

        # try to load page again if error and not found
        if len(match_stats) == 0:
            print("error reload website")
            # open new webdriver to load javascript
            driver = webdriver.Chrome(options=chrome_options)
            driver.get(link)
            statsbut = WebDriverWait(driver, 3).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div/section/div[2]/div/div[1]/div/div/ul/li[3]')))
            ActionChains(driver).click(statsbut).perform()
            time.sleep(1.5)

            # load to beautifulsoup
            bs = BeautifulSoup(driver.page_source, 'html.parser')
            match_stats = bs.find_all("tbody", class_="matchCentreStatsContainer")
            match_stats = match_stats[0].find_all("tr")

        # store all stats in dictionary
        stats_master = []
        stats = {}

        # iterate through all the match statistics
        for idx, elem in enumerate(match_stats):
            temp = match_stats[idx].find_all("td")
            part1 = re.search(r'>(\w.+?)<',str(temp[0])).group(1)   #homestat
            part2 = re.search(r'>(\w.+?)<',str(temp[1])).group(1)  #statname
            part3 = re.search(r'>(\w.+?)<',str(temp[2])).group(1)  #awaystat  
            part1 = part1.replace("</p>", '')
            part3 = part3.replace("</p>", '')

            stats[part2] = [part1, part3]
            
        # append to masterlist
        stats_master.append(stats)

        ################## Goals, Time, and Assist Section
        home_list = []
        home_time_list = []
        away_list = []
        away_time_list = []
        assist_home_list = []
        assist_home_time_list = []
        assist_away_list = []
        assist_away_time_list = []

        # load with beautifulSoup
        bs = BeautifulSoup(driver.page_source, 'html.parser')

        # close driver and proceed to next link
        driver.quit()

        # get the goal matchEvents
        goals = bs.find_all("div", class_="matchEvents")
        assists = bs.find_all("div", class_="assists")

        # separate home and away goals
        goals_home = goals[0].find_all("div", class_="home")
        goals_away = goals[0].find_all("div", class_="away")
        goals_home = goals_home[0].findAll("div", class_="event")
        goals_away = goals_away[0].findAll("div", class_="event")

        # seperate home and away assists
        assists_home = assists[0].find_all("div", class_="home")
        assists_away = assists[0].find_all("div", class_="away")
        assists_home = assists_home[0].findAll("div", class_="event")
        assists_away = assists_away[0].findAll("div", class_="event")

        # iterate through all the home goals
        for idx, elem in enumerate(goals_home):
            temp = goals_home[idx].text.strip()

            # only take goals and not red cards
            if "Own Goal" in temp:
                temp = temp.split('\n')[0]
                # get owngoal name and time of goal
                owngoal = re.findall(r'^\D+', temp)[0].strip()
                times = re.findall(r'\+?\d{1,2}', temp)

                # append to array
                for idx, elem in enumerate(times):
                    # ignore the additional minutes sign
                    if '+' not in elem:
                        home_list.append(owngoal + ' (og)')
                        home_time_list.append(elem)

            elif "Goal" in temp:
                temp = temp.split('\n')[0]
                # get goalscorer name and time of goal
                goalscorer = re.findall(r'^\D+', temp)[0].strip()
                times = re.findall(r'\+?\d{1,2}', temp)

                # append to array
                for idx, elem in enumerate(times):
                    # ignore additional minutes sign
                    if '+' not in elem:
                        home_list.append(goalscorer)
                        home_time_list.append(elem)
                
            elif "pen" in temp:
                temp = temp.split('\n')[0]
                # get goalscorer name and time
                goalscorer = re.findall(r'^\D+', temp)[0].strip()
                times = re.findall(r'\+?\d{1,2}', temp)

                for idx, elem in enumerate(times):
                    # ignore additional minutes sign
                    if '+' not in elem:
                        home_list.append(goalscorer)
                        home_time_list.append(elem)
            else:
                continue

        # iterate through all the away goals
        for idx, elem in enumerate(goals_away):
            temp = goals_away[idx].text.strip()
            
            # only take goals and not red cards
            if "Own Goal" in temp:
                temp = temp.split('\n')[0]
                # get goalscorer name and time
                owngoal = re.findall(r'^\D+', temp)[0].strip()
                times = re.findall(r'\+?\d{1,2}', temp)

                for idx, elem in enumerate(times):
                    # ignore additional minutes sign
                    if '+' not in elem:
                        away_list.append(owngoal + ' (og)')
                        away_time_list.append(elem)

            elif "Goal" in temp:
                temp = temp.split('\n')[0]
                # get goalscorer name and time
                goalscorer = re.findall(r'^\D+', temp)[0].strip()
                times = re.findall(r'\+?\d{1,2}', temp)

                for idx, elem in enumerate(times):
                    # ignore additional minutes sign
                    if '+' not in elem:
                        away_list.append(goalscorer)
                        away_time_list.append(elem)
                        
            elif "pen" in temp:
                temp = temp.split('\n')[0]
                # get goalscorer name and time
                goalscorer = re.findall(r'^\D+', temp)[0].strip()
                times = re.findall(r'\+?\d{1,2}', temp)

                for idx, elem in enumerate(times):
                    # ignore additional minutes sign
                    if '+' not in elem:
                        away_list.append(goalscorer)
                        away_time_list.append(elem)
                        
            else:
                continue

        # iterate through all the home assists
        for idx, elem in enumerate(assists_home):
            temp = assists_home[idx].text.strip()
            temp = temp.split('\n')[0]

            # get goalscorer name and time of goal
            assister = re.findall(r'^\D+', temp)[0].strip()
            times = re.findall(r'\+?\d{1,2}', temp)

            # append to array
            for idx, elem in enumerate(times):
                if '+' not in elem:
                    assist_home_list.append(assister)
                    assist_home_time_list.append(elem)

        # iterate through all the away assists
        for idx, elem in enumerate(assists_away):
            temp = assists_away[idx].text.strip()
            temp = temp.split('\n')[0]

            # get goalscorer name and time
            assister = re.findall(r'^\D+', temp)[0].strip()
            times = re.findall(r'\+?\d{1,2}', temp)

            for idx, elem in enumerate(times):
                # ignore additional minutes sign
                if '+' not in elem:
                    assist_away_list.append(assister)
                    assist_away_time_list.append(elem)
                
        ############################# Extract Match Date, Referee, Attendance
        # Match Date
        date = bs.find_all("div", class_="matchDate renderMatchDateContainer")
        date = re.search(r'data-kickoff="(\d*)"', str(date[0])).group(1)
      

        # Referee
        ref = bs.find_all("div", class_="referee")
        referee = ref[0].text.strip()

        # Attendance
        att = bs.find_all("div", class_="attendance hide-m")
        # prevent attenande error
        try:
            att = att[0].text.strip()
            att = att.replace("Att: ", '')
            att = att.replace(",", '')
            attendance = att
        except:
            attendance = 0

        ################################ Extract team formation and lineups
        formation = bs.find_all("div", class_="position")
        home_formation = formation[0].text.split('\n')[2].strip()
        away_formation = formation[1].text.split('\n')[2].strip()

        ############################### Extract Club captains
        capt_h = bs.find_all("ul", class_="startingLineUpContainer squadList home")[0].find_all("li", class_="player")
        home_captain = ''
        for elem in capt_h:
            lencapt = elem.find_all("div", class_="cpt")
            if len(lencapt) >= 1:
                home_captain = re.search(r'\D*', elem.find_all("div", class_="name")[0].text.strip()).group(0).strip()

        capt_a = bs.find_all("ul", class_="startingLineUpContainer squadList")[0].find_all("li", class_="player")
        away_captain = ''
        for elem in capt_a:
            lencapt = elem.find_all("div", class_="cpt")
            if len(lencapt) >= 1:
                away_captain = re.search(r'\D*', elem.find_all("div", class_="name")[0].text.strip()).group(0).strip()

        ################################ Parse Starting Lineup, Subs, Subsin, Subsoff
        lineup = bs.find_all("div", class_="matchLineupTeamContainer")

        squad_home_starter = lineup[0].find_all("ul", class_="startingLineUpContainer squadList home")[0].find_all("div", class_="name")
        squad_home_subs = lineup[0].find_all("ul", class_="startingLineUpContainer squadList home")[1].find_all("div", class_="name")
        squad_away_starter = lineup[1].find_all("ul", class_="startingLineUpContainer squadList")[0].find_all("div", class_="name")
        squad_away_subs = lineup[1].find_all("ul", class_="startingLineUpContainer squadList")[1].find_all("div", class_="name")

        home_starting_lineup = []
        home_subs_lineup = []
        home_subout = []
        home_subout_time = []
        home_subin = []
        home_subin_time = []
        home_yellow = []
        home_red = []

        away_starting_lineup = []
        away_subs_lineup = []
        away_subout = []
        away_subout_time = []
        away_subin = []
        away_subin_time = []
        away_yellow = []
        away_red = []

        ###### to parse home squad starting lineup
        for idx, elem in enumerate(squad_home_starter):
            player = elem.text.strip()
            sub_out = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())

            # starting lineup player append to list
            if len(sub_out) > 0:
                # append starting lineup player
                home_starting_lineup.append(re.findall(r'\D*', player)[0].strip())
                # append subout time and player
                home_subout.append(re.findall(r'\D*', player)[0].strip())
                home_subout_time.append(sub_out[0][0:2])
            else:
                home_starting_lineup.append(player)

            # yellow and red carded players
            # yellow card
            if len(elem.find_all("span", class_="icn card-yellow")) > 0:
                home_yellow.append(re.findall(r'\D*', player)[0].strip())
            # red card
            if len(elem.find_all("span", class_="icn card-red")) > 0:
                home_red.append(re.findall(r'\D*', player)[0].strip())

        # to parse substitute squad
        for idx, elem in enumerate(squad_home_subs):
            player = elem.text.strip()
            sub_in = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())
            
            # substitute player append to list
            if len(sub_in) > 0:
                # append starting subs player
                home_subs_lineup.append(re.findall(r'\D*', player)[0].strip())
                # append subin time and player
                home_subin.append(re.findall(r'\D*', player)[0].strip())
                home_subin_time.append(sub_in[0][0:2])

                # players who are subbed in then are subbed out again
                if len(sub_in) > 1:
                    home_subout.append(re.findall(r'\D*', player)[0].strip())
                    home_subout_time.append(sub_in[1][0:2])
            else:
                home_subs_lineup.append(player)
                
            # yellow and red carded players
            # yellow card
            if len(elem.find_all("span", class_="icn card-yellow")) > 0:
                home_yellow.append(re.findall(r'\D*', player)[0].strip())
            # red card
            if len(elem.find_all("span", class_="icn card-red")) > 0:
                home_red.append(re.findall(r'\D*', player)[0].strip())

        ##### to parse away squad starting lineup
        for idx, elem in enumerate(squad_away_starter):
            player = elem.text.strip()
            sub_out = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())

            # starting lineup player append to list
            if len(sub_out) > 0:
                # append starting lineup player
                away_starting_lineup.append(re.findall(r'\D*', player)[0].strip())
                # append subout time and player
                away_subout.append(re.findall(r'\D*', player)[0].strip())
                away_subout_time.append(sub_out[0][0:2])
            else:
                away_starting_lineup.append(player)

            # yellow and red carded players
            # yellow card
            if len(elem.find_all("span", class_="icn card-yellow")) > 0:
                away_yellow.append(re.findall(r'\D*', player)[0].strip())
            # red card
            if len(elem.find_all("span", class_="icn card-red")) > 0:
                away_red.append(re.findall(r'\D*', player)[0].strip())

        # to parse substitute squad
        for idx, elem in enumerate(squad_away_subs):
            player = elem.text.strip()
            sub_in = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())
            
            # substitute player append to list
            if len(sub_in) > 0:
                # append starting subs player
                away_subs_lineup.append(re.findall(r'\D*', player)[0].strip())
                # append subin time and player
                away_subin.append(re.findall(r'\D*', player)[0].strip())
                away_subin_time.append(sub_in[0][0:2])

                # players who are subbed in then are subbed out again
                if len(sub_in) > 1:
                    away_subout.append(re.findall(r'\D*', player)[0].strip())
                    away_subout_time.append(sub_in[1][0:2])
            else:
                away_subs_lineup.append(player)
                
            # yellow and red carded players
            # yellow card
            if len(elem.find_all("span", class_="icn card-yellow")) > 0:
                away_yellow.append(re.findall(r'\D*', player)[0].strip())
            # red card
            if len(elem.find_all("span", class_="icn card-red")) > 0:
                away_red.append(re.findall(r'\D*', player)[0].strip())

        # append to dataframe  
        df.loc[row, 'Home_goal'] = conv_list_to_str(home_list)
        df.loc[row, 'Home_goal_times'] = conv_list_to_str(home_time_list)
        df.loc[row, 'Away_goal'] = conv_list_to_str(away_list)
        df.loc[row, 'Away_goal_times'] = conv_list_to_str(away_time_list)
        df.loc[row, 'Home_assist'] = conv_list_to_str(assist_home_list)
        df.loc[row, 'Home_assist_times'] = conv_list_to_str(assist_home_time_list)
        df.loc[row, 'Away_assist'] = conv_list_to_str(assist_away_list)
        df.loc[row, 'Away_assist_times'] = conv_list_to_str(assist_away_time_list)
        
        # statsdataframe
        statsdf = pd.DataFrame.from_dict(stats_master)
        statsdf = split_home_away_stats(statsdf)
                
        for elem in statsdf.columns:
            df.loc[row, elem] = statsdf.loc[0,elem]
        
        # referee, matchdate, attendance
        df.loc[row, 'Referee'] = referee
        df.loc[row, 'Match_Date'] = date
        df.loc[row, 'Attendance'] = attendance

        # Formation and Captain
        df.loc[row, 'Home_formation'] = home_formation
        df.loc[row, 'Away_formation'] = away_formation
        df.loc[row, 'Home_captain'] = home_captain
        df.loc[row, 'Away_captain'] = away_captain

        # stats
        df.loc[row, 'Home_starting_lineup'] = conv_list_to_str(home_starting_lineup)
        df.loc[row, 'Home_subs_lineup'] = conv_list_to_str(home_subs_lineup)
        df.loc[row, 'Home_subout'] = conv_list_to_str(home_subout)
        df.loc[row, 'Home_subout_time'] = conv_list_to_str(home_subout_time)
        df.loc[row, 'Home_subin'] = conv_list_to_str(home_subin)
        df.loc[row, 'Home_subin_time'] = conv_list_to_str(home_subin_time)
        df.loc[row, 'Home_yellow'] = conv_list_to_str(home_yellow)
        df.loc[row, 'Home_red'] = conv_list_to_str(home_red)

        df.loc[row, 'Away_starting_lineup'] = conv_list_to_str(away_starting_lineup)
        df.loc[row, 'Away_subs_lineup'] = conv_list_to_str(away_subs_lineup)
        df.loc[row, 'Away_subout'] = conv_list_to_str(away_subout)
        df.loc[row, 'Away_subout_time'] = conv_list_to_str(away_subout_time)
        df.loc[row, 'Away_subin'] = conv_list_to_str(away_subin)
        df.loc[row, 'Away_subin_time'] = conv_list_to_str(away_subin_time)
        df.loc[row, 'Away_yellow'] = conv_list_to_str(away_yellow)
        df.loc[row, 'Away_red'] = conv_list_to_str(away_red)

In [23]:
# scrape statistics for past 10 years
Parallel(n_jobs=-1, require='sharedmem')(delayed(scrape_stats_new)(df, i) for i in range(df.shape[0]));

In [24]:
# scrape statistics for current season
Parallel(n_jobs=-1, require='sharedmem')(delayed(scrape_stats_new)(currdf, i) for i in range(currdf.shape[0]));

http://www.premierleague.com/match/46889
http://www.premierleague.com/match/46887
http://www.premierleague.com/match/46891
http://www.premierleague.com/match/46890
error reload website
http://www.premierleague.com/match/46885
http://www.premierleague.com/match/46888
http://www.premierleague.com/match/46892
http://www.premierleague.com/match/46893
http://www.premierleague.com/match/46894
http://www.premierleague.com/match/46886
http://www.premierleague.com/match/46878
http://www.premierleague.com/match/46882
http://www.premierleague.com/match/46877
http://www.premierleague.com/match/46876
http://www.premierleague.com/match/46880
http://www.premierleague.com/match/46884
http://www.premierleague.com/match/46883
http://www.premierleague.com/match/46881
http://www.premierleague.com/match/46870
http://www.premierleague.com/match/46871
http://www.premierleague.com/match/46874
http://www.premierleague.com/match/46865
http://www.premierleague.com/match/46867
http://www.premierleague.com/match/4

In [25]:
# ### Fix the broken data (0% Possesion) 
# ## PLEASE OPEN THE TEMP FILE FIRST THEN manually put index in list to rescrape
# for idx in index_error_df:
#     df.loc[idx, 'Referee'] = 0

In [26]:
#### Fix the broken data (0% Possesion) 
### PLEASE OPEN THE TEMP FILE FIRST THEN manually put index in list to rescrape
# for idx in index_error_currdf:
#     currdf.loc[idx, 'Referee'] = 0

In [27]:
### TEMPORARY PLACEMENT to keep data if timeout or error
# df.to_csv('tempdf.csv', index=False)
# currdf.to_csv('tempcurrdf.csv', index=False)

In [28]:
# check if goal scored == goal scorer if not print index
def check_goal_scored(df):
    index_error = []

    for idx in range(df.shape[0]):
        goals_home = df.loc[idx, 'score'][0]
        goals_away = df.loc[idx, 'score'][2]
        
        if df.loc[idx, 'Home_goal'] == '' and int(goals_home) > 0:
            index_error.append(idx)
        elif pd.isnull(df.loc[idx, 'Home_goal']) == True and int(goals_home) != 0:
            index_error.append(idx)
        elif pd.isnull(df.loc[idx, 'Home_goal']) == True and int(goals_home) == 0:
            continue
        elif len(df.loc[idx, 'Home_goal'].split(',')) != int(goals_home) and int(goals_home) != 0:
            index_error.append(idx)
        
            
        if df.loc[idx, 'Away_goal'] == '' and int(goals_away) > 0:
            index_error.append(idx)
        elif pd.isnull(df.loc[idx, 'Away_goal']) == True and int(goals_away) != 0:
            index_error.append(idx)
        elif pd.isnull(df.loc[idx, 'Away_goal']) == True and int(goals_away) == 0:
            continue
        elif len(df.loc[idx, 'Away_goal'].split(',')) != int(goals_away) and int(goals_away) != 0:
            index_error.append(idx)
            
    return index_error

In [29]:
# check goal scored and goals scorers. Should be 0
index_error_df = check_goal_scored(df)
print('Errors:', len(index_error_df))

Errors: 0


In [30]:
# check goal scored and goals scorers. Should be 0
index_error_currdf = check_goal_scored(currdf)
print('Errors:', len(index_error_currdf))

Errors: 0


In [37]:
# ## fix error columns -- OPTIONAL if error shape not 61
# for idx in range(currdf.shape[0]):
#     if pd.isnull(currdf.loc[idx, 'Home_Shots on target']) == True:
#         currdf.loc[idx, 'Home_Shots on target'] = currdf.loc[idx, 'Home_label.stat.ontarget_scoring_att']
#     if pd.isnull(currdf.loc[idx, 'Away_Shots on target']) == True:
#         currdf.loc[idx, 'Away_Shots on target'] = currdf.loc[idx, 'Away_label.stat.ontarget_scoring_att']

# currdf.drop(['Home_label.stat.ontarget_scoring_att', 'Away_label.stat.ontarget_scoring_att'], axis = 1, inplace = True)

In [38]:
currdf.replace('', np.nan, regex = True, inplace = True)

print('df shape', df.shape)
print('currdf shape', currdf.shape)

df shape (3420, 61)
currdf shape (288, 61)


In [39]:
# check of columns need to be empty
set(currdf.columns) - (set(df.columns))

set()

In [40]:
def reorganize_timing_orders(df):
    if pd.isnull(df['Home_goal_times']) == False and len(df['Home_goal_times'].split(',')) >= 1:        
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Home_goal_times'].split(','))), df['Home_goal'].split(',')))))
        df['Home_goal'] = conv_list_to_str(b)
        df['Home_goal_times'] = conv_list_to_str(a)
    
    if pd.isnull(df['Home_assist_times']) == False and len(df['Home_assist_times'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Home_assist_times'].split(','))), df['Home_assist'].split(',')))))
        df['Home_assist'] = conv_list_to_str(b)
        df['Home_assist_times'] = conv_list_to_str(a)
    
    if pd.isnull(df['Away_goal_times']) == False and len(df['Away_goal_times'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Away_goal_times'].split(','))), df['Away_goal'].split(',')))))
        df['Away_goal'] = conv_list_to_str(b)
        df['Away_goal_times'] = conv_list_to_str(a)
    
    if pd.isnull(df['Away_assist_times']) == False and len(df['Away_assist_times'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Away_assist_times'].split(','))), df['Away_assist'].split(',')))))
        df['Away_assist'] = conv_list_to_str(b)
        df['Away_assist_times'] = conv_list_to_str(a)
    
    if pd.isnull(df['Home_subout_time']) == False and len(df['Home_subout_time'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Home_subout_time'].split(','))), df['Home_subout'].split(',')))))
        df['Home_subout'] = conv_list_to_str(b)
        df['Home_subout_time'] = conv_list_to_str(a)
    
    if pd.isnull(df['Home_subin_time']) == False and len(df['Home_subin_time'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Home_subin_time'].split(','))), df['Home_subin'].split(',')))))
        df['Home_subin'] = conv_list_to_str(b)
        df['Home_subin_time'] = conv_list_to_str(a)
    
    if pd.isnull(df['Away_subout_time']) == False and len(df['Away_subout_time'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Away_subout_time'].split(','))), df['Away_subout'].split(',')))))
        df['Away_subout'] = conv_list_to_str(b)
        df['Away_subout_time'] = conv_list_to_str(a)
    
    if pd.isnull(df['Away_subin_time']) == False and len(df['Away_subin_time'].split(',')) >= 1:
        a, b = (list(t) for t in zip(*sorted(zip(list(map(int, df['Away_subin_time'].split(','))), df['Away_subin'].split(',')))))
        df['Away_subin'] = conv_list_to_str(b)
        df['Away_subin_time'] = conv_list_to_str(a)
    
    return df

In [41]:
df = df.apply(lambda x: reorganize_timing_orders(x), axis = 1)
currdf = currdf.apply(lambda x: reorganize_timing_orders(x), axis = 1)

In [42]:
df.head()

Unnamed: 0,Home,Away,score,stadium,matchlink,Season,Home_goal,Home_goal_times,Away_goal,Away_goal_times,Home_assist,Home_assist_times,Away_assist,Away_assist_times,Home_Clearances,Away_Clearances,Home_Corners,Away_Corners,Home_Fouls conceded,Away_Fouls conceded,Home_Offsides,Away_Offsides,Home_Passes,Away_Passes,Home_Possession %,Away_Possession %,Home_Shots,Away_Shots,Home_Shots on target,Away_Shots on target,Home_Tackles,Away_Tackles,Home_Touches,Away_Touches,Home_Yellow cards,Away_Yellow cards,Home_Red cards,Away_Red cards,Referee,Match_Date,Attendance,Home_formation,Away_formation,Home_captain,Away_captain,Home_starting_lineup,Home_subs_lineup,Home_subout,Home_subout_time,Home_subin,Home_subin_time,Home_yellow,Home_red,Away_starting_lineup,Away_subs_lineup,Away_subout,Away_subout_time,Away_subin,Away_subin_time,Away_yellow,Away_red
0,Brighton,Man City,1-4,Amex Stadium,http://www.premierleague.com/match/38678,2018/19,Glenn Murray,27.0,"Sergio Agüero,Aymeric Laporte,Riyad Mahrez,Ilk...",28386372.0,Pascal Groß,27.0,"David Silva,Riyad Mahrez,David Silva",283863.0,41,5,2,6,12,8,1,1,245,796,23.7,76.3,6,20,2,9,16,15,401,955,0,0,0,0,Michael Oliver,1557669600000,30662,4-5-1,4-4-1-1,Bruno,Vincent Kompany,"Mat Ryan,Lewis Dunk,Bruno,Bernardo,Shane Duffy...","David Button,Dan Burn,Gaëtan Bong,Martín Monto...","Alireza Jahanbakhsh,Glenn Murray,Bruno",676784,"Florin Andone,Jürgen Locadia,Martín Montoya",676784,,,"Ederson,Aymeric Laporte,Kyle Walker,Oleksandr ...","Arijanet Muric,John Stones,Nicolás Otamendi,Da...","David Silva,Vincent Kompany,Kyle Walker",788688,"Kevin De Bruyne,Nicolás Otamendi,Danilo",788688,,
1,Burnley,Arsenal,1-3,Turf Moor,http://www.premierleague.com/match/38679,2018/19,Ashley Barnes,65.0,"Pierre-Emerick Aubameyang,Pierre-Emerick Aubam...",526390.0,Johann Gudmundsson,65.0,"Alex Iwobi,Alex Iwobi",6390.0,21,27,4,5,11,3,4,0,341,533,39.7,60.3,14,17,5,6,15,11,527,714,5,1,0,0,Mike Dean,1557669600000,21461,4-4-2,4-2-3-1,Tom Heaton,Nacho Monreal,"Tom Heaton,Charlie Taylor,Ben Mee,Matthew Lowt...","Joe Hart,Stephen Ward,Kevin Long,Robbie Brady,...","Dwight McNeil,Chris Wood,Jeff Hendrick",647782,"Johann Gudmundsson,Peter Crouch,Robbie Brady",647782,"Tom Heaton,Matthew Lowton,James Tarkowski,Jack...",,"Bernd Leno,Konstantinos Mavropanos,Shkodran Mu...","Petr Cech,Laurent Koscielny,Sead Kolasinac,Gra...","Konstantinos Mavropanos,Joseph Willock",3462,"Laurent Koscielny,Eddie Nketiah",3462,Matteo Guendouzi,
2,Crystal Palace,Bournemouth,5-3,Selhurst Park,http://www.premierleague.com/match/38680,2018/19,"Michy Batshuayi,Michy Batshuayi,Jack Simpson (...",2432376580.0,"Jefferson Lerma,Jordon Ibe,Joshua King",455673.0,"Aaron Wan-Bissaka,Wilfried Zaha,Wilfried Zaha",326580.0,"Nathaniel Clyne,Adam Smith,Chris Mepham",455673.0,23,19,4,4,11,8,4,1,429,517,45.0,55.0,17,16,8,8,16,18,627,713,3,0,0,0,Roger East,1557669600000,25433,4-4-2,4-4-1-1,Luka Milivojevic,Steve Cook,"Vicente Guaita,Joel Ward,Martin Kelly,Aaron Wa...","Wayne Hennessey,Nikola Tavares,Luke Dreher,Bak...","Wilfried Zaha,Michy Batshuayi,Andros Townsend",878990,"Bakary Sako,Connor Wickham,Luke Dreher",878990,"Luka Milivojevic,James McArthur,Wilfried Zaha",,"Mark Travers,Nathaniel Clyne,Steve Cook,Jack S...","Artur Boruc,Chris Mepham,Matt Butcher,Emerson ...","Jack Simpson,Jordon Ibe,Nathaniel Clyne",707089,"Chris Mepham,Lys Mousset,Sam Surridge",707089,,
3,Fulham,Newcastle,0-4,Craven Cottage,http://www.premierleague.com/match/38681,2018/19,,,"Jonjo Shelvey,Ayoze Pérez,Fabian Schär,Salomón...",9116190.0,,,"Matt Ritchie,Matt Ritchie",961.0,7,38,5,5,6,8,1,1,765,358,68.1,31.9,16,13,2,6,12,15,940,532,1,0,0,0,Kevin Friend,1557669600000,24979,4-2-3-1,5-4-1,Tom Cairney,Jamaal Lascelles,"Sergio Rico,Alfie Mawson,Cyrus Christie,Maxime...","Fabri,Steven Sessegnon,Tim Ream,Ryan Babel,Nee...","Cyrus Christie,Floyd Ayité,Ryan Sessegnon",657481,"Ryan Babel,Harvey Elliott,Neeskens Kebano",657481,Joe Bryan,,"Martin Dubravka,Jamaal Lascelles,Matt Ritchie,...","Karl Darlow,Lewis Cass,Federico Fernández,Ciar...","Christian Atsu,Isaac Hayden,Paul Dummett",666981,"Yoshinori Muto,Federico Fernández,Kenedy",666981,,
4,Leicester,Chelsea,0-0,King Power Stadium,http://www.premierleague.com/match/38682,2018/19,,,,,,,,,17,16,4,5,9,8,1,2,470,533,46.9,53.1,9,14,3,4,16,12,644,711,0,1,0,0,Anthony Taylor,1557669600000,32140,4-1-4-1,4-3-3,Kasper Schmeichel,César Azpilicueta,"Kasper Schmeichel,Ben Chilwell,Ricardo Pereira...","Danny Ward,Wes Morgan,Danny Simpson,Harvey Bar...","Wilfred Ndidi,Marc Albrighton,James Maddison",667684,"Shinji Okazaki,Danny Simpson,Harvey Barnes",667684,,,"Willy Caballero,César Azpilicueta,Marcos Alons...","Kepa Arrizabalaga,Emerson,Marc Guehi,Andreas C...","Willian,Ross Barkley,Gonzalo Higuaín",697684,"Eden Hazard,Mateo Kovacic,Olivier Giroud",697684,Jorginho,


In [43]:
currdf.head()

Unnamed: 0,Home,Away,score,stadium,matchlink,Season,Home_goal,Home_goal_times,Away_goal,Away_goal_times,Home_assist,Home_assist_times,Away_assist,Away_assist_times,Home_Clearances,Away_Clearances,Home_Corners,Away_Corners,Home_Fouls conceded,Away_Fouls conceded,Home_Offsides,Away_Offsides,Home_Passes,Away_Passes,Home_Possession %,Away_Possession %,Home_Shots,Away_Shots,Home_Shots on target,Away_Shots on target,Home_Tackles,Away_Tackles,Home_Touches,Away_Touches,Home_Yellow cards,Away_Yellow cards,Home_Red cards,Away_Red cards,Referee,Match_Date,Attendance,Home_formation,Away_formation,Home_captain,Away_captain,Home_starting_lineup,Home_subs_lineup,Home_subout,Home_subout_time,Home_subin,Home_subin_time,Home_yellow,Home_red,Away_starting_lineup,Away_subs_lineup,Away_subout,Away_subout_time,Away_subin,Away_subin_time,Away_yellow,Away_red
0,Leicester,Aston Villa,4-0,King Power Stadium,http://www.premierleague.com/match/46889,2019/20,"Harvey Barnes,Jamie Vardy,Jamie Vardy,Harvey B...",40637985,,,"Marc Albrighton,Marc Albrighton",4085,,,12,22,9,0,15,12,3,2,543,301,64.6,35.4,15,4,0.0,0.0,24,23,757,510,2,1,0,0,Michael Oliver,1583784000000,32125,4-1-4-1,4-1-4-1,Kasper Schmeichel,Jack Grealish,"Kasper Schmeichel,James Justin,Çaglar Söyüncü,...","Danny Ward,Christian Fuchs,Wes Morgan,Youri Ti...","Kelechi Iheanacho,Dennis Praet,Wilfred Ndidi",597683,"Jamie Vardy,Youri Tielemans,Nampalys Mendy",597683,"Jonny Evans,Dennis Praet",,"Pepe Reina,Björn Engels,Matt Targett,Tyrone Mi...","Ørjan Nyland,Ezri Konsa Ngoyo,Neil Taylor,Dani...","Ahmed El Mohamady,Conor Hourihane",6367,"Anwar El Ghazi,Keinan Davis",6367,Conor Hourihane,
1,Chelsea,Everton,4-0,Stamford Bridge,http://www.premierleague.com/match/46887,2019/20,"Mason Mount,Pedro,Willian,Olivier Giroud",14215154,,,"Pedro,Ross Barkley,Ross Barkley,Willian",14215154,,,12,11,6,1,8,10,0,1,646,415,60.2,39.8,17,3,0.0,0.0,13,15,823,571,1,2,0,0,Kevin Friend,1583676000000,40694,4-3-3,4-4-2,César Azpilicueta,Gylfi Sigurdsson,"Kepa Arrizabalaga,César Azpilicueta,Marcos Alo...","Willy Caballero,Reece James,Fikayo Tomori,Andr...","Mason Mount,Willian,Olivier Giroud",607186,"Reece James,Faustino Anjorin,Armando Broja",607186,Kurt Zouma,,"Jordan Pickford,Djibril Sidibé,Mason Holgate,M...","Maarten Stekelenburg,Leighton Baines,Yerry Min...","Bernard,Tom Davies,Dominic Calvert-Lewin",455876,"Theo Walcott,Moise Kean,Anthony Gordon",455876,"Mason Holgate,André Gomes",
2,Man Utd,Man City,2-0,Old Trafford,http://www.premierleague.com/match/46891,2019/20,"Anthony Martial,Scott McTominay",3090,,,Bruno Fernandes,30,,,33,8,2,11,11,9,1,2,294,761,27.7,72.3,12,7,0.0,0.0,19,15,488,948,2,4,0,0,Mike Dean,1583685000000,73288,3-4-1-2,4-3-3,Harry Maguire,Fernandinho,"David de Gea,Harry Maguire,Luke Shaw,Victor Li...","Sergio Romero,Eric Bailly,Axel Tuanzebe,Scott ...","Anthony Martial,Brandon Williams,Bruno Fernandes",787888,"Eric Bailly,Scott McTominay,Odion Ighalo",787888,"Harry Maguire,Fred",,"Ederson,Fernandinho,Oleksandr Zinchenko,João C...","Claudio Bravo,Benjamin Mendy,Eric García,Kyle ...","Bernardo Silva,Sergio Agüero,Oleksandr Zinchenko",595977,"Gabriel Jesus,Riyad Mahrez,Benjamin Mendy",595977,"Fernandinho,João Cancelo,Rodrigo,Gabriel Jesus",
3,Liverpool,Bournemouth,2-1,Anfield,http://www.premierleague.com/match/46890,2019/20,"Mohamed Salah,Sadio Mané",2433,Callum Wilson,9.0,"Sadio Mané,Virgil van Dijk",2433,Jefferson Lerma,9.0,20,43,9,4,10,12,1,5,729,246,74.5,25.5,14,6,0.0,0.0,7,9,917,419,0,1,0,0,Paul Tierney,1583584200000,53323,4-3-3,4-1-4-1,James Milner,Steve Cook,"Adrián,Joseph Gomez,Virgil van Dijk,James Miln...","Andy Lonergan,Joel Matip,Neco Williams,Naby Ke...","Alex Oxlade-Chamberlain,Roberto Firmino",8490,"Adam Lallana,Divock Origi",8490,,,"Aaron Ramsdale,Adam Smith,Steve Cook,Jack Stac...","Artur Boruc,Diego Rico,Jack Simpson,Dan Goslin...","Steve Cook,Junior Stanislas,Jefferson Lerma",196880,"Jack Simpson,Dominic Solanke,Dan Gosling",196880,Callum Wilson,
4,Arsenal,West Ham,1-0,Emirates Stadium,http://www.premierleague.com/match/46885,2019/20,Alexandre Lacazette,78,,,Mesut Özil,78,,,28,18,6,7,11,9,2,2,621,264,68.9,31.1,9,14,0.0,0.0,13,11,799,448,1,2,0,0,Martin Atkinson,1583593200000,60335,4-2-3-1,4-4-2,Pierre-Emerick Aubameyang,Mark Noble,"Bernd Leno,Bukayo Saka,Sokratis,Pablo Marí,Dav...","Emiliano Martínez,Héctor Bellerín,Gabriel Mart...","Eddie Nketiah,Nicolas Pépé,Mesut Özil",596989,"Alexandre Lacazette,Reiss Nelson,Héctor Bellerín",596989,Sokratis,,"Lukasz Fabianski,Issa Diop,Jeremy Ngakia,Angel...","Darren Randolph,Fabián Balbuena,Pablo Zabaleta...","Mark Noble,Pablo Fornals,Jarrod Bowen",818790,"Tomas Soucek,Felipe Anderson,Robert Snodgrass",818790,"Pablo Fornals,Michail Antonio",


In [44]:
# publish to local file
df.to_csv('past_results.csv', index = False)
currdf.to_csv('curr_results.csv', index = False)

# Scraping Fixtures

In [45]:
# Global dataframe to store result information
fixturesdf = pd.DataFrame()

In [46]:
def scrape_current_fixtures(curr_season):    
    ####### Restart URL
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(fixtures_url)
    # scroll down to get all matches
    SCROLL_PAUSE_TIME = 1

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # click accept cookies button
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    nextx = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '/html/body/section/div/div')))
    ActionChains(driver).click(nextx).perform()

    # pause to get caught up
    time.sleep(3)
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
            
    ######## Scroll down infinetely section to get all matches
    SCROLL_PAUSE_TIME = 2

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    time.sleep(1)

    ####### Convert to beautifulsoup and get result information
    # initialize beautifulsoup and get all matches
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    fixtures = bs.find_all("div", class_="fixtures__matches-list")
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    
    # close the unneeded driver
    driver.quit()
    
    # sleep to catch up with time
    time.sleep(1)

    # scrape the results page
    all_team1 = []
    all_team2 = []
    all_score = []
    all_links = []
    all_stadium = []
    
    for idx, elem in enumerate(fixtures):
        for idx2, elem2 in enumerate(fixtures[idx].find_all("span", class_="overview")):
            teams = re.findall(r'<span class="shortname">(.+?)<',str(elem2))
            
            all_team1.append(teams[0]) # append team 1 (HOME)
            all_team2.append(teams[1]) # append team 2 (AWAY)
            
            stadiumtxt = elem2.find_all("span", class_="stadiumName")
            stadiumtxt = re.findall(r'<\/span>(.+?),', str(stadiumtxt[0]))[0]            
            all_stadium.append(stadiumtxt) # append stadium name

    df = pd.DataFrame()
    df['Home'] = all_team1
    df['Away'] = all_team2
    df['stadium'] = all_stadium
    df['Season'] = curr_season
    df.drop_duplicates(inplace = True)
    df.reset_index(inplace = True, drop = True)

    return df

In [47]:
fixturesdf = scrape_current_fixtures('2019/2020')

In [48]:
fixturesdf.head()

Unnamed: 0,Home,Away,stadium,Season
0,Bournemouth,Crystal Palace,Vitality Stadium,2019/2020
1,Aston Villa,Sheffield Utd,Villa Park,2019/2020
2,Aston Villa,Chelsea,Villa Park,2019/2020
3,Brighton,Arsenal,Amex Stadium,2019/2020
4,Burnley,Watford,Turf Moor,2019/2020


In [49]:
# publish to local file
fixturesdf.to_csv('fixturesdf.csv', index = True)

# Club Badges

In [None]:
def scrape_badges():
    ####### Get URL
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(tables_url)
    # scroll down to get all matches
    SCROLL_PAUSE_TIME = 1

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # click accept cookies button
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    nextx = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '/html/body/section/div/div')))
    ActionChains(driver).click(nextx).perform()

    # pause to get caught up
    time.sleep(3)
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    
    ######## click season button
    seasonbut = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div[2]/div[1]/div[1]/section/div[2]/div[2]')))
    ActionChains(driver).click(seasonbut).perform()
    
    time.sleep(2)
    
    ######## click all season button
    seasonclick = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div[2]/div[1]/div[1]/section/div[2]/ul/li[1]')))
    ActionChains(driver).click(seasonclick).perform()
        
    time.sleep(2)
    
    ####### Convert to beautifulsoup and get result information
    # initialize beautifulsoup and get all matches
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    clubbadges = bs.find_all("td", class_="team")
    
    # close the unneeded driver
    driver.quit()
    
    ######### Download badge images
    club_mapping = {}
    
    # extract badge number
    for elem in clubbadges:
        team_name = elem.find_all("span", class_="long")[0]
        team_name = re.search(r'>(\w.+?)<', str(team_name)).group(1)
        badge_num = elem.find_all("span", class_="badge-25")[0]
        badge_num = re.search(r't\d*', str(badge_num)).group(0)
        
        if team_name not in club_mapping.keys():
            club_mapping[team_name] = badge_num
    
    # base url for images
    base_url = 'http://premierleague-static-files.s3.amazonaws.com/premierleague/badges/'
    
    # store badges in local file
    for keys in club_mapping:
        val = club_mapping[keys]
        url = base_url + val + '.png'
        filename = keys
        urllib.request.urlretrieve(url, os.path.join('Club Badges', filename + ".png"))

In [None]:
scrape_badges()

# Scraping Players

In [5]:
playerdf = pd.DataFrame()

In [6]:
def player_details_scrape():
    ####### Get URL
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(player_url)
    
    # scroll down to get all players
    SCROLL_PAUSE_TIME = 3
    
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page   
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # click accept cookies button
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    nextx = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '/html/body/section/div/div')))
    ActionChains(driver).click(nextx).perform()

    # pause to get caught up
    time.sleep(2)

    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")

    # time to catch up
    time.sleep(2)
    
    print('finish scrolling')
    
    # declare variables
    player_name_list = []
    player_position_list = []
    player_country_list = []
    player_link_list = []
    
    # initialize beautifulsoup and get all player details
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    players_raw = bs.find_all("tbody", class_="dataContainer indexSection")
    players = bs.find_all("td", players_raw[0])
    
    time.sleep(1)
    
    # iterate through list of players
    for elem in players:
        player_name_list.append(elem.text)
        #print('Player: ', elem.text)  #last one here should be Kurt Zouma
        
        link = re.search(r'href="\/\/(www.premierleague.com\/players\/\w*\/(?=\S*[-])([A-Za-zÀ-ÖØ-öø-ÿ-]+)\/overview)', str(elem))
        if link is None:    
            link = re.search(r'href="\/\/(www.premierleague.com\/players\/\w*\/[A-Za-zÀ-ÖØ-öø-ÿ]*)',str(elem))
        
        link = link.group(1)
        link = 'http://' + link

        player_link_list.append(link)

    pos_and_country = players_raw[0].find_all("td", class_="hide-s")
    for idx, elem in enumerate(pos_and_country):
        if idx % 2 == 0:
            player_position_list.append(elem.text)
        else:
            player_country_list.append(elem.text)
    
    
    # scroll to top of page
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(2)
    
    # iterate through all premier league clubs
    for idx in range(2, 22):
        # scroll to top of page
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(2)
        
        # click clubs
        nextx = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div[2]/div[1]/div/section/div[2]/div[2]')))
        ActionChains(driver).click(nextx).perform()
        time.sleep(1)
        
        clubs = WebDriverWait(driver, 1).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="mainContent"]/div[2]/div[1]/div/section/div[2]/ul/li[' + str(idx) + ']')))
        ActionChains(driver).click(clubs).perform()
        
        ### Scroll to bottom of page
        SCROLL_PAUSE_TIME = 2

        # Get scroll height
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            time.sleep(SCROLL_PAUSE_TIME)

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # initialize beautifulsoup and get all player details
        bs = BeautifulSoup(driver.page_source, 'html.parser')
        players_raw = bs.find_all("tbody", class_="dataContainer indexSection")
        players = bs.find_all("td", players_raw[0])

        # iterate through list of players
        for elem in players:
            player_name_list.append(elem.text)
            
            link = re.search(r'href="\/\/(www.premierleague.com\/players\/\w*\/(?=\S*[-])([A-Za-zÀ-ÖØ-öø-ÿ-]+)\/overview)', str(elem))
            if link is None:    
                link = re.search(r'href="\/\/(www.premierleague.com\/players\/\w*\/[A-Za-zÀ-ÖØ-öø-ÿ]*)',str(elem))

            link = link.group(1)
            link = 'http://' + link

            player_link_list.append(link)

        pos_and_country = players_raw[0].find_all("td", class_="hide-s")
        for idx, elem in enumerate(pos_and_country):
            if idx % 2 == 0:
                player_position_list.append(elem.text)
            else:
                player_country_list.append(elem.text)
    
    # quit driver
    driver.quit()
    
    playerdf['player_name'] = player_name_list
    playerdf['position'] = player_position_list
    playerdf['country'] = player_country_list
    playerdf['link'] = player_link_list
    
    playerdf.drop_duplicates(inplace = True)
    playerdf.reset_index(drop = True, inplace = True)

In [7]:
# scrape all player list
player_details_scrape()

finish scrolling
Player:  Max Aarons
Player:  Abdul Rahman Baba
Player:  Tammy Abraham
Player:  Adam Smith
Player:  Che Adams
Player:  Dennis Adeniran
Player:  Albert Adomah
Player:  Adrián
Player:  Adrien Silva
Player:  Benik Afobe
Player:  Sergio Agüero
Player:  Daniel Agyei
Player:  Soufyan Ahannach
Player:  Ahmed El Mohamady
Player:  Albian Ajeti
Player:  Nathan Aké
Player:  Alberto Moreno
Player:  Marc Albrighton
Player:  Toby Alderweireld
Player:  Aleix García
Player:  Trent Alexander-Arnold
Player:  Ali Koiki
Player:  Alisson
Player:  Allan
Player:  Miguel Almirón
Player:  Marcos Alonso
Player:  Steven Alzate
Player:  Ibrahim Amadou
Player:  Daniel Amartey
Player:  Luke Amos
Player:  Fabián Balbuena
Player:  George Baldock
Player:  Folarin Balogun
Player:  Tudor Baluta
Player:  Beni Baningime
Player:  Scott Banks
Player:  Ben Barclay
Player:  Phil Bardsley
Player:  Ross Barkley
Player:  Ashley Barnes
Player:  Antonio Barreca
Player:  Mason Barrett
Player:  Chris Basham
Player:  

Player:  Mohamed Elneny
Player:  Mohamed Elyounoussi
Player:  Mohamed Salah
Player:  Jayson Molumby
Player:  Martín Montoya
Player:  Elliott Moore
Player:  Aaron Mooy
Player:  Álvaro Morata
Player:  Marlos Moreno
Player:  Wes Morgan
Player:  Carlton Morris
Player:  Ravel Morrison
Player:  Victor Moses
Player:  Mason Mount
Player:  Lys Mousset
Player:  Arijanet Muric
Player:  Jacob Murphy
Player:  Glenn Murray
Player:  Admiral Muskwe
Player:  Shkodran Mustafi
Player:  Jordon Mutch
Player:  Yoshinori Muto
Player:  Daniel N'Lundulu
Player:  Nacho Monreal
Player:  Marvelous Nakamba
Player:  Samir Nasri
Player:  Marc Navarro
Player:  Tanguy Ndombele
Player:  Layton Ndukwu
Player:  Reiss Nelson
Player:  Nélson Oliveira
Player:  Jeremy Ngakia
Player:  Oumar Niasse
Player:  Eddie Nketiah
Player:  Georges-Kévin Nkoudou
Player:  Felix Nmecha
Player:  Mark Noble
Player:  Rhys Norrington-Davies
Player:  James Norris
Player:  Oliver Norwood
Player:  Ørjan Nyland
Player:  Jack O'Connell
Player:  Tho

In [8]:
print('Playersdf shape:', playerdf.shape)
playerdf.head()

Playersdf shape: (1007, 4)


Unnamed: 0,player_name,position,country,link
0,Max Aarons,Defender,England,http://www.premierleague.com/players/19970/Max...
1,Abdul Rahman Baba,Defender,Ghana,http://www.premierleague.com/players/13279/Abd...
2,Tammy Abraham,Forward,England,http://www.premierleague.com/players/13286/Tam...
3,Adam Smith,Defender,England,http://www.premierleague.com/players/3512/Adam...
4,Che Adams,Midfielder,England,http://www.premierleague.com/players/10905/Che...


In [9]:
playerdf.tail()

Unnamed: 0,player_name,position,country,link
1002,Ben Johnson,Midfielder,England,http://www.premierleague.com/players/15212/Ben...
1003,Conor Coady,Defender,England,http://www.premierleague.com/players/4136/Cono...
1004,Raúl Jiménez,Forward,Mexico,http://www.premierleague.com/players/11071/Raú...
1005,Lewis Richards,Defender,Ireland,http://www.premierleague.com/players/25466/Lew...
1006,Terry Taylor,Midfielder,Wales,http://www.premierleague.com/players/24667/Ter...


In [25]:
# Check
playerdf[playerdf['player_name'] == 'Jordan Henderson']

Unnamed: 0,player_name,position,country,link,Team,dob,Number,Height,pid
959,Jordan Henderson,Midfielder,England,http://www.premierleague.com/players/3712/Jord...,Liverpool,17/06/1990,14,182cm,56979


In [26]:
playerdf[playerdf['player_name'] == 'Dennis Srbeny'] # Dennis Srbeny is not at Norwich City (LOOK AT TEAM!!!)

Unnamed: 0,player_name,position,country,link,Team,dob,Number,Height,pid
823,Dennis Srbeny,Forward,Germany,http://www.premierleague.com/players/33067/Den...,Forward,05/05/1994,,189cm,179587


In [27]:
playerdf[playerdf['player_name'] == 'Angeliño'] # Angelino is not at Man City(LOOK AT TEAM!!!)

Unnamed: 0,player_name,position,country,link,Team,dob,Number,Height,pid
65,Angeliño,Defender,Spain,http://www.premierleague.com/players/10467/Ang...,Defender,04/01/1997,,175cm,145235


In [28]:
playerdf[playerdf['player_name'] == 'Jóhann Gudmundsson']

Unnamed: 0,player_name,position,country,link,Team,dob,Number,Height,pid
339,Jóhann Gudmundsson,Midfielder,Iceland,http://www.premierleague.com/players/6210/Jóha...,Burnley,27/10/1990,7,186cm,60586


In [14]:
# initialize all columns to 0
playerdf['Team'] = 0
playerdf['dob'] = 0
playerdf['Number'] = 0
playerdf['Height'] = 0

In [15]:
def scrape_players_etc(playerdf, row):
    ###### get match link
    link = playerdf.loc[row, 'link']
    print(link)
    
    ##### extract html
    bs = BeautifulSoup(requests.get(link).text, 'html.parser')
    
    team = bs.find_all("div", class_="info")
    team = team[0].text.strip()
    
    pid = bs.find_all("div", class_="wrapper playerContainer")
    pid = re.search(r'data-player="p(\d*)"', str(pid)).group(1)
    
    try:
        number = bs.find_all("div", class_="number")
        number = number[0].text.strip()
    except:
        number = np.nan
    
    try:
        dob = bs.find_all("ul", class_="pdcol2")[0].find_all("div", class_="info")
        dob = dob[0].text.strip().split(' ')[0]
    except:
        dob = np.nan

    try:
        height = bs.find_all("ul", class_="pdcol3")[0].find_all("div", class_="info")
        height = height[0].text.strip()
    except:
        height = np.nan
        
    # append to dataframe
    playerdf.loc[row, 'Team'] = team
    playerdf.loc[row, 'dob'] = dob
    playerdf.loc[row, 'Number'] = number
    playerdf.loc[row, 'Height'] = height
    playerdf.loc[row, 'pid'] = pid

In [17]:
# scrape statistics for past 10 years
# if getting assertion error (blk_ref gaps) re run this code block
Parallel(n_jobs=-1, require='sharedmem')(delayed(scrape_players_etc)(playerdf, i) for i in range(playerdf.shape[0]));

http://www.premierleague.com/players/19970/Max-Aarons/overviewhttp://www.premierleague.com/players/13279/Abdul-Rahman-Baba/overview

http://www.premierleague.com/players/13286/Tammy-Abraham/overview
http://www.premierleague.com/players/3512/Adam-Smith/overview
http://www.premierleague.com/players/10905/Che-Adams/overview
http://www.premierleague.com/players/14674/Dennis-Adeniran/overview
http://www.premierleague.com/players/9131/Albert-Adomah/overviewhttp://www.premierleague.com/players/4852/Adrián

http://www.premierleague.com/players/11357/Adrien-Silva/overview
http://www.premierleague.com/players/4040/Benik-Afobe/overview
http://www.premierleague.com/players/4328/Sergio-Agüero/overview
http://www.premierleague.com/players/10559/Daniel-Agyei/overview
http://www.premierleague.com/players/24695/Soufyan-Ahannach/overview
http://www.premierleague.com/players/4183/Ahmed-El-Mohamady/overview
http://www.premierleague.com/players/5248/Albian-Ajeti/overview
http://www.premierleague.com/player

http://www.premierleague.com/players/8707/Dan-Burn/overview
http://www.premierleague.com/players/14444/Robbie-Burton/overview
http://www.premierleague.com/players/66307/Rocky-Bushiri/overview
http://www.premierleague.com/players/11965/Matt-Butcher/overview
http://www.premierleague.com/players/3505/David-Button/overview
http://www.premierleague.com/players/25435/Oskar-Buur/overview
http://www.premierleague.com/players/23869/Jack-Bycroft/overview
http://www.premierleague.com/players/8945/Sam-Byram/overview
http://www.premierleague.com/players/10466/Willy-Caballero/overview
http://www.premierleague.com/players/2620/Gary-Cahill/overviewhttp://www.premierleague.com/players/8454/Callum-Wilson/overview

http://www.premierleague.com/players/9576/Dominic-Calvert-Lewin/overview
http://www.premierleague.com/players/22952/Víctor-Camarasa/overviewhttp://www.premierleague.com/players/50467/Brennan-Camp/overview

http://www.premierleague.com/players/74562/Leonardo-Campana/overview
http://www.premierl

http://www.premierleague.com/players/6820/Ondrej-Duda/overview
http://www.premierleague.com/players/3601/Shane-Duffy/overview
http://www.premierleague.com/players/4340/Paul-Dummett/overview
http://www.premierleague.com/players/23826/Bobby-Duncan/overviewhttp://www.premierleague.com/players/8163/Lewis-Dunk/overview

http://www.premierleague.com/players/14255/Jake-Eastwood/overview
http://www.premierleague.com/players/12707/Ederson
http://www.premierleague.com/players/4394/John-Egan/overview
http://www.premierleague.com/players/11033/Anwar-El-Ghazi/overview
http://www.premierleague.com/players/8962/Callum-Elder/overview
http://www.premierleague.com/players/2214/Robert-Elliot/overview
http://www.premierleague.com/players/33185/Harvey-Elliott/overview
http://www.premierleague.com/players/8046/Tommy-Elphick/overviewhttp://www.premierleague.com/players/16803/Emerson

http://www.premierleague.com/players/12199/Björn-Engels/overview
http://www.premierleague.com/players/15258/Niall-Ennis/overvi

http://www.premierleague.com/players/52951/Ki-Jana-Hoever/overview
http://www.premierleague.com/players/9496/Scott-Hogan/overview
http://www.premierleague.com/players/11575/Rob-Holding/overview
http://www.premierleague.com/players/5713/José-Holebas/overview
http://www.premierleague.com/players/10564/Mason-Holgate/overview
http://www.premierleague.com/players/14637/Nathan-Holland/overview
http://www.premierleague.com/players/9431/Ricky-Holmes/overview
http://www.premierleague.com/players/66108/Nathaniel-Shio-Hong-Wan/overview
http://www.premierleague.com/players/9377/Conor-Hourihane/overview
http://www.premierleague.com/players/23702/Callum-Hudson-Odoi/overview
http://www.premierleague.com/players/8589/Will-Hughes/overview
http://www.premierleague.com/players/9417/Jordan-Hugill/overview
http://www.premierleague.com/players/13550/Cameron-Humphreys-Grant/overview
http://www.premierleague.com/players/24340/Joseph-Hungbo/overview
http://www.premierleague.com/players/8598/James-Husband/overv

http://www.premierleague.com/players/2270/Aaron-Lennon/overview
http://www.premierleague.com/players/4985/Bernd-Leno/overviewhttp://www.premierleague.com/players/12299/Léo-Bonatini/overview

http://www.premierleague.com/players/37776/Jefferson-Lerma/overview
http://www.premierleague.com/players/23784/Thakgalo-Leshabela/overview
http://www.premierleague.com/players/23830/Dylan-Levitt/overview
http://www.premierleague.com/players/21821/Adam-Lewis/overview
http://www.premierleague.com/players/10766/Lewis-Cook/overview
http://www.premierleague.com/players/5520/Stephan-Lichtsteiner/overview
http://www.premierleague.com/players/4154/Anders-Lindegaard/overview
http://www.premierleague.com/players/5066/Victor-Lindelöf/overview
http://www.premierleague.com/players/4337/Jesse-Lingard/overview
http://www.premierleague.com/players/5551/Fernando-Llorente/overview
http://www.premierleague.com/players/4664/Hugo-Lloris/overview
http://www.premierleague.com/players/19851/Giovani-Lo-Celso/overview
http:

http://www.premierleague.com/players/2896/John-Ruddy/overview
http://www.premierleague.com/players/16801/Antonio-Rüdiger/overview
http://www.premierleague.com/players/11341/Rui-Patrício/overview
http://www.premierleague.com/players/25203/Lukas-Rupp/overview
http://www.premierleague.com/players/53176/Will-Russ/overview
http://www.premierleague.com/players/12192/Mat-Ryan/overview
http://www.premierleague.com/players/14656/Ryan-Sessegnon/overview
http://www.premierleague.com/players/12817/Allan-Saint-Maximin/overviewhttp://www.premierleague.com/players/20745/Romain-Saïss/overview

http://www.premierleague.com/players/6251/Henri-Saivet/overview
http://www.premierleague.com/players/49481/Bukayo-Saka/overview
http://www.premierleague.com/players/4796/Mamadou-Sakho/overview
http://www.premierleague.com/players/10004/Bakary-Sako/overview
http://www.premierleague.com/players/66204/William-Saliba/overview
http://www.premierleague.com/players/20490/Mbwana-Samatta/overview
http://www.premierleague

http://www.premierleague.com/players/4747/Marco-van-Ginkel/overview
http://www.premierleague.com/players/8979/Jamie-Vardy/overviewhttp://www.premierleague.com/players/31670/Indiana-Vassilev/overview

http://www.premierleague.com/players/32602/Michael-Verrips/overview
http://www.premierleague.com/players/4666/Jan-Vertonghen/overview
http://www.premierleague.com/players/20109/Jannik-Vestergaard/overview
http://www.premierleague.com/players/25312/Nikola-Vlasic/overview
http://www.premierleague.com/players/4007/Sam-Vokes/overview
http://www.premierleague.com/players/15013/Jake-Vokins/overview
http://www.premierleague.com/players/4398/Michel-Vorm/overview
http://www.premierleague.com/players/22040/Mario-Vrancic/overview
http://www.premierleague.com/players/4851/Matej-Vydra/overview
http://www.premierleague.com/players/2839/Theo-Walcott/overview
http://www.premierleague.com/players/3955/Kyle-Walker/overviewhttp://www.premierleague.com/players/13815/Kyle-Walker-Peters/overview

http://www.pre

In [18]:
playerdf.head()

Unnamed: 0,player_name,position,country,link,Team,dob,Number,Height,pid
0,Max Aarons,Defender,England,http://www.premierleague.com/players/19970/Max...,Norwich City,04/01/2000,2.0,178cm,232980
1,Abdul Rahman Baba,Defender,Ghana,http://www.premierleague.com/players/13279/Abd...,Defender,02/07/1994,,180cm,118335
2,Tammy Abraham,Forward,England,http://www.premierleague.com/players/13286/Tam...,Chelsea,02/10/1997,9.0,190cm,173879
3,Adam Smith,Defender,England,http://www.premierleague.com/players/3512/Adam...,AFC Bournemouth,29/04/1991,15.0,180cm,54469
4,Che Adams,Midfielder,England,http://www.premierleague.com/players/10905/Che...,Southampton,13/07/1996,,175cm,200439


In [19]:
playerdf.tail()

Unnamed: 0,player_name,position,country,link,Team,dob,Number,Height,pid
1002,Ben Johnson,Midfielder,England,http://www.premierleague.com/players/15212/Ben...,West Ham United,24/01/2000,53.0,175cm,222018
1003,Conor Coady,Defender,England,http://www.premierleague.com/players/4136/Cono...,Wolverhampton Wanderers,25/02/1993,,185cm,94147
1004,Raúl Jiménez,Forward,Mexico,http://www.premierleague.com/players/11071/Raú...,Wolverhampton Wanderers,05/05/1991,9.0,190cm,102057
1005,Lewis Richards,Defender,Ireland,http://www.premierleague.com/players/25466/Lew...,Wolverhampton Wanderers,15/10/2001,76.0,,437688
1006,Terry Taylor,Midfielder,Wales,http://www.premierleague.com/players/24667/Ter...,Wolverhampton Wanderers,29/06/2001,39.0,,432850


In [20]:
# fix broken records (double positions)
playerdf.loc[playerdf['player_name'] == 'Che Adams', 'position'] = 'Midfielder'
playerdf.loc[playerdf['player_name'] == 'Conor Coady', 'position'] = 'Defender'
playerdf.loc[playerdf['player_name'] == 'Declan Rice', 'position'] = 'Defender'
playerdf.loc[playerdf['player_name'] == 'Gabriel Martinelli', 'position'] = 'Forward'
playerdf.loc[playerdf['player_name'] == 'Kortney Hause', 'position'] = 'Defender'
playerdf.loc[playerdf['player_name'] == 'Matt Butcher', 'position'] = 'Defender'
playerdf.loc[playerdf['player_name'] == 'Oleksandr Zinchenko', 'position'] = 'Defender'
playerdf.loc[playerdf['player_name'] == 'Phil Foden', 'position'] = 'Forward'
playerdf.loc[playerdf['player_name'] == 'Sam McQueen', 'position'] = 'Defender'
playerdf.loc[playerdf['player_name'] == 'Tahith Chong', 'position'] = 'Forward'
playerdf.loc[playerdf['player_name'] == 'William Smallbone', 'position'] = 'Midfielder'

playerdf.drop_duplicates(inplace = True)
playerdf.reset_index(drop = True, inplace = True)

In [21]:
# duplicate positions
temp = playerdf.groupby(['player_name'])['position'].agg('count').reset_index()
temp[temp['position'] > 1]

Unnamed: 0,player_name,position


In [32]:
# publish to local file
playerdf.to_csv('playerdf.csv', index = False)

In [23]:
# save image to local file
def save_player_images(playerdf, row):
    playername = playerdf.loc[row, 'player_name']
    pid = playerdf.loc[row, 'pid']
    print(playername)
    
    try:
        base_url = "https://premierleague-static-files.s3.amazonaws.com/premierleague/photos/players/250x250/p"
        url = base_url + pid + '.png'
        urllib.request.urlretrieve(url, os.path.join('Players', playername + ".png"))
    except:
        url = "http://platform-static-files.s3.amazonaws.com/premierleague/photos/players/250x250/Photo-Missing.png"
        urllib.request.urlretrieve(url, os.path.join('Players', playername + ".png"))   

In [24]:
# scrape player images for current season
Parallel(n_jobs=-1, require='sharedmem')(delayed(save_player_images)(playerdf, i) for i in range(playerdf.shape[0]));

Abdul Rahman BabaTammy Abraham
Adam Smith
Max Aarons

Che Adams
Dennis Adeniran
Albert Adomah
Adrián
Adrien Silva
Benik Afobe
Sergio Agüero
Daniel Agyei
Soufyan Ahannach
Ahmed El Mohamady
Albian Ajeti
Nathan Aké
Alberto Moreno
Marc Albrighton
Toby Alderweireld
Aleix García
Trent Alexander-Arnold
Ali Koiki
Alisson
Allan
Miguel Almirón
Marcos Alonso
Steven Alzate
Ibrahim Amadou
Daniel Amartey
Luke Amos
Fabián Balbuena
George Baldock
Folarin Balogun
Tudor Baluta
Beni Baningime
Scott Banks
Ben Barclay
Phil Bardsley
Ross Barkley
Ashley Barnes
Antonio Barreca
Mason Barrett
Chris Basham
Michy Batshuayi
Danny Batth
Gavin Bazunu
Jack Bearne
Jan Bednarek
Asmir Begovic
Héctor Bellerín
Yohan Benalouane
Filip Benkovic
Ryan Bennett
Jayden Bennetts
Josh Benson
Nabil Bentaleb
Christian Benteke
Sander Berge
Steven Bergwijn
Adrián Bernabé
Ethan Ampadu
Joseph Anang
Florin Andone
André Gomes
Andreas Pereira
Angeliño
Faustino Anjorin
Michail AntonioBilly Arce

Cameron Archer
Archie Davies
Stuart Armstrong


David McGoldrick
Michael McGovern
Giovanni McGregor
Harry McKirdy
Kenny McLean
Dwight McNeil
Sam McQueen
Scott McTominay
Zech Medley
Ben Mee
D'Mani Mellor
Nampalys Mendy
Teden Mengi
Chris Mepham
Max Meyer
Matt Miazga
Simon Mignolet
Luka Milivojevic
Liam Millar
James Milner
Yerry Mina
Takumi Minamino
Tyrone Mings
Kevin Mirallas
Tyrick Mitchell
Henrikh Mkhitaryan
Jan Mlakar
Mohamed Elneny
Mohamed Elyounoussi
Mohamed Salah
Jayson Molumby
Martín Montoya
Elliott Moore
Aaron Mooy
Álvaro Morata
Marlos Moreno
Wes Morgan
Carlton Morris
Ravel Morrison
Victor Moses
Mason Mount
Lys Mousset
Arijanet Muric
Jacob Murphy
Glenn Murray
Admiral Muskwe
Shkodran Mustafi
Jordon Mutch
Yoshinori Muto
Daniel N'Lundulu
Nacho Monreal
Marvelous Nakamba
Samir Nasri
Marc Navarro
Tanguy Ndombele
Layton Ndukwu
Reiss Nelson
Nélson Oliveira
Jeremy Ngakia
Oumar Niasse
Eddie Nketiah
Georges-Kévin Nkoudou
Felix Nmecha
Mark Noble
Rhys Norrington-Davies
James Norris
Oliver Norwood
Ørjan Nyland
Jack O'Connell
Thomas O'Connor

# Scratch Work

In [None]:
###### Part 1

In [None]:
import requests

match_url = 'https://www.premierleague.com/players/19970/Max-Aarons/overview'
bs = BeautifulSoup(requests.get(match_url).text, 'html.parser')

In [None]:
a = bs.find_all("div", class_="info")
a[0].text.strip()

In [None]:
a = bs.find_all("div", class_="number")
a[0].text.strip()

In [None]:
a = bs.find_all("ul", class_="pdcol2")[0].find_all("div", class_="info")
a[0].text.strip().split(' ')[0]

In [None]:
a = bs.find_all("ul", class_="pdcol3")[0].find_all("div", class_="info")
a[0].text.strip()

In [None]:
a = bs.find_all("div", class_="wrapper playerContainer")

In [None]:
re.search(r'data-player="p(\d*)"', str(a)).group(1)

In [None]:
############ Part 2

In [None]:
import requests

match_url = 'https://www.premierleague.com/match/46809'
#requests.get("http://" +url)
bs = BeautifulSoup(requests.get(match_url).text, 'html.parser')

In [None]:
a = bs.find_all("div", class_="position")
a[0].text.split('\n')[2].strip()

In [None]:
a[1].text.split('\n')[2].strip()

In [None]:
capt_home = bs.find_all("ul", class_="startingLineUpContainer squadList home")[0].find_all("li", class_="player")
for elem in capt_home:
    lencapt = elem.find_all("div", class_="cpt")
    if len(lencapt) >= 1:
        print(re.search(r'\D*', elem.find_all("div", class_="name")[0].text.strip()).group(0).strip())
    

capt_away = bs.find_all("ul", class_="startingLineUpContainer squadList")[0].find_all("li", class_="player")
for elem in capt_away:
    lencapt = elem.find_all("div", class_="cpt")
    if len(lencapt) >= 1:
        #print(elem.find_all("div", class_="name")[0].text.strip())
        print(re.search(r'\D*', elem.find_all("div", class_="name")[0].text.strip()).group(0).strip())

In [None]:
lineup = bs.find_all("div", class_="matchLineupTeamContainer")

In [None]:
squad_home_starter = lineup[0].find_all("ul", class_="startingLineUpContainer squadList home")[0].find_all("div", class_="name")
squad_home_subs = lineup[0].find_all("ul", class_="startingLineUpContainer squadList home")[1].find_all("div", class_="name")
squad_away_starter = lineup[1].find_all("ul", class_="startingLineUpContainer squadList")[0].find_all("div", class_="name")
squad_away_subs = lineup[1].find_all("ul", class_="startingLineUpContainer squadList")[1].find_all("div", class_="name")


home_starting_lineup = []
home_subs_lineup = []
home_subout = []
home_subout_time = []
home_subin = []
home_subin_time = []
home_yellow = []
home_red = []

away_starting_lineup = []
away_subs_lineup = []
away_subout = []
away_subout_time = []
away_subin = []
away_subin_time = []
away_yellow = []
away_red = []

###### to parse home squad starting lineup
for idx, elem in enumerate(squad_home_starter):
    player = elem.text.strip()
    sub_out = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())
        
    # starting lineup player append to list
    if len(sub_out) > 0:
        # append starting lineup player
        home_starting_lineup.append(re.findall(r'\D*', player)[0].strip())
        # append subout time and player
        home_subout.append(re.findall(r'\D*', player)[0].strip())
        home_subout_time.append(sub_out[0][0:2])
    else:
        home_starting_lineup.append(player)
    
    # yellow and red carded players
    # yellow card
    if len(elem.find_all("span", class_="icn card-yellow")) > 0:
        home_yellow.append(re.findall(r'\D*', player)[0].strip())
    # red card
    if len(elem.find_all("span", class_="icn card-red")) > 0:
        home_red.append(re.findall(r'\D*', player)[0].strip())
        
# to parse substitute squad
for idx, elem in enumerate(squad_home_subs):
    player = elem.text.strip()
    sub_in = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())
    
    # substitute player append to list
    if len(sub_in) > 0:
        # append starting subs player
        home_subs_lineup.append(re.findall(r'\D*', player)[0].strip())
        # append subin time and player
        home_subin.append(re.findall(r'\D*', player)[0].strip())
        home_subin_time.append(sub_in[0][0:2])
        
        # players who are subbed in then are subbed out again
        if len(sub_in) > 1:
            home_subout.append(re.findall(r'\D*', player)[0].strip())
            home_subout_time.append(sub_in[1][0:2])
    else:
        home_subs_lineup.append(player)

##### to parse away squad starting lineup
for idx, elem in enumerate(squad_away_starter):
    player = elem.text.strip()
    sub_out = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())
    
    # starting lineup player append to list
    if len(sub_out) > 0:
        # append starting lineup player
        away_starting_lineup.append(re.findall(r'\D*', player)[0].strip())
        # append subout time and player
        away_subout.append(re.findall(r'\D*', player)[0].strip())
        away_subout_time.append(sub_out[0][0:2])
    else:
        away_starting_lineup.append(player)
    
    # yellow and red carded players
    # yellow card
    if len(elem.find_all("span", class_="icn card-yellow")) > 0:
        away_yellow.append(re.findall(r'\D*', player)[0].strip())
    # red card
    if len(elem.find_all("span", class_="icn card-red")) > 0:
        away_red.append(re.findall(r'\D*', player)[0].strip())

# to parse substitute squad
for idx, elem in enumerate(squad_away_subs):
    player = elem.text.strip()
    sub_in = re.findall(r'\d{1,2}[\s+\d{1,2}]*', elem.text.strip())
    
    # substitute player append to list
    if len(sub_in) > 0:
        # append starting subs player
        away_subs_lineup.append(re.findall(r'\D*', player)[0].strip())
        # append subin time and player
        away_subin.append(re.findall(r'\D*', player)[0].strip())
        away_subin_time.append(sub_in[0][0:2])
        
        # players who are subbed in then are subbed out again
        if len(sub_in) > 1:
            away_subout.append(re.findall(r'\D*', player)[0].strip())
            away_subout_time.append(sub_in[1][0:2])
    else:
        away_subs_lineup.append(player)

In [None]:
# away_starting_lineup = []
# away_subs_lineup = []
# away_subout = []
# away_subout_time = []
# away_subin = []
# away_subin_time = []
# away_yellow = []
# away_red = []
# away_captain = []

print(home_captain)

In [None]:
date = bs.find_all("div", class_="referee")
print(date[0].text.strip())

In [None]:
date = bs.find_all("div", class_='attendance hide-m')
date = date[0].text.strip()
print(date)

In [None]:
date = date.replace("Att: ", '')
date = date.replace(",", '')
print(date)

In [None]:
date = bs.find_all("div", class_="matchDate renderMatchDateContainer")
re.search(r'data-kickoff="(\d*)"', str(date[0])).group(1)

In [None]:
goals = bs.find_all("div", class_="matchEvents matchEventsContainer")
goals_home = goals[0].find_all("div", class_="home")
goals_away = goals[0].find_all("div", class_="away")
print(goals_home)

In [None]:
a = goals_home[0].findAll("div", class_="event")[0].text.strip()   #1 harrykane, 0 tanguy ndombele
a

In [None]:
if "Goal" in a:
    print("goal")
else:
    print("others")

In [None]:
a = a.split('\n')[0]
print(a)

goalscorer = re.findall(r'^\D+', a)[0].strip()
print(goalscorer + ' (og)')

times = re.findall(r'\d{1,2}', a)
print(times)

In [None]:
goalscorer = re.findall(r'^\D+', a)[0].strip()
print(goalscorer)

In [None]:
times = re.findall(r'\d{1,2}', a)
print(times)

In [None]:
re.findall(r'>(\w.+?)<', str(goals_home[0].findAll("a")[0])) # last index iterate through all goals

In [None]:
# all fixture blocks
fixture_blocks = driver.find_elements_by_xpath('//*[@id="mainContent"]/div[2]/div[1]/div[3]/section/div')
print(len(fixture_blocks))

all_team1 = []
all_team2 = []
all_score = []

parser = html.fromstring(driver.page_source)
counter = 0

# iterate through all the blocks
for i in range(1, len(fixture_blocks) + 1):
    match_per_block = driver.find_elements_by_xpath('//*[@id="mainContent"]/div[2]/div[1]/div[3]/section/div['+ str(i) + ']/ul/li')
    
    for j in range(1, len(match_per_block) + 1):
        team1 = parser.xpath('//*[@id="mainContent"]/div[2]/div[1]/div[3]/section/div[' + str(i) + ']/ul/li[' + str(j) + ']/div/span/span[1]/span[1]/span[1]/span[1]')[0].text
        team2 = parser.xpath('//*[@id="mainContent"]/div[2]/div[1]/div[3]/section/div[' + str(i) + ']/ul/li['+ str(j) + ']/div/span/span[1]/span[3]/span[2]/span[1]')[0].text
        temp = parser.xpath('//*[@id="mainContent"]/div[2]/div[1]/div[3]/section/div[' + str(i) + ']/ul/li[' + str(j) + ']/div/span/span[1]/span[2]')        
        score = ''.join(temp[0].itertext())
        
        
        all_team1.append(team1)
        all_team2.append(team2)
        all_score.append(score)        
        
        counter = counter + 1

print(counter)

In [None]:
a = parser.xpath('//*[@id="mainContent"]/div[2]/div[1]/div[3]/section/div[1]/ul/li[1]/div')

In [None]:
a[0]

In [None]:
''.join(a[0].itertext())

In [None]:
for i in (a[0].iterlinks()):
    print(i)

In [None]:
temp = ''.join(a[0].itertext())