In [112]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver import ChromeOptions

In [113]:
os.system("export PATH=$PATH:$(pwd)")
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(
    options=chrome_options,
    
    # Change this to the local absolute path to project directory
    executable_path='/Users/jshea/Desktop/school/spring_2020/machine_learning/ML-Final-Project/chromedriver'
)

In [114]:
def get_team_urls(season):
    ''' Get premier league team urls for a given season '''
    
    # List to record all the team page urls for this season (starts with Man City)
    team_urls = ["https://fbref.com/en/squads/b8fd03ef/{}/Manchester-City".format(season)]
    url = team_urls[0]

    # Loads page and gets matchup records for Man City in the current season
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    matchup_records = soup.body.find_all(
        "div", {"id": "all_kitchen_sink_sched"})[0].find("div").find("div").find_all(
            "div", {"class", "table_outer_container"})[0].find("table").find_all("tr")
    

    # Loops through records and extracts links to opponent pages
    for record in matchup_records:

        # Makes sure matchup is premier league
        try:
            league = record.th.a.text
        except:
            continue
        if league == "Premier League":

            # Gets opponent page url
            opponent_link = record.find("td", {"data-stat": "opponent"}).a["href"]
            team_code = opponent_link.split("/")[3]
            team_name = opponent_link.split("/")[5]
            team_urls.append("https://fbref.com/en/squads/{}/{}/{}".format(team_code, season, team_name))

    # Dudups team urls
    team_urls = list(set(team_urls))
    
    return team_urls

In [115]:
def get_matchup_urls(team_url):
    ''' Get all the matchup urls corresponding to a team url '''

    # Loads page and gets matchup records
    driver.get(team_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    matchup_records = soup.body.find_all(
        "div", {"id": "all_kitchen_sink_sched"})[0].find("div").find("div").find_all(
            "div", {"class", "table_outer_container"})[0].find("table").find_all("tr")

    # Loops through records and extracts opponent links
    matchup_urls = []
    for record in matchup_records:

        # Makes sure matchup is premier league
        try:
            league = record.th.a.text
        except:
            continue
        if league == "Premier League":

            match_link = record.find("td", {"data-stat": "match_report"}).a["href"]
            match_code = match_link.split("/")[3]
            match_title = match_link.split("/")[4]
            matchup_urls.append("https://fbref.com/en/matches/{}/{}".format(match_code, match_title))

    return matchup_urls

In [116]:
# Gets urls to all matchups in given seasons

all_matchups = []

# Loops through season
for season in ["2016-2017"]:
    
    # Gets all 20 team urls for the season
    team_urls = get_team_urls(season)
    
    # For each team, get all their matchup urls
    for team_url in team_urls:
        print("Getting matches for url {}".format(team_url))
        
        all_matchups = all_matchups + get_matchup_urls(team_url)
        
    # Dedup matchup urls
    all_matchups = list(set(all_matchups))

Getting matches for url https://fbref.com/en/squads/b8fd03ef/2016-2017/Manchester-City
Getting matches for url https://fbref.com/en/squads/19538871/2016-2017/Manchester-United
Getting matches for url https://fbref.com/en/squads/943e8050/2016-2017/Burnley
Getting matches for url https://fbref.com/en/squads/18bb7c10/2016-2017/Arsenal
Getting matches for url https://fbref.com/en/squads/7c21e445/2016-2017/West-Ham-United
Getting matches for url https://fbref.com/en/squads/a2d435b3/2016-2017/Leicester-City
Getting matches for url https://fbref.com/en/squads/fb10988f/2016-2017/Swansea-City
Getting matches for url https://fbref.com/en/squads/bd8769d1/2016-2017/Hull-City
Getting matches for url https://fbref.com/en/squads/8ef52968/2016-2017/Sunderland
Getting matches for url https://fbref.com/en/squads/361ca564/2016-2017/Tottenham-Hotspur
Getting matches for url https://fbref.com/en/squads/d3fd31cc/2016-2017/Everton
Getting matches for url https://fbref.com/en/squads/47c64c55/2016-2017/Crystal

In [117]:
print("Extracted {} matchup urls".format(len(all_matchups)))

Extracted 380 matchup urls


In [118]:
def get_match_data(match_url):
    ''' gets all associated match data from a given match page '''
    
    # Loads page
    driver.get(match_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Dict to store all match associated data
    match_dict = {"url": match_url}
    
    match_dict["date"] = soup.body.find_all(
        "div", {"class": "scorebox_meta"})[0].find_all("strong")[0].a.text

    match_dict["week"] = int(soup.body.find_all(
        "div", {"id": "content"})[0].div.text[26:-1])
    
    match_dict["home_team"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("strong")[0].a["href"].split("/")[5]
    match_dict["away_team"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("strong")[3].a["href"].split("/")[5]
    
    
    match_dict["home_record"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div")[5].text
    match_dict["away_record"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div")[14].text
    
    match_dict["home_score"] = int(soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div", {"class": "score"})[0].text)
    match_dict["away_score"] = int(soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div", {"class": "score"})[1].text)
    
    #match_dict["home_score_xg"] = float(soup.body.find_all(
    #    "div", {"class": "scorebox"})[0].find_all("div", {"class": "score_xg"})[0].text)
    #match_dict["away_score_xg"] = float(soup.body.find_all(
    #    "div", {"class": "scorebox"})[0].find_all("div", {"class": "score_xg"})[1].text)
    
    try:
        match_dict["home_possession"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[0].text[:-1])
    except:
        match_dict["home_possession"] = 0
    try:
        match_dict["away_possession"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[1].text[:-1])
    except:
        match_dict["away_possession"] = 0
    
    try:
        match_dict["home_pass_acc"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[2].text[:-1])
    except:
        match_dict["home_pass_acc"] = 0
    try:
        match_dict["away_pass_acc"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[3].text[:-1])
    except:
        match_dict["away_pass_acc"] = 0
    
    try:
        match_dict["home_sot"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[4].text[:-1])
    except:
        match_dict["home_sot"] = 0
    try:
        match_dict["away_sot"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[5].text[:-1])
    except:
        match_dict["away_sot"] = 0
    
    try:
        match_dict["home_saves"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[6].text[:-1])
    except:
        match_dict["home_saves"] = 0
    try:
        match_dict["away_saves"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[7].text[:-1])
    except:
        match_dict["away_saves"] = 0
    
    match_dict["home_fouls"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[4].text)
    match_dict["away_fouls"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[6].text)
    
    match_dict["home_corners"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[7].text)
    match_dict["away_corners"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[9].text)
    
    match_dict["home_crosses"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[10].text)
    match_dict["away_crosses"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[12].text)
    
    match_dict["home_touches"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[13].text)
    match_dict["away_touches"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[15].text)
    
    match_dict["home_tackles"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[20].text)
    match_dict["away_tackles"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[22].text)
    
    match_dict["home_ints"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[23].text)
    match_dict["away_ints"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[25].text)
    
    match_dict["home_aerials"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[26].text)
    match_dict["away_aerials"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[28].text)
    
    match_dict["home_clearances"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[29].text)
    match_dict["away_clearances"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[31].text)
    
    match_dict["home_offsides"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[36].text)
    match_dict["away_offsides"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[38].text)
    
    match_dict["home_goal_kicks"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[39].text)
    match_dict["away_goal_kicks"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[41].text)
    
    match_dict["home_throwins"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[42].text)
    match_dict["away_throwins"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[44].text)
    
    match_dict["home_longballs"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[45].text)
    match_dict["away_longballs"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[47].text)
    
    return match_dict

In [119]:
i = 0
df = {}
for m_url in all_matchups:
    print(m_url)
    
    # Get the match data
    d = get_match_data(m_url)
    
    # For first value, set up the dict keys
    if i == 0:
        for key in d.keys():
            df[key] = []
            
    # Append values to df dict
    for key in d.keys():
        df[key].append(d[key])
            
    i += 1
    
    if i % 10 == 0:
        print(i)

https://fbref.com/en/matches/894a485d/Liverpool-Manchester-City-December-31-2016-Premier-League
https://fbref.com/en/matches/f85bb62d/Manchester-United-Southampton-August-19-2016-Premier-League
https://fbref.com/en/matches/d968d3e0/Hull-City-Middlesbrough-April-5-2017-Premier-League
https://fbref.com/en/matches/482a100b/Tottenham-Hotspur-West-Bromwich-Albion-January-14-2017-Premier-League
https://fbref.com/en/matches/d4d1348d/Liverpool-Swansea-City-January-21-2017-Premier-League
https://fbref.com/en/matches/37825164/Arsenal-Hull-City-February-11-2017-Premier-League
https://fbref.com/en/matches/d4011f7a/Hull-City-Liverpool-February-4-2017-Premier-League
https://fbref.com/en/matches/07b59c24/Swansea-City-West-Bromwich-Albion-May-21-2017-Premier-League
https://fbref.com/en/matches/feeb3b85/Crystal-Palace-Hull-City-May-14-2017-Premier-League
https://fbref.com/en/matches/81fd1f87/Liverpool-West-Bromwich-Albion-October-22-2016-Premier-League
10
https://fbref.com/en/matches/05c6c17b/Hull-City

https://fbref.com/en/matches/97ec0ea0/Bournemouth-Liverpool-December-4-2016-Premier-League
https://fbref.com/en/matches/2d418d07/Swansea-City-Hull-City-August-20-2016-Premier-League
90
https://fbref.com/en/matches/c11cdaf7/Liverpool-Burnley-March-12-2017-Premier-League
https://fbref.com/en/matches/69572f77/Manchester-City-Sunderland-August-13-2016-Premier-League
https://fbref.com/en/matches/faf2b6c2/West-Ham-United-Crystal-Palace-January-14-2017-Premier-League
https://fbref.com/en/matches/4f2d1718/Middlesbrough-Tottenham-Hotspur-September-24-2016-Premier-League
https://fbref.com/en/matches/568c83cf/Everton-Swansea-City-November-19-2016-Premier-League
https://fbref.com/en/matches/5f295262/Liverpool-Bournemouth-April-5-2017-Premier-League
https://fbref.com/en/matches/09a5904c/Swansea-City-Everton-May-6-2017-Premier-League
https://fbref.com/en/matches/766a29a9/Southampton-Tottenham-Hotspur-December-28-2016-Premier-League
https://fbref.com/en/matches/205a52ed/Watford-West-Bromwich-Albion-A

https://fbref.com/en/matches/3e5420a6/Everton-Burnley-April-15-2017-Premier-League
https://fbref.com/en/matches/c2d0e082/Burnley-Middlesbrough-December-26-2016-Premier-League
https://fbref.com/en/matches/1560b386/Chelsea-Swansea-City-February-25-2017-Premier-League
https://fbref.com/en/matches/b123da8c/Watford-Arsenal-August-27-2016-Premier-League
180
https://fbref.com/en/matches/92c4eebc/Swansea-City-Burnley-March-4-2017-Premier-League
https://fbref.com/en/matches/f3357cd6/Tottenham-Hotspur-Everton-March-5-2017-Premier-League
https://fbref.com/en/matches/80c3d431/West-Bromwich-Albion-Crystal-Palace-March-4-2017-Premier-League
https://fbref.com/en/matches/d8bbb1d5/Swansea-City-Watford-October-22-2016-Premier-League
https://fbref.com/en/matches/c0ed0645/Everton-Chelsea-April-30-2017-Premier-League
https://fbref.com/en/matches/8d876e96/West-Bromwich-Albion-Southampton-April-8-2017-Premier-League
https://fbref.com/en/matches/65448ede/Hull-City-Manchester-City-December-26-2016-Premier-Leag

https://fbref.com/en/matches/5a03c66a/Everton-Middlesbrough-September-17-2016-Premier-League
https://fbref.com/en/matches/32a93ec5/Arsenal-West-Bromwich-Albion-December-26-2016-Premier-League
https://fbref.com/en/matches/442f489d/West-Bromwich-Albion-Swansea-City-December-14-2016-Premier-League
https://fbref.com/en/matches/6dc6ac2e/Bournemouth-Burnley-May-13-2017-Premier-League
https://fbref.com/en/matches/06033702/Chelsea-Bournemouth-December-26-2016-Premier-League
https://fbref.com/en/matches/77d729ce/Swansea-City-West-Ham-United-December-26-2016-Premier-League
https://fbref.com/en/matches/7e89be76/Hull-City-West-Bromwich-Albion-November-26-2016-Premier-League
270
https://fbref.com/en/matches/20d83945/Everton-Stoke-City-August-27-2016-Premier-League
https://fbref.com/en/matches/6c231d40/Sunderland-Burnley-March-18-2017-Premier-League
https://fbref.com/en/matches/aaad3544/Stoke-City-West-Bromwich-Albion-September-24-2016-Premier-League
https://fbref.com/en/matches/f82b8b32/Watford-Che

https://fbref.com/en/matches/f6b78ab8/Leicester-City-Swansea-City-August-27-2016-Premier-League
https://fbref.com/en/matches/a09c07c5/Everton-Watford-May-12-2017-Premier-League
https://fbref.com/en/matches/8db69c8c/Manchester-United-Chelsea-April-16-2017-Premier-League
https://fbref.com/en/matches/613431fe/Stoke-City-Hull-City-April-15-2017-Premier-League
https://fbref.com/en/matches/fa337038/Everton-Tottenham-Hotspur-August-13-2016-Premier-League
https://fbref.com/en/matches/8cecbb4b/Sunderland-Middlesbrough-August-21-2016-Premier-League
https://fbref.com/en/matches/eae4b047/Southampton-Arsenal-May-10-2017-Premier-League
https://fbref.com/en/matches/a41c0118/Everton-West-Bromwich-Albion-March-11-2017-Premier-League
https://fbref.com/en/matches/14625650/Middlesbrough-West-Bromwich-Albion-January-31-2017-Premier-League
360
https://fbref.com/en/matches/46ad94b6/Crystal-Palace-Manchester-United-December-14-2016-Premier-League
https://fbref.com/en/matches/d971823a/Arsenal-Sunderland-May-16

In [122]:
df_prime = pd.DataFrame(df)

In [125]:
df_prime.to_csv("matchup_data_2016.csv")

In [124]:
df_prime

Unnamed: 0,url,date,week,home_team,away_team,home_record,away_record,home_score,away_score,home_possession,...,home_clearances,away_clearances,home_offsides,away_offsides,home_goal_kicks,away_goal_kicks,home_throwins,away_throwins,home_longballs,away_longballs
0,https://fbref.com/en/matches/894a485d/Liverpoo...,"Saturday December 31, 2016",19,Liverpool,Manchester-City,13-4-2,12-3-4,1,0,43,...,28,33,2,2,9,8,33,25,80,58
1,https://fbref.com/en/matches/f85bb62d/Manchest...,"Friday August 19, 2016",2,Manchester-United,Southampton,2-0-0,0-1-1,2,0,43,...,20,22,2,2,10,8,29,21,48,71
2,https://fbref.com/en/matches/d968d3e0/Hull-Cit...,"Wednesday April 5, 2017",31,Hull-City,Middlesbrough,8-6-17,4-11-15,4,2,48,...,40,12,1,1,7,9,23,24,68,89
3,https://fbref.com/en/matches/482a100b/Tottenha...,"Saturday January 14, 2017",21,Tottenham-Hotspur,West-Bromwich-Albion,13-6-2,8-5-8,4,0,73,...,20,24,1,1,5,10,26,24,70,66
4,https://fbref.com/en/matches/d4d1348d/Liverpoo...,"Saturday January 21, 2017",22,Liverpool,Swansea-City,13-6-3,5-3-14,2,3,74,...,11,44,2,1,4,13,22,25,62,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,https://fbref.com/en/matches/3d216fb0/Swansea-...,"Saturday October 1, 2016",7,Swansea-City,Liverpool,1-1-5,5-1-1,1,2,41,...,35,17,1,2,10,7,24,17,68,61
376,https://fbref.com/en/matches/25e9da07/Hull-Cit...,"Saturday March 11, 2017",28,Hull-City,Swansea-City,6-6-16,8-3-17,2,1,50,...,21,28,7,3,10,8,16,20,71,86
377,https://fbref.com/en/matches/250a2859/Chelsea-...,"Saturday November 5, 2016",11,Chelsea,Everton,8-1-2,5-3-3,5,0,60,...,14,25,2,1,4,4,19,15,77,60
378,https://fbref.com/en/matches/897b5086/Bournemo...,"Tuesday December 13, 2016",16,Bournemouth,Leicester-City,6-3-7,4-4-8,1,0,63,...,44,27,4,2,7,9,26,28,63,71


In [126]:
# Combines all sheets into one
df1 = pd.read_csv("matchup_data.csv").drop(["date_timestamp", "home_score_xg", "away_score_xg"], axis=1)
df2 = pd.read_csv("matchup_data_2017.csv").drop(["Unnamed: 0", "home_score_xg", "away_score_xg"], axis=1)
df3 = pd.read_csv("matchup_data_2016.csv").drop(["Unnamed: 0"], axis=1)

In [127]:
df = df1
df = df.append(df2)
df = df.append(df3)

In [128]:
# Adds a numerical date value for each game
month_map = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}
def date_transform(date_string):
    date_list = date_string.split()
    year = int(date_list[3])
    day = int(date_list[2][:-1])
    month = month_map[date_list[1]]
    
    return year * 10000 + month * 100 + day

df["date_value"] = [date_transform(date_string) for date_string in df["date"]]

In [129]:
# Sorts sheet by datevalue and saves as csv
df = df.sort_values(by="date_value")
df.to_csv("./matchup_data_all.csv", index=False)