In [None]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver import ChromeOptions

In [None]:
os.system("export PATH=$PATH:$(pwd)")
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(
    options=chrome_options,
    
    # Change this to the local absolute path to project directory
    executable_path='/Users/jshea/Desktop/school/spring_2020/machine_learning/ML-Final-Project/chromedriver'
)

In [None]:
def get_team_urls(season):
    ''' Get premier league team urls for a given season '''
    
    # List to record all the team page urls for this season (starts with Man City)
    team_urls = ["https://fbref.com/en/squads/b8fd03ef/{}/Manchester-City".format(season)]
    url = team_urls[0]

    # Loads page and gets matchup records for Man City in the current season
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    matchup_records = soup.body.find_all(
        "div", {"id": "all_kitchen_sink_sched"})[0].find("div").find("div").find_all(
            "div", {"class", "table_outer_container mobile_table"})[0].find_all(
                "div", {"class", "overthrow table_container"})[0].find_all(
                    "tbody")[0].find_all("tr")

    # Loops through records and extracts links to opponent pages
    for record in matchup_records:

        # Makes sure matchup is premier league
        try:
            league = record.th.a.text
        except:
            continue
        if league == "Premier League":

            # Gets opponent page url
            opponent_link = record.find("td", {"data-stat": "opponent"}).a["href"]
            team_code = opponent_link.split("/")[3]
            team_name = opponent_link.split("/")[5]
            team_urls.append("https://fbref.com/en/squads/{}/{}/{}".format(team_code, season, team_name))

    # Dudups team urls
    team_urls = list(set(team_urls))
    
    return team_urls

In [None]:
def get_matchup_urls(team_url):
    ''' Get all the matchup urls corresponding to a team url '''

    # Loads page and gets matchup records
    driver.get(team_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    matchup_records = soup.body.find_all(
        "div", {"id": "all_kitchen_sink_sched"})[0].find("div").find("div").find_all(
            "div", {"class", "table_outer_container mobile_table"})[0].find_all(
                "div", {"class", "overthrow table_container"})[0].find_all(
                    "tbody")[0].find_all("tr")

    # Loops through records and extracts opponent links
    matchup_urls = []
    for record in matchup_records:

        # Makes sure matchup is premier league
        try:
            league = record.th.a.text
        except:
            continue
        if league == "Premier League":

            match_link = record.find("td", {"data-stat": "match_report"}).a["href"]
            match_code = match_link.split("/")[3]
            match_title = match_link.split("/")[4]
            matchup_urls.append("https://fbref.com/en/matches/{}/{}".format(match_code, match_title))

    return matchup_urls

In [None]:
# Gets urls to all matchups in given seasons

all_matchups = []

# Loops through season
for season in ["2018-2019"]:
    
    # Gets all 20 team urls for the season
    team_urls = get_team_urls(season)
    
    # For each team, get all their matchup urls
    for team_url in team_urls:
        print("Getting matches for url {}".format(team_url))
        
        all_matchups = all_matchups + get_matchup_urls(team_url)
        
    # Dedup matchup urls
    all_matchups = list(set(all_matchups))

In [None]:
print("Extracted {} matchup urls".format(len(all_matchups)))

In [None]:
def get_match_data(match_url):
    ''' gets all associated match data from a given match page '''
    
    # Loads page
    driver.get(match_url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Dict to store all match associated data
    match_dict = {"url": match_url}
    
    match_dict["date"] = soup.body.find_all(
        "div", {"class": "scorebox_meta"})[0].find_all("strong")[0].a.text

    match_dict["week"] = int(soup.body.find_all(
        "div", {"id": "content"})[0].div.text[26:-1])
    
    match_dict["home_team"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("strong")[0].a["href"].split("/")[5]
    match_dict["away_team"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("strong")[3].a["href"].split("/")[5]
    
    match_dict["home_record"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div")[6].text
    match_dict["away_record"] = soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div")[16].text
    
    match_dict["home_score"] = int(soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div", {"class": "score"})[0].text)
    match_dict["away_score"] = int(soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div", {"class": "score"})[1].text)
    
    match_dict["home_score_xg"] = float(soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div", {"class": "score_xg"})[0].text)
    match_dict["away_score_xg"] = float(soup.body.find_all(
        "div", {"class": "scorebox"})[0].find_all("div", {"class": "score_xg"})[1].text)
    
    try:
        match_dict["home_possession"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[0].text[:-1])
    except:
        match_dict["home_possession"] = 0
    try:
        match_dict["away_possession"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[1].text[:-1])
    except:
        match_dict["away_possession"] = 0
    
    try:
        match_dict["home_pass_acc"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[2].text[:-1])
    except:
        match_dict["home_pass_acc"] = 0
    try:
        match_dict["away_pass_acc"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[3].text[:-1])
    except:
        match_dict["away_pass_acc"] = 0
    
    try:
        match_dict["home_sot"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[4].text[:-1])
    except:
        match_dict["home_sot"] = 0
    try:
        match_dict["away_sot"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[5].text[:-1])
    except:
        match_dict["away_sot"] = 0
    
    try:
        match_dict["home_saves"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[6].text[:-1])
    except:
        match_dict["home_saves"] = 0
    try:
        match_dict["away_saves"] = int(soup.body.find_all(
        "div", {"id": "team_stats"})[0].find_all("strong")[7].text[:-1])
    except:
        match_dict["away_saves"] = 0
    
    match_dict["home_fouls"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[4].text)
    match_dict["away_fouls"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[6].text)
    
    match_dict["home_corners"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[7].text)
    match_dict["away_corners"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[9].text)
    
    match_dict["home_crosses"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[10].text)
    match_dict["away_crosses"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[12].text)
    
    match_dict["home_touches"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[13].text)
    match_dict["away_touches"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[15].text)
    
    match_dict["home_tackles"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[20].text)
    match_dict["away_tackles"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[22].text)
    
    match_dict["home_ints"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[23].text)
    match_dict["away_ints"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[25].text)
    
    match_dict["home_aerials"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[26].text)
    match_dict["away_aerials"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[28].text)
    
    match_dict["home_clearances"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[29].text)
    match_dict["away_clearances"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[31].text)
    
    match_dict["home_offsides"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[36].text)
    match_dict["away_offsides"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[38].text)
    
    match_dict["home_goal_kicks"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[39].text)
    match_dict["away_goal_kicks"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[41].text)
    
    match_dict["home_throwins"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[42].text)
    match_dict["away_throwins"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[44].text)
    
    match_dict["home_longballs"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[45].text)
    match_dict["away_longballs"] = int(soup.body.find_all(
        "div", {"id": "team_stats_extra"})[0].find_all("div")[47].text)
    
    return match_dict

In [None]:
i = 0
df = {}
for m_url in all_matchups:
    print(m_url)
    
    # Get the match data
    d = get_match_data(m_url)
    
    # For first value, set up the dict keys
    if i == 0:
        for key in d.keys():
            df[key] = []
            
    # Append values to df dict
    for key in d.keys():
        df[key].append(d[key])
            
    i += 1
    
    if i % 10 == 0:
        print(i)

In [None]:
df_prime = pd.DataFrame(df)

In [None]:
df_prime.to_csv("matchup_data.csv")

In [None]:
driver.get("https://fbref.com/en/matches/806cb036/Manchester-United-Brighton--Hove-Albion-January-19-2019-Premier-League")