In [2]:
# Returns a DataFrame containing the 
def scrape_europe(path_to_chromedriver="", league_names=['epl', 'laliga', 'bundesliga', 'seriea', 'ligue1', 'championsleague','europaleague']):
    # imports necessary libraries
    ## some must be downloaded
    from selenium import webdriver
    import pandas as pd
    import numpy as np
    from balaban.utils import get_col_dtype
    from selenium.webdriver.chrome.options import Options
    from selenium import webdriver
    from webdriver_manager.chrome import ChromeDriverManager

    # Creates the arrays we will iterate through
    years = np.array(["/2017-2018-", "/2018-2019-", ""])
    top_5_league_2018 = np.array(["1631/", "1652/", "1634/", "1632/", "1640/", "", ""])
    top_5_league_2019 = np.array(["1889/", "1886/", "2109/", "2104/", "1896/", "2102/", "2103/"])
    top_5_league_2020 = np.array(["", "", "", "", "", "", ""])
    top_5_league_nums = np.array(['9', '12', '20', '11', '13', '8', '19'])
    top_5_league_names = np.array(['Premier-League', 'La-Liga', 'Bundesliga', 'Serie-A', 'Ligue-1', "Champions-League", "Europa-League"])
    league_codes = ['epl', 'laliga', 'bundesliga', 'ligue1', 'seriea', 'championsleague', 'europaleague']
    league_names = list(league_names) if (type(league_names) is np.ndarray) or (
            type(league_names) is tuple) else league_names
    league_names = [league_names] if type(league_names) is str else league_names

    # Argument validation for the optional league inputs
    try:
        league_matches = np.isin(league_codes, league_names)
    except:
        raise ValueError(
            "league_names should be a list of characters. Available options are 'epl', 'laliga', 'bundesliga',  "
            "'seriea', 'ligue1', 'championsleague','europaleague'")

    if np.sum(league_matches) == 0:
        raise ValueError(
            "league_names should be a list of characters. Available options are 'epl', 'laliga', 'bundesliga',  "
            "'seriea', 'ligue1', 'championsleague','europaleague'")
    elif np.sum(league_matches) != len(league_names):
        raise ValueError(
            "league_names contains strings that weren't matched. Available options are 'epl', 'laliga', 'bundesliga', "
            " 'seriea', 'ligue1', 'championsleague','europaleague'")

    # Creates the Chromedriver connection
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    # Finds / Downloads ChromeDriver for you
    browser =   webdriver.Chrome(ChromeDriverManager().install())

    # Makes lists out of the arrays
    years = list(years)
    year_nums_2018 = list(top_5_league_2018)
    year_nums_2019 = list(top_5_league_2019)
    year_nums_2020 = list(top_5_league_2020)
    top_5_league_names = list(top_5_league_names)
    top_5_league_nums = list(top_5_league_nums)

    # The pages we will iterate through
    categories = ['passing', 'shooting', 'misc','possession','defense','gca','passing_types']
    # The DataFrame we will return
    all_players_df = pd.DataFrame()
    for year in years:
        # The DataFrame for this year we will append to all_players_df
        year_df = pd.DataFrame()
        
        # Sets the leaguecode, which is actually a year-league code indicating the pages to access, based on year
        if year == "/2017-2018-":
            leaguecode = year_nums_2018
        elif year == "/2018-2019-":
            leaguecode = year_nums_2019
        else:
            leaguecode = year_nums_2020
        for lnum, lnam, ynum in zip(top_5_league_nums, top_5_league_names, leaguecode):
            # There is no data for this year and these competitions, so we do not attempt to access it
            if (year == "/2017-2018-" and (lnam == "Champions-League" or lnam == "Europa-League")):
                continue
            
            for category in categories:
                # the current year has a different url structure
                if year == "":
                    url = 'https://fbref.com/en/comps/' + lnum + '/' + ynum + category + '/' + year + lnam + '-Stats'
                else:
                    url = 'https://fbref.com/en/comps/' + lnum + '/' + ynum + category + year + lnam + '-Stats'
                # Uses the webdriver to retrieve the URL
                browser.get(url)
                # We operate differently upon our first page for each league/year because we only want certain fields once
                if category == 'passing':
                    # Creates a df from the table on fbref's site
                    my_table = browser.find_element_by_id('div_stats_passing')
                    my_table = my_table.find_element_by_xpath("table")
                    df = pd.read_html(my_table.get_attribute('outerHTML'))[0]
                    # Creates singular column names from hierarchical column names to prevent column name redundancy
                    col_names = np.array(
                        [('Unnamed:' not in val[0]) * (val[0] + ': ') + val[1] for val in df.columns.values])
                    # Creates an array of the columns we want / 0 is Rk, 6 is Birth Year, we keep the rest
                    tmp = np.array(df)[:, np.r_[np.arange(1, 5), np.arange(7, df.shape[1] - 1)]]
                    # Creates a league column with lnam as the league name
                    tmp = np.c_[np.tile(lnam, tmp.shape[0]), tmp]
                    # Sets column names
                    tmp_col_names = np.r_[np.array('League'), col_names[np.r_[np.arange(1, 5), np.arange(7, df.shape[1] - 1)]]]
                    # Makes a new dataframe out of the cleaned data
                    league_df = pd.DataFrame(tmp)
                    # Sets the column names
                    league_df.columns = tmp_col_names
                    # Drops the rows where Player is the Player name (redundant column names)
                    league_df = league_df[league_df['Player'] != 'Player']
                    # Casts all the columns to uniform datatypes
                    league_df = league_df.astype(league_df.apply(get_col_dtype).to_dict())
                    # Sets player as the index
                    league_df.index = league_df['Player']
                    # Cleans nation name
                    league_df["Nation"]= league_df['Nation'].str.split(" ").str[1]
                else:
                    my_table = browser.find_element_by_id('div_stats_'+category)
                    my_table = my_table.find_element_by_xpath("table")
                    df = pd.read_html(my_table.get_attribute('outerHTML'))[0]
                    col_names = np.array(
                        [('Unnamed:' not in val[0]) * (val[0] + ': ') + val[1] for val in df.columns.values])
                    tmp = np.array(df)[:, np.r_[1, np.arange(8, df.shape[1] - 1)]]
                    tmp_df = pd.DataFrame(tmp)
                    tmp_df.columns = col_names[np.r_[1, np.arange(8, df.shape[1] - 1)]]
                    tmp_df = tmp_df[tmp_df['Player'] != 'Player']
                    tmp_df = tmp_df.astype(tmp_df.apply(get_col_dtype).to_dict())
                    tmp_df = tmp_df.set_index(list(tmp_df)[0])
                    league_df = pd.concat([league_df, tmp_df], axis=1, sort=False)
            # Adds the data from the given league to the data for the given year
            year_df = pd.concat([year_df, league_df])
        # Creates a year column in the df
        if year != "":
            year_df["year"] = year[1:-1]
        else:
            year_df["year"] = "2019-2020"
        # Adds the year to the final df
        all_players_df = pd.concat([all_players_df, year_df])
    return all_players_df

df = scrape_europe("")

[WDM] - Looking for [chromedriver 83.0.4103.39 mac64] driver in cache 
[WDM] - File found in cache by path [/Users/keesvanhemmen/.wdm/drivers/chromedriver/83.0.4103.39/mac64/chromedriver]


In [3]:
df.to_csv("fbrefEurope.csv")