In [1]:
import pandas as pd
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')

driver = webdriver.Chrome(chrome_options=options)

league_urls = ["https://www.whoscored.com/Regions/252/Tournaments/2/England-Premier-League",
               "https://www.whoscored.com/Regions/108/Tournaments/5/Italy-Serie-A",
               "https://www.whoscored.com/Regions/206/Tournaments/4/Spain-La-Liga",
               "https://www.whoscored.com/Regions/81/Tournaments/3/Germany-Bundesliga",
               "https://www.whoscored.com/Regions/74/Tournaments/22/France-Ligue-1",
               "https://www.whoscored.com/Regions/177/Tournaments/21/Portugal-Liga-NOS",
               "https://www.whoscored.com/Regions/155/Tournaments/13/Netherlands-Eredivisie",
               "https://www.whoscored.com/Regions/225/Tournaments/17/Turkey-Super-Lig"]

In [2]:
def get_team_urls(driver, league_url, sleep_time):
    driver.get(league_url)
    
    time.sleep(sleep_time)
    
    teams_df = pd.DataFrame(columns=["team_id", "team_name", "team_url"])
    
    teams = driver.find_elements_by_css_selector(".standings > tr > .team")
    for team in teams:
        team_dict = {
            "team_id": team.find_element_by_css_selector(".team-link").get_attribute("href").split("/")[4],
            "team_name": team.find_element_by_css_selector('.team-link').text,
            "team_url": team.find_element_by_css_selector('.team-link').get_attribute("href")
        }
    
        teams_df.loc[len(teams_df)] = team_dict
    
    return teams_df

In [3]:
def replace_dash(df):
    mapping = {'-': 0}
    replace_dict = {}
    
    for colum in df.columns:
        replace_dict[colum] = mapping
        
    return df.replace(replace_dict)

In [4]:
def get_player_summary_stats(driver, team_url, sleep_time):
    driver.get(team_url)
    
    time.sleep(sleep_time)
    
    players_summary_df = pd.DataFrame(columns=["player_id", "player_name", "team", "country", "age", "position",
                                               "height_cm", "weight_kg", "first_eleven", "substituted_on",
                                              "minutes_played", "yellow_cards", "red_cards", "man_of_the_match",
                                              "avg_rating", "aerials_won"])
    
    players = driver.find_elements_by_css_selector("#player-table-statistics-body > tr")
    
    for player in players:
        
        apps = player.find_elements_by_css_selector("td")[5].text.strip().split("(")
        first_eleven = apps[0]
        if len(apps) > 1:
            substituted_on = apps[1].replace(")", "")
        else:
            substituted_on = 0
        
        player_dict = {
            "player_id": player.find_element_by_css_selector(".player-link").get_attribute("href").split("/")[4],
            "player_name": player.find_element_by_css_selector(".player-link").text.strip(),
            "team": player.find_element_by_css_selector(".team-header-name").text.strip(),
            "country": player.find_element_by_css_selector(".ui-icon").get_attribute("class").split(" ")[2].split("-")[1].strip(),
            "age": player.find_elements_by_css_selector(".pn > span")[0].text.strip(),
            "position": player.find_elements_by_css_selector(".pn > span")[1].text.replace(",", "", 1).strip(),
            "height_cm": player.find_elements_by_css_selector("td")[3].text.strip(),
            "weight_kg": player.find_elements_by_css_selector("td")[4].text.strip(),
            "first_eleven": first_eleven,
            "substituted_on": substituted_on,
            "minutes_played": player.find_element_by_css_selector(".minsPlayed").text.strip(),
            "yellow_cards": player.find_element_by_css_selector(".yellowCard").text.strip(),
            "red_cards": player.find_element_by_css_selector(".redCard").text.strip(),
            "man_of_the_match": player.find_element_by_css_selector(".manOfTheMatch").text.strip(),
            "avg_rating": player.find_element_by_css_selector(".rating").text.strip(),
            "aerials_won": player.find_element_by_css_selector(".aerialWonPerGame").text.strip()
        }
        
        players_summary_df.loc[len(players_summary_df)] = player_dict
    
    return replace_dash(players_summary_df)

In [5]:
def get_player_defensive_stats(driver, team_url, sleep_time):
    driver.get(team_url)
    
    time.sleep(sleep_time)
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.find_elements_by_css_selector("#team-squad-stats-options > li")[1].find_element_by_css_selector("a").click()
    
    time.sleep(sleep_time)
    
    players_defensive_df = pd.DataFrame(columns=["player_id", "tackles", "interceptions", "fouls", "offsides_won",
                                                "clearances", "dribbled_past", "blocks", "own_goals"])
    
    players = driver.find_elements_by_css_selector("#team-squad-stats-defensive #player-table-statistics-body > tr")
    
    for player in players:
        
        player_dict = {
            "player_id": player.find_element_by_css_selector(".player-link").get_attribute("href").split("/")[4],
            "tackles": player.find_element_by_css_selector(".tacklePerGame").text.strip(),
            "interceptions": player.find_element_by_css_selector(".interceptionPerGame").text.strip(),
            "fouls": player.find_element_by_css_selector(".foulsPerGame").text.strip(),
            "offsides_won": player.find_element_by_css_selector(".offsideWonPerGame").text.strip(),
            "clearances": player.find_element_by_css_selector(".clearancePerGame").text.strip(),
            "dribbled_past": player.find_element_by_css_selector(".wasDribbledPerGame").text.strip(),
            "blocks": player.find_element_by_css_selector(".outfielderBlockPerGame").text.strip(),
            "own_goals": player.find_element_by_css_selector(".goalOwn").text.strip(),
        }
        
        players_defensive_df.loc[len(players_defensive_df)] = player_dict
    
    return replace_dash(players_defensive_df)

In [6]:
def get_player_offensive_stats(driver, team_url, sleep_time):
    driver.get(team_url)
    
    time.sleep(sleep_time)
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.find_elements_by_css_selector("#team-squad-stats-options > li")[2].find_element_by_css_selector("a").click()
    
    time.sleep(sleep_time)
    
    players_offensive_df = pd.DataFrame(columns=["player_id", "goals", "assists", "shots", "key_passes", "dribbles",
                                                "fouled", "offsides", "dispossessed", "bad_controls"])
    
    players = driver.find_elements_by_css_selector("#team-squad-stats-offensive #player-table-statistics-body > tr")
    
    for player in players:
        
        player_dict = {
            "player_id": player.find_element_by_css_selector(".player-link").get_attribute("href").split("/")[4],
            "goals": player.find_element_by_css_selector(".goal").text.strip(),
            "assists": player.find_element_by_css_selector(".assistTotal").text.strip(),
            "shots": player.find_element_by_css_selector(".shotsPerGame").text.strip(),
            "key_passes": player.find_element_by_css_selector(".keyPassPerGame").text.strip(),
            "dribbles": player.find_element_by_css_selector(".dribbleWonPerGame").text.strip(),
            "fouled": player.find_element_by_css_selector(".foulGivenPerGame").text.strip(),
            "offsides": player.find_element_by_css_selector(".offsideGivenPerGame").text.strip(),
            "dispossessed": player.find_element_by_css_selector(".dispossessedPerGame").text.strip(),
            "bad_controls": player.find_element_by_css_selector(".turnoverPerGame").text.strip()
        }
        
        players_offensive_df.loc[len(players_offensive_df)] = player_dict
    
    return replace_dash(players_offensive_df)

In [7]:
def get_player_passing_stats(driver, team_url, sleep_time):
    driver.get(team_url)
    
    time.sleep(sleep_time)
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.find_elements_by_css_selector("#team-squad-stats-options > li")[3].find_element_by_css_selector("a").click()
    
    time.sleep(sleep_time)
    
    players_passing_df = pd.DataFrame(columns=["player_id", "passes", "pass_accuracy", "crosses", "long_balls",
                                              "through_balls"])
    
    players = driver.find_elements_by_css_selector("#team-squad-stats-passing #player-table-statistics-body > tr")
    
    for player in players:
        
        player_dict = {
            "player_id": player.find_element_by_css_selector(".player-link").get_attribute("href").split("/")[4],
            "passes": player.find_element_by_css_selector(".totalPassesPerGame").text.strip(),
            "pass_accuracy": player.find_element_by_css_selector(".passSuccess").text.strip(),
            "crosses": player.find_element_by_css_selector(".accurateCrossesPerGame").text.strip(),
            "long_balls": player.find_element_by_css_selector(".accurateLongPassPerGame").text.strip(),
            "through_balls": player.find_element_by_css_selector(".accurateThroughBallPerGame").text.strip()
        }
        
        players_passing_df.loc[len(players_passing_df)] = player_dict
    
    return replace_dash(players_passing_df)

In [8]:
player_stats_df = pd.DataFrame(columns=["player_id", "player_name", "team", "country", "age", "position", "height_cm", "weight_kg",
                                        "first_eleven", "substituted_on", "minutes_played", "yellow_cards", "red_cards",
                                        "man_of_the_match", "avg_rating", "aerials_won", "tackles", "interceptions", "fouls",
                                        "offsides_won", "clearances", "dribbled_past", "blocks", "own_goals", "goals",
                                        "assists", "shots", "key_passes", "dribbles", "fouled", "offsides", "dispossessed",
                                        "bad_controls", "passes", "pass_accuracy", "crosses", "long_balls", "through_balls"])

for league in league_urls:
    
    team_urls = get_team_urls(driver, league, 2)
    
    for team in team_urls["team_url"]:
        summary_df = get_player_summary_stats(driver, team, 2)
        defensive_df = get_player_defensive_stats(driver, team, 2)
        offensive_df = get_player_offensive_stats(driver, team, 2)
        passing_df = get_player_passing_stats(driver, team, 2)
        
        df_1 = summary_df.merge(defensive_df, on="player_id")
        df_2 = df_1.merge(offensive_df, on="player_id")
        df_3 = df_2.merge(passing_df, on="player_id")
        
        player_stats_df = player_stats_df.append(df_3)

driver.close()

player_stats_df.reset_index(drop=True, inplace=True)
player_stats_df.to_csv("player_stats.csv", encoding="utf-8")