# Crawling Whoscored Web Site
1. https://www.whoscored.com/
2. Crawling
    - League Team
    - Player Summary
    - Player Defensive
    - Player Offensive
    - Player Passing
3. Make Pickle

### Import Package

In [11]:
import pandas as pd
import time
from selenium import webdriver

### 0. Useful Common Var & Function 

In [53]:
api_delay_term = 5

def connect_webdriver(url):
    """
    connect webdriver get url and return driver
    
    """
    driver =  webdriver.PhantomJS()
    driver.get(url)
    return driver 

def replace_pd(df):
    """
    replace pandas dataframe datas "-" to 0
    
    """
    
    # mapping pattern
    mapping = {'-': 0}
    
    replace_dict = {}
    
    for colum in df.columns:
        replace_dict[colum] = mapping
        
    return df.replace(replace_dict)

### 1. Crawling League Team Data Function

In [57]:
def crawling_league_teams(team_id):
    """
    cawling league team_id and team name datas
    
    parameter ---------------------------------------------------------------
    team_id : one of you want league team_id & parameter data type int or str
    
    return ------------------------------------------------------------------
    crawling league team_id, team_name datas belong team_id parameter
    return pandas dataframe columns=team_id, team_name
    
    """
    
    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = connect_webdriver(url) 
    
    # wait get league team datas
    time.sleep(api_delay_term) 
    
    # make pandas dataframe
    team_df = pd.DataFrame(columns=["team_id","team_name"])
    
    # get team datas
    teams = driver.find_elements_by_css_selector("#teams option")
    for team in teams:
        team_name = team.text
        team_id = team.get_attribute("value").split("/")[2]
        team_df.loc[len(team_df)] = {"team_id":team_id, "team_name":team_name }
        
    # close webdriver
    driver.close()
    
    return team_df

### 2. Crawling Players Summary Data Function

In [58]:
def crawling_player_summary(team_id):
    """
    cawling player summary data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_nuber, flag, name, age, position, tall, weight, full_time, half_time
    , mins, goals, asists, yel, red, spg, ps, motm, aw, rating
    
    """    
    
    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = connect_webdriver(url) 

    # wait for getting data
    time.sleep(api_delay_term)
    
    # make pandas dataframe
    player_summary_df = pd.DataFrame(columns=[
            "player_number", "flag", "name", "age", "position"
            , "tall", "weight", "full_time", "half_time", "mins"
            , "goals", "asists", "yel", "red", "spg", "ps", "motm"
            , "aw", "rating",
        ])
    
    # get player summay datas
    elements = driver.find_elements_by_css_selector("#player-table-statistics-body tr")
    for element in elements:
        
        # split full time games and half time games
        games = element.find_elements_by_css_selector("td")[5].text
        games = games.split("(")
        full_time, half_time = games[0], 0
        if len(games) > 1 :
            half_time = games[1].replace(")","")
        else :
            half_time = 0
        
        # player dictionary data
        player_dict = { 
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "flag": element.find_elements_by_css_selector("td")[1].find_elements_by_css_selector("span")[0].get_attribute("class").split("-")[2],
            "name": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].text, 
            "age": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[0].text, 
            "position": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[1].text[1:], 
            "tall": element.find_elements_by_css_selector("td")[3].text,
            "weight": element.find_elements_by_css_selector("td")[4].text, 
            "full_time": full_time,
            "half_time": half_time,
            "mins": element.find_elements_by_css_selector("td")[6].text,
            "goals": element.find_elements_by_css_selector("td")[7].text,
            "asists": element.find_elements_by_css_selector("td")[8].text,
            "yel": element.find_elements_by_css_selector("td")[9].text,
            "red": element.find_elements_by_css_selector("td")[10].text,
            "spg": element.find_elements_by_css_selector("td")[11].text,
            "ps": element.find_elements_by_css_selector("td")[12].text,
            "aw": element.find_elements_by_css_selector("td")[13].text,
            "motm": element.find_elements_by_css_selector("td")[14].text,
            "rating": element.find_elements_by_css_selector("td")[15].text,
        }
        
        player_summary_df.loc[len(player_summary_df)] = player_dict
    
    # close webdriver
    driver.close()
    
    return replace_pd(player_summary_df)

### 3. Crawling Players Defensive Data Function

In [59]:
def crawling_player_defensive(team_id):
    """
    cawling player defensive data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_number, tackles, inter, fouls, offsides, clear, drb, blocks, owng
    
    """  

    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = connect_webdriver(url) 
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # click event for getting defensive data
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[0].find_element_by_css_selector("a").click()
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # make pandas dataframe
    player_defensive_df = pd.DataFrame(columns=[
            "player_number", "tackles", "inter", "fouls", "offsides", "clear", "drb", "blocks", "owng"
        ])
    
    # get player defensive datas
    elements = driver.find_elements_by_css_selector("#team-squad-stats-defensive #player-table-statistics-body tr")
    for element in elements:
       
        player_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "tackles": element.find_elements_by_css_selector("td")[7].text, 
            "inter": element.find_elements_by_css_selector("td")[8].text, 
            "fouls": element.find_elements_by_css_selector("td")[9].text,
            "offsides": element.find_elements_by_css_selector("td")[10].text,
            "clear": element.find_elements_by_css_selector("td")[11].text,
            "drb": element.find_elements_by_css_selector("td")[12].text,
            "blocks": element.find_elements_by_css_selector("td")[13].text,
            "owng": element.find_elements_by_css_selector("td")[14].text,
        }
        
        player_defensive_df.loc[len(player_defensive_df)] = player_dict
    
    # close webdriver
    driver.close()
    
    return replace_pd(player_defensive_df)

### 4. Crawling Players Offensive Data Function

In [61]:
def crawling_player_offensive(team_id):
    
    """
    cawling player defensive data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_number, keyp, fouled, off, disp, unstch
    
    """  

    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = connect_webdriver(url) 
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # click event for getting data
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[1].find_element_by_css_selector("a").click()
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # make pandas dataframe
    player_offensive_df = pd.DataFrame(columns=["player_number", "keyp", "fouled", "off", "disp", "unstch"])
    
    # get player offensive datas
    elements = driver.find_elements_by_css_selector("#statistics-table-offensive #player-table-statistics-body tr")
    for element in elements:
    
        player_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "keyp": element.find_elements_by_css_selector("td")[10].text, 
            "fouled": element.find_elements_by_css_selector("td")[12].text, 
            "off": element.find_elements_by_css_selector("td")[13].text,
            "disp": element.find_elements_by_css_selector("td")[14].text,
            "unstch": element.find_elements_by_css_selector("td")[15].text,
        }
        
        player_offensive_df.loc[len(player_offensive_df)] = player_dict
        
    # close webdriver
    driver.close()
    
    return replace_pd(player_offensive_df)

### 5. Crawling Players Passing Data Function

In [71]:
def crawling_player_passing(team_id):
    
    """
    cawling player defensive data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_number, avgp, ps, crosses, longb, thrb
    
    """ 
    
    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = connect_webdriver(url) 
    
    # wait for gettig data
    time.sleep(api_delay_term)
    
    # click event for gettig data
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[2].find_element_by_css_selector("a").click()

    # wait for gettig data
    time.sleep(api_delay_term)
    
    # make pnadas dateframe
    player_passing_df = pd.DataFrame(columns=[
            "player_number", "avgp", "ps", "crosses", "longb", "thrb"
        ])

    # get data
    elements = driver.find_elements_by_css_selector("#statistics-table-passing #player-table-statistics-body tr")
    for element in elements:
       
        player_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "avgp": element.find_elements_by_css_selector("td")[8].text, 
            "ps": element.find_elements_by_css_selector("td")[9].text, 
            "crosses": element.find_elements_by_css_selector("td")[10].text,
            "longb": element.find_elements_by_css_selector("td")[11].text,
            "thrb": element.find_elements_by_css_selector("td")[12].text,
        }
        
        player_passing_df.loc[len(player_passing_df)] = player_dict
    
    # close webdriver
    driver.close()
    
    return replace_pd(player_passing_df)

### 6. Crawling Function to Pickle

In [73]:
import pickle

def save_pkl(pkl_file, path):
    pickle.dump(pkl_file, open(path, "wb"))
    print("Save pkl_file (path:{0})".format(path))

# make crawling_league_teams function pkl file & save
crawling_team = pickle.dumps(crawling_league_teams)
save_pkl(crawling_team, "./pkl/crawling_team.pkl")

# make crawling_player_defensive function pkl file & save
crawling_defensive = pickle.dumps(crawling_player_defensive)
save_pkl(crawling_defensive, "./pkl/crawling_defensive.pkl")
    
# make crawling_player_offensive function pkl file & save
crawling_offensive = pickle.dumps(crawling_player_offensive)
save_pkl(crawling_offensive, "./pkl/crawling_offensive.pkl")
    
# make crawling_player_passing function pkl file & save
crawling_passing = pickle.dumps(crawling_player_passing)
save_pkl(crawling_passing, "./pkl/crawling_passing.pkl")

Save pkl_file (path:./pkl/crawling_team.pkl)
Save pkl_file (path:./pkl/crawling_defensive.pkl)
Save pkl_file (path:./pkl/crawling_offensive.pkl)
Save pkl_file (path:./pkl/crawling_passing.pkl)


In [74]:
# load pickle
crawling_team_pkl_function = pickle.loads(pickle.load(open("./pkl/crawling_team.pkl", "rb")))

In [76]:
# using loaded pickle function
crawling_team_pkl_function(13).head()

Unnamed: 0,team_id,team_name
0,13,Arsenal
1,24,Aston Villa
2,183,Bournemouth
3,15,Chelsea
4,162,Crystal Palace
