In [5]:
import time
import pandas as pd
import time
import pickle
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [6]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.implicitly_wait(3)
driver.get('https:/1xbet.whoscored.com/')
driver.close()

In [7]:
def crawling_league_teams(region, tournaments, api_delay_term=5):

    """
    cawling league team_id and team name datas
    
    Arg :
        region : region id (from whoscored) for certain league ex) england --- 252, spain --- 206
        tournaments : tournament(or league) id ex) premier league --- 2,
    
    return :
        crawling league team_id, team_name datas belong team_id parameter
        return pandas dataframe columns=team_id, team_name
    

    """
    
    # connect webdriver
    url = "https://1xbet.whoscored.com/Regions/" + str(region) + "/Tournaments/" + str(tournaments)
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get(url)

    # wait get league team datas
    time.sleep(api_delay_term) 
    
    # make pandas dataframe
    team_df = pd.DataFrame(columns=["team_id","team_name"])

    # get team data
    teams = driver.find_element(By.XPATH, "//*[@id='standings-23400-content']")
    teams = teams.find_elements(By.CSS_SELECTOR, "a.team-link")
    
    for team in teams:
        team_name = team.get_attribute("innerHTML")
        team_id = team.get_attribute("href").split("/")[4]
        team_df.loc[len(team_df)] = {"team_id":team_id, "team_name":team_name }
        
    # close webdriver
    driver.close()
    
    return team_df

In [8]:
PL = crawling_league_teams(252, 2)

In [9]:
PL

Unnamed: 0,team_id,team_name
0,26,Liverpool
1,13,Arsenal
2,174,Nottingham Forest
3,15,Chelsea
4,167,Manchester City
5,23,Newcastle
6,211,Brighton
7,170,Fulham
8,24,Aston Villa
9,183,Bournemouth


## League Table 

Premier League table (2022-2023 season)
- URL = 'https://1xbet.whoscored.com/Regions/[region id]/Tournaments/[league id]'
- 252 (region id), 2 (league id)

In [10]:
URL = 'https://1xbet.whoscored.com/Regions/252/Tournaments/2'

In [17]:
def league_table(URL, api_delay_term=5):
    """
    cwaling league table(seasonal statistics) 
    
    Args : 
        URL : league table URL
    
    Output :
        leauge table(dataframe)
    
 
    """
    url = str(URL)
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get(url)

    time.sleep(api_delay_term)

    league_table_df = pd.DataFrame(columns=[
        "team_number", "team_name", "P", "W", "D", "L", "GF", "GA", "GD", "Pts"])
    elements = driver.find_element(By.CLASS_NAME, "standings").find_elements(By.CSS_SELECTOR, "tr")

    for element in elements:
        league_table_dict = { 
            "team_number": element.find_elements(By.CSS_SELECTOR, "td")[0].find_elements(By.CSS_SELECTOR, "a")[0].get_attribute("href").split("/")[4], 
            "team_name": element.find_elements(By.CSS_SELECTOR, "td")[0].find_elements(By.CSS_SELECTOR, "a")[0].get_attribute("textContent").split("\t")[0],     
            "P": element.find_elements(By.CSS_SELECTOR, "td")[1].get_attribute("textContent").split("\t")[0],
            "W": element.find_elements(By.CSS_SELECTOR, "td")[2].get_attribute("textContent").split("\t")[0], 
            "D": element.find_elements(By.CSS_SELECTOR, "td")[3].get_attribute("textContent").split("\t")[0],
            "L": element.find_elements(By.CSS_SELECTOR, "td")[4].get_attribute("textContent").split("\t")[0], 
            "GF": element.find_elements(By.CSS_SELECTOR, "td")[5].get_attribute("textContent").split("\t")[0], 
            "GA": element.find_elements(By.CSS_SELECTOR, "td")[6].get_attribute("textContent").split("\t")[0],
            "GD": element.find_elements(By.CSS_SELECTOR, "td")[7].get_attribute("textContent").split("\t")[0],
            "Pts": element.find_elements(By.CSS_SELECTOR, "td")[8].get_attribute("textContent").split("\t")[0],
        }
        league_table_df.loc[len(league_table_df)] = league_table_dict

    # close webdriver
    driver.close()
    return league_table_df

In [18]:
PL2223 = league_table(URL)

In [19]:
PL2223

Unnamed: 0,team_number,team_name,P,W,D,L,GF,GA,GD,Pts
0,26,Liverpool,29,21,7,1,69,27,42,70
1,13,Arsenal,29,16,10,3,53,24,29,58
2,174,Nottingham Forest,29,16,6,7,49,35,14,54
3,15,Chelsea,29,14,7,8,53,37,16,49
4,167,Manchester City,29,14,6,9,55,40,15,48
5,23,Newcastle,28,14,5,9,47,38,9,47
6,211,Brighton,29,12,11,6,48,42,6,47
7,170,Fulham,29,12,9,8,43,38,5,45
8,24,Aston Villa,29,12,9,8,41,45,-4,45
9,183,Bournemouth,29,12,8,9,48,36,12,44


In [20]:
def league_table_added(URL, api_delay_term=3):
    """
    crawling league table with additional features
    ex) shot per game, Tackles per game ... etc.
    
    Args : 
        URL : league table URL
        
    Output : 
        league table (data.frame)
    
    """
    url = str(URL)
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get(url)
    
    time.sleep(api_delay_term)
    
    league_table_df = pd.DataFrame(columns=[
        "team_name", "P", "W", "D", "L", "GF", "GA", "GD", "Pts"])
    elements = driver.find_elements(By.CLASS_NAME, 'standings')[0].find_elements(By.CSS_SELECTOR, "tr")
    
    for element in elements:
        league_table_dict = { 
            "team_name": element.find_elements(By.CSS_SELECTOR, "td")[0].find_elements(By.CSS_SELECTOR, "a")[0].get_attribute("textContent").split("\t")[0],     
            "P": element.find_elements(By.CSS_SELECTOR, "td")[1].get_attribute("textContent").split("\t")[0],
            "W": element.find_elements(By.CSS_SELECTOR, "td")[2].get_attribute("textContent").split("\t")[0], 
            "D": element.find_elements(By.CSS_SELECTOR, "td")[3].get_attribute("textContent").split("\t")[0],
            "L": element.find_elements(By.CSS_SELECTOR, "td")[4].get_attribute("textContent").split("\t")[0], 
            "GF": element.find_elements(By.CSS_SELECTOR, "td")[5].get_attribute("textContent").split("\t")[0], 
            "GA": element.find_elements(By.CSS_SELECTOR, "td")[6].get_attribute("textContent").split("\t")[0],
            "GD": element.find_elements(By.CSS_SELECTOR, "td")[7].get_attribute("textContent").split("\t")[0],
            "Pts": element.find_elements(By.CSS_SELECTOR, "td")[8].get_attribute("textContent").split("\t")[0],
        }
        league_table_df.loc[len(league_table_df)] = league_table_dict
    
    time.sleep(api_delay_term)
    
    starter = driver.find_elements(By.ID, "sub-navigation")
    starter = starter[0].find_elements(By.CSS_SELECTOR, "li")[2]
    starter.click()
    
    
    team_stat_df1 = pd.DataFrame(columns=[
        "team_name", "Goals", "Shots pg", "Yellow", "Red", "Poss%", "Pass%", 
        "A_Won", "Rating"
    ])
    elements = driver.find_elements(By.ID, "top-team-stats-summary-content")
    elements = elements[0].find_elements(By.CSS_SELECTOR, "tr")
    
    for element in elements:
        team_table_dict1 = { 
            "team_name": element.find_elements(By.CSS_SELECTOR, "td")[0].find_elements(By.CSS_SELECTOR, "a")[0].get_attribute("textContent").split("\t")[0].split('. ')[1],     
            "Goals": element.find_elements(By.CSS_SELECTOR, "td")[1].get_attribute("textContent").split("\t")[0],
            "Shots pg": element.find_elements(By.CSS_SELECTOR, "td")[2].get_attribute("textContent").split("\t")[0], 
            "Yellow": element.find_elements(By.CSS_SELECTOR, "td")[3].find_elements(By.CSS_SELECTOR, "span")[0].get_attribute("textContent").split("\t")[0], 
            "Red": element.find_elements(By.CSS_SELECTOR, "td")[3].find_elements(By.CSS_SELECTOR, "span")[1].get_attribute("textContent").split("\t")[0], 
            "Poss%": element.find_elements(By.CSS_SELECTOR, "td")[4].get_attribute("textContent").split("\t")[0], 
            "Pass%": element.find_elements(By.CSS_SELECTOR, "td")[5].get_attribute("textContent").split("\t")[0],
            "A_Won": element.find_elements(By.CSS_SELECTOR, "td")[6].get_attribute("textContent").split("\t")[0],
            "Rating": element.find_elements(By.CSS_SELECTOR, "td")[7].get_attribute("textContent").split("\t")[0],
        }
        team_stat_df1.loc[len(team_stat_df1)] = team_table_dict1
    
    element = driver.find_element(By.CSS_SELECTOR, "a[href='#stage-team-stats-defensive']")
    element.click()
    
    time.sleep(api_delay_term)
    
    
    team_stat_df2 = pd.DataFrame(columns=[
        "team_name", "Shoted pg", "Tackles pg", "Intercept pg", "Fouls pg", "Offsides pg"
    ])
    elements = driver.find_elements(By.ID, "statistics-team-table-defensive")
    elements = elements[0].find_elements(By.ID, "top-team-stats-summary-content")
    elements = elements[0].find_elements(By.CSS_SELECTOR, "tr")
    
    for element in elements:
        team_table_dict2 = { 
            "team_name": element.find_elements(By.CSS_SELECTOR, "td")[0].find_elements(By.CSS_SELECTOR, "a")[0].get_attribute("textContent").split("\t")[0].split('. ')[1],     
            'Shoted pg': element.find_elements(By.CSS_SELECTOR, "td")[1].get_attribute("textContent").split("\t")[0],
            'Tackles pg': element.find_elements(By.CSS_SELECTOR, "td")[2].get_attribute("textContent").split("\t")[0],
            'Intercept pg': element.find_elements(By.CSS_SELECTOR, "td")[3].get_attribute("textContent").split("\t")[0], 
            'Fouls pg': element.find_elements(By.CSS_SELECTOR, "td")[4].get_attribute("textContent").split("\t")[0], 
            'Offsides pg': element.find_elements(By.CSS_SELECTOR, "td")[5].get_attribute("textContent").split("\t")[0], 
        }
        team_stat_df2.loc[len(team_stat_df2)] = team_table_dict2
    
    element = driver.find_element(By.CSS_SELECTOR, "a[href='#stage-team-stats-offensive']")
    element.click()
    
    time.sleep(api_delay_term)
    
    team_stat_df3 = pd.DataFrame(columns=[
        "team_name", "Shots OT pg", "Dribbles pg", "Fouled pg"
    ])
    elements = driver.find_elements(By.ID, "statistics-team-table-offensive")
    elements = elements[0].find_elements(By.ID, "top-team-stats-summary-content")
    elements = elements[0].find_elements(By.CSS_SELECTOR, "tr")
    
    for element in elements: 
        team_table_dict3 = { 
            "team_name": element.find_elements(By.CSS_SELECTOR, "td")[0].find_elements(By.CSS_SELECTOR, "a")[0].get_attribute("textContent").split("\t")[0].split('. ')[1],     
            'Shots OT pg': element.find_elements(By.CSS_SELECTOR, "td")[2].get_attribute("textContent").split("\t")[0],
            'Dribbles pg': element.find_elements(By.CSS_SELECTOR, "td")[3].get_attribute("textContent").split("\t")[0], 
            'Fouled pg': element.find_elements(By.CSS_SELECTOR, "td")[4].get_attribute("textContent").split("\t")[0], 
        }
        team_stat_df3.loc[len(team_stat_df3)] = team_table_dict3
        
        
    team_stat_df = pd.merge(league_table_df, team_stat_df1, how='left', on='team_name')
    team_stat_df = pd.merge(team_stat_df, team_stat_df2, how='left', on='team_name')
    team_stat_df = pd.merge(team_stat_df, team_stat_df3, how='left', on='team_name')
    
    # close webdriver
    driver.close()
    
    return team_stat_df

In [21]:
df = league_table_added(URL)

In [42]:
df.head()

Unnamed: 0,team_name,P,W,D,L,GF,GA,GD,Pts,Goals,...,A_Won,Rating,Shoted pg,Tackles pg,Intercept pg,Fouls pg,Offsides pg,Shots OT pg,Dribbles pg,Fouled pg
0,Arsenal,17,14,2,1,40,14,26,44,40,...,11.2,6.86,8.2,14.9,7.3,10.2,1.4,5.7,8.2,11.4
1,Manchester City,17,12,3,2,45,16,29,39,45,...,12.7,6.93,7.1,12.9,6.4,8.5,1.5,6.1,7.6,8.8
2,Newcastle,18,9,8,1,32,11,21,35,32,...,14.3,6.82,11.1,16.7,9.1,11.1,2.3,5.0,5.9,10.3
3,Manchester United,17,11,2,4,27,20,7,35,27,...,12.5,6.74,11.9,15.6,9.9,11.6,2.1,5.5,5.2,7.2
4,Tottenham,18,10,3,5,37,25,12,33,37,...,13.4,6.75,14.3,15.9,9.3,10.1,1.8,5.8,6.6,8.2


In [22]:
df

Unnamed: 0,team_name,P,W,D,L,GF,GA,GD,Pts,Goals,...,A_Won,Rating,Shoted pg,Tackles pg,Intercept pg,Fouls pg,Offsides pg,Shots OT pg,Dribbles pg,Fouled pg
0,Liverpool,29,21,7,1,69,27,42,70,69,...,10.8,6.87,9.7,17.4,8.1,11.6,1.4,6.3,8.3,9.3
1,Arsenal,29,16,10,3,53,24,29,58,53,...,13.4,6.79,9.7,16.1,6.2,11.5,2.2,4.8,8.5,10.6
2,Nottingham Forest,29,16,6,7,49,35,14,54,49,...,15.0,6.72,13.6,18.3,8.7,10.6,2.3,4.6,7.4,11.0
3,Chelsea,29,14,7,8,53,37,16,49,53,...,10.8,6.69,10.9,15.5,7.3,11.8,1.6,5.8,8.8,11.9
4,Manchester City,29,14,6,9,55,40,15,48,55,...,8.0,6.73,9.9,13.3,6.0,7.4,1.2,5.8,10.1,9.9
5,Newcastle,28,14,5,9,47,38,9,47,47,...,12.3,6.69,13.1,16.3,7.7,10.6,2.0,4.4,7.7,12.3
6,Brighton,29,12,11,6,48,42,6,47,48,...,14.0,6.62,11.1,18.1,8.5,11.7,1.1,4.8,9.3,11.3
7,Fulham,29,12,9,8,43,38,5,45,43,...,13.9,6.64,11.0,17.9,8.6,11.2,1.1,4.6,7.9,9.9
8,Aston Villa,29,12,9,8,41,45,-4,45,41,...,10.4,6.56,12.1,16.8,6.6,11.4,1.7,4.2,9.0,13.1
9,Bournemouth,29,12,8,9,48,36,12,44,48,...,15.0,6.75,13.5,19.4,10.0,13.8,2.0,5.8,8.1,10.2


In [43]:
#df.to_csv('PL2223_league_table.csv')