## Fbref Data Scraping

In [31]:
# Package for scraping
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
from io import StringIO

# Dictionary for statistic types and League ID
stat_type_dict = {
    # Fieldplayer stats
    "Standard": "standard",
    "Shooting": "shooting",
    "Passing": "passing",
    "Pass Types": "passing_types",
    "Goal and Shot Creation": "gca",
    "Defensive Action": "defense",
    "Possession": "possession",
    "Playing Time": "playingtime",
    
    # Goalkeeper stats
    "Goalkeeping": "keepers",
    "Goalkeeping Advanced": "keepersadv"
}

league_id_dict = {
    "Premier League": 9,
    "La Liga": 12,
    "Bundesliga": 20,
    "Serie A": 11,
    "Ligue 1": 13,
    "Eredivisie": 23,
    "Primeira Liga": 32,
    "Belgian Pro League": 37
}

# Function to extract feature statistics
def get_fbref_stats(stat_type, season_str, league_name):
    league_id = league_id_dict[league_name]
    league_name_url = league_name.replace(" ", "-")
    
    # URL part for stat_type
    url_stat_type = "stats" if stat_type == "Standard" else stat_type_dict[stat_type]
    url = f"https://fbref.com/en/comps/{league_id}/{season_str}/{url_stat_type}/{season_str}-{league_name_url}-Stats"

    headers = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url, headers=headers)

    try:
        html = urlopen(req)
        print(f"Requesting URL: {url}")
        soup = BeautifulSoup(html, 'html.parser')
    except Exception as e:
        return None, f"Error loading page: {e}"
    
    # Fix table_id mapping for these cases
    table_id = {
        "Playing Time": "stats_playing_time",
        "Goalkeeping": "stats_keeper",
        "Goalkeeping Advanced": "stats_keeper_adv"
    }.get(stat_type, "stats_standard" if stat_type == "Standard" else f"stats_{stat_type_dict[stat_type]}")
    
    def find_table_in_comments(soup, table_id):
    # Try to find table normally
        table = soup.find("table", {"id": table_id})
        if table:
            return table
    
    # If not found, search inside comments
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for comment in comments:
            comment_soup = BeautifulSoup(comment, "html.parser")
            table = comment_soup.find("table", {"id": table_id})
            if table:
                return table
        return None

    table_html = find_table_in_comments(soup, table_id)

    try:
        df = pd.read_html(StringIO(str(table_html)), flavor='lxml')[0]
    except Exception as e:
        return None, f"Error parsing table HTML: {e}"

    # Remove repeated header rows
    df = df[df[df.columns[0]] != df.columns[0]]
    df.reset_index(drop=True, inplace=True)

    # Flatten multi-index columns and clean 'Unnamed' prefixes
    df.columns = [
        col[1] if col[0].startswith('Unnamed') or col[0] == col[1]
        else f"{col[0]}_{col[1]}"
        for col in df.columns
    ]

    # Drop unwanted columns if they exist
    df.drop(columns=[c for c in df.columns if c.lower() in ['rk', 'matches']], inplace=True, errors='ignore')

    # Convert numeric columns except known non-numeric
    non_numeric_cols = {"Player", "Nation", "Pos", "Squad", "Age", "Born"}
    for col in df.columns:
        if col not in non_numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
            
    # Fill NaN values with 0
    df.fillna(0, inplace=True)
    
    # For non-standard stat types, fetch appropriate standard table for filtering
    if stat_type != "Standard":
        # Decide source for filtering: Standard for most, Goalkeeping for Advanced
        if stat_type == "Goalkeeping Advanced":
            std_url = f"https://fbref.com/en/comps/{league_id}/{season_str}/keepers/{season_str}-{league_name_url}-Stats"
            std_table_id = "stats_keeper"
        elif stat_type == "Goalkeeping":
            std_url = url  # Already have the Goalkeeping data in `df`, no need to refetch
            std_table_id = "stats_keeper"
        else:
            std_url = f"https://fbref.com/en/comps/{league_id}/{season_str}/stats/{season_str}-{league_name_url}-Stats"
            std_table_id = "stats_standard"

        # Only fetch if we didn't already get the standard data (i.e., not Goalkeeping)
        if stat_type != "Goalkeeping":
            req_std = Request(std_url, headers=headers)
            try:
                html_std = urlopen(req_std)
                soup_std = BeautifulSoup(html_std, 'html.parser')
            except Exception as e:
                return None, f"Error loading standard stats page: {e}"

            comment_blocks_std = soup_std.find_all(string=lambda text: isinstance(text, Comment))
            std_table_html = None
            for comment in comment_blocks_std:
                if std_table_id in comment:
                    comment_soup = BeautifulSoup(comment, 'html.parser')
                    std_table_html = comment_soup.find("table", {"id": std_table_id})
                    if std_table_html:
                        break

            if std_table_html is None:
                return None, f"Standard stats table not found at {std_url}"

            try:
                std_df = pd.read_html(StringIO(str(std_table_html)), flavor='lxml')[0]
            except Exception as e:
                return None, f"Error parsing standard stats table: {e}"

            std_df = std_df[std_df[std_df.columns[0]] != std_df.columns[0]]
            std_df.reset_index(drop=True, inplace=True)
            std_df.columns = [
                col[1] if col[0].startswith('Unnamed') or col[0] == col[1]
                else f"{col[0]}_{col[1]}"
                for col in std_df.columns
            ]

            for col in std_df.columns:
                if col not in non_numeric_cols:
                    std_df[col] = pd.to_numeric(std_df[col], errors='coerce').fillna(0)

            valid_players = std_df[(std_df["Playing Time_MP"] >= 5) &
                                (std_df["Playing Time_Min"] >= 150)]["Player"]

            df = df[df["Player"].isin(valid_players)].reset_index(drop=True)

        else:
            # If stat_type is Goalkeeping, apply filter directly
            df = df[(df["Playing Time_MP"] >= 5) & (df["Playing Time_Min"] >= 150)].reset_index(drop=True)
    
    else:
        df = df[(df["Playing Time_MP"] >= 5) & (df["Playing Time_Min"] >= 150)].reset_index(drop=True)

    return df, None

In [32]:
df, error = get_fbref_stats("Playing Time", "2023-2024", "Premier League")
df

Requesting URL: https://fbref.com/en/comps/9/2023-2024/playingtime/2023-2024-Premier-League-Stats


Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,Playing Time_MP,Playing Time_Min,Playing Time_Mn/MP,Playing Time_Min%,...,Team Success_onG,Team Success_onGA,Team Success_+/-,Team Success_+/-90,Team Success_On-Off,Team Success (xG)_onxG,Team Success (xG)_onxGA,Team Success (xG)_xG+/-,Team Success (xG)_xG+/-90,Team Success (xG)_On-Off
0,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20.0,1237.0,62.0,36.2,...,12.0,34.0,-22.0,-1.60,-1.97,14.2,27.1,-12.8,-0.94,-1.37
1,Tosin Adarabioyo,eng ENG,DF,Fulham,25,1997,20.0,1617.0,81.0,47.3,...,28.0,25.0,3.0,0.17,0.62,27.6,28.0,-0.5,-0.03,0.56
2,Elijah Adebayo,eng ENG,FW,Luton Town,25,1998,27.0,1419.0,53.0,41.5,...,30.0,36.0,-6.0,-0.38,0.83,22.2,28.4,-6.3,-0.40,0.92
3,Simon Adingra,ci CIV,FW,Brighton,21,2002,31.0,2222.0,72.0,65.0,...,34.0,38.0,-4.0,-0.16,0.06,33.9,35.9,-2.0,-0.08,-0.34
4,Nayef Aguerd,ma MAR,DF,West Ham,27,1996,21.0,1857.0,88.0,54.3,...,26.0,44.0,-18.0,-0.87,-1.10,25.0,38.2,-13.2,-0.64,-0.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473,Nicolò Zaniolo,it ITA,"FW,MF",Aston Villa,24,1999,25.0,839.0,34.0,24.5,...,17.0,13.0,4.0,0.43,0.05,13.1,14.7,-1.6,-0.18,-0.35
474,Anass Zaroury,ma MAR,"FW,MF",Burnley,22,2000,6.0,152.0,25.0,4.4,...,2.0,4.0,-2.0,-1.18,-0.22,1.8,2.8,-1.1,-0.64,0.15
475,Oleksandr Zinchenko,ua UKR,DF,Arsenal,26,1996,27.0,1722.0,64.0,50.4,...,38.0,15.0,23.0,1.20,-0.87,38.5,12.1,26.4,1.38,0.22
476,Kurt Zouma,fr FRA,DF,West Ham,28,1994,33.0,2838.0,86.0,83.0,...,53.0,63.0,-10.0,-0.32,0.30,43.4,59.2,-15.7,-0.50,-0.03


## Fbref Scraping - Team Version

In [7]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
from io import StringIO

def get_fbref_team_stats(stat_type, season_str, league_name):
    # URL construction
    league_id = league_id_dict[league_name]
    league_name_url = league_name.replace(" ", "-")
    stat_suffix = stat_type_dict[stat_type]
    url = f"https://fbref.com/en/comps/{league_id}/{season_str}/{stat_suffix}/{season_str}-{league_name_url}-Stats"

    headers = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url, headers=headers)

    # Try to load page
    try:
        html = urlopen(req)
        print(f"Requesting team URL: {url}")
        soup = BeautifulSoup(html, 'html.parser')
    except Exception as e:
        return None, f"Error loading team stats page: {e}"

    # --- Helper: find table by caption ---
    def find_table_by_caption(soup, caption_startswith="Squad"):
        # First check visible tables
        for table in soup.find_all("table"):
            caption = table.find("caption")
            if caption and caption.text.strip().startswith(caption_startswith):
                return table
        # Then check within HTML comments
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for comment in comments:
            comment_soup = BeautifulSoup(comment, "html.parser")
            for table in comment_soup.find_all("table"):
                caption = table.find("caption")
                if caption and caption.text.strip().startswith(caption_startswith):
                    return table
        return None

    table_html = find_table_by_caption(soup, caption_startswith="Squad")

    if table_html is None:
        return None, f"No team-level table found for stat type '{stat_type}'"

    # Parse table into DataFrame
    try:
        df = pd.read_html(StringIO(str(table_html)), flavor='lxml')[0]
    except Exception as e:
        return None, f"Error parsing table HTML: {e}"

    # Clean table
    df = df[df[df.columns[0]] != df.columns[0]]
    df.reset_index(drop=True, inplace=True)

    # Flatten column names
    df.columns = [
        col[1] if col[0].startswith('Unnamed') or col[0] == col[1]
        else f"{col[0]}_{col[1]}"
        for col in df.columns
    ]

    # Convert numeric columns
    non_numeric_cols = {"Squad", "Country"}
    for col in df.columns:
        if col not in non_numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    df.fillna(0, inplace=True)

    return df, None

In [9]:
df, error = get_fbref_team_stats(
    stat_type="Standard",
    season_str="2023-2024",
    league_name="Belgian Pro League"
)
df

Requesting team URL: https://fbref.com/en/comps/37/2023-2024/standard/2023-2024-Belgian-Pro-League-Stats


Unnamed: 0,Squad,# Pl,Age,Poss,Playing Time_MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Performance_Gls,Performance_Ast,...,Per 90 Minutes_Gls,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG
0,Anderlecht,32,26.4,53.0,30,330,2700,30.0,56,41,...,1.87,1.37,3.23,1.63,3.0,1.59,1.15,2.74,1.42,2.57
1,Antwerp,27,24.7,59.3,30,330,2700,30.0,52,36,...,1.73,1.2,2.93,1.57,2.77,1.72,1.21,2.93,1.48,2.69
2,Cercle Brugge,27,23.3,45.1,30,330,2700,30.0,43,34,...,1.43,1.13,2.57,1.37,2.5,1.7,1.23,2.93,1.62,2.85
3,Charleroi,30,25.5,47.8,30,330,2700,30.0,26,20,...,0.87,0.67,1.53,0.77,1.43,1.1,0.83,1.93,0.99,1.82
4,Club Brugge,29,25.7,56.8,30,330,2700,30.0,60,39,...,2.0,1.3,3.3,1.73,3.03,2.0,1.4,3.4,1.74,3.14
5,Eupen,26,25.4,44.7,30,330,2700,30.0,24,18,...,0.8,0.6,1.4,0.77,1.37,0.96,0.71,1.67,0.93,1.65
6,Genk,26,24.0,53.9,30,330,2700,30.0,49,30,...,1.63,1.0,2.63,1.57,2.57,1.98,1.43,3.41,1.84,3.27
7,Gent,32,26.4,54.3,30,330,2700,30.0,50,30,...,1.67,1.0,2.67,1.53,2.53,1.66,1.13,2.79,1.5,2.63
8,Kortrijk,36,24.6,43.0,30,330,2700,30.0,22,13,...,0.73,0.43,1.17,0.67,1.1,0.85,0.57,1.42,0.82,1.39
9,Mechelen,30,26.5,50.9,30,330,2700,30.0,37,28,...,1.23,0.93,2.17,1.13,2.07,1.41,1.04,2.45,1.31,2.35
