In [70]:
import os
import time
import random
import pandas as pd
import cloudscraper
from bs4 import BeautifulSoup, Comment

# Initialize a Cloudflare-bypassing scraper
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True}
)
BASE_URL = "https://fbref.com"

# League IDs and names
LEAGUES = {
    'Premier League': 9,
    'La Liga':       12,
    'Bundesliga':    20,
    'Serie A':       11,
    'Ligue 1':       13,
}
# Categories to scrape ('' stands for overall)
CATEGORIES = {
    'overall':      '',
    'shooting':     'shooting',
    'passing':      'passing',
    'gca':          'gca',
    'defense':      'defense',
    'possession':   'possession',
    'misc':         'misc',
}
# polite delay range
DELAY_MIN, DELAY_MAX = 1.5, 3.0


def scrape_league_category(league_name: str, comp_id: int, category_key: str) -> pd.DataFrame:
    """
    Scrape one stats category for a given league and return a DataFrame.
    Adds columns: League, Category.
    """
    cat = CATEGORIES[category_key]
    # build URL
    if cat:
        url = f"{BASE_URL}/en/comps/{comp_id}/{cat}/{league_name.replace(' ', '-')}-Stats"
    else:
        url = f"{BASE_URL}/en/comps/{comp_id}/{league_name.replace(' ', '-')}-Stats"

    print(f"Fetching {league_name} - {category_key}...")
    time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
    resp = scraper.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'lxml')

    # find commented table block
    comment = soup.find(
        string=lambda text: isinstance(text, Comment) and f"div_stats_{cat or 'standard'}" in text
    )
    if comment:
        tbl_soup = BeautifulSoup(comment, 'lxml')
        table = tbl_soup.find('table')
    else:
        # fallback to first stats_table
        table = soup.find('table', class_='stats_table')

    if table is None:
        print(f"Warning: no table for {league_name} {category_key}")
        return pd.DataFrame()

    df = pd.read_html(str(table))[0]
    df['League'] = league_name
    df['Category'] = category_key
    return df


def main():
    # Collect all league-category DataFrames
    dfs = []
    for league, comp_id in LEAGUES.items():
        for cat_key in CATEGORIES:
            df = scrape_league_category(league, comp_id, cat_key)
            if not df.empty:
                dfs.append(df)

    # concatenate all
    all_stats = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
    print(f"Combined DataFrame shape: {all_stats.shape}")

    # save to CSV
    os.makedirs('data', exist_ok=True)
    all_stats.to_csv('data/top5_all_categories.csv', index=False)
    return all_stats


if __name__ == '__main__':
    master_df = main()
    print(master_df.head())


Fetching Premier League - overall...


HTTPError: 429 Client Error: Too Many Requests for url: https://fbref.com/en/comps/9/Premier-League-Stats

In [69]:
combined_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Playing Time,Playing Time,Playing Time,...,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 36_level_0,League
Unnamed: 0_level_1,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches,Unnamed: 21_level_1
0,1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Matches,Premier League
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,...,0.00,0.00,0.00,0.12,0.00,0.12,0.12,0.12,Matches,Premier League
2,3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,...,0.14,0.00,0.14,0.07,0.05,0.12,0.07,0.12,Matches,Premier League
3,4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,...,0.13,0.06,0.13,0.06,0.01,0.07,0.06,0.07,Matches,Premier League
4,5,Simon Adingra,ci CIV,"FW,MF",Brighton,22,2002,29,12,1097,...,0.33,0.16,0.33,0.20,0.20,0.40,0.20,0.40,Matches,Premier League
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2961,550,Luck Zogbé,ci CIV,DF,Brest,19,2005,13,7,638,...,0.00,0.00,0.00,0.04,0.02,0.06,0.04,0.06,Matches,Ligue 1
2962,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches,Ligue 1
2963,551,Aristide Zossou,ci CIV,MF,Auxerre,19,2005,3,0,16,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Matches,Ligue 1
2964,552,Yanis Zouaoui,dz ALG,"DF,FW",Le Havre,30,1994,17,9,976,...,0.18,0.00,0.18,0.01,0.27,0.28,0.01,0.28,Matches,Ligue 1
