In [1]:
import os
import time
import random
import pandas as pd
import cloudscraper
from bs4 import BeautifulSoup, Comment
import requests

# Initialize cloudscraper session
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True}
)
BASE_URL = "https://fbref.com"

# Top-5 leagues: name -> comp_id
LEAGUES = {
    'Premier League': 9,
    'La Liga':       12,
    'Bundesliga':    20,
    'Serie A':       11,
    'Ligue 1':       13,
}
# Stats categories ('' means overall)
CATEGORIES = {
    'overall':    '',
    'shooting':   'shooting',
    'passing':    'passing',
    'gca':        'gca',
    'defense':    'defense',
    'possession': 'possession',
    'misc':       'misc',
}

# Request throttling and retry settings
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 '
    '(KHTML, like Gecko) Version/16.5 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0',
]
PROXIES = []  # e.g. ['http://proxy1:port', ...]
DELAY_MIN = 3
DELAY_MAX = 6
MAX_RETRIES = 5
BACKOFF_FACTOR = 2


def get_with_retries(url, referer=None):
    """
    Fetch URL with retries on 429/network errors. Uses exponential backoff and jitter.
    """
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        if referer:
            headers['Referer'] = referer
        proxies = None
        if PROXIES:
            proxy = random.choice(PROXIES)
            proxies = {'http': proxy, 'https': proxy}
        try:
            resp = scraper.get(url, headers=headers, proxies=proxies)
            if resp.status_code == 429:
                raise requests.exceptions.HTTPError('429 Too Many Requests')
            resp.raise_for_status()
            return resp
        except Exception as e:
            last_err = e
            if attempt == MAX_RETRIES:
                print(f"[Error] {url} failed after {MAX_RETRIES} attempts: {e}")
                raise
            # backoff with jitter
            wait = (BACKOFF_FACTOR ** (attempt - 1)) + random.uniform(0, 1)
            print(f"[Retry {attempt}] {url} error: {e}. Sleeping {wait:.1f}s...")
            time.sleep(wait)
    raise last_err


def scrape_league_category(league_name: str, comp_id: int, category: str) -> pd.DataFrame:
    """
    Scrape stats table for one league and category.
    Returns DataFrame with extra columns: League, Category.
    """
    path = category + '/' if category else ''
    # Build URL
    url = f"{BASE_URL}/en/comps/{comp_id}/{path}{league_name.replace(' ', '-')}-Stats"
    print(f"Fetching {league_name} [{category or 'overall'}]...")
    time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
    resp = get_with_retries(url)
    soup = BeautifulSoup(resp.text, 'lxml')

    # Look for commented table block first
    block_id = f"div_stats_{category or 'standard'}"
    comment = soup.find(string=lambda t: isinstance(t, Comment) and block_id in t)
    if comment:
        tbl_soup = BeautifulSoup(comment, 'lxml')
        table = tbl_soup.find('table')
    else:
        # fallback to first stats_table
        table = soup.find('table', class_='stats_table')

    if table is None:
        print(f"[Warning] No table for {league_name} {category}")
        return pd.DataFrame()

    df = pd.read_html(str(table))[0]
    df['League'] = league_name
    df['Category'] = category or 'overall'
    return df


def main():
    # Scrape all leagues × categories
    frames = []
    for league, comp_id in LEAGUES.items():
        for cat in CATEGORIES.values():
            try:
                df = scrape_league_category(league, comp_id, cat)
                if not df.empty:
                    frames.append(df)
            except Exception as e:
                print(f"Error scraping {league} {cat}: {e}")

    master_df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    print(f"Master DataFrame shape: {master_df.shape}")

    os.makedirs('data', exist_ok=True)
    master_df.to_csv('data/top5_all_categories.csv', index=False)
    return master_df


if __name__ == '__main__':
    df = main()
    print(df.head())


Fetching Premier League [overall]...


  df = pd.read_html(str(table))[0]


Fetching Premier League [shooting]...


  df = pd.read_html(str(table))[0]


Fetching Premier League [passing]...


  df = pd.read_html(str(table))[0]


Fetching Premier League [gca]...


  df = pd.read_html(str(table))[0]


Fetching Premier League [defense]...


  df = pd.read_html(str(table))[0]


Fetching Premier League [possession]...


  df = pd.read_html(str(table))[0]


Fetching Premier League [misc]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [overall]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [shooting]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [passing]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [gca]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [defense]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [possession]...


  df = pd.read_html(str(table))[0]


Fetching La Liga [misc]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [overall]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [shooting]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [passing]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [gca]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [defense]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [possession]...


  df = pd.read_html(str(table))[0]


Fetching Bundesliga [misc]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [overall]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [shooting]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [passing]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [gca]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [defense]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [possession]...


  df = pd.read_html(str(table))[0]


Fetching Serie A [misc]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [overall]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [shooting]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [passing]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [gca]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [defense]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [possession]...


  df = pd.read_html(str(table))[0]


Fetching Ligue 1 [misc]...


  df = pd.read_html(str(table))[0]


Master DataFrame shape: (17892, 145)
    Rk            Squad    MP     W     D     L    GF    GA    GD   Pts  ...  \
0  1.0        Liverpool  38.0  25.0   9.0   4.0  86.0  41.0  45.0  84.0  ...   
1  2.0          Arsenal  38.0  20.0  14.0   4.0  69.0  34.0  35.0  74.0  ...   
2  3.0  Manchester City  38.0  21.0   8.0   9.0  72.0  44.0  28.0  71.0  ...   
3  4.0          Chelsea  38.0  20.0   9.0   9.0  64.0  43.0  21.0  69.0  ...   
4  5.0    Newcastle Utd  38.0  20.0   6.0  12.0  68.0  47.0  21.0  66.0  ...   

   (Performance, Crs)  (Performance, Int)  (Performance, TklW)  \
0                 NaN                 NaN                  NaN   
1                 NaN                 NaN                  NaN   
2                 NaN                 NaN                  NaN   
3                 NaN                 NaN                  NaN   
4                 NaN                 NaN                  NaN   

   (Performance, PKwon)  (Performance, PKcon)  (Performance, OG)  \
0                

In [3]:
df.head(30)

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,"(Performance, Crs)","(Performance, Int)","(Performance, TklW)","(Performance, PKwon)","(Performance, PKcon)","(Performance, OG)","(Performance, Recov)","(Aerial Duels, Won)","(Aerial Duels, Lost)","(Aerial Duels, Won%)"
0,1.0,Liverpool,38.0,25.0,9.0,4.0,86.0,41.0,45.0,84.0,...,,,,,,,,,,
1,2.0,Arsenal,38.0,20.0,14.0,4.0,69.0,34.0,35.0,74.0,...,,,,,,,,,,
2,3.0,Manchester City,38.0,21.0,8.0,9.0,72.0,44.0,28.0,71.0,...,,,,,,,,,,
3,4.0,Chelsea,38.0,20.0,9.0,9.0,64.0,43.0,21.0,69.0,...,,,,,,,,,,
4,5.0,Newcastle Utd,38.0,20.0,6.0,12.0,68.0,47.0,21.0,66.0,...,,,,,,,,,,
5,6.0,Aston Villa,38.0,19.0,9.0,10.0,58.0,51.0,7.0,66.0,...,,,,,,,,,,
6,7.0,Nott'ham Forest,38.0,19.0,8.0,11.0,58.0,46.0,12.0,65.0,...,,,,,,,,,,
7,8.0,Brighton,38.0,16.0,13.0,9.0,66.0,59.0,7.0,61.0,...,,,,,,,,,,
8,9.0,Bournemouth,38.0,15.0,11.0,12.0,58.0,46.0,12.0,56.0,...,,,,,,,,,,
9,10.0,Brentford,38.0,16.0,8.0,14.0,66.0,57.0,9.0,56.0,...,,,,,,,,,,
