In [43]:
import os
import time
import pandas as pd
import cloudscraper
from bs4 import BeautifulSoup, Comment

# Initialize a Cloudflare‑bypassing scraper
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True}
)

BASE_URL = "https://fbref.com"

# Top‑5 leagues (2024–25 season)
LEAGUE_URLS = {
    'Premier League': 'https://fbref.com/en/comps/9/stats/Premier-League-Stats',
    'La Liga':       'https://fbref.com/en/comps/12/stats/La-Liga-Stats',
    'Bundesliga':    'https://fbref.com/en/comps/20/stats/Bundesliga-Stats',
    'Serie A':       'https://fbref.com/en/comps/11/stats/Serie-A-Stats',
    'Ligue 1':       'https://fbref.com/en/comps/13/stats/Ligue-1-Stats',
}

# Delay between requests (seconds)
DELAY = 1.5
# Data output folder
SAVE_DIR = "data/players"
os.makedirs(SAVE_DIR, exist_ok=True)


def get_player_links(league_name, league_url):
    """Scrape league page, parse commented standard stats table, return player→URL map."""
    print(f"Fetching league data for {league_name}...")
    resp = scraper.get(league_url)
    resp.raise_for_status()

    # Parse HTML; the main stats table is inside a commented div
    soup = BeautifulSoup(resp.text, 'lxml')
    # Find commented block containing 'div_stats_standard'
    comment_nodes = soup.find_all(
        string=lambda text: isinstance(text, Comment) and 'div_stats_standard' in text
    )
    if comment_nodes:
        comment_html = comment_nodes[0]
        comment_soup = BeautifulSoup(comment_html, 'lxml')
        table = comment_soup.find('table')
    else:
        # Fallback: direct table lookup by ID
        table = soup.find('table', id=lambda x: x and x.endswith('_overall'))

    players = {}
    if not table or not table.tbody:
        print(f"Could not find stats table for {league_name}.")
        return players

    for row in table.tbody.find_all('tr'):
        # Skip header separators
        if 'thead' in row.get('class', []):
            continue
        cell = row.find('td', {'data-stat': 'player'})
        if not cell:
            continue
        link = cell.find('a', href=True)
        if not link:
            continue
        name = link.text.strip()
        players[name] = BASE_URL + link['href']

    print(f"Found {len(players)} players in {league_name}.")
    return players


def scrape_player_page(name, url):
    """Visit a player page, dump each table to CSV in a player‑named folder."""
    time.sleep(DELAY)
    resp = scraper.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'lxml')

    safe = name.replace('/', '_').replace(' ', '_')
    outdir = os.path.join(SAVE_DIR, safe)
    os.makedirs(outdir, exist_ok=True)

    for table in soup.find_all('table'):
        tid = table.get('id')
        if not tid:
            continue
        df = pd.read_html(str(table))[0]
        df.to_csv(os.path.join(outdir, f"{tid}.csv"), index=False)
    print(f"Saved data for {name}.")


def main():
    all_players = {}
    for league, url in LEAGUE_URLS.items():
        try:
            links = get_player_links(league, url)
            all_players.update(links)
        except Exception as e:
            print(f"Error fetching players for {league}: {e}")

    print(f"Total players found: {len(all_players)}")
    for i, (player, link) in enumerate(all_players.items(), 1):
        print(f"[{i}/{len(all_players)}] Scraping {player}")
        try:
            scrape_player_page(player, link)
        except Exception as e:
            print(f"Error scraping {player}: {e}")

    print("All done!")

if __name__ == '__main__':
    main()


Fetching league data for Premier League...
Found 562 players in Premier League.
Fetching league data for La Liga...
Found 588 players in La Liga.
Fetching league data for Bundesliga...
Found 481 players in Bundesliga.
Fetching league data for Serie A...
Found 599 players in Serie A.
Fetching league data for Ligue 1...
Found 542 players in Ligue 1.
Total players found: 2702
[1/2702] Scraping Max Aarons


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Saved data for Max Aarons.
[2/2702] Scraping Joshua Acheampong


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Saved data for Joshua Acheampong.
[3/2702] Scraping Tyler Adams


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Saved data for Tyler Adams.
[4/2702] Scraping Tosin Adarabioyo


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Saved data for Tosin Adarabioyo.
[5/2702] Scraping Simon Adingra


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Saved data for Simon Adingra.
[6/2702] Scraping Emmanuel Agbadou


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Saved data for Emmanuel Agbadou.
[7/2702] Scraping Asher Agbinone


KeyboardInterrupt: 