In [47]:
import pandas as pd
import cloudscraper
from bs4 import BeautifulSoup, Comment

# Initialize scraper to bypass Cloudflare
scraper = cloudscraper.create_scraper(
    browser={"browser": "chrome", "platform": "windows", "desktop": True}
)

BASE_URL = "https://fbref.com"
LEAGUE_URLS = {
    'Premier League': 'https://fbref.com/en/comps/9/stats/Premier-League-Stats',
    'La Liga':       'https://fbref.com/en/comps/12/stats/La-Liga-Stats',
    'Bundesliga':    'https://fbref.com/en/comps/20/stats/Bundesliga-Stats',
    'Serie A':       'https://fbref.com/en/comps/11/stats/Serie-A-Stats',
    'Ligue 1':       'https://fbref.com/en/comps/13/stats/Ligue-1-Stats',
}
all_players_df = pd.DataFrame()

def scrape_league_overall(league_name, url):
    """
    Scrapes the main overall stats table for a given league and returns a DataFrame.
    """
    print(f"Scraping {league_name} overall stats...")
    resp = scraper.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'lxml')

    # The overall stats table is inside a commented div with id 'div_stats_standard'
    comment = soup.find(string=lambda text: isinstance(text, Comment) and 'div_stats_standard' in text)
    if not comment:
        raise ValueError(f"Could not find commented stats for {league_name}")

    comment_soup = BeautifulSoup(comment, 'lxml')
    table = comment_soup.find('table')
    df = pd.read_html(str(table))[0]

    # Annotate with league name
    df['League'] = league_name
    return df


def main():
    # Collect DataFrames for each league
    df_list = []
    for league, url in LEAGUE_URLS.items():
        df = scrape_league_overall(league, url)
        df_list.append(df)

    # Concatenate into a single DataFrame
    all_players_df = pd.concat(df_list, ignore_index=True)
    print(f"Combined DataFrame shape: {all_players_df.shape}")

    # Optional: save to CSV
    all_players_df.to_csv('top5_leagues_player_stats.csv', index=False)
    return all_players_df


if __name__ == '__main__':
    combined_df = main()
    print(combined_df.head())


Scraping Premier League overall stats...


  df = pd.read_html(str(table))[0]


Scraping La Liga overall stats...


  df = pd.read_html(str(table))[0]


Scraping Bundesliga overall stats...


  df = pd.read_html(str(table))[0]


Scraping Serie A overall stats...


  df = pd.read_html(str(table))[0]


Scraping Ligue 1 overall stats...
Combined DataFrame shape: (2966, 38)
  Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0 Unnamed: 3_level_0  \
                  Rk             Player             Nation                Pos   
0                  1         Max Aarons            eng ENG                 DF   
1                  2  Joshua Acheampong            eng ENG                 DF   
2                  3        Tyler Adams             us USA                 MF   
3                  4   Tosin Adarabioyo            eng ENG                 DF   
4                  5      Simon Adingra             ci CIV              FW,MF   

  Unnamed: 4_level_0 Unnamed: 5_level_0 Unnamed: 6_level_0 Playing Time  \
               Squad                Age               Born           MP   
0        Bournemouth                 24               2000            3   
1            Chelsea                 18               2006            4   
2        Bournemouth                 25               1999   

  df = pd.read_html(str(table))[0]


In [49]:
all_players_df.head()