In [14]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import time

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    )
}

BIG5_URL = (
    "https://fbref.com/en/comps/Big5/stats/players/"
    "Big-5-European-Leagues-Stats"
)

def get_big5_player_list(url: str) -> pd.DataFrame:
    resp = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(resp.text, "html.parser")

    # grab *all* comment strings and combine
    comments = [c for c in soup.find_all(string=lambda t: isinstance(t, Comment))]
    comment_html = "\n".join(comments)
    comment_soup = BeautifulSoup(comment_html, "html.parser")

    # locate the main table
    table = comment_soup.find("table", id="stats_standard")
    if table is None:
        raise RuntimeError("Could not find #stats_standard in comments")

    players = []
    for tr in table.tbody.find_all("tr"):
        td = tr.find("td", {"data-stat": "player"})
        if not td or not td.a:
            continue
        name = td.a.text.strip()
        href = td.a["href"]
        players.append({
            "name": name,
            "url": "https://fbref.com" + href
        })

    return pd.DataFrame(players)

In [15]:
def extract_player_per90(url: str) -> dict | None:
    resp = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(resp.text, "html.parser")

    # combine rendered + commented HTML
    comments = [c for c in soup.find_all(string=lambda t: isinstance(t, Comment))]
    all_html = resp.text + "\n" + "\n".join(comments)
    full_soup = BeautifulSoup(all_html, "html.parser")

    # find any table with “Scouting Report” in its caption
    target = None
    for tbl in full_soup.find_all("table"):
        cap = tbl.find("caption")
        if cap and "Scouting Report" in cap.text:
            target = tbl
            break
    if target is None:
        return None

    stats = {}
    for tr in target.tbody.find_all("tr"):
        th = tr.find("th", {"data-stat": "statistic"})
        td = tr.find("td", {"data-stat": "per90"})
        if not th or not td:
            continue

        key = th.text.strip()
        val = td.text.strip().replace("%", "")
        try:
            stats[key] = float(val)
        except ValueError:
            stats[key] = None

    return stats

In [16]:
def scrape_all_per90(big5_url: str, pause: float = 2.0) -> pd.DataFrame:
    df_players = get_big5_player_list(big5_url)
    all_data = []
    for row in df_players.itertuples():
        print(f"→ Scraping {row.name} …", end="")
        stats = extract_player_per90(row.url)
        if stats:
            stats["Player"] = row.name
            all_data.append(stats)
            print(" OK")
        else:
            print(" no scouting table")
        time.sleep(pause)

    return pd.DataFrame(all_data)

In [17]:
df_all = scrape_all_per90(BIG5_URL)

RuntimeError: Could not find #stats_standard in comments