In [12]:
import os, re, time, random, requests, pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

BASE = "https://www.transfermarkt.us"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer": "https://www.google.com/",
    "Cache-Control": "no-cache",
}

S = requests.Session()
S.headers.update(HEADERS)

def retry_get(url, max_retries=5, base_sleep=3.0):
    """GET with exponential backoff + jitter; returns text or raises HTTPError."""
    attempt = 0
    while True:
        r = S.get(url, timeout=30)
        # Some 403s are transient; back off and retry a few times
        if r.status_code in (403, 429) and attempt < max_retries:
            attempt += 1
            sleep_s = base_sleep * (2 ** (attempt - 1)) + random.uniform(0.5, 2.0)
            print(f"… {r.status_code} on {url} — retry {attempt}/{max_retries} after {sleep_s:.1f}s")
            time.sleep(sleep_s)
            continue
        r.raise_for_status()
        return r.text

def parse_compact_squad(url: str, team_name: str = None, league: str = "Premier League"):
    html = retry_get(url)
    soup = BeautifulSoup(html, "lxml")

    out = []
    for tr in soup.select("table.items tbody tr.odd, table.items tbody tr.even"):
        tds = tr.find_all("td", recursive=False)
        if not tds:
            continue

        # player anchor
        player_idx, player_name = None, None
        for i, td in enumerate(tds):
            a = td.select_one('a[href*="/profil/spieler/"]')
            if a:
                player_idx = i
                player_name = a.get_text(strip=True)
                break
        if player_idx is None:
            continue

        # kit number
        kit = None
        if player_idx >= 1:
            m = re.search(r"\d+", tds[player_idx - 1].get_text(" ", strip=True))
            kit = int(m.group()) if m else None

        # position
        pos_td = tds[player_idx].select_one("table.inline-table tr:nth-of-type(2) td")
        position = pos_td.get_text(strip=True) if pos_td else None

        def safe_text(idx):
            return tds[idx].get_text(" ", strip=True) if idx < len(tds) else None

        # +1 Age
        age_txt = safe_text(player_idx + 1)
        age = int(re.search(r"\d+", age_txt).group()) if age_txt and re.search(r"\d+", age_txt) else None

        # +2 Nationality flags
        nat_td = tds[player_idx + 2] if (player_idx + 2) < len(tds) else None
        nationality = None
        if nat_td:
            flags = [img.get("title") for img in nat_td.select("img[title]")]
            nationality = ", ".join([f for f in flags if f]) or None

        # +3 Contract, +4 Market value
        contract_until = safe_text(player_idx + 3) or None
        market_value   = safe_text(player_idx + 4) or None

        parts = player_name.split()
        last_name  = parts[-1] if len(parts) > 1 else player_name
        first_name = " ".join(parts[:-1]) if len(parts) > 1 else None

        out.append({
            "league": league,
            "team": team_name,
            "kit_number": kit,
            "player_name": player_name,
            "first_name": first_name,
            "last_name": last_name,
            "position": position,
            "age": age,
            "nationality": nationality,
            "contract_until": contract_until,
            "market_value": market_value,
        })

    df = pd.DataFrame(out)
    if not df.empty:
        df["kit_number"] = df["kit_number"].astype("Int64")
    return df

def get_premier_league_team_urls(season=2025):
    # Use league page without season in path (works on .us)
    league_url = f"{BASE}/premier-league/startseite/wettbewerb/GB1"
    html = retry_get(league_url)
    soup = BeautifulSoup(html, "lxml")

    found = {}
    # CSS path
    for a in soup.select('a.vereinprofil_tooltip[href*="/startseite/verein/"]'):
        href = a.get("href") or ""
        m = re.search(r"/([^/]+)/startseite/verein/(\d+)", href)
        if not m:
            continue
        slug, verein_id = m.group(1), m.group(2)
        team_name = a.get_text(strip=True) or slug.replace("-", " ").title()
        compact = f"{BASE}/{slug}/kader/verein/{verein_id}/saison_id/{season}"
        found[verein_id] = {"team_name": team_name, "url": compact}

    # Regex fallback (in case some entries aren’t captured by selector)
    for m in re.finditer(r"/([a-z0-9\-\.]+)/startseite/verein/(\d+)", html, flags=re.I):
        slug, verein_id = m.group(1), m.group(2)
        if verein_id in found:
            continue
        a = soup.find("a", href=re.compile(fr"/{re.escape(slug)}/startseite/verein/{verein_id}"))
        team_name = (a.get_text(strip=True) if a else slug.replace("-", " ").title())
        compact = f"{BASE}/{slug}/kader/verein/{verein_id}/saison_id/{season}"
        found[verein_id] = {"team_name": team_name, "url": compact}

    return list(found.values())

# ---------------- Run ----------------
season = 2025
teams = get_premier_league_team_urls(season)
print(f"Found {len(teams)} clubs for Premier League {season}")
for t in teams[:8]:
    print("•", t["team_name"], "->", t["url"])

# TIP: scrape a small subset first to verify, then switch to teams_all
teams_all = teams
# teams_all = teams[:5]  # uncomment to test on 5 clubs first

frames = []
for t in teams_all:
    try:
        # polite pacing + jitter to avoid 403s
        time.sleep(2.0 + random.uniform(0.25, 0.75))
        df_team = parse_compact_squad(t["url"], team_name=t["team_name"], league="Premier League")
        if df_team is None or df_team.empty:
            print(f"△ {t['team_name']}: 0 rows (skipped)")
            continue
        print(f"✓ {t['team_name']}: {len(df_team)} players")
        frames.append(df_team)
    except requests.HTTPError as e:
        print(f"△ Error : {e} for url: {t['url']}")

if frames:
    players_dim = pd.concat(frames, ignore_index=True)
    players_dim["player_name"] = players_dim["player_name"].str.replace(r"\s+", " ", regex=True)
    players_dim = players_dim.drop_duplicates(subset=["team", "kit_number", "player_name"])

    print(f"\nTotal rows: {len(players_dim)} across {players_dim['team'].nunique()} teams")

    out_path = "data/processed/players_dim.csv"
    os.makedirs(os.path.dirname(out_path), exist_ok=True)  # FIX: create folder if missing
    players_dim.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")
    print(players_dim.head(20))
else:
    print("No player data scraped.")

Found 20 clubs for Premier League 2025
• Liverpool FC -> https://www.transfermarkt.us/liverpool-fc/kader/verein/31/saison_id/2025
•  -> https://www.transfermarkt.us/fc-arsenal/kader/verein/11/saison_id/2025
•  -> https://www.transfermarkt.us/manchester-city/kader/verein/281/saison_id/2025
•  -> https://www.transfermarkt.us/fc-chelsea/kader/verein/631/saison_id/2025
•  -> https://www.transfermarkt.us/tottenham-hotspur/kader/verein/148/saison_id/2025
•  -> https://www.transfermarkt.us/manchester-united/kader/verein/985/saison_id/2025
•  -> https://www.transfermarkt.us/newcastle-united/kader/verein/762/saison_id/2025
•  -> https://www.transfermarkt.us/nottingham-forest/kader/verein/703/saison_id/2025
✓ Liverpool FC: 27 players
✓ : 25 players
✓ : 26 players
✓ : 32 players
✓ : 29 players
✓ : 27 players
✓ : 28 players
✓ : 29 players
✓ : 23 players
… 403 on https://www.transfermarkt.us/brighton-amp-hove-albion/kader/verein/1237/saison_id/2025 — retry 1/5 after 4.6s
✓ : 26 players
✓ : 29 playe