In [None]:
league_mapping = {
    (81, 3, 2015): "germany-bundesliga",
    (252, 2, 2015): "england-premier-league",
    (247, 36, 2014): "international-fifa-world-cup"
}

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_squad_data(league_name, league_abbr, season_year):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

    season_url = f"https://www.transfermarkt.com/{league_name}/startseite/wettbewerb/{league_abbr}/plus/?saison_id={season_year}"

    response = requests.get(season_url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")

    # The main player table has the class 'items'. This is our target.
    team_table = soup.find('table', class_='items')
    
    teams_data = []

     # Find all player rows (they are in <tr> tags with class 'odd' or 'even')
    for row in team_table.tbody.find_all('tr', class_=['odd', 'even']):
        # Extract columns (<td> tags) from the row
        cols = row.find_all('td')
        
        # it looks like this
        # href="/bayer-04-leverkusen/startseite/verein/15/saison_id/2025"
        href = cols[1].find('a').get("href")

        team_slug = href.strip("/").split("/")[0]
        team_id = href.strip("/").split("/")[3]

        teams_data.append({
            "team_id": team_id,
            "team_slug": team_slug,
        })
    
    return teams_data

In [None]:
data = scrape_squad_data("bundesliga", "L1", 2025)

for team in data:
    for x in team.values():
        print(x, end=" ")
        print()

27 
fc-bayern-munchen 
23826 
rasenballsport-leipzig 
15 
bayer-04-leverkusen 
16 
borussia-dortmund 
24 
eintracht-frankfurt 
79 
vfb-stuttgart 
82 
vfl-wolfsburg 
60 
sc-freiburg 
533 
tsg-1899-hoffenheim 
18 
borussia-monchengladbach 
86 
sv-werder-bremen 
39 
1-fsv-mainz-05 
167 
fc-augsburg 
89 
1-fc-union-berlin 
3 
1-fc-koln 
41 
hamburger-sv 
2036 
1-fc-heidenheim-1846 
35 
fc-st-pauli 


In [23]:
from curl_cffi import requests as rq
import time
import random

def scrape_squad_data(team_id, team_slug, season_year, retries=3, base_delay=10):
    """
    Scrapes player squad data with delays and retries.
    """
    squad_url = f"https://www.transfermarkt.com/{team_slug}/kader/verein/{team_id}/plus/1/galerie/0?saison_id={season_year}"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # --- NEW: Retry Loop with Exponential Backoff ---
    for i in range(retries):
        try:
            response = rq.get(squad_url, headers=headers, timeout=15) # Add a timeout
            
            # Check for specific error codes that indicate blocking
            if response.status_code == 503 or response.status_code == 429:
                # This is a rate-limiting error, raise an exception to trigger the retry
                response.raise_for_status()

            # If the request was successful (e.g., status 200), process the data and exit the loop
            soup = BeautifulSoup(response.content, 'html.parser')
            player_table = soup.find('table', class_='items')
            
            # Handle cases where the page loads but has no player data (e.g., future season)
            if not player_table or not player_table.tbody:
                print(f"    - No player data found on page for team {team_slug}, season {season_year}. Skipping.")
                return [] # Return an empty list for this team/season

            players_data = []
            for row in player_table.tbody.find_all('tr', class_=['odd', 'even']):
                cols = row.find_all('td')
                player_name = cols[1].find('a').get_text(strip=True)
                position = cols[4].get_text(strip=True)
                date_of_birth = cols[5].get_text(strip=True).split('(')[0].strip()
                nationality = cols[6].find('img')['title']
                height = cols[7].get_text(strip=True)
                preferred_foot = cols[8].get_text(strip=True)
                
                players_data.append({
                    "player_name": player_name,
                    "position": position,
                    "date_of_birth": date_of_birth,
                    "nationality": nationality,
                    "height": height,
                    "preferred_foot": preferred_foot
                })
            
            return players_data # Return the successful data

        except rq.exceptions.RequestException as e:
            print(f"    - WARNING: Attempt {i+1}/{retries} failed for {team_slug}. Error: {e}")
            if i < retries - 1:
                # Exponential backoff: 10s, 20s, 40s...
                delay = base_delay * (2 ** i) + random.uniform(2, 5)
                print(f"        -> Being rate-limited. Waiting for {delay:.2f} seconds before retrying...")
                time.sleep(delay)
            else:
                print(f"    - ERROR: All {retries} attempts failed for {team_slug}. Giving up.")

                return None # Signal that this team failed completely

In [24]:
failing_teams = []
squads_list = []

for team in data:
    team_id = team["team_id"]
    team_slug = team["team_slug"]
    
    squad_list = scrape_squad_data(team_id, team_slug, 2025)
    if squad_list is None:
        failing_teams.append([team_id, team_slug])
        continue
    
    squads_list += squad_list

for team_id, team_slug in failing_teams:
    squads_list += scrape_squad_data(team_id, team_slug, 2025)

# Convert to a Pandas DataFrame for easy viewing/saving
df = pd.DataFrame(squads_list)
print(df)

        -> Being rate-limited. Waiting for 14.24 seconds before retrying...
        -> Being rate-limited. Waiting for 23.41 seconds before retrying...
    - ERROR: All 3 attempts failed for fc-augsburg. Giving up.
        -> Being rate-limited. Waiting for 14.63 seconds before retrying...
        -> Being rate-limited. Waiting for 23.44 seconds before retrying...
    - ERROR: All 3 attempts failed for 1-fc-union-berlin. Giving up.
        -> Being rate-limited. Waiting for 13.14 seconds before retrying...
        -> Being rate-limited. Waiting for 24.28 seconds before retrying...
        -> Being rate-limited. Waiting for 14.16 seconds before retrying...
        -> Being rate-limited. Waiting for 23.82 seconds before retrying...
        -> Being rate-limited. Waiting for 14.03 seconds before retrying...
        -> Being rate-limited. Waiting for 12.88 seconds before retrying...
        -> Being rate-limited. Waiting for 23.13 seconds before retrying...
        -> Being rate-limited. W

TypeError: 'NoneType' object is not iterable

In [22]:
print(df.head(30))

               player_name            position date_of_birth   nationality  \
0              Jonas Urbig          Goalkeeper    08.08.2003       Germany   
1             Manuel Neuer          Goalkeeper    27.03.1986       Germany   
2             Sven Ulreich          Goalkeeper    03.08.1988       Germany   
3              Leon Klanac          Goalkeeper    01.03.2007       Germany   
4          Dayot Upamecano         Centre-Back    27.10.1998        France   
5              Min-jae Kim         Centre-Back    15.11.1996  Korea, South   
6                                  Centre-Back    11.02.1996       Germany   
7               Hiroki Ito         Centre-Back    12.05.1999         Japan   
8          Alphonso Davies           Left-Back    02.11.2000        Canada   
9        Raphaël Guerreiro           Left-Back    22.12.1993      Portugal   
10          Josip Stanisic          Right-Back    02.04.2000       Croatia   
11           Konrad Laimer          Right-Back    27.05.1997    

In [3]:
import requests
from bs4 import BeautifulSoup
import json
import re

def scrape_whoscored_match_data(match_url):
    """
    Scrapes the embedded JSON data blob from a WhoScored match URL.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(match_url, headers=headers)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all script tags
    scripts = soup.find_all('script')
    
    # The data is stored in a script tag that contains the text 'matchCentreData'.
    # We need to find that specific script tag.
    data_script = None
    for script in scripts:
        if script.string and 'matchCentreData' in script.string:
            data_script = script.string
            break
    
    if not data_script:
        print("Could not find the data script. The website structure may have changed.")
        return None
        
    # The script content is JavaScript code, like "var matchCentreData = {...};".
    # We need to use a regular expression to extract just the JSON part (the curly braces).
    match = re.search(r'matchCentreData = (\{.*\});', data_script)
    
    if not match:
        print("Could not extract JSON data from the script.")
        return None
        
    # The regex group(1) captures the content inside the parentheses.
    json_string = match.group(1)
    
    # Now, parse this JSON string into a Python dictionary
    match_data = json.loads(json_string)
    
    return match_data

# --- Main Execution ---
url = 'https://www.whoscored.com/matches/1903134/live/england-premier-league-2025-2026-manchester-city-tottenham' # This is a placeholder future match
# Let's use a real, past match to get actual data
real_match_url = 'https://www.whoscored.com/Matches/1729482/Live/England-Premier-League-2023-2024-Manchester-City-Tottenham'

data = scrape_whoscored_match_data(real_match_url)

if data:
    # Now you can explore this rich dictionary
    home_team = data['home']['name']
    away_team = data['away']['name']
    
    print(f"Match: {home_team} vs {away_team}")
    
    print("\nHome Team Players:")
    for player in data['home']['players']:
        if player.get('isFirstEleven'): # Check if the player was in the starting lineup
            print(f"- {player['name']} ({player['position']})")
            
    # The entire dataset, including every event (shots, passes, tackles) is in here.
    # For example, let's find the first goal event.
    first_goal = next((event for event in data['events'] if event.get('isGoal')), None)
    if first_goal:
        minute = first_goal['minute']
        player_name = first_goal['player']['name']
        print(f"\nFirst Goal: Scored by {player_name} in the {minute}' minute.")

HTTPError: 403 Client Error: Forbidden for url: https://www.whoscored.com/Matches/1729482/Live/England-Premier-League-2023-2024-Manchester-City-Tottenham