In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup
import re
import json
from datetime import date, timedelta
import datetime
import os

Set up selenium driver

In [2]:
# Chrome options
options = Options()
# comment out this line to run with browser open
# options.add_argument("--headless=new")

options.add_argument("--disable-blink-features=AutomationControlled") # bypass bot detection
options.add_argument("--no-sandbox") # for wsl
options.add_argument("--disable-dev-shm-usage") # or container
options.add_argument("--window-size=1920,1080")
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # anti-detection settings
options.add_experimental_option("useAutomationExtension", False)
options.binary_location = "/usr/bin/google-chrome"

prefs = {
    "profile.managed_default_content_settings.javascript": 2,
    "profile.managed_default_settings.images": 2
}
options.add_experimental_option("prefs", prefs)

# Ubuntu example
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo apt install ./google-chrome-stable_current_amd64.deb

In [60]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [107]:
driver.quit()

In [3]:
import os, json
from hdfs import InsecureClient

HDFS_PATH = "/user/dottier/"
client = InsecureClient("http://localhost:9870", user="dottier")

def write_json_to_bronze(file_path: str, is_hdfs: bool, content: dict, overwrite: bool = False) -> bool:
    try:
        if is_hdfs:
            hdfs_path = os.path.join(HDFS_PATH, file_path)
            parent = os.path.dirname(hdfs_path)
            client.makedirs(parent)

            if not overwrite and client.status(hdfs_path, strict=False):
                print(f"SKIP: {hdfs_path} already exists (HDFS)")
                return

            with client.write(hdfs_path, encoding="utf-8", overwrite=True) as f:
                json.dump(content, f, ensure_ascii=False)

        else:
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            if not overwrite and os.path.exists(file_path):
                print(f"SKIP: {file_path} already exists (local)")
                return

            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(content, f, ensure_ascii=False)

    except Exception as e:
        print(f"FATAL ERROR: Could not write {file_path}. Details: {e}")

Crawling: Getting match info

- Get unique season id of a selected unique (region_id, tournament_id)

In [4]:
import time, random

def reset_driver(driver: webdriver.Chrome):
    if driver:
        driver.quit()
        
    time.sleep(random.uniform(10, 20))
    print("INFO: Starting a new browser session...")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    return driver

bronze/match_data/league=international-fifa-world-cup/season=2018/stage_id=12757/match_id=1249924: match_id=1249924, startTimeUtc=2018-06-28T18:00:00Z


In [5]:
# Get the latest 3 season ids of selected league
def get_season_ids(region_id, tournament_id, start_year_after, driver: webdriver.Chrome):
    league_url = f"https://www.whoscored.com/regions/{region_id}/tournaments/{tournament_id}"
    # driver.get(league_url)
    
    # soup = BeautifulSoup(driver.page_source, "html.parser")

    # season_ids = []
    # season_options = soup.select('select#seasons option')
    # if not season_options:
    #     print(f"WARNING: Could not find season dropdown for tournament {tournament_id}. Retrying")
    #     driver = reset_driver(driver)
    
    #     # 2nd attempt
    #     season_options = fetch_options(driver)
    #     if not season_options:
    #         print(f"ERROR: Still could not find season dropdown for tournament {tournament_id}.")
    #         return [], driver   # fail case
    
    driver.get(league_url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    season_options = soup.select('select#seasons option')

    if not season_options:
        print(f"WARNING: Could not find season dropdown for tournament {tournament_id}. Retrying")
        driver = reset_driver(driver)

        driver.get(league_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        season_options = soup.select('select#seasons option')

    if not season_options:
        print(f"WARNING: Could not find season dropdown for tournament {tournament_id}. Skipping")
        return [], driver
    
    season_ids = []

    for season_option in season_options:
        value_attr = season_option.get("value")
        display_name = season_option.string
        if value_attr:
            try:
                parts = display_name.split('/')
                start_year = int(parts[0])

                if start_year_after > start_year:
                    continue

                season_id = value_attr.split('/')[6]
                season_ids.append(season_id)
            except IndexError:
                print(f"WARNING: Could not parse season_id from value: {value_attr}")

    return season_ids, driver

In [10]:
season_id, _ = get_season_ids(81, 3, 2015, driver)

In [6]:
# League mapping: (region_id, tournament_id) -> short league name
league_mapping = {
    (81, 3, 2015): "germany-bundesliga",
    (252, 2, 2015): "england-premier-league",
    (247, 36, 2014): "international-fifa-world-cup",
    (250, 12, 2015): "europe-champions-league",
}

In [None]:
# def get_stage_ids(region_id, tournament_id, season_id, driver):
#     season_url = f"https://www.whoscored.com/regions/{region_id}/tournaments/{tournament_id}/seasons/{season_id}"
#     driver.get(season_url)
#     soup = BeautifulSoup(driver.page_source, "html.parser")

#     # Season has multiple stages
#     stage_options = soup.select('select#stages option')

#     if stage_options:
#         stage_ids = []

#         for stage_option in stage_options:
#             value_attr = stage_option.get("value")
#             if value_attr:
#                 try:
#                     stage_id = value_attr.split('/')[8]
#                     stage_ids.append(stage_id)
#                 except IndexError:
#                     print(f"WARNING: Could not parse stage_id from value: {value_attr}")

#         if stage_ids:
#             return stage_ids, driver


#     # season has 1 stage only
#     canonical_link_tag = soup.find('link', {'rel': 'canonical'})
    
#     if canonical_link_tag and canonical_link_tag.get('href'):
#         href = canonical_link_tag['href']
#         try:
#             stage_id = href.split('/stages/')[1].split('/')[0]
#             return [stage_id], driver
#         except IndexError:
#             print(f"ERROR: Could not parse stage_id from canonical URL: {href}")
#             return [], driver

In [6]:
def get_stage_ids(region_id, tournament_id, season_id, driver, retries=1):
    season_url = f"https://www.whoscored.com/regions/{region_id}/tournaments/{tournament_id}/seasons/{season_id}"

    for attempt in range(retries + 1):
        driver.get(season_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        stage_options = soup.select('select#stages option')

        if stage_options:
            stage_ids = []
            for stage_option in stage_options:
                value_attr = stage_option.get("value")
                if value_attr:
                    try:
                        stage_id = value_attr.split('/')[8]
                        stage_ids.append(stage_id)
                    except IndexError:
                        print(f"WARNING: Could not parse stage_id from value: {value_attr}")
            if stage_ids:
                return stage_ids, driver

        # fallback to canonical link
        canonical_link_tag = soup.find('link', {'rel': 'canonical'})
        if canonical_link_tag and canonical_link_tag.get('href'):
            href = canonical_link_tag['href']
            try:
                stage_id = href.split('/stages/')[1].split('/')[0]
                return [stage_id], driver
            except IndexError:
                print(f"ERROR: Could not parse stage_id from canonical URL: {href}")

        # If nothing found and we have retries left, reset driver and try again
        if attempt < retries:
            print(f"WARNING: No stages found, retrying... (attempt {attempt + 1})")
            driver = reset_driver(driver)

    # If all retries failed
    return [], driver


In [55]:
print(f"Stage ids of Bundesliga 2022/2023: {get_stage_ids(81, 3, 9120, driver)}")
print(f"Stage ids of FIFA World Cup 2022: {get_stage_ids(247, 36, 8213, driver)}")

Stage ids of Bundesliga 2022/2023: (['21026'], <selenium.webdriver.chrome.webdriver.WebDriver (session="f76e0335ec76652ba434da79b0e9b585")>)
Stage ids of FIFA World Cup 2022: (['18657', '18649', '18650', '18651', '18652', '18653', '18656', '18655', '18654'], <selenium.webdriver.chrome.webdriver.WebDriver (session="f76e0335ec76652ba434da79b0e9b585")>)


In [7]:
# Write stage data into file
# And returns derived metadata about the stage
def process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver: webdriver.Chrome, is_hdfs):
    stage_url = f"https://www.whoscored.com/regions/{region_id}/tournaments/{tournament_id}/seasons/{season_id}/stages/{stage_id}"
    driver.get(stage_url)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    script_tag = soup.select_one('script[data-hypernova-key="tournamentfixtures"]')
    if not script_tag:
        print(f"ERROR: Could not find the data script tag on page: {stage_url}")
        return None

    raw_json = script_tag.string.strip().removeprefix("<!--").removesuffix("-->")

    print(f"Start writing stage data for stage {stage_id}")

    try:
        data = json.loads(raw_json)
        stage_data = data["tournaments"][0]
        if not stage_data:
            print(f"ERROR: Could not find stage data block in JSON for stage {stage_id}")
            return None
    except (json.JSONDecodeError, KeyError, IndexError, TypeError) as e:
        print(f"ERROR: Failed to parse or access key data for stage {stage_id}. Details: {e}")
        return None

    # Get season partition (2023/2024)
    season_partition = stage_data.get("seasonName").replace('/', '-')
    parts = season_partition.split('-')
    
    # And starting + ending year for fixtures fetching
    start_year, end_year = None, None
    if len(parts) == 1 and parts[0].isdigit():
        start_year = end_year = int(parts[0])
    elif len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
        start_year, end_year = int(parts[0]), int(parts[1])

    stage_name = stage_data.get("stageName")

    stage_path = os.path.join(f"league={league_name}", f"season={season_partition}", f"stage_id={stage_id}")
    output_dir = os.path.join("bronze", "stage_data", stage_path)
    output_file = os.path.join(output_dir, "stage_info.json")

    output = {
        "metadata": {
            "stage_id": stage_id,
            "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
            "source_url": stage_url,
        },
        "data": stage_data
    }

    # Funny fix for EURO 2020 played in 2021 cuz I'm too lazy to think of something else
    if league_name == "international-european-championship" and start_year == 2020:
        start_year = 2021
        end_year = 2021
    
    try:
        if not is_hdfs:
            os.makedirs(output_dir, exist_ok=True)

        write_json_to_bronze(output_file, is_hdfs, output, overwrite=True)

        print(f"SUCCESS: Wrote data for stage {stage_id} to "
              f"{'HDFS' if is_hdfs else 'local'} {output_file}")

        return {
            "season_id": season_id,
            "stage_id": stage_id,
            "stage_name": stage_name,
            "start_year": start_year,
            "end_year": end_year,
            "stage_path": stage_path
        }

    # try:
    #     os.makedirs(output_dir, exist_ok=True)
    #     with open(output_file, "w", encoding="utf-8") as f:
    #         json.dump(output, f, ensure_ascii=False)

    #     print(f"SUCCESS: Wrote data for stage {stage_id} to {output_file}")
    #     return {
    #         "season_id": season_id,
    #         "stage_id": stage_id,
    #         "start_year": start_year,
    #         "end_year": end_year,
    #         "stage_path": stage_path
    #     }
    
    except (IOError, OSError) as e:
        print(f"FATAL ERROR: Could not write file for stage {stage_id}. Details: {e}")
        return None

In [73]:
print(process_season_stages(247, 124, 7329, 16297, "international-european-championship", driver, False))

Start writing stage data for stage 16297
Wrote to local FS: bronze/stage_data/league=international-european-championship/season=2020/stage_id=16297/stage_info.json
SUCCESS: Wrote data for stage 16297 to local bronze/stage_data/league=international-european-championship/season=2020/stage_id=16297/stage_info.json
{'season_id': 7329, 'stage_id': 16297, 'stage_name': 'EURO Grp. A', 'start_year': 2021, 'end_year': 2021, 'stage_path': 'league=international-european-championship/season=2020/stage_id=16297'}


In [10]:
# Create week keys in the format of {YYYY}W{WW}
def generate_week_keys(start_year, end_year):
    weeks = set()
    d = date(start_year, 1, 1)
    while d.year <= end_year:
        iso = d.isocalendar()  # (year, week, weekday)
        key = f"{iso[0]}W{iso[1]:02d}"
        weeks.add(key)
        d += timedelta(days=7)
    return sorted(weeks)

In [8]:
from datetime import date

def generate_month_keys(start_year, end_year):
    months = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            key = f"{year}{month:02d}"
            months.append(key)
    return months

# Example usage:
print(generate_month_keys(2026, 2026))

['202601', '202602', '202603', '202604', '202605', '202606', '202607', '202608', '202609', '202610', '202611', '202612']


In [None]:
# # Get all match_ids and start time (UTC) of selected season
# def get_match_configs(stage_config, driver: webdriver.Chrome):
#     stage_id = stage_config["stage_id"]
#     start_year = stage_config["start_year"]
#     end_year = stage_config["end_year"]
#     stage_path = stage_config["stage_path"]

#     if not all([stage_id, start_year, end_year, stage_path]):
#         print(f"ERROR: get_match_configs received invalid stage_metadata: {stage_config}")
#         return []

#     print(f"Start getting match_configs for stage {stage_id}")
    
#     match_configs_to_return = []
#     processed_match_ids = set()

#     week_keys = generate_week_keys(start_year, end_year)
#     for week_key in week_keys:
#         weekly_data_url = f"https://www.whoscored.com/tournaments/{stage_id}/data/?d={week_key}&isAggregate=false"

#         try:
#             driver.get(weekly_data_url)
#             # Using execute_script is clever, but let's make it robust
#             json_text = driver.execute_script("return document.body.innerText || null;")
            
#             if not json_text:
#                 print(f"WARNING: No JSON text found for URL: {weekly_data_url}")
#                 continue

#             data = json.loads(json_text)
#             if not isinstance(data, dict) or not data.get("version"):
#                 continue

#             for stage in data.get("tournaments", []):
#                 for match in stage.get("matches", []):
#                     match_id = match.get("id")
#                     start_time_utc = match.get("startTimeUtc")

#                     if not match_id or not start_time_utc:
#                         continue
                    
#                     if match_id in processed_match_ids:
#                         continue

#                     match_output_dir = os.path.join("bronze", "match_data", stage_path, f"match_id={match_id}")
#                     os.makedirs(match_output_dir, exist_ok=True)
#                     preview_file_path = os.path.join(match_output_dir, "match_preview.json")

#                     with open(preview_file_path, "w", encoding="utf-8") as f:
#                         json.dump(match, f, indent=2, ensure_ascii=False)
                    
#                     match_configs_to_return.append([match_id, start_time_utc])
#                     processed_match_ids.add(match_id)

#             print(f"Found {len(match_configs_to_return)} matches for week {week_key}")

#         except json.JSONDecodeError as e:
#             print(f"Failed to parse JSON for URL: {weekly_data_url}. Details: {e}")
#         except Exception as e:
#             print(f"ERROR: An unexpected error occurred for URL {weekly_data_url}. Details: {e}")

#     print(f"Found {len(match_configs_to_return)} matches for stage {stage_id}")
#     return match_configs_to_return

In [None]:
stage_config = {
    "season_id": 9120,
    "stage_id":  21026,
    "start_year": 2022,
    "end_year": 2023,
    "stage_path": "league=germany-bundesliga\\season=2022-2023\\stage_id=21026"
}

print(get_match_configs(stage_config, driver))

Start getting match_configs for stage 21026
Found 306 matches for stage 21026
[[1643039, '2022-08-05T18:30:00Z'], [1643041, '2022-08-06T13:30:00Z'], [1643043, '2022-08-06T13:30:00Z'], [1643044, '2022-08-06T13:30:00Z'], [1643045, '2022-08-06T13:30:00Z'], [1643046, '2022-08-06T13:30:00Z'], [1643040, '2022-08-06T16:30:00Z'], [1643047, '2022-08-07T13:30:00Z'], [1643042, '2022-08-07T15:30:00Z'], [1643051, '2022-08-12T18:30:00Z'], [1643049, '2022-08-13T13:30:00Z'], [1643050, '2022-08-13T13:30:00Z'], [1643053, '2022-08-13T13:30:00Z'], [1643054, '2022-08-13T13:30:00Z'], [1643056, '2022-08-13T13:30:00Z'], [1643055, '2022-08-13T16:30:00Z'], [1643052, '2022-08-14T13:30:00Z'], [1643048, '2022-08-14T15:30:00Z'], [1643060, '2022-08-19T18:30:00Z'], [1643057, '2022-08-20T13:30:00Z'], [1643058, '2022-08-20T13:30:00Z'], [1643062, '2022-08-20T13:30:00Z'], [1643064, '2022-08-20T13:30:00Z'], [1643065, '2022-08-20T13:30:00Z'], [1643059, '2022-08-20T16:30:00Z'], [1643061, '2022-08-21T13:30:00Z'], [1643063, '

In [9]:
from datetime import datetime, timezone, timedelta
import os

# whether or not to overwrite a file
def should_write_match(file_path: str, start_time_utc: str, is_hdfs: bool, days_back: int = 3) -> bool:
    """
    Decide whether to write match_preview.json

    Rules:
      - Always write if file doesn't exist
      - Overwrite only if start_time_utc within ±days_back from now
    """
    try:
        if is_hdfs:
            hdfs_path = os.path.join(HDFS_PATH, file_path)
            if not client.status(hdfs_path, strict=False):  # file doesn't exist
                return True
        else:
            if not os.path.exists(file_path):
                return True

        # If file exists, check time
        match_time = datetime.strptime(start_time_utc, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
        now = datetime.now(timezone.utc)
        delta = abs((match_time - now).total_seconds())
        return delta <= days_back * 86400

    except Exception as e:
        print(f"WARNING: should_write_match fallback for {file_path}. Details: {e}")
        return True  # safer to overwrite



In [10]:
# Get all match_ids and start time (UTC) of selected season
def get_match_configs(stage_config, driver: webdriver.Chrome, is_hdfs):
    stage_id = stage_config["stage_id"]
    start_year = stage_config["start_year"]
    end_year = stage_config["end_year"]
    stage_path = stage_config["stage_path"]

    if not all([stage_id, start_year, end_year, stage_path]):
        print(f"ERROR: get_match_configs received invalid stage_metadata: {stage_config}")
        return [], driver

    print(f"Start getting match_configs for stage {stage_id}")
    
    match_configs_to_return = []
    processed_match_ids = set()

    month_keys = generate_month_keys(start_year, end_year)
    for month_key in month_keys:
        monthly_data_url = f"https://www.whoscored.com/tournaments/{stage_id}/data/?d={month_key}&isAggregate=false"

        try:
            driver.get(monthly_data_url)
            # Using execute_script is clever, but let's make it robust
            json_text = driver.execute_script("return document.body.innerText || null;")
            
            # retry once if failed
            if not json_text:
                print(f"WARNING: No JSON text found for URL: {monthly_data_url}, retrying once...")
                driver = reset_driver(driver)
                
                driver.get(monthly_data_url)
                json_text = driver.execute_script("return document.body.innerText || null;")

            if not json_text:
                print(f"ERROR: Still no JSON after retry for {monthly_data_url}")
                continue

            data = json.loads(json_text)
            if not isinstance(data, dict) or not data.get("version"):
                continue

            for stage in data.get("tournaments", []):
                for match in stage.get("matches", []):
                    match_id = match.get("id")
                    start_time_utc = match.get("startTimeUtc")

                    if not match_id or not start_time_utc:
                        continue
                    
                    if match_id in processed_match_ids:
                        continue

                    # match_output_dir = os.path.join("bronze", "match_data", stage_path, f"match_id={match_id}")
                    # os.makedirs(match_output_dir, exist_ok=True)
                    # preview_file_path = os.path.join(match_output_dir, "match_preview.json")

                    output = {
                        "metadata": {
                            "match_id": match_id,
                            "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),  # UTC time
                            "source_url": monthly_data_url,
                        },
                        "data": match
                    }

                    preview_file_path = os.path.join(
                        "bronze", "match_data", stage_path, f"match_id={match_id}", "match_preview.json"
                    )

                    if should_write_match(preview_file_path, start_time_utc, is_hdfs, days_back=3):
                        write_json_to_bronze(preview_file_path, is_hdfs, output)

                    # write_json_to_bronze(preview_file_path, is_hdfs, output)
                    # with open(preview_file_path, "w", encoding="utf-8") as f:
                    #     json.dump(output, f, ensure_ascii=False)
                    
                    match_configs_to_return.append([match_id, start_time_utc])
                    processed_match_ids.add(match_id)

            # print(f"Found {len(match_configs_to_return)} matches for month {month_key}")

        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON for URL: {monthly_data_url}. Details: {e}")
        except Exception as e:
            print(f"ERROR: An unexpected error occurred for URL {monthly_data_url}. Details: {e}")

    print(f"Found {len(match_configs_to_return)} matches for stage {stage_id}")
    return match_configs_to_return, driver

In [14]:
# Get all match_ids and start time (UTC) of selected season
def get_match_configs(stage_config, driver: webdriver.Chrome, is_hdfs):
    stage_id = stage_config["stage_id"]
    start_year = stage_config["start_year"]
    end_year = stage_config["end_year"]
    stage_path = stage_config["stage_path"]

    if not all([stage_id, start_year, end_year, stage_path]):
        print(f"ERROR: get_match_configs received invalid stage_metadata: {stage_config}")
        return [], driver

    def fetch_matches(start_year, end_year, driver_obj):
        match_configs_to_return = []
        processed_match_ids = set()

        month_keys = generate_month_keys(start_year, end_year)
        for month_key in month_keys:
            monthly_data_url = f"https://www.whoscored.com/tournaments/{stage_id}/data/?d={month_key}&isAggregate=false"

            try:
                driver_obj.get(monthly_data_url)
                json_text = driver_obj.execute_script("return document.body.innerText || null;")
                
                # retry once if failed
                if not json_text:
                    print(f"WARNING: No JSON text found for URL: {monthly_data_url}, retrying once...")
                    driver_obj = reset_driver(driver_obj)
                    driver_obj.get(monthly_data_url)
                    json_text = driver_obj.execute_script("return document.body.innerText || null;")

                if not json_text:
                    print(f"ERROR: Still no JSON after retry for {monthly_data_url}")
                    continue

                data = json.loads(json_text)
                if not isinstance(data, dict) or not data.get("version"):
                    continue

                for stage in data.get("tournaments", []):
                    for match in stage.get("matches", []):
                        match_id = match.get("id")
                        start_time_utc = match.get("startTimeUtc")

                        if not match_id or not start_time_utc:
                            continue
                        
                        if match_id in processed_match_ids:
                            continue

                        output = {
                            "metadata": {
                                "match_id": match_id,
                                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),  # UTC time
                                "source_url": monthly_data_url,
                            },
                            "data": match
                        }

                        preview_file_path = os.path.join(
                            "bronze", "match_data", stage_path, f"match_id={match_id}", "match_preview.json"
                        )
                        
                        if should_write_match(preview_file_path, start_time_utc, is_hdfs, days_back=3):
                            write_json_to_bronze(preview_file_path, is_hdfs, output)
                        
                        match_configs_to_return.append([match_id, start_time_utc])
                        processed_match_ids.add(match_id)

            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON for URL: {monthly_data_url}. Details: {e}")
            except Exception as e:
                print(f"ERROR: An unexpected error occurred for URL {monthly_data_url}. Details: {e}")

        return match_configs_to_return, driver_obj

    print(f"Start getting match_configs for stage {stage_id}")
    
    # First attempt with given years
    match_configs_to_return, driver = fetch_matches(start_year, end_year, driver)

    # If no matches found, retry with end_year + 1
    if len(match_configs_to_return) == 0:
        print(f"WARNING: No matches found for {start_year}-{end_year}, retrying with {start_year + 1}-{end_year + 1}")
        match_configs_to_return, driver = fetch_matches(start_year + 1, end_year + 1, driver)

    print(f"Found {len(match_configs_to_return)} matches for stage {stage_id}")
    return match_configs_to_return, driver


In [70]:
stage_config = {
    "season_id": 6275,
    "stage_id":  14188,
    "start_year": 2016,
    "end_year": 2017,
    "stage_path": "league=europe-europa-league\\season=2016-2017\\stage_id=14188"
}

print(get_match_configs(stage_config, driver, False))

Start getting match_configs for stage 14188
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139064/match_preview.json
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139065/match_preview.json
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139066/match_preview.json
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139067/match_preview.json
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139068/match_preview.json
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139069/match_preview.json
Wrote to local FS: bronze/match_data/league=europe-europa-league\season=2016-2017\stage_id=14188/match_id=1139070/match_preview.json
Wrote to local FS: bronze

In [None]:
stage_config = {
    "season_id": 9120,
    "stage_id":  21026,
    "start_year": 2022,
    "end_year": 2023,
    "stage_path": "league=germany-bundesliga\\season=2022-2023\\stage_id=21026"
}

print(get_match_configs(stage_config, driver))

In [12]:
def extract_json_object(text, start_key):
    start_idx = text.find(start_key)
    if start_idx == -1:
        raise ValueError(f"Key '{start_key}' not found in text.")

    brace_start = text.find('{', start_idx)
    if brace_start == -1:
        raise ValueError(f"Opening '{{' not found after key '{start_key}'.")

    brace_count = 0
    for i in range(brace_start, len(text)):
        if text[i] == '{':
            brace_count += 1
        elif text[i] == '}':
            brace_count -= 1
            if brace_count == 0:
                return text[brace_start:i+1]

    raise ValueError(f"Could not find matching closing '}}' for key '{start_key}'.")

In [16]:
from enum import Enum

class MatchStatus(Enum):
    FETCHED = 1
    ALREADY_EXISTS = 2
    FAILED = 3
    FUTURE_MATCH = 4

In [13]:
def get_match_data(match_id, start_time_string, stage_path, driver: webdriver.Chrome, is_hdfs):
    output_dir = os.path.join("bronze", "match_data", stage_path, f"match_id={match_id}")
    output_file = os.path.join(output_dir, "match_data.json")
    
    # if os.path.exists(output_file):
    #     print(f"INFO: Data for match {match_id} already exists. Skipping.")
    #     return True

    if not should_write_match(output_file, start_time_string, is_hdfs, days_back=3):
        return MatchStatus.ALREADY_EXISTS

    # if is_hdfs:
    #     from hdfs.util import HdfsError
    #     try:
    #         client.status(os.path.join(HDFS_PATH, output_file))
    #         # print(f"INFO: Data for match {match_id} already exists on HDFS. Skipping.")
    #         return MatchStatus.ALREADY_EXISTS
    #     except HdfsError:
    #         pass  # file does not exist, continue
    # else:
    #     if os.path.exists(output_file):
    #         # print(f"INFO: Data for match {match_id} already exists. Skipping.")
    #         return MatchStatus.ALREADY_EXISTS
    
    start_time_UTC = datetime.strptime(start_time_string, '%Y-%m-%dT%H:%M:%SZ')
    if start_time_UTC > datetime.now():
        return MatchStatus.FUTURE_MATCH

    url = f"https://www.whoscored.com/matches/{match_id}/live"
    
    try:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        script_tags = soup.find_all("script")

        for script in script_tags:
            script_text = script.string
            if not script_text or not all(k in script_text for k in ["matchCentreData", "matchCentreEventTypeJson", "formationIdNameMappings"]):
                continue

            mcd_str = extract_json_object(script_text, "matchCentreData:")
            event_map_str = extract_json_object(script_text, "matchCentreEventTypeJson:")
            formation_map_str = extract_json_object(script_text, "formationIdNameMappings:")

            if not (mcd_str and event_map_str and formation_map_str):
                continue

            match_centre_data = json.loads(mcd_str)
            event_type_mapping = json.loads(event_map_str)
            formation_mapping = json.loads(formation_map_str)

            final_data_object = {
                "metadata": {
                    "match_id": match_id,
                    "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),  # UTC time
                    "source_url": url,
                },
                "matchId": match_id,
                "matchCentreData": match_centre_data,
                "matchCentreEventTypeJson": event_type_mapping,
                "formationIdNameMappings": formation_mapping
            }

            write_json_to_bronze(output_file, is_hdfs, final_data_object)
            print(f"SUCCESS: Wrote data for match {match_id}")
            
            # os.makedirs(output_dir, exist_ok=True)
            # with open(output_file, "w", encoding="utf-8") as f:
            #     json.dump(final_data_object, f, ensure_ascii=False)
            
            # print(f"SUCCESS: Wrote data for match {match_id}")
            return MatchStatus.FETCHED
        
    except json.JSONDecodeError as e:
        print(f"ERROR: Failed to parse JSON for match {match_id}. URL: {url}. Details: {e}")
        return MatchStatus.FAILED
    except (IOError, OSError) as e:
        print(f"FATAL ERROR: Could not write file for match {match_id}. Path: {output_file}. Details: {e}")
        return MatchStatus.FAILED
    except Exception as e:
        print(f"ERROR: An unexpected error occurred while processing match {match_id}. URL: {url}. Details: {e}")
        return MatchStatus.FAILED

    print(f"WARNING: Could not find a script containing all required data for match {match_id} on page {url}")
    return MatchStatus.FAILED

In [40]:
get_match_data(1643039, "2015-05-15T18:45:00Z", "league=germany-bundesliga/season=2022-2023/stage_id=21026", driver, True)

<MatchStatus.ALREADY_EXISTS: 2>

In [30]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [45]:
import time
import random

driver = None

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")

        if driver:
            driver.quit()
        
        print("INFO: Starting a new browser session...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            if driver:
                driver.quit()

            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            stage_ids = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]

                match_configs, driver = get_match_configs(stage_config, driver)
                for match_id, start_time_string in match_configs:
                    get_match_data(match_id, start_time_string, stage_path, driver)

            time.sleep(random.uniform(10, 20))

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: germany-bundesliga
INFO: Starting a new browser session...
Season_ids of germany-bundesliga: ['10720', '10365', '9649', '9120', '8667', '8279', '7872', '7405', '6902', '6392', '5870']

--------------------
INFO: Starting new session for Season ID: 10720
Start writing stage data for stage 24478
SUCCESS: Wrote data for stage 24478 to bronze\stage_data\league=germany-bundesliga\season=2025-2026\stage_id=24478\stage_info.json
Start getting match_configs for stage 24478
Found 18 matches for month 202508
Found 45 matches for month 202509
Found 72 matches for month 202510
Found 108 matches for month 202511
Found 135 matches for month 202512
Found 180 matches for month 202601
Found 216 matches for month 202602
Found 243 matches for month 202603
Found 279 matches for month 202604
Found 306 matches for month 202605
Found 306 matches for stage 24478
SUCCESS: Wrote data for match 1908319
SUCCESS: Wrote data for match 1910600
SUCCESS: Wrote data for match 1910601
SUCCESS: Wrote dat

In [46]:
league_mapping = {
    (206, 4, 2015): "spain-laliga",
    (247, 67, 2025): "international-fifa-club-world-cup"
}

import time
import random

driver = None

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")

        if driver:
            driver.quit()
        
        print("INFO: Starting a new browser session...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            if driver:
                driver.quit()

            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            stage_ids = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]

                match_configs, driver = get_match_configs(stage_config, driver)
                for match_id, start_time_string in match_configs:
                    get_match_data(match_id, start_time_string, stage_path, driver)

            time.sleep(random.uniform(10, 20))

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: spain-laliga
INFO: Starting a new browser session...
Season_ids of spain-laliga: ['10803', '10317', '9682', '9149', '8681', '8321', '7889', '7466', '6960', '6436', '5933']

--------------------
INFO: Starting new session for Season ID: 10803
Start writing stage data for stage 24622
SUCCESS: Wrote data for stage 24622 to bronze\stage_data\league=spain-laliga\season=2025-2026\stage_id=24622\stage_info.json
Start getting match_configs for stage 24622
Found 31 matches for month 202508
Found 70 matches for month 202509
Found 100 matches for month 202510
Found 140 matches for month 202511
Found 170 matches for month 202512
Found 210 matches for month 202601
Found 250 matches for month 202602
Found 290 matches for month 202603
Found 330 matches for month 202604
Found 380 matches for month 202605
Found 380 matches for stage 24622
SUCCESS: Wrote data for match 1913916
SUCCESS: Wrote data for match 1913892
SUCCESS: Wrote data for match 1913918
SUCCESS: Wrote data for match 19138

In [12]:
league_mapping = {
    (108, 5, 2015): "italy-serie-a",
    (74, 22, 2015): "france-ligue-1"
}

import time
import random

driver = None

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")

        if driver:
            driver.quit()
        
        print("INFO: Starting a new browser session...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            if driver:
                driver.quit()

            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            stage_ids = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]

                match_configs, driver = get_match_configs(stage_config, driver)
                for match_id, start_time_string in match_configs:
                    get_match_data(match_id, start_time_string, stage_path, driver)

            time.sleep(random.uniform(10, 20))

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: italy-serie-a
INFO: Starting a new browser session...
Season_ids of italy-serie-a: ['10732', '10375', '9659', '9159', '8735', '8330', '7928', '7468', '6974', '6461', '5970']

--------------------
INFO: Starting new session for Season ID: 10732
Start writing stage data for stage 24500
SUCCESS: Wrote data for stage 24500 to bronze\stage_data\league=italy-serie-a\season=2025-2026\stage_id=24500\stage_info.json
Start getting match_configs for stage 24500
Found 20 matches for month 202508
Found 50 matches for month 202509
Found 90 matches for month 202510
Found 130 matches for month 202511
Found 170 matches for month 202512
Found 220 matches for month 202601
Found 260 matches for month 202602
Found 300 matches for month 202603
Found 340 matches for month 202604
Found 380 matches for month 202605
Found 380 matches for stage 24500
SUCCESS: Wrote data for match 1901064
SUCCESS: Wrote data for match 1901069
SUCCESS: Wrote data for match 1901067
SUCCESS: Wrote data for match 190

KeyboardInterrupt: 

In [13]:
league_mapping = {
    (74, 22, 2015): "france-ligue-1"
}

import time
import random

driver = None

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")

        if driver:
            driver.quit()
        
        print("INFO: Starting a new browser session...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            if driver:
                driver.quit()

            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            stage_ids = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]

                match_configs, driver = get_match_configs(stage_config, driver)
                for match_id, start_time_string in match_configs:
                    get_match_data(match_id, start_time_string, stage_path, driver)

            time.sleep(random.uniform(10, 20))

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: france-ligue-1
INFO: Starting a new browser session...
Season_ids of france-ligue-1: ['10792', '10329', '9635', '9129', '8671', '8185', '7814', '7344', '6833', '6318', '5830']

--------------------
INFO: Starting new session for Season ID: 10792
Start writing stage data for stage 24609
SUCCESS: Wrote data for stage 24609 to bronze\stage_data\league=france-ligue-1\season=2025-2026\stage_id=24609\stage_info.json
Start getting match_configs for stage 24609
Found 27 matches for month 202508
Found 54 matches for month 202509
Found 90 matches for month 202510
Found 126 matches for month 202511
Found 144 matches for month 202512
Found 171 matches for month 202601
Found 207 matches for month 202602
Found 243 matches for month 202603
Found 279 matches for month 202604
Found 306 matches for month 202605
Found 306 matches for stage 24609
INFO: Data for match 1911273 already exists. Skipping.
INFO: Data for match 1911284 already exists. Skipping.
INFO: Data for match 1911290 alrea

In [14]:
league_mapping = {
    (81, 3, 2015): "germany-bundesliga",
    (252, 2, 2015): "england-premier-league",
    (247, 36, 2014): "international-fifa-world-cup",
    (250, 12, 2015): "europe-champions-league",
    (108, 5, 2015): "italy-serie-a",
    (74, 22, 2015): "france-ligue-1"
}

import time
import random

driver = None

initial_failures = []

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")

        if driver:
            driver.quit()
        
        print("INFO: Starting a new browser session...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            if driver:
                driver.quit()

            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            stage_ids = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]

                match_configs, driver = get_match_configs(stage_config, driver)

                for match_id, start_time_string in match_configs:
                    result = get_match_data(match_id, start_time_string, stage_path, driver)
                    if not result:
                        initial_failures.append([match_id, start_time_string, stage_path])

            time.sleep(random.uniform(10, 20))

    # --- PASS 2: Retry Loop for Failures ---
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            print(f"- Retrying match {match_id}...")
            
            result = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if not result:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # --- Final Report ---
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)


finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: germany-bundesliga
INFO: Starting a new browser session...
Season_ids of germany-bundesliga: ['10720', '10365', '9649', '9120', '8667', '8279', '7872', '7405', '6902', '6392', '5870']

--------------------
INFO: Starting new session for Season ID: 10720
Start writing stage data for stage 24478
SUCCESS: Wrote data for stage 24478 to bronze\stage_data\league=germany-bundesliga\season=2025-2026\stage_id=24478\stage_info.json
Start getting match_configs for stage 24478
Found 18 matches for month 202508
Found 45 matches for month 202509
Found 72 matches for month 202510
Found 108 matches for month 202511
Found 135 matches for month 202512
Found 180 matches for month 202601
Found 216 matches for month 202602
Found 243 matches for month 202603
Found 279 matches for month 202604
Found 306 matches for month 202605
Found 306 matches for stage 24478
INFO: Data for match 1908319 already exists. Skipping.
INFO: Data for match 1910600 already exists. Skipping.
INFO: Data for match 1

In [17]:
league_mapping = {
    (250, 30, 2015): "europe-europa-league",
}

import time
import random

driver = None

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")

        if driver:
            driver.quit()
        
        print("INFO: Starting a new browser session...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            if driver:
                driver.quit()

            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

            stage_ids = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]

                match_configs, driver = get_match_configs(stage_config, driver)

                for match_id, start_time_string in match_configs:
                    result = get_match_data(match_id, start_time_string, stage_path, driver)
                    if not result:
                        initial_failures.append([match_id, start_time_string, stage_path])

            time.sleep(random.uniform(10, 20))

    # --- PASS 2: Retry Loop for Failures ---
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            print(f"- Retrying match {match_id}...")
            
            result = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if not result:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # --- Final Report ---
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)


finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: europe-europa-league
INFO: Starting a new browser session...
Season_ids of europe-europa-league: ['10904', '10458', '9778', '9087', '8741', '8178', '7805', '7353', '6843', '6275', '5849']

--------------------
INFO: Starting new session for Season ID: 10904
Start writing stage data for stage 24798
SUCCESS: Wrote data for stage 24798 to bronze\stage_data\league=europe-europa-league\season=2025-2026\stage_id=24798\stage_info.json
Start getting match_configs for stage 24798
Found 144 matches for month 202509
Found 144 matches for stage 24798
Start writing stage data for stage 24799
ERROR: Failed to parse or access key data for stage 24799. Details: list index out of range

--------------------
INFO: Starting new session for Season ID: 10458
Start writing stage data for stage 24084
SUCCESS: Wrote data for stage 24084 to bronze\stage_data\league=europe-europa-league\season=2024-2025\stage_id=24084\stage_info.json
Start getting match_configs for stage 24084
Found 16 matches 

In [None]:
# player_ids = set()

# for season_id, match in matches.items():
#     for match_id, start_time_string in match:
#         path = f"/data/season/{season_id}/match/{match_id}/match_data.json"
#         match_data_exists = os.path.exists(path)

#         if not match_data_exists:
#             continue

#         with open("path", "r") as f:
#             match_data = json.load(f)
        
#         print(match_data["matchId"])
        

In [None]:
# Probably should be in the next layer

# def get_player_data_from_match(season_id, match_id):
#     player_team_dictionary = {}
#     path = f"data/season/{season_id}/matches/{match_id}/match_data.json"
#     print(path)
#     match_data_exists = os.path.exists(path)

#     if not match_data_exists:
#         return False

#     with open(path, "r", encoding="utf-8") as f:
#         match_data = json.load(f)
    
#     match_centre_data = match_data["matchCentreData"]
#     home = match_centre_data["home"]
#     home_team_id = home["teamId"]
#     home_players = home["players"]

#     for home_player in home_players:
#         home_player_id = home_player["playerId"]
        
#         if (home_player_id, home_team_id) in player_team_dictionary.items():
#             continue

#         home_player_name = home_player["name"]
    

In [None]:
# get_player_data_from_match(10365, 1834247)

data/season/10365/matches/1834247/match_data.json
1834247


In [99]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [100]:
import os
import json

root_dir = "bronze/match_data"

failed_count = 0
failed_matches = []
fixed_count = 0
fixed_matches = []

for dirpath, dirnames, filenames in os.walk(root_dir):
    # It should always be
    if "match_preview.json" in filenames:
        preview_file = os.path.join(dirpath, "match_preview.json")
        try:
            with open(preview_file, "r", encoding="utf-8") as f:
                data = json.load(f)
            start_time_utc = data.get("data", {}).get("startTimeUtc")
            match_id = data.get("metadata", {}).get("match_id")
            if not start_time_utc:
                print(f"{dirpath}: match_preview.json exists but startTimeUtc not found")

            if "match_data.json" not in filenames:
                # bronze/match_data/league=international-fifa-world-cup/season=2018/stage_id=12757/match_id=1249924
                segments = dirpath.split(os.sep)
                stage_path = "/".join(segments[2:5])

                status = get_match_data(
                    match_id=match_id,
                    start_time_string=start_time_utc,
                    stage_path=stage_path,
                    driver=driver,
                    is_hdfs=False
                )

                if status == MatchStatus.FAILED:
                    failed_count += 1
                    failed_matches.append(f"{match_id} ({stage_path})")
                elif status == MatchStatus.FETCHED:
                    fixed_count += 1
                    fixed_matches.append(f"{match_id} ({stage_path})")

        except Exception as e:
            failed_count += 1
            failed_matches.append(f"{dirpath} (Exception)")
            print(f"[ERROR] Failed to read {preview_file}: {e}")

print("\n=== Summary ===")
print(f"Successfully fixed: {fixed_count}")
print("Fixed matches:", fixed_matches)
print(f"Failed: {failed_count}")
print("Failed matches:", failed_matches)


=== Summary ===
Successfully fixed: 2
Fixed matches: ['1874049 (league=international-world-cup-qualification-uefa/season=2025-2026/stage_id=23972)', '1873998 (league=international-world-cup-qualification-uefa/season=2025-2026/stage_id=23974)']
Failed: 1
Failed matches: ['1076372 (league=france-ligue-1/season=2016-2017/stage_id=13768)']


In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [20]:
path = "this/is/a/path"
segments = path.split(os.sep)
result = "/".join(segments[2:5])  # ["is", "a"]
print(result)  # Output: "is/a"

a/path


In [98]:
league_mapping = {
    (81, 3, 2015): "germany-bundesliga",
    (252, 2, 2015): "england-premier-league",
    (108, 5, 2015): "italy-serie-a",
    (74, 22, 2015): "france-ligue-1",
    (206, 4, 2015): "spain-laliga",

    (250, 12, 2015): "europe-champions-league",
    (250, 30, 2015): "europe-europa-league",

    (247, 36, 2014): "international-fifa-world-cup",
    (247, 67, 2025): "international-fifa-club-world-cup",
    (247, 124, 2012): "international-european-championship"
}

IS_HDFS = False
if driver:
    driver.quit()
    driver = None

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")
        
        print("INFO: Starting a new browser session...")
        if not driver:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")

            stage_ids, driver = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver, is_hdfs=IS_HDFS)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]
                stage_name = stage_config["stage_name"]

                match_configs, driver = get_match_configs(stage_config, driver, is_hdfs=IS_HDFS)
                
                matches_count = len(match_configs)
                fetched_count = 0 # Successfully fetched matches

                for match_id, start_time_string in match_configs:
                    status = get_match_data(match_id, start_time_string, stage_path, driver, is_hdfs=IS_HDFS)
                    if status == MatchStatus.FETCHED or status == MatchStatus.ALREADY_EXISTS:
                        fetched_count += 1
                    elif status == MatchStatus.FUTURE_MATCH:
                        matches_count -= 1
                
                print(f"Successfully fetched {fetched_count} out of {matches_count} matches for stage {stage_name} with id {stage_id}")

            # Reset driver after every season
            driver = reset_driver(driver)

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: germany-bundesliga
INFO: Starting a new browser session...
Season_ids of germany-bundesliga: ['10720', '10365', '9649', '9120', '8667', '8279', '7872', '7405', '6902', '6392', '5870']

--------------------
INFO: Starting new session for Season ID: 10720
Start writing stage data for stage 24478
SUCCESS: Wrote data for stage 24478 to local bronze/stage_data/league=germany-bundesliga/season=2025-2026/stage_id=24478/stage_info.json
Start getting match_configs for stage 24478
INFO: Closing final browser session.


KeyboardInterrupt: 

In [86]:
from datetime import datetime, timezone

def write_crawl_time(is_hdfs: bool):
    crawl_time = datetime.now(timezone.utc) - timedelta(days=20)
    crawl_time_str = datetime.strftime(crawl_time, "%Y-%m-%dT%H:%M:%SZ")

    file_path = "bronze/last_crawl_timestamp.txt"
    if is_hdfs:
        hdfs_path = os.path.join(HDFS_PATH, file_path)
        with client.write(hdfs_path, encoding="utf-8") as f:
            f.write(crawl_time_str)
        print(f"Wrote to HDFS: {hdfs_path}")

    else:
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(crawl_time_str)
        print(f"Wrote to local FS: {file_path}")

In [87]:
write_crawl_time(is_hdfs=IS_HDFS)

Wrote to local FS: bronze/last_crawl_timestamp.txt


In [95]:
league_mapping = {
    (206, 4, 2015): "spain-laliga",
    (250, 30, 2015): "europe-europa-league",

    (250, 751, 2022): "europe-european-championship-qualification",
    (247, 721, 2021): "international-world-cup-qualification-uefa",
    (247, 683, 2018): "international-uefa-nations-league-a",
    (247, 684, 2018): "international-uefa-nations-league-b",
    (247, 685, 2018): "international-uefa-nations-league-c",
    (247, 686, 2018): "international-uefa-nations-league-d"
}

IS_HDFS = False
if driver:
    driver.quit()
    driver = None

# Storing failed matches to try 1 more time
initial_failures = []

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")
        
        print("INFO: Starting a new browser session...")
        if not driver:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")

            stage_ids, driver = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver, is_hdfs=IS_HDFS)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]
                stage_name = stage_config["stage_name"]

                match_configs, driver = get_match_configs(stage_config, driver, is_hdfs=IS_HDFS)
                
                matches_count = len(match_configs)
                fetched_count = 0 # Successfully fetched matches

                for match_id, start_time_string in match_configs:
                    status = get_match_data(match_id, start_time_string, stage_path, driver, is_hdfs=IS_HDFS)
                    if status == MatchStatus.FETCHED or status == MatchStatus.ALREADY_EXISTS:
                        fetched_count += 1
                    elif status == MatchStatus.FUTURE_MATCH:
                        matches_count -= 1
                    elif status == MatchStatus.FAILED:
                        initial_failures.append([match_id, start_time_string, stage_path])
                
                print(f"Successfully fetched {fetched_count} out of {matches_count} matches for stage {stage_name} with id {stage_id}")

            # Reset driver after every season
            driver = reset_driver(driver)

    # Retry loop for failed matches
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            status = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if status == MatchStatus.FAILED:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # Report
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: spain-laliga
INFO: Starting a new browser session...
Season_ids of spain-laliga: ['10803', '10317', '9682', '9149', '8681', '8321', '7889', '7466', '6960', '6436', '5933']

--------------------
INFO: Starting new session for Season ID: 10803
Start writing stage data for stage 24622
SUCCESS: Wrote data for stage 24622 to local bronze/stage_data/league=spain-laliga/season=2025-2026/stage_id=24622/stage_info.json
Start getting match_configs for stage 24622
Found 380 matches for stage 24622
Successfully fetched 31 out of 31 matches for stage LaLiga with id 24622
INFO: Starting a new browser session...

--------------------
INFO: Starting new session for Season ID: 10317
INFO: Starting a new browser session...
Start writing stage data for stage 23401
SUCCESS: Wrote data for stage 23401 to local bronze/stage_data/league=spain-laliga/season=2024-2025/stage_id=23401/stage_info.json
Start getting match_configs for stage 23401
Found 380 matches for stage 23401
Successfully fetch

In [None]:
league_mapping = {

    
    (247, 124, 2012): "international-european-championship"
}

IS_HDFS = False
if driver:
    driver.quit()
    driver = None

# Storing failed matches to try 1 more time
initial_failures = []

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")
        
        print("INFO: Starting a new browser session...")
        if not driver:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")

            stage_ids, driver = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver, is_hdfs=IS_HDFS)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]
                stage_name = stage_config["stage_name"]

                match_configs, driver = get_match_configs(stage_config, driver, is_hdfs=IS_HDFS)
                
                matches_count = len(match_configs)
                fetched_count = 0 # Successfully fetched matches

                for match_id, start_time_string in match_configs:
                    status = get_match_data(match_id, start_time_string, stage_path, driver, is_hdfs=IS_HDFS)
                    if status == MatchStatus.FETCHED or status == MatchStatus.ALREADY_EXISTS:
                        fetched_count += 1
                    elif status == MatchStatus.FUTURE_MATCH:
                        matches_count -= 1
                    elif status == MatchStatus.FAILED:
                        initial_failures.append([match_id, start_time_string, stage_path])
                
                print(f"Successfully fetched {fetched_count} out of {matches_count} matches for stage {stage_name} with id {stage_id}")

            # Reset driver after every season
            driver = reset_driver(driver)

    # Retry loop for failed matches
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            status = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if status == MatchStatus.FAILED:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # Report
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: international-european-championship
INFO: Starting a new browser session...
Season_ids of international-european-championship: ['9299', '7329', '4246', '3164']

--------------------
INFO: Starting new session for Season ID: 9299
Start writing stage data for stage 21415
SUCCESS: Wrote data for stage 21415 to local bronze/stage_data/league=international-european-championship/season=2024/stage_id=21415/stage_info.json
Start getting match_configs for stage 21415
Found 15 matches for stage 21415
Successfully fetched 15 out of 15 matches for stage EURO Final Stage with id 21415
Start writing stage data for stage 23157
SUCCESS: Wrote data for stage 23157 to local bronze/stage_data/league=international-european-championship/season=2024/stage_id=23157/stage_info.json
Start getting match_configs for stage 23157
Found 0 matches for stage 23157
Successfully fetched 0 out of 0 matches for stage EURO Grp. D with id 23157
Start writing stage data for stage 21399
SUCCESS: Wrote data f

In [23]:
from datetime import datetime, timezone

def write_crawl_time(is_hdfs: bool):
    crawl_time = datetime.now(timezone.utc) - timedelta(days=20)
    crawl_time_str = datetime.strftime(crawl_time, "%Y-%m-%dT%H:%M:%SZ")

    file_path = "bronze/last_crawl_timestamp.txt"
    
    try:
        if is_hdfs:
            hdfs_path = os.path.join(HDFS_PATH, file_path)
            parent = os.path.dirname(hdfs_path)
            client.makedirs(parent)

            # always overwrite
            with client.write(hdfs_path, encoding="utf-8", overwrite=True) as f:
                f.write(crawl_time_str)

            print(f"SUCCESS: Wrote crawl timestamp to HDFS: {hdfs_path}")

        else:
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            # always overwrite
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(crawl_time_str)

            print(f"SUCCESS: Wrote crawl timestamp to local FS: {file_path}")

    except Exception as e:
        print(f"FATAL ERROR: Could not write crawl timestamp. Details: {e}")

In [26]:
IS_HDFS = True
write_crawl_time(is_hdfs=IS_HDFS)

SUCCESS: Wrote crawl timestamp to HDFS: /user/dottier/bronze/last_crawl_timestamp.txt


In [None]:
league_mapping = {
    (81, 3, 2015): "germany-bundesliga",
    (252, 2, 2015): "england-premier-league",
    (108, 5, 2015): "italy-serie-a",
    (74, 22, 2015): "france-ligue-1",
    (206, 4, 2015): "spain-laliga",

    (250, 12, 2015): "europe-champions-league",
    (250, 30, 2015): "europe-europa-league",

    (247, 36, 2014): "international-fifa-world-cup",
    (247, 67, 2025): "international-fifa-club-world-cup",
    (247, 124, 2012): "international-european-championship",
    
    (250, 751, 2022): "europe-european-championship-qualification",
    (247, 721, 2021): "international-world-cup-qualification-uefa",
    (247, 683, 2018): "international-uefa-nations-league-a",
    (247, 684, 2018): "international-uefa-nations-league-b",
    (247, 685, 2018): "international-uefa-nations-league-c",
    (247, 686, 2018): "international-uefa-nations-league-d"
}

if driver:
    driver.quit()
    driver = None

# Storing failed matches to try 1 more time
initial_failures = []

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")
        
        print("INFO: Starting a new browser session...")
        if not driver:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")

            stage_ids, driver = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver, is_hdfs=IS_HDFS)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]
                stage_name = stage_config["stage_name"]

                match_configs, driver = get_match_configs(stage_config, driver, is_hdfs=IS_HDFS)
                
                matches_count = len(match_configs)
                fetched_count = 0 # Successfully fetched matches
                already_exists_count = 0
                future_count = 0
                failed_count = 0

                for match_id, start_time_string in match_configs:
                    status = get_match_data(match_id, start_time_string, stage_path, driver, is_hdfs=IS_HDFS)
                    if status == MatchStatus.FETCHED:
                        fetched_count += 1
                    elif status == MatchStatus.ALREADY_EXISTS:
                        already_exists_count += 1
                    elif status == MatchStatus.FUTURE_MATCH:
                        future_count += 1
                    elif status == MatchStatus.FAILED:
                        failed_count += 1
                        initial_failures.append([match_id, start_time_string, stage_path])
                
                print(f"""
                Stage '{stage_name}' (ID: {stage_id}) summary:
                Total matches considered : {matches_count}
                Successfully fetched      : {fetched_count}
                Already existed           : {already_exists_count}
                Scheduled for future      : {future_count}
                Failed to fetch           : {failed_count}
                """)

            # Reset driver after every season
            driver = reset_driver(driver)

    # Retry loop for failed matches
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            status = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if status == MatchStatus.FAILED:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # Report
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: germany-bundesliga
INFO: Starting a new browser session...
INFO: Starting a new browser session...
Season_ids of germany-bundesliga: ['10720', '10365', '9649', '9120', '8667', '8279', '7872', '7405', '6902', '6392', '5870']

--------------------
INFO: Starting new session for Season ID: 10720
Start writing stage data for stage 24478
SUCCESS: Wrote data for stage 24478 to HDFS bronze/stage_data/league=germany-bundesliga/season=2025-2026/stage_id=24478/stage_info.json
Start getting match_configs for stage 24478
Found 306 matches for stage 24478

                Stage 'Bundesliga' (ID: 24478) summary:
                Total matches considered : 306
                Successfully fetched      : 0
                Already existed           : 18
                Scheduled for future      : 288
                Failed to fetch           : 0
                
INFO: Starting a new browser session...

--------------------
INFO: Starting new session for Season ID: 10365
Start writing st

In [17]:
league_mapping = {
    (247, 686, 2018): "international-uefa-nations-league-d"
}

IS_HDFS = True
driver = None

# Storing failed matches to try 1 more time
initial_failures = []

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")
        
        print("INFO: Starting a new browser session...")
        if not driver:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")

            stage_ids, driver = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver, is_hdfs=IS_HDFS)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]
                stage_name = stage_config["stage_name"]

                match_configs, driver = get_match_configs(stage_config, driver, is_hdfs=IS_HDFS)
                
                matches_count = len(match_configs)
                fetched_count = 0 # Successfully fetched matches
                already_exists_count = 0
                future_count = 0
                failed_count = 0

                for match_id, start_time_string in match_configs:
                    status = get_match_data(match_id, start_time_string, stage_path, driver, is_hdfs=IS_HDFS)
                    if status == MatchStatus.FETCHED:
                        fetched_count += 1
                    elif status == MatchStatus.ALREADY_EXISTS:
                        already_exists_count += 1
                    elif status == MatchStatus.FUTURE_MATCH:
                        future_count += 1
                    elif status == MatchStatus.FAILED:
                        failed_count += 1
                        initial_failures.append([match_id, start_time_string, stage_path])
                
                print(f"""
                Stage '{stage_name}' (ID: {stage_id}) summary:
                Total matches considered : {matches_count}
                Successfully fetched      : {fetched_count}
                Already existed           : {already_exists_count}
                Scheduled for future      : {future_count}
                Failed to fetch           : {failed_count}
                """)

            # Reset driver after every season
            driver = reset_driver(driver)

    # Retry loop for failed matches
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            status = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if status == MatchStatus.FAILED:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # Report
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: international-uefa-nations-league-d
INFO: Starting a new browser session...
Season_ids of international-uefa-nations-league-d: ['9983', '8871', '8157', '7216']

--------------------
INFO: Starting new session for Season ID: 9983
Start writing stage data for stage 22917
SUCCESS: Wrote data for stage 22917 to HDFS bronze/stage_data/league=international-uefa-nations-league-d/season=2024-2025/stage_id=22917/stage_info.json
Start getting match_configs for stage 22917
Found 6 matches for stage 22917

                Stage 'UEFA Nations League D Grp. 1' (ID: 22917) summary:
                Total matches considered : 6
                Successfully fetched      : 0
                Already existed           : 6
                Scheduled for future      : 0
                Failed to fetch           : 0
                
Start writing stage data for stage 22918
SUCCESS: Wrote data for stage 22918 to HDFS bronze/stage_data/league=international-uefa-nations-league-d/season=2024-2025/

In [22]:
IS_HDFS = True

In [37]:
import os
import json
import posixpath

root_dir = "bronze/match_data"

if IS_HDFS:
    root_dir = os.path.join(HDFS_PATH, root_dir)

failed_count = 0
failed_matches = []
fixed_count = 0
fixed_matches = []

driver = None
if not driver:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def hdfs_walk(client, root):
    """
    Walk HDFS directory tree like os.walk.
    """
    dirs, files = [], []
    for entry in client.list(root, status=True):
        name, info = entry
        if info['type'] == 'DIRECTORY':
            dirs.append(name)
        else:
            files.append(name)
    yield root, dirs, files
    for d in dirs:
        new_root = posixpath.join(root, d)
        yield from hdfs_walk(client, new_root)


# Decide source
if IS_HDFS:
    client = InsecureClient("http://localhost:9870", user="dottier")
    walker = hdfs_walk(client, root_dir)
else:
    walker = os.walk(root_dir)

for dirpath, dirnames, filenames in walker:
    if "match_preview.json" in filenames:
        # Build preview file path
        if IS_HDFS:
            preview_file = posixpath.join(dirpath, "match_preview.json")
            try:
                with client.read(preview_file, encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                failed_count += 1
                failed_matches.append(f"{dirpath} (Exception)")
                print(f"[ERROR] Failed to read {preview_file}: {e}")
                continue
        else:
            preview_file = os.path.join(dirpath, "match_preview.json")
            try:
                with open(preview_file, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                failed_count += 1
                failed_matches.append(f"{dirpath} (Exception)")
                print(f"[ERROR] Failed to read {preview_file}: {e}")
                continue

        start_time_utc = data.get("data", {}).get("startTimeUtc")
        match_id = data.get("metadata", {}).get("match_id")
        if not start_time_utc:
            print(f"{dirpath}: match_preview.json exists but startTimeUtc not found")

        if "match_data.json" not in filenames:
            if IS_HDFS:
                # user/{username}/bronze/match_data/league=.../season=.../stage_id=.../match_id=...
                segments = dirpath.strip("/").split("/")
                stage_path = "/".join(segments[4:7])
            else:
                # bronze/match_data/league=.../season=.../stage_id=.../match_id=...
                segments = dirpath.split(os.sep)
                stage_path = "/".join(segments[2:5])

            status = get_match_data(
                match_id=match_id,
                start_time_string=start_time_utc,
                stage_path=stage_path,
                driver=driver,
                is_hdfs=IS_HDFS
            )

            if status == MatchStatus.FAILED:
                failed_count += 1
                failed_matches.append(f"{match_id} ({stage_path})")
            elif status == MatchStatus.FETCHED:
                fixed_count += 1
                fixed_matches.append(f"{match_id} ({stage_path})")


print("\n=== Summary ===")
print(f"Successfully fixed: {fixed_count}")
print("Fixed matches:", fixed_matches)
print(f"Failed: {failed_count}")
print("Failed matches:", failed_matches)



SUCCESS: Wrote data for match 1492135
SUCCESS: Wrote data for match 1492137
SUCCESS: Wrote data for match 1492139
SUCCESS: Wrote data for match 1492142
SUCCESS: Wrote data for match 1492145
SUCCESS: Wrote data for match 1492148
SUCCESS: Wrote data for match 1492151
SUCCESS: Wrote data for match 1492155
SUCCESS: Wrote data for match 1492159
SUCCESS: Wrote data for match 1492163
SUCCESS: Wrote data for match 1492167
SUCCESS: Wrote data for match 1492171
SUCCESS: Wrote data for match 1492175
SUCCESS: Wrote data for match 1492179
SUCCESS: Wrote data for match 1492183
SUCCESS: Wrote data for match 1492197
SUCCESS: Wrote data for match 1492200
SUCCESS: Wrote data for match 1492203
SUCCESS: Wrote data for match 1492206
SUCCESS: Wrote data for match 1492209
SUCCESS: Wrote data for match 1492212
SUCCESS: Wrote data for match 1492215
SUCCESS: Wrote data for match 1492221
SUCCESS: Wrote data for match 1492247
SUCCESS: Wrote data for match 1492248
SUCCESS: Wrote data for match 1492249
SUCCESS: Wro

In [35]:
league_mapping = {
    (247, 124, 2012): "international-european-championship",
}

if driver:
    driver.quit()
    driver = None

# Storing failed matches to try 1 more time
initial_failures = []

try: 
    for (region_id, tournament_id, start_year_after), league_name in league_mapping.items():
        print("="*50)
        print(f"STARTING LEAGUE: {league_name}")
        
        print("INFO: Starting a new browser session...")
        if not driver:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        season_ids, driver = get_season_ids(region_id, tournament_id, start_year_after, driver)
        print(f"Season_ids of {league_name}:", season_ids)

        for season_id in season_ids:
            print("\n" + "-"*20)
            print(f"INFO: Starting new session for Season ID: {season_id}")

            stage_ids, driver = get_stage_ids(region_id, tournament_id, season_id, driver)
            if not stage_ids:
                print(f"WARNING: No stages found for season {season_id}. Skipping.")
                continue

            for stage_id in stage_ids:
                stage_config = process_season_stages(region_id, tournament_id, season_id, stage_id, league_name, driver, is_hdfs=IS_HDFS)
                if not stage_config:
                    print(f"WARNING: Failed to process metadata for stage {stage_id}. Skipping to next stage.")
                    continue

                stage_path = stage_config["stage_path"]
                stage_name = stage_config["stage_name"]

                match_configs, driver = get_match_configs(stage_config, driver, is_hdfs=IS_HDFS)
                
                matches_count = len(match_configs)
                fetched_count = 0 # Successfully fetched matches
                already_exists_count = 0
                future_count = 0
                failed_count = 0

                for match_id, start_time_string in match_configs:
                    status = get_match_data(match_id, start_time_string, stage_path, driver, is_hdfs=IS_HDFS)
                    if status == MatchStatus.FETCHED:
                        fetched_count += 1
                    elif status == MatchStatus.ALREADY_EXISTS:
                        already_exists_count += 1
                    elif status == MatchStatus.FUTURE_MATCH:
                        future_count += 1
                    elif status == MatchStatus.FAILED:
                        failed_count += 1
                        initial_failures.append([match_id, start_time_string, stage_path])
                
                print(f"""
                Stage '{stage_name}' (ID: {stage_id}) summary:
                Total matches considered : {matches_count}
                Successfully fetched      : {fetched_count}
                Already existed           : {already_exists_count}
                Scheduled for future      : {future_count}
                Failed to fetch           : {failed_count}
                """)

            # Reset driver after every season
            driver = reset_driver(driver)

    # Retry loop for failed matches
    final_failures = []
    if initial_failures:
        print("\n" + "="*50)
        print(f"RETRYING {len(initial_failures)} FAILED MATCHES...")
        print("="*50)
        
        for match_id, start_time_string, stage_path in initial_failures:
            status = get_match_data(match_id, start_time_string, stage_path, driver)
            
            if status == MatchStatus.FAILED:
                print(f"  -> RETRY FAILED: Match {match_id} still failing.")
                final_failures.append((match_id, start_time_string, stage_path))
            else:
                print(f"  -> RETRY SUCCESS: Match {match_id} scraped successfully.")

    # Report
    print("\n" + "="*50)
    print("CRAWL COMPLETE.")
    if final_failures:
        print(f"The following {len(final_failures)} matches could not be scraped after all retries:")
        for match_info in final_failures:
            print(f"  - Match ID: {match_info[0]}")
    else:
        print("All matches were scraped successfully (or were already present).")
    print("="*50)

finally:
    # Make sure the final driver is always closed, even if there's an error
    if driver:
        print("INFO: Closing final browser session.")
        driver.quit()

STARTING LEAGUE: international-european-championship
INFO: Starting a new browser session...
Season_ids of international-european-championship: ['9299', '7329', '4246', '3164']

--------------------
INFO: Starting new session for Season ID: 9299
Start writing stage data for stage 21415
SUCCESS: Wrote data for stage 21415 to HDFS bronze/stage_data/league=international-european-championship/season=2024/stage_id=21415/stage_info.json
Start getting match_configs for stage 21415
Found 15 matches for stage 21415

                Stage 'EURO Final Stage' (ID: 21415) summary:
                Total matches considered : 15
                Successfully fetched      : 0
                Already existed           : 15
                Scheduled for future      : 0
                Failed to fetch           : 0
                
Start writing stage data for stage 23157
SUCCESS: Wrote data for stage 23157 to HDFS bronze/stage_data/league=international-european-championship/season=2024/stage_id=23157/stag