In [1]:
import json
import re
from datetime import datetime
from pathlib import Path

from scraper import MTBEventsPage, MTBRacesPage, MTBResultsPage

In [2]:
def custom_serializer(obj):
    """Custom JSON serializer for datetime objects"""
    if isinstance(obj, datetime):
        return obj.date().isoformat()  # Convert datetime to ISO 8601 string
    raise TypeError(f"Type {type(obj)} not serializable")

# Events by Year

In [3]:
year = 2023
year_folder = Path(f"./data/{year}")
year_folder.mkdir(parents=True, exist_ok=True)

Extract all event information for the given year. We use selenium here because without the interactive element, result urls don't actually show up in the HTML.

In [None]:
event_page = MTBEventsPage(year, use_selenium=True)
events = event_page.fetch_events()

# Enrich Events

In [5]:
enriched_events = [{
    **event,
    'races': MTBRacesPage(event['results_url']).fetch_races()
} for event in events]

# Get Race Results

In [None]:
for event in enriched_events:
    for race in event['races']:
        name = (f"{race['discipline']}_{race['gender']}_"
                f"{race['category']}_{race['race_type']}").replace(" ", "_")
        print(f"Extracting {event['location']} {name}")
        page = MTBResultsPage(race['url'])
        race['event'] = event['location']
        race['name'] = name.replace("-", "_")
        race['date'] = page.fetch_results_date()
        race['results'] = page.fetch_results()

# Save Outputs

In [7]:
for num, event in enumerate(enriched_events):
    # Create a folder for each event for the year
    location = re.sub(r'\s?-\s?|\s*', '_', event["location"]) \
                 .replace(",", "") \
                 .lower()
    event_folder = year_folder / f"{num:02d}_{location}"
    event_folder.mkdir(parents=True, exist_ok=True)

    # Save the event details to a JSON file
    event_file = event_folder / "event.json"
    with open(event_file, "w") as f:
        json.dump(event, f, default=custom_serializer,
                  ensure_ascii=False, indent=2)

    # Modify each race's info
    for race in event['races']:
        # Create a folder
        result_folder = event_folder / "results" / race['discipline']
        result_folder.mkdir(parents=True, exist_ok=True)

        # Save
        race_file = result_folder / f"{race['name']}.json"
        race = {k: v for k, v in race.items() if k != "name"}
        with open(race_file, "w") as f:
            json.dump(race, f, default=custom_serializer,
                      ensure_ascii=False, indent=2)