In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

chrome_options = Options()
chrome_options.binary_location = "/Users/lilykoffman/Documents/athletics/chrome-mac-x64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing"

service = Service("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

try:
    driver.get("https://worldathletics.org/results/olympic-games")
    time.sleep(3)  # wait for page to load

    # Find all <a> tags under the table with meeting names
    links = driver.find_elements(By.CSS_SELECTOR, "td[data-th='MEETING NAME'] a")

    games_urls = []
    for link in links:
        url = link.get_attribute("href")
        name = link.text
        print(f"{name}: {url}")
        games_urls.append(url)

finally:
    driver.quit()


The XXXIII Olympic Games: https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087
The XXXII Olympic Games (Athletics): https://worldathletics.org/results/olympic-games/2021/the-xxxii-olympic-games-athletics-6568
The XXXI Olympic Games: https://worldathletics.org/results/olympic-games/2016/the-xxxi-olympic-games-5771
The XXX Olympic Games: https://worldathletics.org/results/olympic-games/2012/the-xxx-olympic-games-4871
The XXIX Olympic Games: https://worldathletics.org/results/olympic-games/2008/the-xxix-olympic-games-3659
28th Olympic Games: https://worldathletics.org/results/olympic-games/2004/28th-olympic-games-3201
27th Olympic Games: https://worldathletics.org/results/olympic-games/2000/27th-olympic-games-2363


In [2]:


def get_event_links(driver, games_url):
    print(f"Loading: {games_url}")
    driver.get(games_url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    all_event_links = []
    rows = soup.select("table.records-table tbody tr")
    
    for row in rows:
        cells = row.find_all("td")
        if not cells:
            continue

        sex = cells[1].text.strip()
        event_name = cells[2].text.strip()
        round_name = cells[3].text.strip()

        # Extract links (e.g., startlist, result, summary, etc.)
        for link_cell in cells[4:]:
            link_tag = link_cell.find("a")
            if link_tag and link_tag.get("href"):
                href = link_tag["href"]
                text = link_tag.text.strip()

                all_event_links.append({
                    "sex": sex,
                    "event": event_name,
                    "round": round_name,
                    "label": text,
                    "url": f"https://worldathletics.org{href}"
                })

    print(f"✓ Found {len(all_event_links)} event links")
    return all_event_links



In [3]:
driver = webdriver.Chrome(service=service, options=chrome_options)

all_events = []

for games_url in games_urls:
    try:
        events = get_event_links(driver, games_url)
        for event in events:
            event["games_url"] = games_url
            all_events.append(event)
        print(f"✓ Fetched {len(events)} events from: {games_url}")
    except Exception as e:
        print(f"✗ Error processing {games_url}: {e}")
    time.sleep(2)  # Politeness delay

print(f"\n✅ Total events collected: {len(all_events)}")

driver.quit()

Loading: https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087
✓ Found 391 event links
✓ Fetched 391 events from: https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087
Loading: https://worldathletics.org/results/olympic-games/2021/the-xxxii-olympic-games-athletics-6568
✓ Found 357 event links
✓ Fetched 357 events from: https://worldathletics.org/results/olympic-games/2021/the-xxxii-olympic-games-athletics-6568
Loading: https://worldathletics.org/results/olympic-games/2016/the-xxxi-olympic-games-5771
✓ Found 347 event links
✓ Fetched 347 events from: https://worldathletics.org/results/olympic-games/2016/the-xxxi-olympic-games-5771
Loading: https://worldathletics.org/results/olympic-games/2012/the-xxx-olympic-games-4871
✓ Found 325 event links
✓ Fetched 325 events from: https://worldathletics.org/results/olympic-games/2012/the-xxx-olympic-games-4871
Loading: https://worldathletics.org/results/olympic-games/2008/the-xxix-olympic-

In [4]:
# all_events
final_events = [e for e in all_events if "Final" in e.get("round", "")]
# final_events



In [5]:
def scrape_results_table(driver, url):
    print(f"Loading results page: {url}")
    driver.get(url)
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Grab headers
    table = soup.select_one("table.records-table")
    if not table:
        print("No table found on page.")
        return []

    headers = [th.get_text(strip=True) for th in table.select("thead th")]
    rows_data = []

    for row in table.select("tbody tr"):
        cells = row.find_all("td")
        if not cells or len(cells) != len(headers):
            continue
        row_dict = {headers[i]: cells[i].get_text(strip=True) for i in range(len(headers))}
        rows_data.append(row_dict)

    return rows_data

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

def scrape_results_table(driver, url, timeout=60):
    print(f"Loading results page: {url}")
    try:
        driver.get(url)

        # Wait until the table with class 'records-table' is present
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.records-table"))
        )
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.select_one("table.records-table")
        if not table:
            print("No table found on page.")
            return []

        headers = [th.get_text(strip=True) for th in table.select("thead th")]
        rows_data = []

        for row in table.select("tbody tr"):
            cells = row.find_all("td")
            if not cells or len(cells) != len(headers):
                continue
            row_dict = {headers[i]: cells[i].get_text(strip=True) for i in range(len(headers))}
            rows_data.append(row_dict)

        return rows_data

    except Exception as e:
        print(f"❌ Error loading results table from {url}: {e}")
        return []


In [15]:
driver = webdriver.Chrome(service=service, options=chrome_options)

for event in final_events:
    result_url = event["url"]  # Or whatever key you use
    results = scrape_results_table(driver, result_url)
    print(f"{len(results)} results scraped from {result_url}")
    event["results"] = results

driver.quit()


Loading results page: https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087/men/20-kilometres-race-walk/final/startlist#resultheader
49 results scraped from https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087/men/20-kilometres-race-walk/final/startlist#resultheader
Loading results page: https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087/men/20-kilometres-race-walk/final/result#resultheader
49 results scraped from https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087/men/20-kilometres-race-walk/final/result#resultheader
Loading results page: https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087/women/20-kilometres-race-walk/final/startlist#resultheader
45 results scraped from https://worldathletics.org/results/olympic-games/2024/the-xxxiii-olympic-games-7087/women/20-kilometres-race-walk/final/startlist#resultheader
Loading results pa

In [16]:
import pandas as pd

# Separate by label
startlist_entries = [e for e in final_events if e["label"] == "Startlist"]
result_entries = [e for e in final_events if e["label"] == "Result"]

In [17]:
def expand_event_entries(entries):
    rows = []
    for event in entries:
        meta = {
            "sex": event["sex"],
            "event": event["event"],
            "round": event["round"],
            "label": event["label"],
            "url": event["url"],
            "games_url": event["games_url"]
        }
        for result_row in event["results"]:
            combined = {**meta, **result_row}
            rows.append(combined)
    return pd.DataFrame(rows)

# Convert to DataFrames
startlist_df = expand_event_entries(startlist_entries)
result_df = expand_event_entries(result_entries)

# Write to CSV
startlist_df.to_csv("olympic_startlists.csv", index=False)
result_df.to_csv("olympic_results.csv", index=False)