In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
genders = ['men', 'women']
age_categories = ['senior', 'u20', 'u18']
events = {
    '100-metres': '10229509',
    '200-metres': '10229511',
    '400-metres': '10229512'
}

base_url = "https://worldathletics.org/records/all-time-toplists/sprints/{event}/all/{gender}/{age_category}"
all_data = []


In [None]:
def scrape_paginated_event(event_name, event_id, gender, age_category):
    page = 1
    while True:
        url = base_url.format(event=event_name, gender=gender, age_category=age_category)
        params = {
            "regionType": "world",
            "timing": "electronic",
            "windReading": "regular",
            "page": page,
            "bestResultsOnly": "false",
            "firstDay": "1900-01-01",
            "lastDay": "2025-05-12",
            "maxResultsByCountry": "all",
            "eventId": event_id,
            "ageCategory": age_category
        }
        headers = {"User-Agent": "Mozilla/5.0"}

        response = requests.get(url, headers=headers, params=params)
        soup = BeautifulSoup(response.text, 'html.parser')

        table = soup.find('table', class_='records-table')
        if not table:
            break

        rows = table.find('tbody').find_all('tr')
        if not rows:
            break  # No more data

        for row in rows:
            cols = row.find_all('td')
            if not cols:
                continue
            all_data.append({
                "Event": event_name,
                "Gender": gender,
                "Age Category": age_category,
                "Rank": cols[0].text.strip(),
                "Mark": cols[1].text.strip(),
                "Wind": cols[2].text.strip(),
                "Competitor": cols[3].text.strip(),
                "Competitor Link": "https://worldathletics.org" + cols[3].find('a')['href'] if cols[3].find('a') else None,
                "DOB": cols[4].text.strip(),
                "Nationality": cols[5].text.strip(),  # country name/code
                "Position": cols[6].text.strip(),
                "Venue": cols[8].text.strip(),
                "Date": cols[9].text.strip(),
                "ResultScore": cols[10].text.strip()
                
            })

        print(f"[✓] Page {page} done for {event_name} - {gender} - {age_category}")
        page += 1
        time.sleep(1)  # Respectful scraping delay


In [None]:
# Main loop
for gender in genders:
    for age_category in age_categories:
        for event_name, event_id in events.items():
            scrape_paginated_event(event_name, event_id, gender, age_category)

In [None]:
df = pd.DataFrame(all_data)
df.to_csv('world_athletics_toplist_full.csv', index=False)
print("✅ Scraping complete. Data saved to world_athletics_toplist_full.csv")