# **ADA Race Results - Gather the Data**

Run on Python 3.12 | No errors | No warnings

I created this program to automate the process of connecting to the race results, finding the headers, and extracting the data row by row across multiple pages. Manually selecting, copying, and pasting records is not only difficult and slow but also prone to errors. Initially, the dataset is small when results submission first opens, but it quickly grows to several thousand records once the event officially starts. Automating this process saves a great deal of time and reduces stress.

The 5K@ADA posts general participant data with no sensitive information, such as age, email, or precise location. The site has no restrictions on automated data collection methods.

In [1]:
# Step 0: Load the required libraries

# To automate web browser interaction
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# To parse the HTML content of a webpage
from bs4 import BeautifulSoup

# For data manipulation
import pandas as pd

# To handle regular expressions for pattern matching and string manipulation
import re

# For handling exceptions and adding delays
import time

In [2]:
# Step 1: Set up the WebDriver

# Initialize the WebDriver (Chrome in this case) and navigate to the target URL
driver = webdriver.Chrome()  # or `webdriver.Firefox()`, etc.
url = "https://results.raceroster.com/v2/en-US/results/mme3nnx552p7v7ks/results"
driver.get(url)

In [3]:
# Step 2: Define a function to clean headers

# This function removes unwanted text from headers and trims whitespace
def clean_header(header):
    return re.sub(r"Click on any of the columns headers to apply sorting", "", header).strip()

In [4]:
# Step 3: Define a function to extract data from the current page
def extract_data_from_page():
    # Use BeautifulSoup to parse the loaded page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Extract data
    results = []
    headers = []
    
    # Find the table body
    table = soup.find('table')
    if table is None:
        raise ValueError("No table found on the page.")
    
    # Find the table headers
    thead = table.find('thead')
    if thead:
        headers = [clean_header(header.get_text(strip=True)) for header in thead.find_all('th')]
    
    # Extract the data rows from the table
    tbody = table.find('tbody')
    if tbody:
        for row in tbody.find_all('tr'):
            cells = row.find_all('td')
            row_data = [cell.get_text(strip=True) for cell in cells]
            results.append(row_data)
    
    return headers, results

In [5]:
# Step 4: Initialize an empty list to store all results
all_results = []
headers = []

In [6]:
# Step 5: Extract data from the first page
page_headers, page_results = extract_data_from_page()
if not headers:
    headers = page_headers
all_results.extend(page_results)

In [7]:
# Step 6: Loop through the pagination to extract data from subsequent pages
while True:
    try:
        # Find the "Next" button using aria-label attribute
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Next']"))
        )
        
        # Click the "Next" button using JavaScript to ensure it works
        driver.execute_script("arguments[0].click();", next_button)

        # Wait for the next page to load by checking the table presence
        time.sleep(3)  # Adding a sleep to ensure page loads completely
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table"))
        )

        # Extract data from the new page and add to all_results
        page_headers, page_results = extract_data_from_page()
        if not headers:
            headers = page_headers
        all_results.extend(page_results)

    except Exception as e:
        # If there is no "Next" button or another issue, break the loop
        print(f"Stopping pagination: {e}")
        # If pagination stops with no message, there are no more pages
        break

Stopping pagination: Message: 



In [8]:
# Step 7: Convert the extracted data to a DataFrame
df0 = pd.DataFrame(all_results, columns=headers)

In [9]:
# Verify the columns and data
df0.head()

Unnamed: 0,Place,Bib,Name,Country,Time,Age Group,Age Group Place,Gender Place,Age Group Gender Place,Enrollment
0,1,4029.0,Youssouf Mahamat Allamine Tahir,TD,14:45,30 - 34,1 / 697,1 / 3087,1 / 367,Run / Walk
1,2,9484.0,Jhan Carlos,CO,14:45,20 - 24,1 / 288,2 / 3087,1 / 161,Run / Walk
2,3,,Nyasha Dzivai,ZA,14:45,35 - 39,1 / 900,3 / 3087,1 / 469,Run / Walk
3,4,16843.0,Chadwick Pridgen,US,14:45,40 - 44,1 / 926,4 / 3087,1 / 498,Run / Walk
4,5,17013.0,Neftali Pérez,VE,14:45,50 - 54,1 / 608,5 / 3087,1 / 319,Run / Walk


In [10]:
# Check the data types and non-null values
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5743 entries, 0 to 5742
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Place                   5743 non-null   object
 1   Bib                     5743 non-null   object
 2   Name                    5743 non-null   object
 3   Country                 5743 non-null   object
 4   Time                    5743 non-null   object
 5   Age Group               5743 non-null   object
 6   Age Group Place         5743 non-null   object
 7   Gender Place            5743 non-null   object
 8   Age Group Gender Place  5743 non-null   object
 9   Enrollment              5743 non-null   object
dtypes: object(10)
memory usage: 448.8+ KB


In [11]:
# Ensure Place columns are treated as text by prepending an apostrophe
df0['Gender Place'] = df0['Gender Place'].apply(lambda x: f"'{x}")
df0['Age Group Place'] = df0['Age Group Place'].apply(lambda x: f"'{x}")
df0['Age Group Gender Place'] = df0['Age Group Gender Place'].apply(lambda x: f"'{x}")

In [12]:
# Normalize the Time column to ensure a consistent format
def normalize_time(time_str):
    parts = time_str.split(':')
    if len(parts) == 2:  # mm:ss
        return f"00:{time_str}"
    elif len(parts) == 3:  # h:mm:ss
        return time_str
    else:
        return time_str  # Return as is if not in the expected format

df0['Time'] = df0['Time'].apply(normalize_time)

In [13]:
# Verify the columns and data again after normalization
df0.head()

Unnamed: 0,Place,Bib,Name,Country,Time,Age Group,Age Group Place,Gender Place,Age Group Gender Place,Enrollment
0,1,4029.0,Youssouf Mahamat Allamine Tahir,TD,00:14:45,30 - 34,'1 / 697,'1 / 3087,'1 / 367,Run / Walk
1,2,9484.0,Jhan Carlos,CO,00:14:45,20 - 24,'1 / 288,'2 / 3087,'1 / 161,Run / Walk
2,3,,Nyasha Dzivai,ZA,00:14:45,35 - 39,'1 / 900,'3 / 3087,'1 / 469,Run / Walk
3,4,16843.0,Chadwick Pridgen,US,00:14:45,40 - 44,'1 / 926,'4 / 3087,'1 / 498,Run / Walk
4,5,17013.0,Neftali Pérez,VE,00:14:45,50 - 54,'1 / 608,'5 / 3087,'1 / 319,Run / Walk


In [14]:
# Check the data types and non-null values again
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5743 entries, 0 to 5742
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Place                   5743 non-null   object
 1   Bib                     5743 non-null   object
 2   Name                    5743 non-null   object
 3   Country                 5743 non-null   object
 4   Time                    5743 non-null   object
 5   Age Group               5743 non-null   object
 6   Age Group Place         5743 non-null   object
 7   Gender Place            5743 non-null   object
 8   Age Group Gender Place  5743 non-null   object
 9   Enrollment              5743 non-null   object
dtypes: object(10)
memory usage: 448.8+ KB


In [15]:
# Step 8: Save the DataFrame to a CSV file without the index
df0.to_csv('ADA_race_results.csv', index=False)

In [16]:
# Step 9: Close the WebDriver
driver.quit()