# **EASD Race Results - Gather the Data**

Run on Python 3.12 | No errors | No warnings

In [1]:
# Step 0: Load the required libraries

# To automate web browser interaction
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# To parse the HTML content of a webpage
from bs4 import BeautifulSoup

# For data manipulation
import pandas as pd

# To handle regular expressions for pattern matching and string manipulation
import re

# For handling exceptions and adding delays
import time

In [2]:
# Step 1: Set up the WebDriver

# Initialize the WebDriver (Chrome in this case) and navigate to the target URL
driver = webdriver.Chrome()  # or `webdriver.Firefox()`, etc.
url = "https://results.raceroster.com/v2/en-US/results/ggzzvyenhfrtr64g/results?page=1"
driver.get(url)

In [3]:
# Step 2: Define a function to clean headers

# This function removes unwanted text from headers and trims whitespace
def clean_header(header):
    return re.sub(r"Click on any of the columns headers to apply sorting", "", header).strip()

In [4]:
# Step 3: Define a function to extract data from the current page
def extract_data_from_page():
    # Use BeautifulSoup to parse the loaded page
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Extract data
    results = []
    headers = []
    
    # Find the table body
    table = soup.find('table')
    if table is None:
        raise ValueError("No table found on the page.")
    
    thead = table.find('thead')
    if thead:
        headers = [clean_header(header.get_text(strip=True)) for header in thead.find_all('th')]

    tbody = table.find('tbody')
    if tbody:
        for row in tbody.find_all('tr'):
            cells = row.find_all('td')
            row_data = [cell.get_text(strip=True) for cell in cells]
            results.append(row_data)
    
    return headers, results

In [5]:
# Step 4: Initialize an empty list to store all results
all_results = []
headers = []

In [6]:
# Step 5: Extract data from the first page
page_headers, page_results = extract_data_from_page()
if not headers:
    headers = page_headers
all_results.extend(page_results)

In [7]:
# Step 6: Loop through the pagination to extract data from subsequent pages
while True:
    try:
        # Find the "Next" button using aria-label attribute
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Next']"))
        )
        
        # Click the "Next" button using JavaScript to ensure it works
        driver.execute_script("arguments[0].click();", next_button)

        # Wait for the next page to load by checking the table presence
        time.sleep(3)  # Adding a sleep to ensure page loads completely
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table"))
        )

        # Extract data from the new page and add to all_results
        page_headers, page_results = extract_data_from_page()
        if not headers:
            headers = page_headers
        all_results.extend(page_results)

    except (NoSuchElementException, TimeoutException) as e:
        # If there is no "Next" button or another issue, break the loop
        print("Stopping pagination: No more pages or timeout.")
        break

    except Exception as e:
        # Handle other exceptions
        print(f"An error occurred: {e}")
        break

Stopping pagination: No more pages or timeout.


In [8]:
# Step 7: Convert to DataFrame
df0 = pd.DataFrame(all_results, columns=headers)

In [9]:
# Verify the columns and data
df0.head()

Unnamed: 0,Place,Gender Place,Age Group Place,Age Group Gender Place,Bib,Name,Gender,Country,Time,Verified
0,1,1 / 2324,1 / 431,1 / 216,14956,Muntasir Ahmad Abdul Kareem Alzghoul,Male,JO,14:45,
1,2,2 / 2324,1 / 716,1 / 375,18494,Darelle Andrade,Male,PH,14:45,
2,3,3 / 2324,2 / 431,2 / 216,16354,Mohamed Awny,Male,EG,14:45,
3,4,1 / 2112,1 / 650,1 / 354,8902,Bojana Bogojevic,Female,RS,14:45,
4,5,4 / 2324,1 / 196,1 / 117,12430,Ephraim Joshua,Male,TZ,14:45,


In [10]:
# Check the data types and non-null values
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4461 entries, 0 to 4460
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Place                   4461 non-null   object
 1   Gender Place            4461 non-null   object
 2   Age Group Place         4461 non-null   object
 3   Age Group Gender Place  4461 non-null   object
 4   Bib                     4461 non-null   object
 5   Name                    4461 non-null   object
 6   Gender                  4461 non-null   object
 7   Country                 4461 non-null   object
 8   Time                    4461 non-null   object
 9   Verified                4461 non-null   object
dtypes: object(10)
memory usage: 348.6+ KB


In [11]:
# Ensure Place columns are treated as text by prepending an apostrophe
df0['Gender Place'] = df0['Gender Place'].apply(lambda x: f"'{x}")
df0['Age Group Place'] = df0['Age Group Place'].apply(lambda x: f"'{x}")
df0['Age Group Gender Place'] = df0['Age Group Gender Place'].apply(lambda x: f"'{x}")

In [12]:
# Normalize the Time column
def normalize_time(time_str):
    parts = time_str.split(':')
    if len(parts) == 2:  # mm:ss
        return f"00:{time_str}"
    elif len(parts) == 3:  # h:mm:ss
        return time_str
    else:
        return time_str  # Return as is if not in the expected format

df0['Time'] = df0['Time'].apply(normalize_time)

In [13]:
# Verify the columns and data
df0.head()

Unnamed: 0,Place,Gender Place,Age Group Place,Age Group Gender Place,Bib,Name,Gender,Country,Time,Verified
0,1,'1 / 2324,'1 / 431,'1 / 216,14956,Muntasir Ahmad Abdul Kareem Alzghoul,Male,JO,00:14:45,
1,2,'2 / 2324,'1 / 716,'1 / 375,18494,Darelle Andrade,Male,PH,00:14:45,
2,3,'3 / 2324,'2 / 431,'2 / 216,16354,Mohamed Awny,Male,EG,00:14:45,
3,4,'1 / 2112,'1 / 650,'1 / 354,8902,Bojana Bogojevic,Female,RS,00:14:45,
4,5,'4 / 2324,'1 / 196,'1 / 117,12430,Ephraim Joshua,Male,TZ,00:14:45,


In [14]:
# Check the data types and non-null values
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4461 entries, 0 to 4460
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Place                   4461 non-null   object
 1   Gender Place            4461 non-null   object
 2   Age Group Place         4461 non-null   object
 3   Age Group Gender Place  4461 non-null   object
 4   Bib                     4461 non-null   object
 5   Name                    4461 non-null   object
 6   Gender                  4461 non-null   object
 7   Country                 4461 non-null   object
 8   Time                    4461 non-null   object
 9   Verified                4461 non-null   object
dtypes: object(10)
memory usage: 348.6+ KB


In [15]:
# Step 8: Save to CSV
df0.to_csv('EASD_race_results.csv', index=False)

In [16]:
# Step 9: Close the WebDriver
driver.quit()