# Redline Results Scraper

This notebook scrapes Redlines Fitness games results data from multiple events hosted on the runnersunite.racetecresults.com website. 


### Setup, Define URLs Helper functions

In [None]:
import pandas as pd
import time
import lxml
from io import StringIO

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup
import random
import pprint
import os

print("Libraries imported successfully.")

# Dictionary of events with their names and result URLs
day_urls = {
        #future proofing for 2025
    #"2025KL.1": "",
    #"2025KL.2": "",
    
    "2024.1":   "https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251",
    "2024.2":   "https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1252",    
    "2023.1":   "https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1216",
    "2023.2":   "https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1217",
}


EVENT_RENAME_LIST = [
    # 2025
    #["WomensSinglesAdvKL2025",   "2025KL.1 Women's Singles Level 3"],
    #["MensSinglesAdvKL2025",     "2025KL.1 Men's Singles Level 3",
    #["WomensSinglesInterKL2025", "2025KL.1 Women's Singles Level 2"],
    #["MensSinglesInterKL2025",   "2025KL.2 Men's Singles Level 2"],
    #["WomensDoublesKL2025",      "2025KL.2 Women's Doubles"],
    #["MensDoublesKL2025",        "2025KL.1 Men's Doubles"],
    #["MixedDoublesKL2025",       "2025KL.1 Mixed Doubles"],
    #["TeamRelayWomenKL2025",     "2025KL.2 Team Relay Women"],
    #["TeamRelayMenKL2025",       "2025KL.2 Team Relay Men"],
    #["TeamRelayMixedKL2025",     "2025KL.2 Team Relay Mixed"],
    #["BeginnersKL2025",          "2025KL.1 Singles Level 1"],
    
    #2024
    ["WomensSinglesCompetitive2024", "2024.1 Women's Singles Competitive"],
    ["MensSinglesCompetitive2024"  , "2024.1 Men's Singles Competitive"],
    ["WomensSinglesOpen2024",        "2024.1 Women's Singles Open"],
    ["MensSinglesOpen2024",          "2024.2 Men's Singles Open"],
    ["WomensDoubles2024",            "2024.2 Women's Doubles"],
    ["MensDoubles2024",              "2024.1 Men's Doubles"],
    ["MixedDoubles2024",             "2024.1 Mixed Doubles"],
    ["TeamRelayWomen2024",           "2024.2 Team Relay Women"],
    ["TeamRelayMen2024",             "2024.2 Team Relay Men"],
    ["TeamRelayMixed2024",           "2024.2 Team Relay Mixed"],
    ["Beginners2024",                "2024.1 Beginners"],
    
    #2023
    ["WomensSinglesCompetitive2023", "2023.1 Women's Singles Competitive"],
    ["MensSinglesCompetitive2023",   "2023.1 Men's Singles Competitive"],
    ["WomensSinglesOpen2023",        "2023.1 Women's Singles Open"],
    ["MensSinglesOpen2023",          "2023.1 Men's Singles Open"],
    ["WomensDoubles2023",            "2023.2 Women's Doubles"],
    ["MensDoubles2023",              "2023.2 Men's Doubles"],
    ["MixedDoubles2023",             "2023.2 Mixed Doubles"],
    ["TeamRelayWomen2023",           "2023.2 Team Relay Women"],
    ["TeamRelayMen2023",             "2023.2 Team Relay Men"],
    ["TeamRelayMixed2023",           "2023.2 Team Relay Mixed"],
]

# helper fucntion to get the total number of pages for a given event
def get_total_pages(html_content: str) -> int:
    """
    Parses HTML content to find the total number of result pages.

    Args:
        html_content: The HTML source of the results page.

    Returns:
        The total number of pages as an integer.
    """
    soup = BeautifulSoup(html_content, 'lxml')

    # 1. Find the pager table by its unique ID
    pager_table = soup.find('table', id='ctl00_Content_Main_grdTopPager')

    # 2. If no pager table is found, there's only one page of results
    if not pager_table:
        return 1

    # 3. Find all the link tags within the pager table
    page_links = pager_table.find_all('a')
    
    page_numbers = []
    # 4. Loop through the links, clean the text, and convert to a number
    for link in page_links:
        # Get text like "[1]" or "2" and remove whitespace
        text = link.get_text(strip=True)
        
        # Remove decorative brackets
        cleaned_text = text.replace('[', '').replace(']', '')
        
        # Check if the result is a digit before converting
        if cleaned_text.isdigit():
            page_numbers.append(int(cleaned_text))

    # 5. The total number of pages is the highest number found.
    # If for some reason no numbers were found, default to 1.
    if not page_numbers:
        return 1
    else:
        return max(page_numbers)

#helper function to remove the second duplicate column name
def remove_second_duplicate_column(dataframe, col_name_to_check):
    """
    Checks for a duplicate column by name and removes the second occurrence.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.
        col_name_to_check (str): The name of the column to check for duplicates.

    Returns:
        pd.DataFrame: A new DataFrame with the duplicate column removed, 
                      or the original DataFrame if no duplicates were found.
    """
    # Find the integer positions of all columns with the given name
    indices = [i for i, name in enumerate(dataframe.columns) if name == col_name_to_check]

    if len(indices) > 1:
        #print(f"Found duplicate '{col_name_to_check}' columns at positions: {indices}")
        
        # Get the position of the second occurrence
        col_to_drop_index = indices[1]
        #print(f"Removing the second occurrence at position {col_to_drop_index}.")
        
        # Get a list of all column indices except the one to drop
        cols_to_keep_indices = [i for i in range(len(dataframe.columns)) if i != col_to_drop_index]
        
        # Return a new DataFrame with only the desired columns
        return dataframe.iloc[:, cols_to_keep_indices]
    else:
        print(f"No duplicates found for column '{col_name_to_check}'.")
        #print(f"Columns: '{dataframe.iloc[0]}'")
        return dataframe.copy() # Return a copy to avoid side effects

def polite_delay(min_seconds=10, max_seconds=45):
    wait_time = random.uniform(min_seconds, max_seconds)
    print(f"Waiting for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

print(f"Ready to scrape {len(day_urls)} Days of Events.")

Libraries imported successfully.
Ready to scrape 4 Days of Events.


### The First Scraping Loop (to get the event links)
Needed to use Selenium to avoid website blocking access.

In [2]:

#Extracted data across all events.
extracted_data = []

# Loop through the dictionary of URLs
for year_day, url in day_urls.items():
    print(f"Scraping data for: '{year_day}'...")
    
    try:

        # Setup and run a real Chrome browser
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

        print("Navigating to URL...")
        driver.get(url)

        # Wait for the page (and any javascript) to load
        time.sleep(3) 

        # Get the HTML content after the browser has rendered it
        html_content = driver.page_source

        # Close the browser
        driver.quit()

        # 1. Create a BeautifulSoup object to parse the HTML
        # We use the 'lxml' parser, which is fast and reliable.
        soup = BeautifulSoup(html_content, 'lxml')

        # 2. Find the <ul> element by its unique ID
        # This is the most reliable way to select the container.
        event_list = soup.find('ul', id='ctl00_Content_Main_divEvents')

        # 3. From that container, find all the <a> (anchor/link) tags
        all_links = event_list.find_all('a')

        # 4. Loop through the links and extract the data
        # We'll store the results in a list of dictionaries for clarity.
        base_url = "https://runnersunite.racetecresults.com"

        for link in all_links:
            # Get the value of the 'href' attribute.
            # Note: BeautifulSoup automatically converts '&' to '&' for you.
            relative_href = link.get('href')
            
            # Get the visible text of the link, removing any extra whitespace
            link_text = link.get_text(strip=True)
            
            # Create the full, usable URL
            full_url = f"{base_url}/{relative_href}"
            full_text = f"{year_day} {link_text}"
            
            extracted_data.append({
                'text': full_text,
                'href': full_url
            })



    except Exception as e:
        print(f"--> An error occurred while scraping {url}: {e}")

    # Be a good web citizen and add a small delay between requests
    time.sleep(2)

# Print the results in a clean format
pprint.pprint(extracted_data)

print("\nScraping process complete.")

Scraping data for: '2024.1'...
Navigating to URL...
Scraping data for: '2024.2'...
Navigating to URL...
Scraping data for: '2023.1'...
Navigating to URL...
Scraping data for: '2023.2'...
Navigating to URL...
[{'href': 'https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=1',
  'text': "2024.1 Women's Singles Open"},
 {'href': 'https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=2',
  'text': "2024.1 Women's Singles Competitive"},
 {'href': 'https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=4',
  'text': "2024.1 Men's Singles Competitive"},
 {'href': 'https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=6',
  'text': '2024.1 Mixed Doubles'},
 {'href': 'https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=5',
  'text': "2024.1 Men's Doubles"},
 {'href': 'https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=7',
  'text': '2024.1 Beginners'},
 {'href

### Step 4: The 2nd Scraping Loop (to get all the data and save files)
Added random delay at end of every file so website dont get mad.

In [3]:

#for each of the extracted data items
for data in extracted_data:
    #get the text and href
    event_name = data['text']
    url = data['href']
    print(f"Scraping data for: '{event_name}'...")
         
    try:

        # Setup and run a real Chrome browser
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

        print("Navigating to URL...")
        driver.get(url)

        # Wait for the page (and any javascript) to load
        time.sleep(3) 

        # Get the HTML content after the browser has rendered it
        html_content = driver.page_source
        
        # Close the browser
        driver.quit()
        

        total_pages = get_total_pages(html_content)
        print(f"Found {total_pages} page(s) for this event.")

        # set list of dataframes to empty
        final_event_dfs = []

        # --- Step 3-6: Loop through all pages ---
        for page_num in range(1, total_pages + 1):
            page_url = f"{url}&dt=0&PageNo={page_num}"
            print(f"  > Scraping page {page_num}: {page_url}")
            
            # We can use pandas directly here for simplicity
            try:

                # Setup and run a real Chrome browser
                driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

                print("Navigating to page URL...")
                driver.get(page_url)

                # Wait for the page (and any javascript) to load
                time.sleep(3) 

                # Get the HTML content after the browser has rendered it
                html_content = driver.page_source

                # Close the browser
                driver.quit()

                # 1. Isolate the table using BeautifulSoup
                soup = BeautifulSoup(html_content, 'lxml')
                results_div = soup.find('div', id='ctl00_Content_Main_divGrid')
                if not results_div:
                    raise ValueError("Could not find the main results container div.")

                results_table_html = str(results_div.find('table'))

                # 2. Use pandas to parse the isolated HTML table
                # read_html returns a list of DataFrames. We want the first one.
                try:
                    df = pd.read_html(StringIO(results_table_html))[0]
                    
                    #if this is not the first df
                    if(final_event_dfs != []):
                        #remove the first row of the df
                        df = df.iloc[1:]
                    
                    #add to final event list of df
                    final_event_dfs.append(df)
        
                except Exception as e:
                    raise RuntimeError(f"pandas.read_html failed to parse the table. Error: {e}")

        # This list will store the pandas DataFrame for each scraped event
            except Exception as e:
                print(f"    - Could not scrape table from page {page_num}. Error: {e}")

        # --- Clean Up the DataFrame ---
        #now we should have a list of dataframes
        final_df = pd.concat(final_event_dfs, ignore_index=True)

        #print(f"Data Cleaning and Processing {event_name}")
               
        # Step 1: Promote the first row of data to be the DataFrame's column headers
        final_df.columns = final_df.iloc[0]

        # Step 2: Remove the now-redundant first row (which is now just a copy of the headers)
        # and reset the index to be clean (0, 1, 2...).
        final_df = final_df.iloc[1:].reset_index(drop=True)
        
        # Step 3: Clean the newly assigned column headers.
        # Use str(col).strip() to be safe. This prevents errors if a column name was NaN or another non-string.
        final_df.columns = [str(col).strip() for col in final_df.columns]
        final_df.columns.name = None # Clean up the index name left over from the promotion

        #Remove Multiple instances of the same column, mobile / desktop
        final_df = remove_second_duplicate_column(final_df, "Name")
        final_df = remove_second_duplicate_column(final_df, "Net Time")
                  
        # write the master df to cvs
        filename = f"rl_data/{event_name}.csv"
        print(f"Writing file {filename}:")
        final_df.to_csv(filename, index=False) 
             
    except Exception as e:
        print(f"--> An error occurred while scraping {url}: {e}")

    # Delay between requests so as not to be too annoying 
    polite_delay()
   


Scraping data for: '2024.1 Women's Singles Open'...
Navigating to URL...
Found 5 page(s) for this event.
  > Scraping page 1: https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=1&dt=0&PageNo=1
Navigating to page URL...
  > Scraping page 2: https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=1&dt=0&PageNo=2
Navigating to page URL...
  > Scraping page 3: https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=1&dt=0&PageNo=3
Navigating to page URL...
  > Scraping page 4: https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=1&dt=0&PageNo=4
Navigating to page URL...
  > Scraping page 5: https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1251&EId=1&dt=0&PageNo=5
Navigating to page URL...
Writing file rl_data/2024.1 Women's Singles Open.csv:
Waiting for 21.68 seconds...
Scraping data for: '2024.1 Women's Singles Competitive'...
Navigating to URL...
Found 2 page(s) for this event.
  > S

###  Finalising the files.
Just to some simple renaming of files, so they match my Redline fitness games Explorerer

In [None]:
#rename the files to match Redline Explorer expected format

for i in range(len(EVENT_RENAME_LIST)):
    original_file_name = f"rl_data/{EVENT_RENAME_LIST[i][1]}.csv"
    new_file_name = f"rl_data/{EVENT_RENAME_LIST[i][0]}.csv"
    print(f"Renaming '{original_file_name}' to '{new_file_name}'")
    os.rename(original_file_name, new_file_name)




Renaming rl_data/2024.1 Women's Singles Competitive.csv to rl_data/WomensSinglesCompetitive2024.csv
Renaming rl_data/2024.1 Men's Singles Competitive.csv to rl_data/MensSinglesCompetitive2024.csv
Renaming rl_data/2024.1 Women's Singles Open.csv to rl_data/WomensSinglesOpen2024.csv
Renaming rl_data/2024.2 Men's Singles Open.csv to rl_data/MensSinglesOpen2024.csv
Renaming rl_data/2024.2 Women's Doubles.csv to rl_data/WomensDoubles2024.csv
Renaming rl_data/2024.1 Men's Doubles.csv to rl_data/MensDoubles2024.csv
Renaming rl_data/2024.1 Mixed Doubles.csv to rl_data/MixedDoubles2024.csv
Renaming rl_data/2024.2 Team Relay Women.csv to rl_data/TeamRelayWomen2024.csv
Renaming rl_data/2024.2 Team Relay Men.csv to rl_data/TeamRelayMen2024.csv
Renaming rl_data/2024.2 Team Relay Mixed.csv to rl_data/TeamRelayMixed2024.csv
Renaming rl_data/2023.1 Women's Singles Competitive.csv to rl_data/WomensSinglesCompetitive2023.csv
Renaming rl_data/2023.1 Men's Singles Competitive.csv to rl_data/MensSinglesCom