In [1]:
# Required libraries: Pandas for storing and exporting data, Selenium for webscraping
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

In [2]:
def load_marathon():
    '''
    Switches race results from half marathon (default) to marathon.
    Input: none
    Ouptut: none
    '''
    # Click on the race dropdown and select full marathon results
    race_options = driver.find_element(
        By.XPATH,
        '//*[@id="select_value_label_6"]'
    )

    race_options.click()

    driver.find_element(
        By.XPATH,
        '//*[@id="select_option_22"]'
    ).click()

    # Wait up to 5 seconds for results to update from half marathon
    start = time.perf_counter()
    while (race_options.text != 'MARATHON'):
        wait = time.per_counter - start
        if wait > 5:
            print('Waited 5+ seconds to load marathon results. Exiting.')
            raise SystemExit
        time.sleep(0.05)

In [3]:
def get_name():
    '''
    Fetch the name for the current runner.
    Handles errors including (1) detail pane not yet loaded, and
    (2) previous runner's detail pane still loaded.
    Input: none
    Output: current runner's name
    '''
    for i in range(10):
        try:
            name = driver.find_element(
                By.XPATH,
                '/html/body/div[5]/md-dialog/md-dialog-content/div[2]/span/div[2]/span[1]'
            ).text
            
            # If name isn't loaded in details pane, try again
            if name == '':
                pass
            # Corner case for first runner (df still empty). Success! Exit the loop early
            elif list(df['Name']) == []:
                break
            # If the name is loaded and updated from the last runner, success! Exit the loop early
            elif name != list(df['Name'])[-1]:
                break
            # 'else' usually means the last runner's details are still loaded
            else:
                pass
        except:
            pass
        # Allow extra time for the details pane to correctly load
        finally:
            time.sleep(0.05)
            
        # error message if, after 10 tries, the place wasn't identified
        if i == 9:
            print('Could not find name. '
                  f'After 10 failed tries returning "{name}"')
        else:
            pass
    
    return name

In [4]:
def scrape_page():
    '''
    Scrapes all results from the current results page.
    Input: none
    Output: place of last runner scraped on page
    '''
    # Find list of all results and turn into list
    results = driver.find_element(
        By.TAG_NAME,
        'md-list'
    )
    results = results.find_elements(
        By.TAG_NAME,
        'md-list-item'
    )
    
    # Parse information for each result
    for result in results:
        # Open details pane
        result.click()
        
        name = get_name()
        
        # Find and separate out gender from string like 'MARATHON AGE - M/35'
        gender = driver.find_element(
            By.XPATH,
            '/html/body/div[5]/md-dialog/md-dialog-content/div[2]/span/div[2]/span[2]'
        )
        gender = gender.text.replace('MARATHON AGE - ', '')[0]
        
        # Close out of details pane
        driver.find_element(
            By.XPATH,
            '/html/body/div[5]/md-dialog/md-dialog-actions/button[2]'
        ).click()
        
        place = result.find_elements(
            By.TAG_NAME,
            'h4'
        )[0].text
        place = int(place[:-2].replace(',', ''))
        
        finish_time = result.find_elements(
            By.TAG_NAME,
            'h4'
        )[-1].text
        finish_time = finish_time.replace('Time:', '').replace(' ', '')
        
        # Add current runner's data to df
        df.loc[place] = [name, gender, finish_time]
    
    return list(df.index)[-1]

In [5]:
def advance_page():
    '''
    Moves to the next page of results.
    Inputs: none
    Ouputs: none
    '''
    # Find the navigation bar for advancing the page
    navigation = driver.find_element(
        By.XPATH,
        '/html/body/div[1]/div/md-content/div/div/md-content/md-toolbar/div[2]'
    )

    # Usually, select the next page number to advance
    try:
        label = ' ' + str(page + 1)
        navigation.find_element(
            By.XPATH,
            f'//button[text()="{label}"]'
        ).click()

    # Sometimes, the next page number isn't visible, so click the ellipsis button
    except:
        navigation.find_elements(
            By.XPATH,
            '//button[text()="…"]'
        )[-1].click()
    
    # Wait until the first name on the page has updated
    for i in range(20):
        first_name = driver.find_element(
            By.XPATH,
            '/html/body/div[1]/div/md-content/div/div/md-content/md-list/md-list-item[1]/div/div[1]/div[2]/h3'
        ).text
        if first_name not in {'', list(df['Name'])[-1]}:
            break
        else:
            time.sleep(0.05)
        if i == 19:
            print('Failed to identify new page after 20 tries. Exiting.')
            raise SystemExit

In [6]:
# Establish web driver, open webpage, and create dataframe
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)
driver.get('http://results2.xacte.com/#/e/2436/placings')

df = pd.DataFrame(columns=['Name', 'Gender', 'Time'])
df.index.name = 'Place'

load_marathon()

# Scrape results from the page and advance and repeat until done
page = 0 # number of the last page loaded
while True:
    scrape_page()
    page += 1
    if page < 44: # expected number of pages
        advance_page()
    else:
        break

driver.quit()

# Export results to csv
df.to_csv('2022colfax.csv')