In [1]:
# Required libraries: Pandas for storing and exporting data, Selenium for webscraping
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.select import Select

In [2]:
def set_filter(start):
    '''
    Uses the 'Advanced Filter' to set the start place
    for the results page.
    Note that setting place to 'Greater than 10', for example,
    actually sets the display including the 10th place finisher.
    
    Input: starting place to display results (can be 0 -> 24950)
    '''
    
    # Open the advanced filter
    if start==0:
        driver.find_element(
            'xpath',
            '//div[@ng-click="showAdvancedFilter()"]'
        ).click()
    else:
        driver.find_element(
            'xpath',
            '//a[@ng-click="showAdvancedFilter()"]'
        ).click()
    
    # Select places that are greater than or equal to the user input
    Select(driver.find_element(
        'xpath',
        '//select[@ng-model="overallPlaceCompareOption"]'
    )).select_by_visible_text('Greater than')
    
    # Set the starting place
    place_num = driver.find_element(
        'xpath',
        '//input[@ng-model="overallPlace"]'
    )
    place_num.clear()
    place_num.send_keys(str(start))
    
    # Apply the filter
    driver.find_element(
        'xpath',
        '//a[@ng-click="apply()"]'
    ).click()

In [3]:
def get_results(start, pages):
    '''
    Collects and stores results
    
    Inputs: 'start' is the place to begin recording,
    'pages' is how many pages of results to record
    '''
    set_filter(start)
    
    # Load more results (pages - 1) times
    for i in range(pages - 1):
        try:
            driver.find_element(
                'xpath',
                '//a[@class="button-load-more"]'
            ).click()
        except:
            pass
    
    # Scrape data from webpage into dataframe and update current place
    elems = driver.find_elements('xpath', '//div[@class="cmd-finisher ng-scope"]')
    for elem in elems:
        parts = elem.text.split('\n')
        global current_place
        current_place = int(parts[-1].replace(',',''))
        df.loc[current_place] = [parts[0], parts[1][0], parts[4]]

In [4]:
# Establish web driver, open webpage, and create dataframe
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(10)
driver.get('https://results.nyrr.org/event/M2021/finishers')

df = pd.DataFrame(columns=['Name', 'Gender', 'Time'])
df.index.name = 'Place'

# Run get_results until all are loaded
current_place = 0
while current_place < 24950:
    get_results(current_place, 10)
    #print('Now current place is', current_place)

# Based on user input, decide whether or not to export results to csv
# (mostly used for debugging during development)
temp = input(
    '''{} results have been loaded.
    Export dataframe to 2021nycm.csv?
    'y' for yes, anything else for no.
    '''.format(df.shape[0])
    )
if temp=='y':
    df.to_csv('2021nycm.csv')
    print('Dataframe exported!')
else:
    print('Dataframe not exported.')

driver.quit()

24950 results have been loaded.
    Export dataframe to 2021nycm.csv?
    'y' for yes, anything else for no.
     y


Dataframe exported!
