In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

def scrape_race_crashes(url):
    # Initialize the browser
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    time.sleep(1.5) # Wait for the page to load
    
    # Click on "view more events" until all events are visible
    while True:
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, ".ViewFullTimeline")
            load_more_button.click()
        except:
            break  # No more events to load

    time.sleep(1.5) # Wait for the page to load


    # Find the timeline element
    timeline = driver.find_element(By.CSS_SELECTOR, ".timeline2")

    # Each row is a child element representing a race event
    rows = timeline.find_elements(By.CSS_SELECTOR, "li")

    crash_data = []

    for row in rows:
        try:
            # Extract the KM to go from the "bol" column
            km_to_go = row.find_element(By.CSS_SELECTOR, ".bol").text
            
            # Extract the event description (HTML) from the "txt" column
            event_description_html = row.find_element(By.CSS_SELECTOR, ".txt").get_attribute('innerHTML')
            
            # Check if the event contains a crash
            if 'crash' in event_description_html.lower():
                riders = []
                
                # Check for riders in a list
                try:
                    rider_elements = row.find_elements(By.CSS_SELECTOR, ".txt li")
                    for rider in rider_elements:
                        lastname = rider.find_element(By.CSS_SELECTOR, ".uppercase").text
                        firstname = rider.find_element(By.TAG_NAME, "a").text.split()[-1]  # Extract Firstname
                        riders.append(f"{lastname} {firstname}")
                except:
                    pass
                
                # Check for riders inline (not in a list)
                if not riders:
                    rider_links = row.find_elements(By.CSS_SELECTOR, ".txt a")
                    for rider in rider_links:
                        lastname = rider.find_element(By.CSS_SELECTOR, ".uppercase").text
                        firstname = rider.text.split()[-1]  # Extract Firstname
                        riders.append(f"{lastname} {firstname}")

                # Append crash data
                crash_data.append({"stage_url": url, "km_to_go": km_to_go, "riders": riders, "event": event_description_html})
        
        except Exception as e:
            # print(f"Error processing row: {e}")
            continue  # Skip any rows that don't fit the expected structure
    
    # Convert to a DataFrame
    crash_df = pd.DataFrame(crash_data)
    
    # Close the browser
    driver.quit()
    
    return crash_df

def pull_crashes_from_results_df(df):
    # Apply the scrape
    urls = list(set(df["stage_url"].tolist()))

    df = pd.DataFrame()

    for url in urls:
        try:
            crashes = scrape_race_crashes(f'https://www.procyclingstats.com/{url}')
            df = pd.concat([df, crashes])
            print(f"Scraped {len(crashes)} crashes from https://www.procyclingstats.com/{url}")
        except:
            print(f"Error scraping https://www.procyclingstats.com/{url}")
            continue
    return df

In [3]:
# Run for 2021-2023 data
results_df_2021_2023 = pd.read_csv("results_df_2021_2023.csv")
results_df_2021_2023 = results_df_2021_2023[results_df_2021_2023['uci_tour'].isin(["UCI Worldtour","UCI ProSeries"])]
results_df_2021_2023['is_stage_race'] = results_df_2021_2023['stage_url'].str[-9:].str.contains('stage')
results_df_2021_2023['has_result_already'] = results_df_2021_2023['stage_url'].str[-9:].str.contains('result')
results_df_2021_2023['stage_url'] = results_df_2021_2023.apply(lambda row: row['stage_url'] + '/live' if row['is_stage_race'] | row['has_result_already']
                           else row['stage_url'] + '/result/live', axis=1)
results_df_2021_2023 = results_df_2021_2023[~results_df_2021_2023['stage_url'].str.contains('tour-of-qinghai-lake')]

crash_df_2021_2023 = pull_crashes_from_results_df(results_df_2021_2023)

crash_df_2021_2023.to_csv("crash_df_2021_2023.csv", index=False)

Scraped 0 crashes from https://www.procyclingstats.com/race/tour-of-the-alps/2023/stage-3/live
Scraped 1 crashes from https://www.procyclingstats.com/race/giro-d-italia/2023/stage-12/live
Scraped 2 crashes from https://www.procyclingstats.com/race/tour-of-belgium/2023/stage-5/live
Error scraping https://www.procyclingstats.com/race/tour-of-britain/2021/stage-3/live
Scraped 4 crashes from https://www.procyclingstats.com/race/giro-d-italia/2022/stage-4/live
Scraped 0 crashes from https://www.procyclingstats.com/race/volta-ao-algarve/2021/stage-5/live
Scraped 0 crashes from https://www.procyclingstats.com/race/tour-de-langkawi/2023/stage-2/live
Scraped 0 crashes from https://www.procyclingstats.com/race/circuit-franco-belge/2023/result/live
Scraped 0 crashes from https://www.procyclingstats.com/race/scheldeprijs/2021/result/live
Scraped 1 crashes from https://www.procyclingstats.com/race/tour-de-wallonie/2023/stage-3/live
Scraped 4 crashes from https://www.procyclingstats.com/race/tour-de

In [4]:
# Get url lookup for 2021-2023 data
results_df_2021_2023 = pd.read_csv("results_df_2021_2023.csv")
results_df_2021_2023 = results_df_2021_2023[results_df_2021_2023['uci_tour'].isin(["UCI Worldtour","UCI ProSeries"])]
results_df_2021_2023['is_stage_race'] = results_df_2021_2023['stage_url'].str[-9:].str.contains('stage')
results_df_2021_2023['has_result_already'] = results_df_2021_2023['stage_url'].str[-9:].str.contains('result')
results_df_2021_2023['stage_url_2'] = results_df_2021_2023.apply(lambda row: row['stage_url'] + '/live' if row['is_stage_race'] | row['has_result_already']
                           else row['stage_url'] + '/result/live', axis=1)
results_df_2021_2023['stage_url_2'] = 'https://www.procyclingstats.com/' + results_df_2021_2023['stage_url_2']
results_df_2021_2023[['stage_url','stage_url_2']].drop_duplicates().to_csv("url_lookup_2021_2023.csv", index=False)

In [None]:
# Example usage
url = 'https://www.procyclingstats.com/race/milano-sanremo/2024/result/live'
crashes_df = scrape_race_crashes(url)
print(crashes_df)