In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import re
import math
import os

In [2]:
years = [2024, 2023]
registered_sexes = ['M', 'W']
events = ["I", 'I60', 'I30']

In [3]:
def get_rider_urls(soup):
    # Find all parent elements with class 'type-fullname' and extract hrefs
    hrefs = []
    for element in soup.find_all(class_='type-fullname'):
        # Find all anchor tags inside the parent element
        a_tag = element.find('a')
        if a_tag and a_tag.has_attr('href'):
            hrefs.append(a_tag['href'])

    return hrefs

In [4]:
def get_ride_info(base_url, rider_url, event):
    full_url = base_url + rider_url

    session = requests.Session()
    retry = Retry(connect=5, backoff_factor=1, status_forcelist=[502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    try:
        response = session.get(full_url, timeout=10)
        response.raise_for_status()
        if response.status_code != 200:
            print(f"Failed to fetch data for {rider_url}, Status code: {response.status_code}")
            return pd.DataFrame()  # Return empty DataFrame if the request fails

        # Parse the HTML content from the GET response
        get_soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize a list to store the pivoted tables
        pivoted_tables = []
        
        # Define the classes containing tables
        target_classes = ['box-general', 'box-totals', 'box-state']
        for target_class in target_classes:
            box = get_soup.find(class_=target_class)
            if box:
                html_string = str(box)
                tables = pd.read_html(StringIO(html_string))
                for table in tables:
                    # Ensure the table has valid rows
                    if not table.empty:
                        # Pivoting the DataFrame
                        pivoted_df = table.set_index(0).T.reset_index(drop=True)
                        pivoted_df.columns.name = None  # Remove column names
                        pivoted_tables.append(pivoted_df)
        
        # Handling the "splits" table
        split_html = get_soup.find(class_='box-splits')
        if split_html:
            html_string = str(split_html)
            tables = pd.read_html(StringIO(html_string))
            if tables:
                table = tables[0]

                # Create a flattened dictionary for the new row format
                flattened_data = {}

                # Custom labels for columns
                labels = {
                    "I": ['arr25', 'dep26', 'arr53', 'dep54', 'arr73', 'dep74', 'finish'],
                    "I60": ['arr25', 'dep26', 'arr32', 'dep33', 'finish'],
                    "I30": ['finish']
                }

                # Extract the data based on the column labels
                for i, label in enumerate(labels.get(event, [])):
                    flattened_data[f'{label}_tod'] = table['Time Of Day'][i] if 'Time Of Day' in table.columns else 'N/A'
                    flattened_data[f'{label}_time'] = table['Time'][i] if 'Time' in table.columns else 'N/A'
                    flattened_data[f'{label}_diff'] = table['diff.'][i] if 'diff.' in table.columns else 'N/A'
                    flattened_data[f'{label}_mph'] = table['mph'][i] if 'mph' in table.columns else 'N/A'

                # Convert the flattened data dictionary back into a DataFrame with one row
                split_df = pd.DataFrame([flattened_data])
                pivoted_tables.append(split_df)

        # Concatenate all the tables if there are any
        if pivoted_tables:
            concat_frame = pd.concat(pivoted_tables, axis=1, ignore_index=True)
            return concat_frame
        else:
            print(f"No data found for {rider_url}")
            return pd.DataFrame()  # Return an empty DataFrame if no tables were found

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {full_url}: {e}. Retrying now")
        i -= 1
        return pd.DataFrame()  # Return an empty DataFrame if the request fails

    except KeyError as e:
        print(f"KeyError for {rider_url}: {e}")
        return pd.DataFrame()  # Skip this rider and return an empty DataFrame

    except Exception as e:
        print(f"Unexpected error for {rider_url}: {e}")
        return pd.DataFrame()  # Skip this rider on unexpected errors

In [5]:
def get_all_pages(year, event, sex):

    base_url = f'https://results.ridelondon.co.uk/{year}/'

    session = requests.Session()
    retry = Retry(connect=5, backoff_factor=1, status_forcelist=[502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    query_params = {
        "page": "1",
        "event": event,
        "num_results": "100",
        "pid": "list",
        "pidp": "start",
        "search[sex]": sex,
    }

    response = session.get(base_url, params=query_params, timeout=10)
    
    
    soup = BeautifulSoup(response.text, 'html.parser')
    match = re.search(r'(\d+) Times', response.text)
    number_of_times = int(match.group(1))
    number_of_pages = math.ceil(number_of_times / 100)
    step = number_of_pages // 10
    
    # Final DataFrame to hold the concatenated data from all pages
    
    for i in range(1, number_of_pages + 1):
        try:
            query_params["page"] = i
            response = session.get(base_url, params=query_params, timeout=10)
            response.raise_for_status()  # Check for errors

            # Parse and process page data here
            rider_frames = []
            soup = BeautifulSoup(response.text, 'html.parser')
            

            hrefs = get_rider_urls(soup)
            for href in hrefs:
                rider_frame = get_ride_info(base_url, href, query_params["event"])
                rider_frames.append(rider_frame)
        
        except requests.exceptions.RequestException as e:
            print(f"Request failed on page {i}: {e}. Retrying now.")
            i -= 1
            continue  # Skip to the next page on failure
    
        final_frame = pd.concat(rider_frames)
    
        final_frame['sex'] = query_params['search[sex]']
        final_frame['year'] = year
    
        final_frame = final_frame.rename(columns={
            'Name': 'name',
            'Rider number': 'rider_number',
            'Charity': 'charity',
            'Event': 'event',
            'Finish': 'finish_time',
            'Status': 'status',
            'Last Timing Point': 'last_timing_point'
        })
        
        final_frame.to_csv(f"data/{year}_event_{query_params["event"]}_{query_params["search[sex]"]}_page_{query_params["page"]}.csv")

In [None]:
for year in years:
    for event in events:
        for sex in registered_sexes:
            get_all_pages(year, event, sex)

dfs = []

# Loop over all files in the directory
for filename in os.listdir("/data"):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(directory, filename)
        # Read the CSV file and append the DataFrame to the list
        df = pd.read_csv(file_path)
        dfs.append(df)

# Combine all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
combined_df.to_csv("final_ride_data.csv")