In [13]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import re
import math
import os

In [2]:
def get_rider_urls(soup):
    # Find all parent elements with class 'type-fullname' and extract hrefs
    hrefs = []
    for element in soup.find_all(class_='type-fullname'):
        # Find all anchor tags inside the parent element
        a_tag = element.find('a')
        if a_tag and a_tag.has_attr('href'):
            hrefs.append(a_tag['href'])

    return hrefs

In [3]:
def get_ride_info(base_url, rider_url, event):
    full_url = base_url + rider_url
    get_response = requests.get(full_url)
    
    if get_response.status_code == 200:
        # Parse the HTML content from the GET response
        get_soup = BeautifulSoup(get_response.text, 'html.parser')
        
        # Initialize a dictionary to store the pivoted data for this page
        page_data = {}
        pivoted_tables = []
        
        # Find the specific classes and extract the tables within those classes
        target_classes = ['box-general', 'box-totals', 'box-state']
        for target_class in target_classes:
            # Find the parent element by class
            box = get_soup.find(class_=target_class)
            
            if box:
                # Extract tables within this parent element
                html_string = str(box)
                tables = pd.read_html(StringIO(html_string))
                
                for table in tables:
                    # Pivoting the DataFrame
                    pivoted_df = table.set_index(0).T
                    pivoted_df = pivoted_df.reset_index(drop=True)
                    
                    # Renaming the columns to match the first row's values
                    pivoted_df.columns.name = None

                    pivoted_tables.append(pivoted_df)

        split_html = get_soup.find(class_='box-splits')
        html_string = str(split_html)
        table = pd.read_html(StringIO(html_string))[0]

        # Create a flattened dictionary for the new row format
        flattened_data = {}
        
        # Custom labels for columns
        labels = {
            "I": ['arr25', 'dep26', 'arr53', 'dep54', 'arr73', 'dep74', 'finish'],
            "I60": ['arr25', 'dep26', 'arr32', 'dep33', 'finish'],
            "I30": ['finish']
        }
        
        # Iterate over the DataFrame rows and add to the dictionary
        for i, label in enumerate(labels[event]):
            flattened_data[f'{label}_tod'] = table.loc[i, 'Time Of Day']
            flattened_data[f'{label}_time'] = table.loc[i, 'Time']
            flattened_data[f'{label}_diff'] = table.loc[i, 'diff.']
            flattened_data[f'{label}_mph'] = table.loc[i, 'mph']
        
        # Convert the flattened data dictionary back into a DataFrame with one row
        split_df = pd.DataFrame([flattened_data])

        pivoted_tables.append(split_df)

        concat_frame = pd.concat(pivoted_tables, axis=1)

        return concat_frame

In [10]:
# Base URL for the GET requests
base_url = 'https://results.ridelondon.co.uk/2024/'

query_params = {
    "page": "1",
    "event": "I30",
    "num_results": "100",
    "pid": "list",
    "pidp": "start",
    "search[sex]": "M",
}

# Sending the POST request with the form data
response = requests.get(base_url, params = query_params)

soup = BeautifulSoup(response.text, 'html.parser')
match = re.search(r'(\d+) Times', response.text)
number_of_times = int(match.group(1))
number_of_pages = math.ceil(number_of_times / 100)
step = number_of_pages // 10

# Final DataFrame to hold the concatenated data from all pages

for i in range(1, number_of_pages + 1):
    # if i % step == 0:
    #     print(f"{i // step * 10}% of riders collected ({i} pages).")

    print(i)
    rider_frames = []

    query_params = {
        "page": i,
        "event": "I30",
        "num_results": "100",
        "pid": "list",
        "pidp": "start",
        "search[sex]": "M",
    }
    
    response = requests.get(base_url, params=query_params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        hrefs = get_rider_urls(soup)
        
        # Iterate through all the hrefs and send a GET request to each concatenated URL
        for href in hrefs:
                rider_frame = get_ride_info(base_url, href, query_params["event"])
    
                rider_frames.append(rider_frame)
    else:
        print(f"POST request failed with status code: {response.status_code}")

    final_frame = pd.concat(rider_frames)

    final_frame = final_frame.rename(columns={
        'Name': 'name',
        'Rider number': 'rider_number',
        'Charity': 'charity',
        'Event': 'event',
        'Finish': 'finish_time',
        'Status': 'status',
        'Last Timing Point': 'last_timing_point'
    })
    
    final_frame.to_csv(f"./2024_{query_params["search[sex]"]}_{query_params["event"]}/2024_event_{query_params["event"]}_{query_params["search[sex]"]}_page_{query_params["page"]}.csv")
# Display the final concatenated DataFrame
# print("Final DataFrame with one row per page:")
# final_df.to_csv('final_table.csv')


1
2
3
4
5


In [14]:
directories = ["2024_M_I", "2024_W_I", "2024_M_I60", "2024_W_I60", "2024_M_I30", "2024_W_I30"] 

for directory in directories:
    # List to hold dataframes
    dfs = []
    
    # Loop over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):  # Check if the file is a CSV
            file_path = os.path.join(directory, filename)
            # Read the CSV file and append the DataFrame to the list
            df = pd.read_csv(file_path)
            dfs.append(df)
    
    # Combine all dataframes into one
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Display the combined DataFrame
    combined_df.to_csv(f"{directory}.csv")