Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 151 additions & 24 deletions data_scrape/ride-london-data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 38,
"id": "cfdcf185-3984-447c-836e-676257675ad2",
"metadata": {},
"outputs": [],
Expand All @@ -15,7 +15,8 @@
"from io import StringIO\n",
"import re\n",
"import math\n",
"import os"
"import os\n",
"from datetime import datetime"
]
},
{
Expand Down Expand Up @@ -89,9 +90,9 @@
" # Ensure the table has valid rows\n",
" if not table.empty:\n",
" # Pivoting the DataFrame\n",
" pivoted_df = table.set_index(0).T.reset_index(drop=True)\n",
" pivoted_df.columns.name = None # Remove column names\n",
" pivoted_tables.append(pivoted_df)\n",
" pivoted_final_frame_renamed = table.set_index(0).T.reset_index(drop=True)\n",
" pivoted_final_frame_renamed.columns.name = None # Remove column names\n",
" pivoted_tables.append(pivoted_final_frame_renamed)\n",
" \n",
" # Handling the \"splits\" table\n",
" split_html = get_soup.find(class_='box-splits')\n",
Expand Down Expand Up @@ -119,8 +120,8 @@
" flattened_data[f'{label}_mph'] = table['mph'][i] if 'mph' in table.columns else 'N/A'\n",
"\n",
" # Convert the flattened data dictionary back into a DataFrame with one row\n",
" split_df = pd.DataFrame([flattened_data])\n",
" pivoted_tables.append(split_df)\n",
" split_final_frame_renamed = pd.DataFrame([flattened_data])\n",
" pivoted_tables.append(split_final_frame_renamed)\n",
"\n",
" # Concatenate all the tables if there are any\n",
" if pivoted_tables:\n",
Expand Down Expand Up @@ -222,31 +223,157 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 59,
"id": "b3838a00-46fa-485b-b15e-20b046f9b86f",
"metadata": {},
"outputs": [],
"source": [
"for year in years:\n",
" for event in events:\n",
" for sex in registered_sexes:\n",
" get_all_pages(year, event, sex)\n",
"# for year in years:\n",
"# for event in events:\n",
"# for sex in registered_sexes:\n",
"# get_all_pages(year, event, sex)\n",
"\n",
"for event in events:\n",
" final_frame_renameds = []\n",
" \n",
" # Loop over all files in the directory\n",
" for filename in os.listdir(\"./data\"):\n",
" if filename.endswith('.csv') and (event + '_') in filename:\n",
" # Read the CSV file and append the DataFrame to the list\n",
" final_frame_renamed = pd.read_csv(\"./data/\" + filename)\n",
" final_frame_renameds.append(final_frame_renamed)\n",
"\n",
" # If there are any files for this race type, concatenate them\n",
" if final_frame_renameds:\n",
" combined_final_frame_renamed = pd.concat(final_frame_renameds)\n",
" # Save the combined DataFrame to a CSV\n",
" combined_final_frame_renamed.to_csv(f\"./data/final_{event}_data.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "5f8d7eee-7ee3-438c-a2a2-eed1f11c7065",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:53: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" final_frame_renamed['tod_25'] = pd.to_datetime(final_frame_renamed['tod_25'], format='%H:%M:%S', errors='coerce')\n",
"/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:56: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" final_frame_renamed['ride_time_25_delta'] = pd.to_timedelta(final_frame_renamed['ride_time_25'], errors='coerce')\n",
"/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:59: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" final_frame_renamed['start_tod'] = final_frame_renamed['tod_25'] - final_frame_renamed['ride_time_25_delta']\n"
]
}
],
"source": [
"race_100_frame = pd.read_csv('./data/final_I_data.csv', low_memory=False)\n",
"column_names = [\n",
" \"index\",\n",
" \"rider_name\",\n",
" \"rider_no\",\n",
" \"charity_name\",\n",
" \"event\",\n",
" \"final_time\",\n",
" \"final_status\",\n",
" \"final_checkout\",\n",
" \"tod_25\",\n",
" \"ride_time_25\",\n",
" \"diff_25\",\n",
" \"mph_25\",\n",
" \"tod_26\",\n",
" \"ride_time_26\",\n",
" \"diff_256\",\n",
" \"mph_26\",\n",
" \"tod_53\",\n",
" \"ride_time_53\",\n",
" \"diff_53\",\n",
" \"mph_53\",\n",
" \"tod_54\",\n",
" \"ride_time_54\",\n",
" \"diff_54\",\n",
" \"mph_54\",\n",
" \"tod_73\",\n",
" \"ride_time_73\",\n",
" \"diff_73\",\n",
" \"mph_73\",\n",
" \"tod_74\",\n",
" \"ride_time_74\",\n",
" \"diff_74\",\n",
" \"mph_74\",\n",
" \"tod_finish\",\n",
" \"ride_time_finish\",\n",
" \"diff_finish\",\n",
" \"mph_finish\",\n",
" \"sex\",\n",
" \"year\"\n",
"]\n",
"final_frame_renamed = race_100_frame\n",
"\n",
"final_frame_renamed.columns = column_names\n",
"final_frame_renamed.sort_values(by=['year', 'final_time'], inplace=True)\n",
"final_frame_renamed[\"rider_pos\"] = final_frame_renamed.groupby('year')['final_time'].rank(method='max')\n",
"final_frame_renamed = final_frame_renamed[final_frame_renamed['rider_no'] != 126413]\n",
"\n",
"# Define the target date\n",
"race_date = datetime(2024, 5, 26)\n",
"\n",
"# Convert 'tod_25' to datetime format, invalid entries will become NaT\n",
"final_frame_renamed['tod_25'] = pd.to_datetime(final_frame_renamed['tod_25'], format='%H:%M:%S', errors='coerce')\n",
"\n",
"# Convert 'ride_time_25' to timedelta format\n",
"final_frame_renamed['ride_time_25_delta'] = pd.to_timedelta(final_frame_renamed['ride_time_25'], errors='coerce')\n",
"\n",
"dfs = []\n",
"# Subtract ride_time_25 from tod_25, NaT entries will remain NaT\n",
"final_frame_renamed['start_tod'] = final_frame_renamed['tod_25'] - final_frame_renamed['ride_time_25_delta']\n",
"\n",
"# Loop through all columns in final_frame_renamed that contain '_tod'\n",
"for col in final_frame_renamed.columns:\n",
" if '_tod' in col:\n",
" # Convert to datetime, invalid entries will become NaT\n",
" final_frame_renamed.loc[:, col] = pd.to_datetime(final_frame_renamed[col], format='%H:%M:%S', errors='coerce').dt.time\n",
" \n",
" # Set the date to 26 May 2024 for valid entries\n",
" final_frame_renamed.loc[:, col] = final_frame_renamed[col].apply(lambda t: datetime.combine(race_date, t) if pd.notnull(t) else pd.NaT)\n",
"\n",
"# Function to convert time in HH:MM:SS to decimal hours, with error handling\n",
"def time_to_decimal_hours(time_str):\n",
" try:\n",
" # Ensure the time string is valid and not empty\n",
" if pd.isnull(time_str) or time_str.strip() == '' or time_str == \"–\":\n",
" return None # Return None for invalid entries\n",
" # Split the time string and convert to hours, minutes, and seconds\n",
" h, m, s = map(int, time_str.split(':'))\n",
" total_seconds = h * 3600 + m * 60 + s\n",
" return total_seconds / 3600 # Convert to hours\n",
" except Exception:\n",
" return None # Return None if there's any issue during conversion\n",
"\n",
"# Loop over all files in the directory\n",
"for filename in os.listdir(\"/data\"):\n",
" if filename.endswith('.csv'): # Check if the file is a CSV\n",
" file_path = os.path.join(directory, filename)\n",
" # Read the CSV file and append the DataFrame to the list\n",
" df = pd.read_csv(file_path)\n",
" dfs.append(df)\n",
"# Ensure that final_frame_renamed is a copy, not a view\n",
"final_frame_renamed = final_frame_renamed.copy()\n",
"\n",
"# Combine all dataframes into one\n",
"combined_df = pd.concat(dfs, ignore_index=True)\n",
"# Loop through all columns in final_frame_renamed that contain 'time'\n",
"for col in final_frame_renamed.columns:\n",
" if 'time' in col:\n",
" # Apply the conversion function using .loc to avoid the SettingWithCopyWarning\n",
" final_frame_renamed.loc[:, col + '_decimal'] = final_frame_renamed[col].apply(time_to_decimal_hours)\n",
"\n",
"# Display the combined DataFrame\n",
"combined_df.to_csv(\"final_ride_data.csv\")"
"final_frame_renamed.to_csv('./data/final_I_data.csv')\n"
]
},
{
Expand Down
16 changes: 0 additions & 16 deletions src/components/timeline.js

This file was deleted.

8 changes: 0 additions & 8 deletions src/data/events.json

This file was deleted.

Loading