Skip to content

Commit 2857cd7

Browse files
authored
Merge pull request #4 from jsdataviz/create-ride-london-scrape
Add initial race time overview
2 parents 9457cc8 + 237fbff commit 2857cd7

File tree

11 files changed

+44276
-320
lines changed

11 files changed

+44276
-320
lines changed

data_scrape/ride-london-data.ipynb

Lines changed: 151 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 38,
66
"id": "cfdcf185-3984-447c-836e-676257675ad2",
77
"metadata": {},
88
"outputs": [],
@@ -15,7 +15,8 @@
1515
"from io import StringIO\n",
1616
"import re\n",
1717
"import math\n",
18-
"import os"
18+
"import os\n",
19+
"from datetime import datetime"
1920
]
2021
},
2122
{
@@ -89,9 +90,9 @@
8990
" # Ensure the table has valid rows\n",
9091
" if not table.empty:\n",
9192
" # Pivoting the DataFrame\n",
92-
" pivoted_df = table.set_index(0).T.reset_index(drop=True)\n",
93-
" pivoted_df.columns.name = None # Remove column names\n",
94-
" pivoted_tables.append(pivoted_df)\n",
93+
" pivoted_final_frame_renamed = table.set_index(0).T.reset_index(drop=True)\n",
94+
" pivoted_final_frame_renamed.columns.name = None # Remove column names\n",
95+
" pivoted_tables.append(pivoted_final_frame_renamed)\n",
9596
" \n",
9697
" # Handling the \"splits\" table\n",
9798
" split_html = get_soup.find(class_='box-splits')\n",
@@ -119,8 +120,8 @@
119120
" flattened_data[f'{label}_mph'] = table['mph'][i] if 'mph' in table.columns else 'N/A'\n",
120121
"\n",
121122
" # Convert the flattened data dictionary back into a DataFrame with one row\n",
122-
" split_df = pd.DataFrame([flattened_data])\n",
123-
" pivoted_tables.append(split_df)\n",
123+
" split_final_frame_renamed = pd.DataFrame([flattened_data])\n",
124+
" pivoted_tables.append(split_final_frame_renamed)\n",
124125
"\n",
125126
" # Concatenate all the tables if there are any\n",
126127
" if pivoted_tables:\n",
@@ -222,31 +223,157 @@
222223
},
223224
{
224225
"cell_type": "code",
225-
"execution_count": null,
226+
"execution_count": 59,
226227
"id": "b3838a00-46fa-485b-b15e-20b046f9b86f",
227228
"metadata": {},
228229
"outputs": [],
229230
"source": [
230-
"for year in years:\n",
231-
" for event in events:\n",
232-
" for sex in registered_sexes:\n",
233-
" get_all_pages(year, event, sex)\n",
231+
"# for year in years:\n",
232+
"# for event in events:\n",
233+
"# for sex in registered_sexes:\n",
234+
"# get_all_pages(year, event, sex)\n",
235+
"\n",
236+
"for event in events:\n",
237+
" final_frame_renameds = []\n",
238+
" \n",
239+
" # Loop over all files in the directory\n",
240+
" for filename in os.listdir(\"./data\"):\n",
241+
" if filename.endswith('.csv') and (event + '_') in filename:\n",
242+
" # Read the CSV file and append the DataFrame to the list\n",
243+
" final_frame_renamed = pd.read_csv(\"./data/\" + filename)\n",
244+
" final_frame_renameds.append(final_frame_renamed)\n",
245+
"\n",
246+
" # If there are any files for this race type, concatenate them\n",
247+
" if final_frame_renameds:\n",
248+
" combined_final_frame_renamed = pd.concat(final_frame_renameds)\n",
249+
" # Save the combined DataFrame to a CSV\n",
250+
" combined_final_frame_renamed.to_csv(f\"./data/final_{event}_data.csv\", index=False)"
251+
]
252+
},
253+
{
254+
"cell_type": "code",
255+
"execution_count": 61,
256+
"id": "5f8d7eee-7ee3-438c-a2a2-eed1f11c7065",
257+
"metadata": {},
258+
"outputs": [
259+
{
260+
"name": "stderr",
261+
"output_type": "stream",
262+
"text": [
263+
"/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:53: SettingWithCopyWarning: \n",
264+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
265+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
266+
"\n",
267+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
268+
" final_frame_renamed['tod_25'] = pd.to_datetime(final_frame_renamed['tod_25'], format='%H:%M:%S', errors='coerce')\n",
269+
"/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:56: SettingWithCopyWarning: \n",
270+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
271+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
272+
"\n",
273+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
274+
" final_frame_renamed['ride_time_25_delta'] = pd.to_timedelta(final_frame_renamed['ride_time_25'], errors='coerce')\n",
275+
"/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:59: SettingWithCopyWarning: \n",
276+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
277+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
278+
"\n",
279+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
280+
" final_frame_renamed['start_tod'] = final_frame_renamed['tod_25'] - final_frame_renamed['ride_time_25_delta']\n"
281+
]
282+
}
283+
],
284+
"source": [
285+
"race_100_frame = pd.read_csv('./data/final_I_data.csv', low_memory=False)\n",
286+
"column_names = [\n",
287+
" \"index\",\n",
288+
" \"rider_name\",\n",
289+
" \"rider_no\",\n",
290+
" \"charity_name\",\n",
291+
" \"event\",\n",
292+
" \"final_time\",\n",
293+
" \"final_status\",\n",
294+
" \"final_checkout\",\n",
295+
" \"tod_25\",\n",
296+
" \"ride_time_25\",\n",
297+
" \"diff_25\",\n",
298+
" \"mph_25\",\n",
299+
" \"tod_26\",\n",
300+
" \"ride_time_26\",\n",
301+
" \"diff_256\",\n",
302+
" \"mph_26\",\n",
303+
" \"tod_53\",\n",
304+
" \"ride_time_53\",\n",
305+
" \"diff_53\",\n",
306+
" \"mph_53\",\n",
307+
" \"tod_54\",\n",
308+
" \"ride_time_54\",\n",
309+
" \"diff_54\",\n",
310+
" \"mph_54\",\n",
311+
" \"tod_73\",\n",
312+
" \"ride_time_73\",\n",
313+
" \"diff_73\",\n",
314+
" \"mph_73\",\n",
315+
" \"tod_74\",\n",
316+
" \"ride_time_74\",\n",
317+
" \"diff_74\",\n",
318+
" \"mph_74\",\n",
319+
" \"tod_finish\",\n",
320+
" \"ride_time_finish\",\n",
321+
" \"diff_finish\",\n",
322+
" \"mph_finish\",\n",
323+
" \"sex\",\n",
324+
" \"year\"\n",
325+
"]\n",
326+
"final_frame_renamed = race_100_frame\n",
327+
"\n",
328+
"final_frame_renamed.columns = column_names\n",
329+
"final_frame_renamed.sort_values(by=['year', 'final_time'], inplace=True)\n",
330+
"final_frame_renamed[\"rider_pos\"] = final_frame_renamed.groupby('year')['final_time'].rank(method='max')\n",
331+
"final_frame_renamed = final_frame_renamed[final_frame_renamed['rider_no'] != 126413]\n",
332+
"\n",
333+
"# Define the target date\n",
334+
"race_date = datetime(2024, 5, 26)\n",
335+
"\n",
336+
"# Convert 'tod_25' to datetime format, invalid entries will become NaT\n",
337+
"final_frame_renamed['tod_25'] = pd.to_datetime(final_frame_renamed['tod_25'], format='%H:%M:%S', errors='coerce')\n",
338+
"\n",
339+
"# Convert 'ride_time_25' to timedelta format\n",
340+
"final_frame_renamed['ride_time_25_delta'] = pd.to_timedelta(final_frame_renamed['ride_time_25'], errors='coerce')\n",
234341
"\n",
235-
"dfs = []\n",
342+
"# Subtract ride_time_25 from tod_25, NaT entries will remain NaT\n",
343+
"final_frame_renamed['start_tod'] = final_frame_renamed['tod_25'] - final_frame_renamed['ride_time_25_delta']\n",
344+
"\n",
345+
"# Loop through all columns in final_frame_renamed that contain '_tod'\n",
346+
"for col in final_frame_renamed.columns:\n",
347+
" if '_tod' in col:\n",
348+
" # Convert to datetime, invalid entries will become NaT\n",
349+
" final_frame_renamed.loc[:, col] = pd.to_datetime(final_frame_renamed[col], format='%H:%M:%S', errors='coerce').dt.time\n",
350+
" \n",
351+
" # Set the date to 26 May 2024 for valid entries\n",
352+
" final_frame_renamed.loc[:, col] = final_frame_renamed[col].apply(lambda t: datetime.combine(race_date, t) if pd.notnull(t) else pd.NaT)\n",
353+
"\n",
354+
"# Function to convert time in HH:MM:SS to decimal hours, with error handling\n",
355+
"def time_to_decimal_hours(time_str):\n",
356+
" try:\n",
357+
" # Ensure the time string is valid and not empty\n",
358+
" if pd.isnull(time_str) or time_str.strip() == '' or time_str == \"\":\n",
359+
" return None # Return None for invalid entries\n",
360+
" # Split the time string and convert to hours, minutes, and seconds\n",
361+
" h, m, s = map(int, time_str.split(':'))\n",
362+
" total_seconds = h * 3600 + m * 60 + s\n",
363+
" return total_seconds / 3600 # Convert to hours\n",
364+
" except Exception:\n",
365+
" return None # Return None if there's any issue during conversion\n",
236366
"\n",
237-
"# Loop over all files in the directory\n",
238-
"for filename in os.listdir(\"/data\"):\n",
239-
" if filename.endswith('.csv'): # Check if the file is a CSV\n",
240-
" file_path = os.path.join(directory, filename)\n",
241-
" # Read the CSV file and append the DataFrame to the list\n",
242-
" df = pd.read_csv(file_path)\n",
243-
" dfs.append(df)\n",
367+
"# Ensure that final_frame_renamed is a copy, not a view\n",
368+
"final_frame_renamed = final_frame_renamed.copy()\n",
244369
"\n",
245-
"# Combine all dataframes into one\n",
246-
"combined_df = pd.concat(dfs, ignore_index=True)\n",
370+
"# Loop through all columns in final_frame_renamed that contain 'time'\n",
371+
"for col in final_frame_renamed.columns:\n",
372+
" if 'time' in col:\n",
373+
" # Apply the conversion function using .loc to avoid the SettingWithCopyWarning\n",
374+
" final_frame_renamed.loc[:, col + '_decimal'] = final_frame_renamed[col].apply(time_to_decimal_hours)\n",
247375
"\n",
248-
"# Display the combined DataFrame\n",
249-
"combined_df.to_csv(\"final_ride_data.csv\")"
376+
"final_frame_renamed.to_csv('./data/final_I_data.csv')\n"
250377
]
251378
},
252379
{

src/components/timeline.js

Lines changed: 0 additions & 16 deletions
This file was deleted.

src/data/events.json

Lines changed: 0 additions & 8 deletions
This file was deleted.

0 commit comments

Comments
 (0)