jsdataviz
diff --git a/‎data_scrape/ride-london-data.ipynb‎
Lines changed: 151 additions & 24 deletions b/‎data_scrape/ride-london-data.ipynb‎
Lines changed: 151 additions & 24 deletions
diff --git a/‎src/components/timeline.js‎
Lines changed: 0 additions & 16 deletions b/‎src/components/timeline.js‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎src/data/events.json‎
Lines changed: 0 additions & 8 deletions b/‎src/data/events.json‎
Lines changed: 0 additions & 8 deletions
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 38,
    "id": "cfdcf185-3984-447c-836e-676257675ad2",
    "metadata": {},
    "outputs": [],
@@ -15,7 +15,8 @@
     "from io import StringIO\n",
     "import re\n",
     "import math\n",
-    "import os"
+    "import os\n",
+    "from datetime import datetime"
    ]
   },
   {
@@ -89,9 +90,9 @@
     "                    # Ensure the table has valid rows\n",
     "                    if not table.empty:\n",
     "                        # Pivoting the DataFrame\n",
-    "                        pivoted_df = table.set_index(0).T.reset_index(drop=True)\n",
-    "                        pivoted_df.columns.name = None  # Remove column names\n",
-    "                        pivoted_tables.append(pivoted_df)\n",
+    "                        pivoted_final_frame_renamed = table.set_index(0).T.reset_index(drop=True)\n",
+    "                        pivoted_final_frame_renamed.columns.name = None  # Remove column names\n",
+    "                        pivoted_tables.append(pivoted_final_frame_renamed)\n",
     "        \n",
     "        # Handling the \"splits\" table\n",
     "        split_html = get_soup.find(class_='box-splits')\n",
@@ -119,8 +120,8 @@
     "                    flattened_data[f'{label}_mph'] = table['mph'][i] if 'mph' in table.columns else 'N/A'\n",
     "\n",
     "                # Convert the flattened data dictionary back into a DataFrame with one row\n",
-    "                split_df = pd.DataFrame([flattened_data])\n",
-    "                pivoted_tables.append(split_df)\n",
+    "                split_final_frame_renamed = pd.DataFrame([flattened_data])\n",
+    "                pivoted_tables.append(split_final_frame_renamed)\n",
     "\n",
     "        # Concatenate all the tables if there are any\n",
     "        if pivoted_tables:\n",
@@ -222,31 +223,157 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 59,
    "id": "b3838a00-46fa-485b-b15e-20b046f9b86f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "for year in years:\n",
-    "    for event in events:\n",
-    "        for sex in registered_sexes:\n",
-    "            get_all_pages(year, event, sex)\n",
+    "# for year in years:\n",
+    "#     for event in events:\n",
+    "#         for sex in registered_sexes:\n",
+    "#             get_all_pages(year, event, sex)\n",
+    "\n",
+    "for event in events:\n",
+    "    final_frame_renameds = []\n",
+    "    \n",
+    "    # Loop over all files in the directory\n",
+    "    for filename in os.listdir(\"./data\"):\n",
+    "        if filename.endswith('.csv') and (event + '_') in filename:\n",
+    "            # Read the CSV file and append the DataFrame to the list\n",
+    "            final_frame_renamed = pd.read_csv(\"./data/\" + filename)\n",
+    "            final_frame_renameds.append(final_frame_renamed)\n",
+    "\n",
+    "    # If there are any files for this race type, concatenate them\n",
+    "    if final_frame_renameds:\n",
+    "        combined_final_frame_renamed = pd.concat(final_frame_renameds)\n",
+    "        # Save the combined DataFrame to a CSV\n",
+    "        combined_final_frame_renamed.to_csv(f\"./data/final_{event}_data.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "5f8d7eee-7ee3-438c-a2a2-eed1f11c7065",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:53: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  final_frame_renamed['tod_25'] = pd.to_datetime(final_frame_renamed['tod_25'], format='%H:%M:%S', errors='coerce')\n",
+      "/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:56: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  final_frame_renamed['ride_time_25_delta'] = pd.to_timedelta(final_frame_renamed['ride_time_25'], errors='coerce')\n",
+      "/var/folders/h8/dz6krpcx2yb_86vz5sx_qczw0000gp/T/ipykernel_21815/3973061266.py:59: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  final_frame_renamed['start_tod'] = final_frame_renamed['tod_25'] - final_frame_renamed['ride_time_25_delta']\n"
+     ]
+    }
+   ],
+   "source": [
+    "race_100_frame = pd.read_csv('./data/final_I_data.csv', low_memory=False)\n",
+    "column_names = [\n",
+    "    \"index\",\n",
+    "    \"rider_name\",\n",
+    "    \"rider_no\",\n",
+    "    \"charity_name\",\n",
+    "    \"event\",\n",
+    "    \"final_time\",\n",
+    "    \"final_status\",\n",
+    "    \"final_checkout\",\n",
+    "    \"tod_25\",\n",
+    "    \"ride_time_25\",\n",
+    "    \"diff_25\",\n",
+    "    \"mph_25\",\n",
+    "    \"tod_26\",\n",
+    "    \"ride_time_26\",\n",
+    "    \"diff_256\",\n",
+    "    \"mph_26\",\n",
+    "    \"tod_53\",\n",
+    "    \"ride_time_53\",\n",
+    "    \"diff_53\",\n",
+    "    \"mph_53\",\n",
+    "    \"tod_54\",\n",
+    "    \"ride_time_54\",\n",
+    "    \"diff_54\",\n",
+    "    \"mph_54\",\n",
+    "    \"tod_73\",\n",
+    "    \"ride_time_73\",\n",
+    "    \"diff_73\",\n",
+    "    \"mph_73\",\n",
+    "    \"tod_74\",\n",
+    "    \"ride_time_74\",\n",
+    "    \"diff_74\",\n",
+    "    \"mph_74\",\n",
+    "    \"tod_finish\",\n",
+    "    \"ride_time_finish\",\n",
+    "    \"diff_finish\",\n",
+    "    \"mph_finish\",\n",
+    "    \"sex\",\n",
+    "    \"year\"\n",
+    "]\n",
+    "final_frame_renamed = race_100_frame\n",
+    "\n",
+    "final_frame_renamed.columns = column_names\n",
+    "final_frame_renamed.sort_values(by=['year', 'final_time'], inplace=True)\n",
+    "final_frame_renamed[\"rider_pos\"] = final_frame_renamed.groupby('year')['final_time'].rank(method='max')\n",
+    "final_frame_renamed = final_frame_renamed[final_frame_renamed['rider_no'] != 126413]\n",
+    "\n",
+    "# Define the target date\n",
+    "race_date = datetime(2024, 5, 26)\n",
+    "\n",
+    "# Convert 'tod_25' to datetime format, invalid entries will become NaT\n",
+    "final_frame_renamed['tod_25'] = pd.to_datetime(final_frame_renamed['tod_25'], format='%H:%M:%S', errors='coerce')\n",
+    "\n",
+    "# Convert 'ride_time_25' to timedelta format\n",
+    "final_frame_renamed['ride_time_25_delta'] = pd.to_timedelta(final_frame_renamed['ride_time_25'], errors='coerce')\n",
     "\n",
-    "dfs = []\n",
+    "# Subtract ride_time_25 from tod_25, NaT entries will remain NaT\n",
+    "final_frame_renamed['start_tod'] = final_frame_renamed['tod_25'] - final_frame_renamed['ride_time_25_delta']\n",
+    "\n",
+    "# Loop through all columns in final_frame_renamed that contain '_tod'\n",
+    "for col in final_frame_renamed.columns:\n",
+    "    if '_tod' in col:\n",
+    "        # Convert to datetime, invalid entries will become NaT\n",
+    "        final_frame_renamed.loc[:, col] = pd.to_datetime(final_frame_renamed[col], format='%H:%M:%S', errors='coerce').dt.time\n",
+    "        \n",
+    "        # Set the date to 26 May 2024 for valid entries\n",
+    "        final_frame_renamed.loc[:, col] = final_frame_renamed[col].apply(lambda t: datetime.combine(race_date, t) if pd.notnull(t) else pd.NaT)\n",
+    "\n",
+    "# Function to convert time in HH:MM:SS to decimal hours, with error handling\n",
+    "def time_to_decimal_hours(time_str):\n",
+    "    try:\n",
+    "        # Ensure the time string is valid and not empty\n",
+    "        if pd.isnull(time_str) or time_str.strip() == '' or time_str == \"–\":\n",
+    "            return None  # Return None for invalid entries\n",
+    "        # Split the time string and convert to hours, minutes, and seconds\n",
+    "        h, m, s = map(int, time_str.split(':'))\n",
+    "        total_seconds = h * 3600 + m * 60 + s\n",
+    "        return total_seconds / 3600  # Convert to hours\n",
+    "    except Exception:\n",
+    "        return None  # Return None if there's any issue during conversion\n",
     "\n",
-    "# Loop over all files in the directory\n",
-    "for filename in os.listdir(\"/data\"):\n",
-    "    if filename.endswith('.csv'):  # Check if the file is a CSV\n",
-    "        file_path = os.path.join(directory, filename)\n",
-    "        # Read the CSV file and append the DataFrame to the list\n",
-    "        df = pd.read_csv(file_path)\n",
-    "        dfs.append(df)\n",
+    "# Ensure that final_frame_renamed is a copy, not a view\n",
+    "final_frame_renamed = final_frame_renamed.copy()\n",
     "\n",
-    "# Combine all dataframes into one\n",
-    "combined_df = pd.concat(dfs, ignore_index=True)\n",
+    "# Loop through all columns in final_frame_renamed that contain 'time'\n",
+    "for col in final_frame_renamed.columns:\n",
+    "    if 'time' in col:\n",
+    "        # Apply the conversion function using .loc to avoid the SettingWithCopyWarning\n",
+    "        final_frame_renamed.loc[:, col + '_decimal'] = final_frame_renamed[col].apply(time_to_decimal_hours)\n",
     "\n",
-    "# Display the combined DataFrame\n",
-    "combined_df.to_csv(\"final_ride_data.csv\")"
+    "final_frame_renamed.to_csv('./data/final_I_data.csv')\n"
    ]
   },
   {