In [1]:
import numpy as np
import pandas as pd
from io import StringIO
import re
import math
import os
from datetime import datetime, time, timedelta

In [2]:
def time_to_decimal_hours(time_val):
    try:
        if pd.isnull(time_val) or str(time_val).strip() == '' or str(time_val) == "–":
            return None

        # If it's a string, convert to datetime
        if isinstance(time_val, str):
            time_val = datetime.strptime(time_val.strip(), '%Y-%m-%d %H:%M:%S')

        # Now it's a datetime object
        total_seconds = time_val.hour * 3600 + time_val.minute * 60 + time_val.second
        return total_seconds / 3600
    except Exception as e:
        print(f"Error: {e} with value {time_val}")  # Helpful for debugging
        return None

In [9]:
import pandas as pd
import numpy as np

# Load the data
parsed_100_frame = pd.read_csv('./web_data/parsed_I_data.csv', low_memory=False)

# Filter for year 2024
race_frame_2024 = parsed_100_frame[parsed_100_frame['year'] == 2024].copy()

# Convert start time to datetime and decimal hours
race_frame_2024['start_tod'] = pd.to_datetime(
    race_frame_2024['start_tod'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
)

def time_to_decimal_hours(t):
    return t.hour + t.minute / 60 + t.second / 3600 if pd.notnull(t) else np.nan

race_frame_2024['start_tod_decimal'] = round(
    race_frame_2024['start_tod'].apply(time_to_decimal_hours), 2
)

# Select necessary columns, including rest stop times
slimmed_frame = race_frame_2024[[
    'rider_no', 
    'is_early_starter',
    'is_late_starter',
    'mph_25', 'mph_53', 'mph_73', 'mph_finish',
    'start_tod', 'start_tod_decimal',
    'ride_time_25_decimal', 'ride_time_53_decimal',
    'ride_time_73_decimal', 'ride_time_finish_decimal',
    'tod_25', 'tod_26', 'tod_53', 'tod_54', 'tod_73', 'tod_74'
]]

# Create the hour range series
hour_range = np.linspace(6, 18, 49)  # Every 0.25 hour from 6 to 18
hour_df = pd.DataFrame({'hour': hour_range})

# Cross join
slimmed_frame['key'] = 1
hour_df['key'] = 1
cross_joined = pd.merge(slimmed_frame, hour_df, on='key').drop('key', axis=1)

# Ensure all relevant columns are numeric
cols_to_float = [
    'mph_25', 'mph_53', 'mph_73', 'mph_finish',
    'ride_time_25_decimal', 'ride_time_53_decimal',
    'ride_time_73_decimal', 'ride_time_finish_decimal',
    'start_tod_decimal', 'hour'
]

cross_joined[cols_to_float] = cross_joined[cols_to_float].apply(pd.to_numeric, errors='coerce')

# Convert tod_* fields to decimal hours (needed for rest stop comparison)
tod_columns = ['tod_25', 'tod_26', 'tod_53', 'tod_54', 'tod_73', 'tod_74']
for col in tod_columns:
    if not np.issubdtype(cross_joined[col].dtype, np.number):
        cross_joined[col] = pd.to_datetime(cross_joined[col], errors='coerce')
        cross_joined[col] = cross_joined[col].apply(time_to_decimal_hours)

# Define race segments
segments = [
    {'mph_col': 'mph_25', 'end_time_col': 'ride_time_25_decimal'},
    {'mph_col': 'mph_53', 'end_time_col': 'ride_time_53_decimal'},
    {'mph_col': 'mph_73', 'end_time_col': 'ride_time_73_decimal'},
    {'mph_col': 'mph_finish', 'end_time_col': 'ride_time_finish_decimal'},
]

# Define rest stop windows
rest_stops = [
    {'start': 'tod_25', 'end': 'tod_26', 'label': 'Stop 25'},
    {'start': 'tod_53', 'end': 'tod_54', 'label': 'Stop 53'},
    {'start': 'tod_73', 'end': 'tod_74', 'label': 'Stop 73'},
]

# Distance + status calculation
def estimate_distance_and_status(row, segments, rest_stops):
    elapsed_time = row['hour'] - row['start_tod_decimal']
    distance = 0
    prev_time = 0

    for stop in rest_stops:
        if pd.notnull(row[stop['start']]) and pd.notnull(row[stop['end']]):
            if row[stop['start']] <= row['hour'] < row[stop['end']]:
                return 0, stop['label']

    for segment in segments:
        seg_end_time = row[segment['end_time_col']]
        if elapsed_time <= prev_time:
            break
        segment_time = min(elapsed_time, seg_end_time) - prev_time
        distance += segment_time * row[segment['mph_col']]
        prev_time = seg_end_time

    return distance, "Riding"

# Apply to each row
cross_joined[['estimated_distance', 'status']] = cross_joined.apply(
    lambda row: pd.Series(estimate_distance_and_status(row, segments, rest_stops)),
    axis=1
)

def custom_distance_bucket(row):
    if pd.isna(row['start_tod_decimal']) or pd.isna(row['hour']):
        return np.nan

    if row['hour'] < row['start_tod_decimal']:
        return "Not Started"

    elapsed_time = row['hour'] - row['start_tod_decimal']
    
    if pd.notnull(row['ride_time_finish_decimal']) and elapsed_time > row['ride_time_finish_decimal']:
        return "Finished"

    if isinstance(row['status'], str) and row['status'].startswith("Stop"):
        return row['status']

    if pd.notnull(row['estimated_distance']):
        return int(np.floor(row['estimated_distance'] / 5) * 5)

    return np.nan


cross_joined['estimated_distance_bucket'] = cross_joined.apply(custom_distance_bucket, axis=1)

# Drop rows with missing buckets
grouped = cross_joined[cross_joined['estimated_distance_bucket'].notna()]

# Group by hour and bucket
rider_distribution = (
    grouped.groupby(['hour', 'estimated_distance_bucket'], dropna=False)
      .agg(
          regular_riders=('rider_no', lambda x: x[(grouped.loc[x.index, 'is_early_starter'] == False) & (grouped.loc[x.index, 'is_late_starter'] == False)].nunique()),
          early_starters=('rider_no', lambda x: x[grouped.loc[x.index, 'is_early_starter'] == True].nunique()),
          late_starters=('rider_no', lambda x: x[grouped.loc[x.index, 'is_late_starter'] == True].nunique())
      )
      .reset_index()
)

# Save to CSV
rider_distribution.to_csv('./race_sim_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slimmed_frame['key'] = 1
  cross_joined[col] = pd.to_datetime(cross_joined[col], errors='coerce')
  cross_joined[col] = pd.to_datetime(cross_joined[col], errors='coerce')
  cross_joined[col] = pd.to_datetime(cross_joined[col], errors='coerce')
  cross_joined[col] = pd.to_datetime(cross_joined[col], errors='coerce')
  cross_joined[col] = pd.to_datetime(cross_joined[col], errors='coerce')
