# Calculating Scheduled Headways for CTA Buses - work in progress

### What This Does

- uses chi-hack-night ghost-buses team functions to take in GTFS data for CTA buses

- uses the trip_summary function from the ghost bus team as a starting point to determine which services are active on a specified route during a specified service day

- calculates the start and end of each service for every stop on the route that day, based on the scheduled arrival times

- calculates the overall in-service times for a given bus stop, route, and direction of travel (continuous timeframes when one or more service(s) is active)

- calculates scheduled headways ONLY for the times service is active on that route/stop/direction of travel.
This fixes an earlier issue where out-of-service times looked like long headways.

- calculates scheduled headway statitics for a given bus stop / route / direction of travel

## To Do Next
- Generate a summary table for an entire corridor with headway stats for each stop.

- Go back to the actual headway calcs and use the in-service times to eliminate "headways" that are actually out-of-service times there too

- investigate EWT calcs



In [14]:
import requests
from dotenv import load_dotenv
import pandas as pd
import geopandas as gpd
from shapely import Point, LineString
import datetime as dt
import numpy as np
from static_gtfs_analysis import *

# Get Scheduled Stop Times and Headways


In [15]:
# Use Laurie's code to get gtfs feed data
gtfs_feed = download_extract_format('20230105')

INFO:root:Downloading CTA data
INFO:root:Download complete
INFO:root:Extracting data from CTA zipfile version 20230105
Loading stops.txt:   0%|          | 0/7 [00:00<?, ?it/s]INFO:root:stops.txt loaded
Loading stop_times.txt:   0%|          | 0/7 [00:00<?, ?it/s]INFO:root:stop_times.txt loaded
Loading routes.txt:  29%|██▊       | 2/7 [00:04<00:12,  2.42s/it]    INFO:root:routes.txt loaded
Loading trips.txt:  29%|██▊       | 2/7 [00:04<00:12,  2.42s/it] INFO:root:trips.txt loaded
Loading calendar.txt:  29%|██▊       | 2/7 [00:04<00:12,  2.42s/it]INFO:root:calendar.txt loaded
Loading calendar_dates.txt:  29%|██▊       | 2/7 [00:04<00:12,  2.42s/it]INFO:root:calendar_dates.txt loaded
Loading shapes.txt:  29%|██▊       | 2/7 [00:04<00:12,  2.42s/it]        INFO:root:shapes.txt loaded
Loading shapes.txt: 100%|██████████| 7/7 [00:05<00:00,  1.27it/s]


In [16]:
# Values to use for testing
gtfs_version_id = '20230105'
route_id = '55'
service_date_string = '2023-01-09'
stop_id = '14122'
direction = 'East'


In [17]:
# Use Laurie's code to get gtfs feed data
gtfs_feed = download_extract_format(gtfs_version_id)    

INFO:root:Downloading CTA data
INFO:root:Download complete
INFO:root:Extracting data from CTA zipfile version 20230105
Loading stops.txt:   0%|          | 0/7 [00:00<?, ?it/s]INFO:root:stops.txt loaded
Loading stop_times.txt:   0%|          | 0/7 [00:00<?, ?it/s]INFO:root:stop_times.txt loaded
Loading routes.txt:  29%|██▊       | 2/7 [00:04<00:10,  2.19s/it]    INFO:root:routes.txt loaded
Loading trips.txt:  29%|██▊       | 2/7 [00:04<00:10,  2.19s/it] INFO:root:trips.txt loaded
Loading calendar.txt:  29%|██▊       | 2/7 [00:04<00:10,  2.19s/it]INFO:root:calendar.txt loaded
Loading calendar_dates.txt:  29%|██▊       | 2/7 [00:04<00:10,  2.19s/it]INFO:root:calendar_dates.txt loaded
Loading shapes.txt:  29%|██▊       | 2/7 [00:04<00:10,  2.19s/it]        INFO:root:shapes.txt loaded
Loading shapes.txt: 100%|██████████| 7/7 [00:05<00:00,  1.38it/s]


In [18]:
def string_to_datetime(date_string:str) -> pendulum.datetime:
        '''Parameters:\n
        date_string is in the format "YYYY-MM-DD" obtained using get_headways().\n
        Data returned:\n
        specified date as a datetime object.'''
        year = int(date_string[:4])
        month = int(date_string[5:7])
        day = int(date_string[8:])
        return pendulum.datetime(year, month, day)


In [19]:
def get_stop_details(gtfs_feed:GTFSFeed, route_id:str, service_date_string:str) -> pd.DataFrame:
    
    '''Parameters:\n

    gtfs_feed is obtained using the download_extract_format() function from the ghost bus team.\n

    route_id is a route id as a string (for example, '55' for the 55 Garfield bus)\n

    service_date_string is in the format "YYYY-MM-DD", indicating the service date to be analyzed.
    Note that service dates can include spillover into the next calendar day, for bus routes that run
    past midnight.\n

    Data returned:\n

    DataFrame of scheduled stop information for the route and day, including scheduled
    stop times at every bus stop with service IDs and direction of travel.'''

    service_date = string_to_datetime(service_date_string)

    # Get trip summary for service date using chn-ghost-buses make_trip_summary() function
    trip_summary = make_trip_summary(gtfs_feed, service_date, service_date)

    # filter down to the specified route
    trip_summary = trip_summary[trip_summary['route_id'] == route_id]

    # list trip ids for this route
    trip_list = trip_summary['trip_id'].unique().tolist()

    # get stop times data for the trips on this route
    stop_times = gtfs_feed.stop_times

    # filter stop times down to the relevant trips
    stop_times = stop_times.loc[stop_times['trip_id'].isin(trip_list)]

    # Add service id, route, and direction to the stop times data
    stop_times = stop_times.merge(trip_summary[['trip_id', 'route_id', 'service_id', 'direction', 'raw_date']], on='trip_id')

    # filter stop details down to the relevant route
    stop_times = stop_times.loc[stop_times['route_id'] == route_id]

    # Eliminate duplicates - every line shows up twice.
    # TODO:  Investigate why. Is this related to the calendar_cross line in the 
    # make_trip_summary() function? And/or the fact that I'm using the 
    # same date as start and end date as arguments in make_trip_summary?()
    stop_times = stop_times.drop_duplicates()

    # add stop time as a timestamp
    stop_times['stop_time'] = stop_times['raw_date'] + pd.to_timedelta(stop_times['arrival_time'])

    return stop_times


In [20]:
# Test
stop_details = get_stop_details(gtfs_feed, route_id, service_date_string)
stop_details

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled,arrival_hour,departure_hour,route_id,service_id,direction,raw_date,stop_time
0,6530000010020,04:37:30,04:37:30,10603,1,St Louis,0,0,4,4,55,65301,West,2023-01-09 00:00:00+00:00,2023-01-09 04:37:30+00:00
1,6530000010020,04:37:55,04:37:55,10604,2,St Louis,0,639,4,4,55,65301,West,2023-01-09 00:00:00+00:00,2023-01-09 04:37:55+00:00
2,6530000010020,04:38:31,04:38:31,10605,3,St Louis,0,1375,4,4,55,65301,West,2023-01-09 00:00:00+00:00,2023-01-09 04:38:31+00:00
3,6530000010020,04:39:01,04:39:01,10606,4,St Louis,0,2061,4,4,55,65301,West,2023-01-09 00:00:00+00:00,2023-01-09 04:39:01+00:00
4,6530000010020,04:39:32,04:39:32,10607,5,St Louis,0,2826,4,4,55,65301,West,2023-01-09 00:00:00+00:00,2023-01-09 04:39:32+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26368,6530045537030,23:38:09,23:38:09,15366,63,Midway Orange Line,0,45089,23,23,55,65302,West,2023-01-09 00:00:00+00:00,2023-01-09 23:38:09+00:00
26369,6530045537030,23:38:42,23:38:42,10632,64,Midway Orange Line,0,45760,23,23,55,65302,West,2023-01-09 00:00:00+00:00,2023-01-09 23:38:42+00:00
26370,6530045537030,23:39:14,23:39:14,10633,65,Midway Orange Line,0,46457,23,23,55,65302,West,2023-01-09 00:00:00+00:00,2023-01-09 23:39:14+00:00
26371,6530045537030,23:39:44,23:39:44,17030,66,Midway Orange Line,0,47108,23,23,55,65302,West,2023-01-09 00:00:00+00:00,2023-01-09 23:39:44+00:00


In [21]:
def get_active_service_times(stop_details:pd.DataFrame, stop_id:str, direction:str) -> list:
    
    '''This is a helper function.\n
    
    Parameters:\n

    stop_details is a dataframe with information on bus stop times
    for a given route and service day.  This is generated by the get_stop_details function.\n

    stop_id is a string representing a single bus stop.\n

    direction is a string representing the direction of travel at this stop to be analyzed: 'North',
    'South', 'East', or 'West'.\n

    Data returned:\n

    List of lists:  each sub-list contains two timestamps representing the start and end of
    an in-service timeframe. These are continuous time ranges when ANY buses on any service 
    for this route and direction are running at a given bus stop.  identifying these will 
    allow us skip out-of-service times in the headway calcs, so those won't show up incorrectly
    as long headways.
    '''

    # dataframe to contain service time ranges
    service_ranges = pd.DataFrame()

    # filter stop details to a single stop and direction of travel
    single_stop_details = stop_details.loc[
        stop_details['stop_id'] == stop_id].loc[
            stop_details['direction'] == direction]
    
    # find service IDs that serve the stop
    service_ids = set(single_stop_details['service_id'])

    # find times when each service starts and ends
    for service_id in service_ids:
        df_service = single_stop_details.loc[single_stop_details['service_id'] == service_id]
        times = df_service['stop_time'].tolist()
        start_time = min(times)
        end_time = max(times)

        # make a single-row dataframe containing start and end times of 
        # one service id at this stop
        df = pd.DataFrame(
            [[service_id, start_time, end_time]],
            columns=['service_id', 'start_time', 'end_time'])

        # Add this service id's time range to the dataframe for all service id's time ranges
        service_ranges = pd.concat([service_ranges, df])

        # sort ranges by start time
        service_ranges.sort_values('start_time', inplace=True)

        # list the start time of the next range
        service_ranges['next_start_time'] = np.roll(service_ranges['start_time'].tolist(), shift=-1,)

        # remove next start time from the last line
        service_ranges['next_start_time'].iloc[-1] = None


    # reset the index
    service_ranges.reset_index(inplace=True)

    # generate a list of start and end times when ANY service is active.
    active_service_times = []
    start_time = 0
    end_time = 0

    # iterate through all service id's time ranges
    for idx, row in service_ranges.iterrows():

        # last range:  set the end time for the current in-service
        # range and add in-service range to the list.
        if idx == len(service_ranges) - 1:
            end_time = row['end_time']
            active_service_times.append((start_time, end_time))

        else:

            #  if there was no service before this range began:
            #  start a new in-service time range with a new start time
            if start_time == 0:
                start_time = row['start_time']


            # if there is a service gap after this range: 
            # this service's end time is the end of the overall in-service time.
            # End the in-service range and add it to the list.
            if row['next_start_time'] > row['end_time']:
                end_time = row['end_time']
                active_service_times.append([start_time, end_time])
                start_time = 0


    return active_service_times


In [22]:
%%capture --no-display

# Test
get_active_service_times(stop_details, stop_id, direction)

[[Timestamp('2023-01-09 00:05:30+0000', tz='Timezone('UTC')'),
  Timestamp('2023-01-09 02:01:00+0000', tz='Timezone('UTC')')],
 (Timestamp('2023-01-09 05:00:00+0000', tz='Timezone('UTC')'),
  Timestamp('2023-01-09 23:42:30+0000', tz='Timezone('UTC')'))]

In [23]:

# Get headways
def get_headways(stop_details:pd.DataFrame , stop_id:str, direction:str):

    '''
    Parameters:\n

    stop_details is a dataframe with information on bus stop times
    for a given route and service day.  This is generated by the get_stop_details function.\n

    stop_id is a string representing a single bus stop.\n

    direction is a string representing the direction of travel at this stop to be analyzed: 'North',
    'South', 'East', or 'West'.\n

    Data returned:\n
    '''
    
    # dataframe to contain the output
    df_headways = pd.DataFrame()

    # get the start and end of all timeframes when one or more service is actively 
    # running for the specified stop and direction of travel
    active_service_times = get_active_service_times(stop_details, stop_id, direction)


    # stop details filtered to one stop_id and direction
    df = stop_details.loc[
        stop_details['stop_id'] == stop_id].loc[
            stop_details['direction'] == direction]

    # sort by arrival time
    df = df.sort_values('arrival_time')

    # add previous stop times to each row
    df['previous_stop_time'] = np.roll(df['stop_time'], shift=1)

    # Calculate headways
    df['headway'] = df['stop_time'] - df['previous_stop_time']

    # Remove the first headway and previous stop time for the first
    # bus in each active service period (no previous arrival time to compare with)
    active_service_starts = [start for (start, end) in active_service_times]

    start_filter = df['stop_time'].isin(active_service_starts)

    df.loc[start_filter, 'headway'] = None
    df.loc[start_filter, 'previous_stop_time'] = None

    return df


In [24]:
    
%%capture --no-display

# Test
headways = get_headways(stop_details, stop_id, direction).head(50)

In [25]:
# Copied from actual_headways.ipynb

def get_headway_stats(headways:pd.DataFrame, headway_column_name:LineString) -> dict:
    '''Parameters:\n
    headways is a dataframe obtained using get_headways() or get_scheduled_headways().\n
    headway_column_name is the name of the column containing headways:  'est_headway' if these
    are based on actual bus times using get_headways() or 'scheduled_headway' if these are based
    on GTFS schedules using get_Scheduled_headways()\n
 
    Data returned:\n
    Statisics on the headways are returned as a dictionary.'''
    est_headways = headways[headway_column_name]
    stats = {
        'mean':est_headways.mean(),
        'max':est_headways.max(),
        'min':est_headways.min(),
        '25th_percentile':est_headways.quantile(0.25), # 25th percentile
        'median':est_headways.median(), # 50th percentile
        '75th_percentile':est_headways.quantile(0.75)
    }
    return stats

get_headway_stats(headways, 'headway')

{'mean': Timedelta('0 days 00:14:50.625000'),
 'max': Timedelta('0 days 00:30:00'),
 'min': Timedelta('0 days 00:07:00'),
 '25th_percentile': Timedelta('0 days 00:12:00'),
 'median': Timedelta('0 days 00:12:00'),
 '75th_percentile': Timedelta('0 days 00:15:52.500000')}

In [26]:
# NEXT:  Generate a summary of the entire bus route, all stops, with scheduled headways
