# Calculating Scheduled Headways for CTA Buses - work in progress

### What This Does

TBD

## In progress

- Calculate scheduled headways at each stop, making sure they make sense given different patterns that
run at different times of day for each route


## To Do Next

TBD

## Notes

TBD


## Strategy for Scheduled Headways (Tentative)

1. Which service IDs serve a given stop?

2. When are the earliest/latest times those service IDs are in effect?

3. What are the headways within each timeframe?

## Needs Fixing
 
TBD

In [1]:
import requests
from dotenv import load_dotenv
import pandas as pd
import geopandas as gpd
from shapely import Point, LineString
import datetime as dt
import numpy as np
from static_gtfs_analysis import *

In [2]:
# # Get API key from the .env file
# load_dotenv()
# API_KEY = os.getenv('API_KEY')

# Get Scheduled Stop Times and Headways


In [3]:
# Use Laurie's code to get gtfs feed data

gtfs_feed = download_extract_format('20230105')

INFO:root:Downloading CTA data
INFO:root:Download complete
INFO:root:Extracting data from CTA zipfile version 20230105
Loading stops.txt:   0%|          | 0/7 [00:00<?, ?it/s]INFO:root:stops.txt loaded
Loading stop_times.txt:   0%|          | 0/7 [00:00<?, ?it/s]INFO:root:stop_times.txt loaded
Loading routes.txt:  29%|██▊       | 2/7 [00:03<00:09,  1.97s/it]    INFO:root:routes.txt loaded
Loading trips.txt:  29%|██▊       | 2/7 [00:03<00:09,  1.97s/it] INFO:root:trips.txt loaded
Loading calendar.txt:  29%|██▊       | 2/7 [00:04<00:09,  1.97s/it]INFO:root:calendar.txt loaded
Loading calendar_dates.txt:  29%|██▊       | 2/7 [00:04<00:09,  1.97s/it]INFO:root:calendar_dates.txt loaded
Loading shapes.txt:  29%|██▊       | 2/7 [00:04<00:09,  1.97s/it]        INFO:root:shapes.txt loaded
Loading shapes.txt: 100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


In [4]:
# Get stop data from gtfs feed
df_stops = gtfs_feed.stops

# Eliminate stops that don't have public facing stop codes
df_stops = df_stops.loc[df_stops['stop_code'].notnull()]

df_stops

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,location_type,parent_station,wheelchair_boarding
0,1,1,Jackson & Austin Terminal,"Jackson & Austin Terminal, Northeastbound, Bus...",41.87632184,-87.77410482,0,,1
1,2,2,5900 W Jackson,"5900 W Jackson, Eastbound, Southside of the St...",41.87706679,-87.77131794,0,,1
2,4,4,5700 W Jackson,"5700 W Jackson, Eastbound, Southside of the St...",41.87702418,-87.76745055,0,,1
3,6,6,Jackson & Lotus,"Jackson & Lotus, Eastbound, Southeast Corner",41.876513,-87.761446,0,,1
4,7,7,5351 W Jackson,"5351 W Jackson, Eastbound, Southside of the St...",41.87655197,-87.75892544,0,,1
...,...,...,...,...,...,...,...,...,...
10763,18644,18644,Grand & Rockwell,"Grand & Rockwell, Eastbound, Southside of the ...",41.89215007,-87.69141601,0,,1
10764,18645,18645,Grand & Western,"Grand & Western, Eastbound, Southeast Corner",41.89058595,-87.68636489,0,,1
10765,18646,18646,Grand & Oakley,"Grand & Oakley, Eastbound, Southeast Corner",41.89063726,-87.68397303,0,,1
11215,999990,999990,Rosemont Blue Line,Rosemont Blue Line Station Terminal,41.9840004,-87.860325,0,,1


In [25]:
def get_scheduled_headways(
        route_id:str, 
        service_date_string:str, 
        start_timedelta_string:str='02:30', 
        end_timedelta_string:str='26:30') -> pd.DataFrame:

        '''Parameters:\n
        route_id is a route id as a string (for example, '55' for the 55 Garfield bus)\n
        start_date_string and end_date_string are in 'YYYY-MM-DD'format\n

        start_timedelta_string in 'hh:mm' format.  (optional. Default is '02:30')\n
        end_timedelta_string in 'hh:mm' format. (optional. Default is '26:30)'\n
        start and end timedeltas are relative to midnight on the service date.
        So a start_timedelta of '06:46' would be 6:46am on the service date, 
        while an end_timedelta of '26:02' would be 2:02 am the next day.  \n

        # Data returned:\n
        # dataframe containing scheduled trip information including shceudled headways
        # for all bus stops running on the specified route between the times specified
        # in the start_timedelta_string and end_timedelta_string. Default timeframe
        # if no start/end times are specified is 2:30 am on the specified date to 2:30 am
        # on the following date.\n
        '''

        # Create dataframe to contain headway data
        df_output = pd.DataFrame()

        def string_to_datetime(date_string):
                year = int(date_string[:4])
                month = int(date_string[5:7])
                day = int(date_string[8:])
                return pendulum.datetime(year, month, day)

        # Capture service date plus or minus one day.  Service that
        # started the previous day neds to be captured at each stop,
        # plus service that spills into the next day for late night routes.
        service_date = string_to_datetime(service_date_string)
        start_date = service_date - dt.timedelta(days=1)
        end_date = service_date + dt.timedelta(days=1)

        # Add seconds to the timedelta strings and convert to timedeltas
        start_timedelta = pd.to_timedelta(start_timedelta_string+':00')
        end_timedelta = pd.to_timedelta(end_timedelta_string+':00')
        
        start_time = service_date + start_timedelta
        end_time = service_date + end_timedelta

        print(start_time)
        print(end_time)

        # Get trip summary for service date plus/minus a day using chn-ghost-buses make_trip_summary() function
        trip_summary = make_trip_summary(gtfs_feed, start_date, end_date)

        # Turn arrival times into timestamps
        trip_summary['arrival_time'] =  pd.to_timedelta(trip_summary['arrival_time'])
        trip_summary['arrival_time'] += trip_summary['raw_date']

        # filter to the specified route
        trip_summary = trip_summary.loc[trip_summary['route_id'] == route_id]

        # Filter to the specified start/end times
        trip_summary = trip_summary.loc[trip_summary['arrival_time'] >= start_time]
        trip_summary = trip_summary.loc[trip_summary['arrival_time'] < end_time]


        ############## Edits start here ##################
        # What patterns serve a given stop?
        


        # consider all buses stopping at a given stop moving in the same direction
        # (Is direction necessary?  Are there any cases where a stop serves two directions?)
        for stop_id, direction in set(zip(trip_summary['stop_id'], trip_summary['direction'])):

                # filter data
                trip_summary_stop_direction = trip_summary.loc[
                        (trip_summary['stop_id'] == stop_id) 
                        & (trip_summary['direction'] == direction)
                        ]

                # Sort chronologically
                trip_summary_stop_direction.sort_values(by='arrival_time',ascending=True, inplace=True)

                # list stop times in chronological order
                stop_times = trip_summary_stop_direction['arrival_time'].tolist()

                # calculate previous stop times
                prev_arrival_times = np.roll(stop_times,1)
                trip_summary_stop_direction['previous_arrival_time'] = prev_arrival_times

                # calculate headway
                trip_summary_stop_direction['scheduled_headway'] = (
                        trip_summary_stop_direction['arrival_time'] - trip_summary_stop_direction['previous_arrival_time']
                        )

                # drop previous arrrival time column, no longer needed
                trip_summary_stop_direction = trip_summary_stop_direction.drop('previous_arrival_time', axis=1)

                # Remove headway from the first bus in the dataset since we don't have the 
                # previous bus to compare with
                trip_summary_stop_direction['scheduled_headway'].iloc[0] = None
                
                # Add
                df_output = pd.concat([df_output, trip_summary_stop_direction])

        return df_output

# Scratch

In [5]:
%%capture --no-display
# turn off warnings

######### Scratch
gtfs_version_id = '20230105'
route_id = '55'
service_date_string = '2023-01-09'
# start_timedelta_string ='02:30'
# end_timedelta_string = '26:30'

gtfs_feed = download_extract_format(gtfs_version_id)



INFO:root:Downloading CTA data
INFO:root:Download complete
INFO:root:Extracting data from CTA zipfile version 20230105
INFO:root:stops.txt loaded
INFO:root:stop_times.txt loaded
INFO:root:routes.txt loaded
INFO:root:trips.txt loaded
INFO:root:calendar.txt loaded
INFO:root:calendar_dates.txt loaded
INFO:root:shapes.txt loaded


In [17]:
def string_to_datetime(date_string):
        year = int(date_string[:4])
        month = int(date_string[5:7])
        day = int(date_string[8:])
        return pendulum.datetime(year, month, day)


In [18]:
############
# Revised:  Capture one service day of data
############


# Capture service date 
service_date = string_to_datetime(service_date_string)

# # Add seconds to the timedelta strings and convert to timedeltas
# start_timedelta = pd.to_timedelta(start_timedelta_string+':00')
# end_timedelta = pd.to_timedelta(end_timedelta_string+':00')

# start_time = service_date + start_timedelta
# end_time = service_date + end_timedelta

# print(start_time)
# print(end_time)


# Get trip summary for service date using chn-ghost-buses make_trip_summary() function
trip_summary = make_trip_summary(gtfs_feed, service_date, service_date)

# Turn arrival times into timestamps
trip_summary['stop_time'] =  trip_summary['raw_date'] + pd.to_timedelta(trip_summary['arrival_time'])

# filter to the specified route
trip_summary = trip_summary.loc[trip_summary['route_id'] == route_id]

# # Filter to the specified start/end times
# trip_summary = trip_summary.loc[trip_summary['arrival_time'] >= start_time]
# trip_summary = trip_summary.loc[trip_summary['arrival_time'] < end_time]

trip_summary

# scheduled_headways = get_scheduled_headways(
#     route_id, 
#     service_date_string, 
#     start_timedelta_string, 
#     end_timedelta_string
#     )


Unnamed: 0,route_id,service_id,trip_id,direction_id,block_id,shape_id,direction,wheelchair_accessible,schd_trip_id,raw_date,...,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled,arrival_hour,departure_hour,headway_secs,stop_time
975487,55,65301,6530000010020,0,653000003086,65301295,West,1,10020,2023-01-09 00:00:00+00:00,...,04:37:30,10603,1,St Louis,0,0,4,4,,2023-01-09 04:37:30+00:00
975524,55,65301,6530000057020,0,653000003137,65305425,West,1,57020,2023-01-09 00:00:00+00:00,...,16:10:00,10565,1,Midway Orange Line,0,0,16,16,,2023-01-09 16:10:00+00:00
975525,55,65301,6530000057020,0,653000003137,65305425,West,1,57020,2023-01-09 00:00:00+00:00,...,17:00:25,15366,63,Midway Orange Line,0,45089,17,17,32.0,2023-01-09 17:00:25+00:00
975598,55,65301,6530000586020,0,653000003122,65305425,West,1,586020,2023-01-09 00:00:00+00:00,...,23:42:30,10565,1,Midway Orange Line,0,0,23,23,,2023-01-09 23:42:30+00:00
975599,55,65301,6530000586020,0,653000003122,65305425,West,1,586020,2023-01-09 00:00:00+00:00,...,24:00:05,10597,29,Midway Orange Line,0,21664,0,0,30.0,2023-01-10 00:00:05+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996217,55,65315,6530040471020,0,653000001185,65305425,West,1,40471020,2023-01-09 00:00:00+00:00,...,01:00:18,10628,60,Midway Orange Line,0,43278,1,1,1871.0,2023-01-09 01:00:18+00:00
996232,55,65315,6530042106020,0,653000001186,65301293,West,1,42106020,2023-01-09 00:00:00+00:00,...,02:40:00,10565,1,St Louis,0,0,2,2,,2023-01-09 02:40:00+00:00
996233,55,65315,6530042106020,0,653000001186,65301293,West,1,42106020,2023-01-09 00:00:00+00:00,...,03:00:17,10608,40,St Louis,0,29169,3,3,1078.0,2023-01-09 03:00:17+00:00
996248,55,65315,6530042199010,0,653000003768,65305425,West,1,42199010,2023-01-09 00:00:00+00:00,...,00:02:30,10565,1,Midway Orange Line,0,0,0,0,,2023-01-09 00:02:30+00:00


In [8]:
# For a given route and stop, what service ID / block ID combos are found?


In [9]:

# # Change data type so it can be merged with df_stops
# trip_summary['stop_id'] = trip_summary['stop_id'].astype(int)

# Add stop data to the trip summary.  Inner join since the stops data is already 
# narrowed down to stops with public facing stop codes.
trip_summary = trip_summary.merge(df_stops, on='stop_id', how='inner')

trip_summary

Unnamed: 0,route_id,service_id,trip_id,direction_id,block_id,shape_id,direction,wheelchair_accessible,schd_trip_id,raw_date,...,departure_hour,stop_time,stop_code,stop_name,stop_desc,stop_lat,stop_lon,location_type,parent_station,wheelchair_boarding
0,55,65301,6530000010020,0,653000003086,65301295,West,1,10020,2023-01-09 00:00:00+00:00,...,4,2023-01-09 04:37:30+00:00,10603,Garfield & Ashland,"Garfield & Ashland, Westbound, Northwest Corner",41.79407388,-87.6648838,0,,1
1,55,65301,6530001696020,0,653000003152,65301295,West,1,1696020,2023-01-09 00:00:00+00:00,...,5,2023-01-09 05:11:30+00:00,10603,Garfield & Ashland,"Garfield & Ashland, Westbound, Northwest Corner",41.79407388,-87.6648838,0,,1
2,55,65301,6530012872020,0,653000003134,65301295,West,1,12872020,2023-01-09 00:00:00+00:00,...,7,2023-01-09 07:00:30+00:00,10603,Garfield & Ashland,"Garfield & Ashland, Westbound, Northwest Corner",41.79407388,-87.6648838,0,,1
3,55,65301,6530023086020,0,653000003140,65305425,West,1,23086020,2023-01-09 00:00:00+00:00,...,5,2023-01-09 05:00:37+00:00,10603,Garfield & Ashland,"Garfield & Ashland, Westbound, Northwest Corner",41.79407388,-87.6648838,0,,1
4,55,65301,6530023418020,0,653000002878,65301295,West,1,23418020,2023-01-09 00:00:00+00:00,...,6,2023-01-09 06:42:30+00:00,10603,Garfield & Ashland,"Garfield & Ashland, Westbound, Northwest Corner",41.79407388,-87.6648838,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426,55,65302,6530016074030,0,653000001118,65305426,West,1,16074030,2023-01-09 00:00:00+00:00,...,20,2023-01-09 20:00:14+00:00,10608,Garfield & Hoyne,"Garfield & Hoyne, Westbound, Northwest Corner",41.79395538,-87.67688782,0,,1
427,55,65315,6530042106020,0,653000001186,65301293,West,1,42106020,2023-01-09 00:00:00+00:00,...,3,2023-01-09 03:00:17+00:00,10608,Garfield & Hoyne,"Garfield & Hoyne, Westbound, Northwest Corner",41.79395538,-87.67688782,0,,1
428,55,65302,6530019731030,1,653000001117,65305424,East,1,19731030,2023-01-09 00:00:00+00:00,...,20,2023-01-09 20:00:01+00:00,17563,55th Street & Kenwood,"55th Street & Kenwood, Eastbound, Southeast Co...",41.79503478,-87.59301755,0,,1
429,55,65302,6530038836020,1,653000001284,65303751,East,1,38836020,2023-01-09 00:00:00+00:00,...,15,2023-01-09 15:10:00+00:00,10538,Garfield & Lowe,"Garfield & Lowe, Eastbound, Southeast Corner",41.7939632,-87.64105011,0,,1


In [34]:

# For a given stop, find the service ids that use it
stop_id = '10520'

service_ids = trip_summary.loc[trip_summary['stop_id'] == stop_id]['service_id'].unique().tolist()

service_ids


['65301']

In [35]:
# for a given servie id, find the earliest and latest bus.

# For each service, list all buses arriving at the stop in order.

for service_id in service_ids:

    trip_summary_test = trip_summary.loc[trip_summary['stop_id'] == stop_id].loc[trip_summary['service_id'] == service_id]

    service_start = min(trip_summary_test['stop_time'])
    service_end = max(trip_summary_test['stop_time'])

    print(f'Service id {service_id} starts at {service_start} and ends at {service_end}.')
    ##########
    ### Need to identify service day using the service data here, not by a time we feed into 
    ### the function.  Split day into service hours (as many as there are) and analyze the 
    ### headways for each separate servide on that day individually.'

    ### Summarize route - stop - service ID - times service is in effect - headways
    ###########
    


Service id 65301 starts at 2023-01-09 06:37:30+00:00 and ends at 2023-01-09 07:17:30+00:00.


In [36]:
trip_summary_test = trip_summary.loc[trip_summary['service_id'] == service_id]
service_start = min(trip_summary_test['stop_time'])
service_end = max(trip_summary_test['stop_time'])

print(f'Service id {service_id} starts at {service_start} and ends at {service_end}.')



Service id 65301 starts at 2023-01-09 03:10:00+00:00 and ends at 2023-01-10 00:03:30+00:00.


In [24]:



df_output = pd.DataFrame()

for stop_id, service_id in set(zip(trip_summary['stop_id'], trip_summary['service_id'])):

            # filter data
            trip_summary_stop_service = trip_summary.loc[
                    (trip_summary['stop_id'] == stop_id) 
                    & (trip_summary['service_id'] == service_id)
                    ]

            # Sort chronologically
            trip_summary_stop_service.sort_values(by='stop_time',ascending=True, inplace=True)

            # list stop times in chronological order
            stop_times = trip_summary_stop_service['stop_time'].tolist()

            # calculate previous stop times
            prev_stop_times = np.roll(stop_times,1)
            trip_summary_stop_service['previous_stop_time'] = prev_stop_times

            # calculate headway
            trip_summary_stop_service['scheduled_headway'] = (
                    trip_summary_stop_service['stop_time'] - trip_summary_stop_service['previous_stop_time']
                    )

            # drop previous stop time column, no longer needed
            trip_summary_stop_service = trip_summary_stop_service.drop('previous_stop_time', axis=1)

            # Remove headway from the first bus in the dataset since we don't have the 
            # previous bus to compare with
            trip_summary_stop_service['scheduled_headway'].iloc[0] = None
            
            # Add
            df_output = pd.concat([df_output, trip_summary_stop_service])



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_summary_stop_service.sort_values(by='stop_time',ascending=True, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_summary_stop_service['previous_stop_time'] = prev_stop_times
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip_summary_stop_service['scheduled_headway'] = (
A value is trying to be set on a copy of a slice from a DataFr

In [None]:
df_output

## Try it out: Get Scheduled Stop Times and Headways

In [26]:
%%capture --no-display 
# turn off warnings

# Test:  Geet headways for the 55-Garfield bus on 1/9/2023 using the default times (2:30am 1/9 through 2:30am 1/10)
scheduled_headways = get_scheduled_headways('55', '2023-01-09')

In [48]:
### TODO:

# Note that some buses have very long scheduled headways.  Example below - Route 55, stop 10495.
# 55th and Kilpatrick eastbound.  Does this make sense?  It 

scheduled_headways.loc[['stop_id'] == '10495'].sort_values(['stop_id', 'arrival_time']).head(50)

KeyError: 'False: boolean label can not be used without a boolean index'

In [42]:
# Other stops do seem to make sense.  Example: Stop 10603, Garfield and Ashland WB
# on route 55:

scheduled_headways.loc[scheduled_headways['stop_id'] == '10603'].sort_values(['stop_id', 'arrival_time'])


Unnamed: 0,route_id,service_id,trip_id,direction_id,block_id,shape_id,direction,wheelchair_accessible,schd_trip_id,raw_date,...,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled,arrival_hour,departure_hour,scheduled_headway
985477,55,65301,6530028875020,0,653000003137,65301295,West,1,28875020,2023-01-09 00:00:00+00:00,...,2023-01-09 03:28:00+00:00,03:28:00,10603,1,St Louis,0,0,3,3,NaT
987031,55,65301,6530033660020,0,653000003140,65301295,West,1,33660020,2023-01-09 00:00:00+00:00,...,2023-01-09 03:55:30+00:00,03:55:30,10603,1,St Louis,0,0,3,3,0 days 00:27:30
986920,55,65301,6530033568020,0,653000003141,65301295,West,1,33568020,2023-01-09 00:00:00+00:00,...,2023-01-09 04:18:00+00:00,04:18:00,10603,1,St Louis,0,0,4,4,0 days 00:22:30
975487,55,65301,6530000010020,0,653000003086,65301295,West,1,10020,2023-01-09 00:00:00+00:00,...,2023-01-09 04:37:30+00:00,04:37:30,10603,1,St Louis,0,0,4,4,0 days 00:19:30
983665,55,65301,6530023086020,0,653000003140,65305425,West,1,23086020,2023-01-09 00:00:00+00:00,...,2023-01-09 05:00:37+00:00,05:00:37,10603,35,Midway Orange Line,0,25705,5,5,0 days 00:23:07
976116,55,65301,6530001696020,0,653000003152,65301295,West,1,1696020,2023-01-09 00:00:00+00:00,...,2023-01-09 05:11:30+00:00,05:11:30,10603,1,St Louis,0,0,5,5,0 days 00:10:53
985218,55,65301,6530027998020,0,653000002970,65301295,West,1,27998020,2023-01-09 00:00:00+00:00,...,2023-01-09 05:35:30+00:00,05:35:30,10603,1,St Louis,0,0,5,5,0 days 00:24:00
986476,55,65301,6530032163020,0,653000002965,65301295,West,1,32163020,2023-01-09 00:00:00+00:00,...,2023-01-09 05:57:00+00:00,05:57:00,10603,1,St Louis,0,0,5,5,0 days 00:21:30
988215,55,65301,6530038932020,0,653000003090,65301295,West,1,38932020,2023-01-09 00:00:00+00:00,...,2023-01-09 06:23:00+00:00,06:23:00,10603,1,St Louis,0,0,6,6,0 days 00:26:00
983849,55,65301,6530023418020,0,653000002878,65301295,West,1,23418020,2023-01-09 00:00:00+00:00,...,2023-01-09 06:42:30+00:00,06:42:30,10603,1,St Louis,0,0,6,6,0 days 00:19:30


In [39]:
# Get scheduled headway stats
scheduled_headway_stats = get_headway_stats(scheduled_headways, 'scheduled_headway')

scheduled_headway_stats 

{'mean': Timedelta('0 days 01:33:54.296969696'),
 'max': Timedelta('0 days 16:59:57'),
 'min': Timedelta('0 days 00:01:00'),
 '25th_percentile': Timedelta('0 days 00:12:00'),
 'median': Timedelta('0 days 00:18:00'),
 '75th_percentile': Timedelta('0 days 01:00:00')}

# Get acutal and scheduled headway stats for every stop on a route


In [90]:
# Latest GTFS feed date available from transitfeeds.com
gtfs_version_id = '20230105'

def get_stats_for_stops(
    route_id:str, 
    service_date_string:str, 
    start_timedelta_string:str='02:30', 
    end_timedelta_string:str='26:30') -> pd.DataFrame:

    def get_headway_stats_by_stop(headway_dataframe:pd.DataFrame, stop_id_column:str, headway_column:str) -> pd.DataFrame:

        df_output = pd.DataFrame()

        # df_output['stop_ids'] = scheduled_headways['stop_id'].groupby(by='stop_id')
        df_output[f'{headway_column}_mean'] = headway_dataframe.groupby(by=stop_id_column)[headway_column].mean()
        df_output[f'{headway_column}_min'] = headway_dataframe.groupby(by=stop_id_column)[headway_column].min()
        df_output[f'{headway_column}_25th_percentile'] = headway_dataframe.groupby(by=stop_id_column)[headway_column].quantile(0.25)
        df_output[f'{headway_column}_median'] = headway_dataframe.groupby(by=stop_id_column)[headway_column].median()
        df_output[f'{headway_column}_75th_percentile'] = headway_dataframe.groupby(by=stop_id_column)[headway_column].quantile(.75)
        df_output[f'{headway_column}_max'] = headway_dataframe.groupby(by=stop_id_column)[headway_column].max()

        # move stop id from index to a regular column named 'stop_id'
        df_output.reset_index(drop=True)
        df_output['stop_id'] = df_output.index
        return df_output

    # Scheduled data from gtfs
    gtfs_feed = download_extract_format(gtfs_version_id)

    scheduled_headways = get_scheduled_headways(
        route_id, 
        service_date_string, 
        start_timedelta_string, 
        end_timedelta_string
        )

    df_scheduled_headway_stops = get_headway_stats_by_stop(scheduled_headways,'stop_id', 'scheduled_headway')

    return df_scheduled_headway_stops

    # # Actual data from cta API
    # vehicles = get_chn_vehicles(service_date_string,start_timedelta_string, end_timedelta_string)

    # actual_headways = get_headways(route_id, vehicles)

    # df_actual_headway_stops = get_headway_stats_by_stop(actual_headways, 'stpid', 'est_headway')

    # # Merge scheduled and actual stats
    # output = df_scheduled_headway_stops.merge(df_actual_headway_stops, on='stop_id', how='outer')

    # return output



In [91]:
%%capture --no-display

get_stats_for_stops('55','2023-01-09')

INFO:root:Downloading CTA data
INFO:root:Download complete
INFO:root:Extracting data from CTA zipfile version 20230105
INFO:root:stops.txt loaded
INFO:root:stop_times.txt loaded
INFO:root:routes.txt loaded
INFO:root:trips.txt loaded
INFO:root:calendar.txt loaded
INFO:root:calendar_dates.txt loaded
INFO:root:shapes.txt loaded


Unnamed: 0_level_0,scheduled_headway_mean,scheduled_headway_min,scheduled_headway_25th_percentile,scheduled_headway_median,scheduled_headway_75th_percentile,scheduled_headway_max,stop_id
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10495,0 days 02:00:09.833333333,0 days 00:57:00,0 days 00:58:11.250000,0 days 01:02:15,0 days 01:02:26.250000,0 days 04:58:45,10495
10501,NaT,NaT,NaT,NaT,NaT,NaT,10501
10502,NaT,NaT,NaT,NaT,NaT,NaT,10502
10505,NaT,NaT,NaT,NaT,NaT,NaT,10505
10507,0 days 00:52:14.444444444,0 days 00:01:00,0 days 00:18:00,0 days 00:21:31,0 days 00:27:17,0 days 07:58:04,10507
...,...,...,...,...,...,...,...
17563,NaT,NaT,NaT,NaT,NaT,NaT,17563
18018,NaT,NaT,NaT,NaT,NaT,NaT,18018
18032,0 days 05:29:54.500000,0 days 05:00:39,-80064 days +01:24:42.108917760,0 days 05:00:39,-26688 days +03:48:40.036306432,0 days 05:59:10,18032
18448,NaT,NaT,NaT,NaT,NaT,NaT,18448


In [63]:
%%capture --no-display
# turn off warnings

# Scratch
gtfs_version_id = '20230105'
route_id = '55'
service_date_string = '2023-01-09'
start_timedelta_string ='02:30'
end_timedelta_string = '26:30'

gtfs_feed = download_extract_format(gtfs_version_id)



# scheduled_headways = get_scheduled_headways(
#     route_id, 
#     service_date_string, 
#     start_timedelta_string, 
#     end_timedelta_string
#     )


INFO:root:Downloading CTA data
INFO:root:Download complete
INFO:root:Extracting data from CTA zipfile version 20230105
INFO:root:stops.txt loaded
INFO:root:stop_times.txt loaded
INFO:root:routes.txt loaded
INFO:root:trips.txt loaded
INFO:root:calendar.txt loaded
INFO:root:calendar_dates.txt loaded
INFO:root:shapes.txt loaded


In [62]:

df_output = pd.DataFrame()

# df_output['stop_ids'] = scheduled_headways['stop_id'].groupby(by='stop_id')
df_output['sched_mean'] = scheduled_headways.groupby(by='stop_id')['scheduled_headway'].mean()
df_output['sched_min'] = scheduled_headways.groupby(by='stop_id')['scheduled_headway'].min()
df_output['sched_25th_percentile'] = scheduled_headways.groupby(by='stop_id')['scheduled_headway'].quantile(0.25)
df_output['sched_median'] = scheduled_headways.groupby(by='stop_id')['scheduled_headway'].median()
df_output['sched_75th_percentile'] = scheduled_headways.groupby(by='stop_id')['scheduled_headway'].quantile(.75)
df_output['sched_max'] = scheduled_headways.groupby(by='stop_id')['scheduled_headway'].max()

df_output




    

Unnamed: 0_level_0,sched_mean,sched_min,sched_25th_percentile,sched_median,sched_75th_percentile,sched_max
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10495,0 days 02:00:09.833333333,0 days 00:57:00,0 days 00:58:11.250000,0 days 01:02:15,0 days 01:02:26.250000,0 days 04:58:45
10501,NaT,NaT,NaT,NaT,NaT,NaT
10502,NaT,NaT,NaT,NaT,NaT,NaT
10505,NaT,NaT,NaT,NaT,NaT,NaT
10507,0 days 00:52:14.444444444,0 days 00:01:00,0 days 00:18:00,0 days 00:21:31,0 days 00:27:17,0 days 07:58:04
...,...,...,...,...,...,...
17563,NaT,NaT,NaT,NaT,NaT,NaT
18018,NaT,NaT,NaT,NaT,NaT,NaT
18032,0 days 05:29:54.500000,0 days 05:00:39,-80064 days +01:24:42.108917760,0 days 05:00:39,-26688 days +03:48:40.036306432,0 days 05:59:10
18448,NaT,NaT,NaT,NaT,NaT,NaT


In [54]:
scheduled_headways.groupby(by='stop_id')['scheduled_headway'].mean()


quantile(0.25)

stop_id
10495   0 days 02:00:09.833333333
10501                         NaT
10502                         NaT
10505                         NaT
10507   0 days 00:52:14.444444444
                   ...           
17563                         NaT
18018                         NaT
18032      0 days 05:29:54.500000
18448                         NaT
6524              0 days 07:59:52
Name: scheduled_headway, Length: 98, dtype: timedelta64[ns]

In [14]:
### ChatGPT code:

import pandas as pd
import requests

# URL for the GTFS data
url = "http://www.transitchicago.com/downloads/sch_data/google_transit.zip"

# Download the data
response = requests.get(url)
open("google_transit.zip", "wb").write(response.content)

# Load the data into pandas dataframes
dataframes = {
    "routes": pd.read_csv("google_transit/routes.txt"),
    "trips": pd.read_csv("google_transit/trips.txt"),
    "stop_times": pd.read_csv("google_transit/stop_times.txt"),
    "stops": pd.read_csv("google_transit/stops.txt"),
    "calendar": pd.read_csv("google_transit/calendar.txt"),
}

# Print the first 5 rows of the "routes" dataframe
print(dataframes["routes"].head())


FileNotFoundError: [Errno 2] No such file or directory: 'google_transit/routes.txt'

In [16]:
# ChatGPT code
 
import pandas as pd

# Load the GTFS data into a pandas dataframe
stop_times = gtfs_feed.stop_times

# My edit:  convert to timedelta so the code can run
stop_times['arrival_time']=stop_times['arrival_time'].apply(pd.to_timedelta)

# Calculate the time difference between consecutive scheduled arrival times for each trip
stop_times["headway_secs"] = stop_times.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["arrival_time"].diff().dt.total_seconds()

# Calculate the average headway for each bus stop
headways = stop_times.groupby("stop_id")["headway_secs"].mean()

# Print the headways in minutes
print(headways / 60)


stop_id
1          0.699632
10         0.379508
100        0.580469
1000       0.397299
10000     -0.097356
            ...    
9996       0.450604
9998       0.356522
9999       0.085172
999990    13.500000
999991     8.500000
Name: headway_secs, Length: 11062, dtype: float64
