### Imports & Options

In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
from os import listdir
from pathlib import Path

In [3]:
import pandas as pd

### Data

In [7]:
filepath = Path("data/us-illinois-chicago-transit-authority-cta-gtfs-389/")
listdir(filepath)

['transfers.txt',
 'agency.txt',
 'calendar_dates.txt',
 'stop_times.txt',
 'frequencies.txt',
 'shapes.txt',
 'trips.txt',
 'stops.txt',
 'calendar.txt',
 'routes.txt']

### Process
For reference, this [helpful diagram](https://xang1234.github.io/images/isochrone/gtfs.png) explains how the tables in GTFS data are linked together.

For both the train and the bus, I need to know the average travel time between stops for each route that stops at those stops. For example, if riding the bus on Ashland Ave, I need to know the average travel times between stops, both if taking the local 9 bus or the express X9 bus.

The table `stop_times.txt` contains all the times a bus or train stops at a stop, as well as a `stop_sequence` number that records the order of stops along a trip. I belive that a trip, denoted by `trip_id`, is a trip taken by a single bus or train. For example, the 9 bus has 1838 unique trips.

From the trip that each bus takes, or is scheduled to take, I can determine the average travel time between stops. I could also determine the estimated travel time between stops depending on the time of day, but for this MVP let's start with just the average time. I say estimated travel time because this is not real-time bus data, it is the published schedules. Anyone who has ridden the bus in Chicago knows that the schedule is a joke. But while the bus may not actually arrive when it's supposed to, I think this data is a reliable estimation of how long it takes to get from stop to stop once the bus arrives. I know the CTA has taken great care to try and publish schedules that match congestion patterns, and so I think it's safe to trust their measurements of of bus speeds. Regardless, this project is to test the effect of bus frequency on travel times, not bus speeds. That will be for a later project.

And so, let's use the `stop_times.txt` dataset to group stops by trip and determine average travel times.

In [15]:
# Needed tables
stop_times = pd.read_csv(filepath / Path("stop_times.txt"))
trips = pd.read_csv(filepath / Path("trips.txt"))
# routes = pd.read_csv(filepath / Path("routes.txt"))

  stop_times = pd.read_csv(filepath / Path("stop_times.txt"))
  trips = pd.read_csv(filepath / Path("trips.txt"))


In [16]:
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled
0,69227595419,06:31:00,06:31:00,30175,1,Skokie,0,0
1,69227595419,06:37:30,06:37:30,30297,2,Skokie,0,21220
2,69227595419,06:39:00,06:39:00,30026,3,Skokie,0,26031
3,69227595419,06:45:00,06:45:00,30027,4,Howard,0,26031
4,69227595419,06:47:00,06:47:00,30298,5,Howard,0,30842
...,...,...,...,...,...,...,...,...
5418664,RLS-901-0,00:18:00,00:18:00,5416,40,Wilson Red Line,1,12261
5418665,RLS-901-1,00:00:00,00:00:00,18580,10,Belmont Station,0,0
5418666,RLS-901-1,00:05:00,00:05:00,1056,20,Belmont Station,0,4458
5418667,RLS-901-1,00:11:00,00:11:00,12554,30,Belmont Station,0,9392


In [28]:
# How many unique `trips` are there for the Ashland local route?
ashland_trips = trips[trips["route_id"] == "9"]["trip_id"].value_counts()
ashland_trips.shape[0]

1838

In [35]:
ashland_stop_times = stop_times[stop_times["trip_id"].isin(ashland_trips.index)]
ashland_stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled
5379927,6490048291010,20:00:00,20:00:00,6131,1,Irving Park,0,0
5379928,6490048291010,20:00:15,20:00:15,6147,2,Irving Park,0,628
5379929,6490048291010,20:00:25,20:00:25,6148,3,Irving Park,0,1021
5379930,6490048291010,20:00:41,20:00:41,6149,4,Irving Park,0,1684
5379931,6490048291010,20:01:41,20:01:41,14781,5,Irving Park,0,3191
...,...,...,...,...,...,...,...,...
5417863,6500048639020,01:36:37,01:36:37,6162,12,74th,0,10360
5417864,6500048639020,01:36:58,01:36:58,6163,13,74th,0,11010
5417865,6500048639020,01:37:54,01:37:54,6165,14,74th,0,12366
5417866,6500048639020,01:38:09,01:38:09,6166,15,74th,0,13031


In [44]:
# But hold up, there are only 14 unique trips among the stop times for the ashland bus.
# Why is that?
ashland_stop_times["trip_id"].value_counts().shape[0]

14

In [50]:
# I guess let's start with the stip that visits the most stops
test_trip = ashland_stop_times["trip_id"].value_counts().index[0]
single_trip = ashland_stop_times[ashland_stop_times["trip_id"] == test_trip]
single_trip

# This is a southbound local Ashland bus

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled
5403900,6500048509050,14:15:30,14:15:30,1931,1,104th/Vincennes,0,0
5403901,6500048509050,14:17:27,14:17:27,15275,2,104th/Vincennes,0,1594
5403902,6500048509050,14:18:19,14:18:19,5681,3,104th/Vincennes,0,2340
5403903,6500048509050,14:19:19,14:19:19,5995,4,104th/Vincennes,0,3000
5403904,6500048509050,14:21:28,14:21:28,5997,5,104th/Vincennes,0,4187
...,...,...,...,...,...,...,...,...
5404814,6500048509050,16:07:58,16:07:58,18079,85,104th/Vincennes,0,92370
5404815,6500048509050,16:09:18,16:09:18,6136,86,104th/Vincennes,0,93257
5404816,6500048509050,16:09:45,16:09:45,6137,87,104th/Vincennes,0,93765
5404817,6500048509050,16:10:44,16:10:44,6138,88,104th/Vincennes,0,95023


In [53]:
# How many stops does it make?
single_trip["stop_id"].value_counts().shape[0]

89

In [55]:
# As expected, it stops at each stop only once.
all(single_trip["stop_id"].value_counts() == 1)

True