# Calculating Headways for CTA Buses

## Strategy

In [1]:
# Vehicles with pattern and dist for an entire route -> match up to pattern data (incl dist along pattern and route) to get stops passed each period. 

# For a given stop, list all vehicles with times they passed the stop.  (dist minus 100 ft or so, to account for buses
# stopping not EXACTLY as far as the stop point?)

# For the given stop, sort vehicles by time and calculate headways.

In [2]:
import requests
from dotenv import load_dotenv
import pandas as pd
import geopandas as gpd
from shapely import Point, LineString

In [3]:
# Get API key from the .env file
load_dotenv()
API_KEY = os.getenv('API_KEY')

### Get vehicle data scraped by chn-ghost-buses for a single day

In [4]:
def get_vehicles(datestring):
    """Datestring must be in MM-DD-YYYY format. Returns chn scraped data 
    for all CTA buses on the specified date."""

    chn_data_source = f'https://chn-ghost-buses-public.s3.us-east-2.amazonaws.com/bus_full_day_data_v2/{datestring}.csv'

    vehicles = pd.read_csv(
        chn_data_source, dtype={
            'vid':'int',
            'tmstmp':'str',
            'lat':'float',
            'lon':'float',
            'hdg':'int',
            'pid':'int',
            'rt':'str',
            'pdist':'int',
            'des':'str',
            'dly':'bool',
            'tatripid':'str',
            'origatripno':'int',
            'tablockid':'str',
            'zone':'str',
            'scrape_file':'str',
            'data_hour':'int',
            'data_date':'str'
            }
        )

    return vehicles


# view test data
get_vehicles('2023-01-08')


Unnamed: 0,vid,tmstmp,lat,lon,hdg,pid,rt,des,pdist,dly,tatripid,origtatripno,tablockid,zone,scrape_file,data_time,data_hour,data_date
0,7993,20230108 00:02,41.894983,-87.624184,358,18414,3,Michigan/Chicago,68294,False,153717,235359439,3 -715,,bus_data/2023-01-08/00:02:56.json,2023-01-08 00:02:00,0,2023-01-08
1,1254,20230108 00:02,41.868729,-87.624199,179,18415,3,95th/RED LINE,13129,False,1051752,235359703,3 -720,,bus_data/2023-01-08/00:02:56.json,2023-01-08 00:02:00,0,2023-01-08
2,1321,20230108 00:02,41.827670,-87.617134,178,18415,3,95th/RED LINE,29661,False,1051751,235359447,3 -719,,bus_data/2023-01-08/00:02:56.json,2023-01-08 00:02:00,0,2023-01-08
3,7998,20230108 00:02,41.734243,-87.614495,179,18415,3,95th/RED LINE,63803,False,1051749,235357170,N4 -793,,bus_data/2023-01-08/00:02:56.json,2023-01-08 00:02:00,0,2023-01-08
4,1276,20230108 00:02,41.873702,-87.624313,159,19380,4,95th/RED LINE,7616,False,1088426,235359595,4 -710,,bus_data/2023-01-08/00:02:56.json,2023-01-08 00:02:00,0,2023-01-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100967,4358,20230108 23:57,41.999790,-87.670921,256,8102,151,Union Station,0,False,10001233,238776129,151 -510,,bus_data/2023-01-08/23:57:56.json,2023-01-08 23:57:00,23,2023-01-08
100968,1881,20230108 23:57,41.927083,-87.638303,137,8102,151,Union Station,35241,False,10001247,238776114,151 -509,,bus_data/2023-01-08/23:57:56.json,2023-01-08 23:57:00,23,2023-01-08
100969,1847,20230108 23:57,41.883390,-87.624527,172,8102,151,Union Station,53853,False,10001246,238776100,151 -508,,bus_data/2023-01-08/23:57:56.json,2023-01-08 23:57:00,23,2023-01-08
100970,1773,20230108 23:57,42.007931,-87.661964,95,904,155,Kedzie,1010,False,150,238775078,155 -502,,bus_data/2023-01-08/23:57:56.json,2023-01-08 23:57:00,23,2023-01-08


In [5]:
# get patterns for a specified route from the CTA's API

def get_patterns(route):
    '''Get patterns data from the CTA's bus tracker API for a specified route.
    Return patterns as a dataframe'''
    
    # get data from CTA's feed
    api_url = f'http://www.ctabustracker.com/bustime/api/v2/getpatterns?key={API_KEY}&rt={route}&format=json'
    response = requests.get(api_url)
    patterns = response.json()

    # convert json to dataframe
    df_patterns = pd.DataFrame(patterns['bustime-response']['ptr'])

    # convert pt column values to dataframes for each pattern containing that pattern's points
    df_patterns['pt'] = df_patterns['pt'].apply(lambda x: pd.DataFrame(x))
    
    return df_patterns

get_patterns(20)

Unnamed: 0,pid,ln,rtdir,pt
0,949,13656.0,Westbound,seq lat lon typ stpid ...
1,947,42604.0,Westbound,seq lat lon typ stpid ...
2,954,41419.0,Westbound,seq lat lon typ pdist st...
3,959,41475.0,Eastbound,seq lat lon typ stpid ...
4,956,13514.0,Eastbound,seq lat lon typ stpid ...
5,957,42743.0,Eastbound,seq lat lon typ stpid ...


In [6]:

def get_pattern_linestrings(route):
    '''Get the patterns data from the CTA's bus tracker API for a specified route.
    Return patterns as a geodataframe with linestring geomtry'''

    df_patterns = get_patterns(route)

    # Turn points into linestrings
    geometry_linestrings = []
    for p in df_patterns['pt']:
        p.sort_values('seq', inplace=True)
        linestring_points = list(zip(p['lon'],p['lat']))

        # generate linestring using all points
        linestring = LineString(linestring_points)
        geometry_linestrings.append(linestring)

    # Create a geodataframe for the patterns using the linestring geometry
    gdf_patterns = gpd.GeoDataFrame(df_patterns, geometry=geometry_linestrings).set_crs(epsg=4326)

    # Drop the original pt column
    gdf_patterns.drop(['pt'], axis=1, inplace=True)

    return gdf_patterns


In [7]:
get_patterns('20')


Unnamed: 0,pid,ln,rtdir,pt
0,949,13656.0,Westbound,seq lat lon typ stpid ...
1,947,42604.0,Westbound,seq lat lon typ stpid ...
2,954,41419.0,Westbound,seq lat lon typ pdist st...
3,959,41475.0,Eastbound,seq lat lon typ stpid ...
4,956,13514.0,Eastbound,seq lat lon typ stpid ...
5,957,42743.0,Eastbound,seq lat lon typ stpid ...


In [8]:
def get_pattern_stops(route):

    df_patterns = get_patterns(route)

    gdf_route_stops = gpd.GeoDataFrame()

    for pattern_points in df_patterns['pt']:
        pattern_points.sort_values('seq', inplace=True)
        # filter to only show stop points
        stops = pattern_points[pattern_points['typ']=='S']
        coords = list(zip(stops['lon'],stops['lat']))
        geometry = [Point(c) for c in coords]
        gdf_pattern_stops = gpd.GeoDataFrame(stops,geometry=geometry).set_crs(epsg=4326)
        gdf_route_stops = pd.concat([gdf_route_stops, gdf_pattern_stops])

    return gdf_route_stops




# TODO:  Add pid to the stop points


In [9]:
# view test data
m = get_pattern_linestrings(20).explore(color='blue', tiles='CartoDB positron')
get_pattern_stops(20).explore(m=m, color='red')
