In [1]:
import os, sys, shutil, zipfile, csv
import requests
import psycopg2
import fiona
import pandas as pd
import numpy as np
import geopandas
from sqlalchemy import URL, create_engine, text
from trino.auth import OAuth2Authentication

In [2]:
# Add in parent directories to sys.path to get multiHook library
current_dir = os.getcwd()
for x in range(4):  # Look four levels up
    parent_dir = os.path.dirname(current_dir)
    if parent_dir not in sys.path:
        sys.path.append(parent_dir)
    current_dir = parent_dir

import multihook.pycnxn.dbhook as dbhook

In [10]:
def import_bundle(
    bundle,
    import_trips=True,
    import_shapes=True,
    import_stops=True,
    import_stoptimes=True,
    import_calendar=True,
    import_calendar_dates=True,
    import_routes=True,
    cache_bundle=False,
):
    """Import GTFS data from the following tables in ADLS:
    - core.dbo.fact_gtfs_trips
    - core.dbo.fact_gtfs_shapes
    - core.dbo.fact_gtfs_stops
    - core.dbo.fact_gtfs_shape_reference
    - core.dbo.fact_gtfs_stop_times

    - ADD IN CALENDAR, CALENDAR DATES, AND ROUTES
    """

    # GTFS bundle can take some time to import. If there is already a cached GTFS bundle pickle file in the repository, read in that data instead
    picklefile = bundle + ".pickle"
    if os.path.isfile(picklefile) == True:
        with open(picklefile, "rb") as handle:
            gtfs_bundle = pickle.load(handle)
            return gtfs_bundle

    con = create_engine(
        r"trino://trino-route-trino.apps.mtasiprod.eastus.aroapp.io:443/mtadatalake",
        connect_args={
            "auth": OAuth2Authentication(),
            "http_scheme": "https",
        }
    )
    cur = con.connect()
    
    output = {}
    # Import trips
    trip_sql = f"""
    with agency as ( -- get agency id for each route
        select distinct route_id, agency_id from mtadatalake.core.fact_gtfs_routes 
        where bundle = '{bundle}'
        )
    SELECT trips.route_id ,trip_id, service_id,trip_headsign,direction_id,block_id,shape_id,boarding_type,bundle, agency.agency_id
    FROM mtadatalake.core.fact_gtfs_trips trips
    join agency on trips.route_id = agency.route_id
    where trips.bundle = '{bundle}'
    """
    if import_trips == True:
        print('Loading trips')
        f = cur.execute(text(trip_sql))
        trips_df = pd.DataFrame(f.fetchall())
        output["trips"] = trips_df

    # Import shapes
    shape_sql = f"""
    SELECT shape_id, shape_pt_sequence, shape_pt_lat, shape_pt_lon, bundle
    FROM mtadatalake.core.fact_gtfs_shapes
    where bundle = '{bundle}'
    """
    if import_shapes == True:
        print("Loading shapes")
        f = cur.execute(text(shape_sql))
        shapes_df = pd.DataFrame(f.fetchall())
        output["shapes"] = shapes_df

    # Import stops
    stop_sql = f"""
    SELECT fact_gtfs_stops.stop_id, stop_name, stop_lat, stop_lon, shape_ref.revenue_stop, bundle
    FROM mtadatalake.core.fact_gtfs_stops
    left join (
        SELECT stop_id, MAX(revenue_stop) AS revenue_stop
        FROM mtadatalake.core.fact_gtfs_shape_reference
        where bundle = '{bundle}'
        GROUP BY stop_id
    ) shape_ref
    on fact_gtfs_stops.stop_id = shape_ref.stop_id
    where bundle = '{bundle}'
    """
    if import_stops == True:
        print("Loading stops")
        f = cur.execute(text(stop_sql))
        stops_df = pd.DataFrame(f.fetchall())
        output["stops"] = stops_df

    # Import stop_times
    stoptime_sql = f"""
    SELECT trip_id, stop_id, arrival_time, departure_time, timepoint, stop_sequence, pickup_type, drop_off_type, bundle
    FROM mtadatalake.core.fact_gtfs_stop_times
    where bundle = '{bundle}'
    """
    if import_stoptimes == True:
        print("Loading stop times")
        f = cur.execute(text(stoptime_sql))
        stoptimes_df = pd.DataFrame(f.fetchall())
        output["stoptimes"] = stoptimes_df

    # import calendar
    calendar_sql = f""" 
    SELECT service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date, bundle, modified_time, loaded_time
    FROM mtadatalake.core.fact_gtfs_calendar
    where bundle = '{bundle}'
    """
    if import_calendar == True:
        print("Loading calendar")
        f = cur.execute(text(calendar_sql))
        calendar_df = pd.DataFrame(f.fetchall())
        output["calendar"] = calendar_df

    #import calendar_dates
    calendar_dates_sql = f"""
    SELECT service_id, "date", exception_type
    FROM mtadatalake.core.fact_gtfs_calendar_dates
    where bundle = '{bundle}'
    """
    if import_calendar_dates == True:
        print("Loading calendar_dates")
        f = cur.execute(text(calendar_dates_sql))
        calendar_dates_df = pd.DataFrame(f.fetchall())
        output["calendar_dates"] = calendar_dates_df

    #import routes
    routes_sql = f"""
    SELECT route_id, route_short_name, route_long_name
    FROM mtadatalake.core.fact_gtfs_routes
    where bundle = '{bundle}'
    """
    if import_calendar_dates == True:
        print("Loading routes")
        f = cur.execute(text(routes_sql))
        routes_df = pd.DataFrame(f.fetchall())
        output["routes"] = routes_df

    
    if cache_bundle == True:
        with open(picklefile, "wb") as handle:
            pickle.dump(output, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"GTFS data for {bundle} successfully loaded and cached")
    else:
       print(f"GTFS data for {bundle} successfully loaded") 

    return output

### Generate Summary for each Route of Specified Bundle

In [78]:
gtfs = import_bundle('2025June_Prod_r04_b01_PREDATE_SHUTTLES_SCHEDULED')

Loading trips
Loading shapes
Loading stops
Loading stop times
Loading calendar
Loading calendar_dates
Loading routes
GTFS data for 2025June_Prod_r04_b01_PREDATE_SHUTTLES_SCHEDULED successfully loaded


In [79]:
trips = gtfs['trips']
stoptimes = gtfs['stoptimes']
stops = gtfs['stops']
shapes = gtfs['shapes']
calendar = gtfs['calendar']
calendar_dates = gtfs['calendar_dates']
routes = gtfs['routes']

In [80]:
def label_period(mins):
    if 0 <= mins < 240:       # 12:00 AM – 4:00 AM
        return 'Overnight'
    elif 240 <= mins < 360:   # 4:00 AM – 6:00 AM
        return 'Early Morning'
    elif 360 <= mins < 540:   # 6:00 AM – 9:00 AM
        return 'AM'
    elif 540 <= mins < 900:   # 9:00 AM – 3:00 PM
        return 'Midday'
    elif 900 <= mins < 1140:  # 3:00 PM – 7:00 PM
        return 'PM'
    elif 1140 <= mins < 1260: # 7:00 PM – 9:00 PM
        return 'Evening'
    else:
        return 'Late Evening'


In [85]:
# Set target date
target_date = 20250702
target_day = 'wednesday'

# Get active service_ids on that date
base_services = calendar[
    (calendar[target_day] == 1) &
    (calendar['start_date'] <= target_date) &
    (calendar['end_date'] >= target_date)
]['service_id']

exceptions = calendar_dates[calendar_dates['date'] == target_date]
removed = exceptions[exceptions['exception_type'] == 2]['service_id']
added = exceptions[exceptions['exception_type'] == 1]['service_id']

final_services = pd.concat([base_services[~base_services.isin(removed)], added]).drop_duplicates()

### Get valid trips for that date
valid_trips = trips[trips['service_id'].isin(final_services)]

#Join with routes to get route_short_name
valid_trips= valid_trips.merge(routes[['route_id', 'route_long_name']], on='route_id', how='left')

# Get the first stop per trip
stoptimes['stop_sequence'] = stoptimes['stop_sequence'].astype(int)
first_stops = stoptimes.sort_values(['trip_id', 'stop_sequence']).groupby('trip_id').first().reset_index()
# Only keep trip_id and arrival_time
first_stops = first_stops[['trip_id', 'arrival_time']] 
# Now join with trips to get route_id and shape_id
trip_times = valid_trips[['trip_id', 'route_id','route_long_name','shape_id','direction_id']].merge(first_stops, on='trip_id')


# Convert 'arrival_time' to total minutes since midnight
split_time = trip_times['arrival_time'].str.split(':', expand=True).astype(int)
trip_times['minutes'] = split_time[0] * 60 + split_time[1]


# filter out the ones that go into the next day
trip_times = trip_times[trip_times['minutes'] < 1440]


# Group and summarize 
trip_times['period'] = trip_times['minutes'].apply(label_period)

      
summary = trip_times.groupby(['route_id','route_long_name', 'direction_id', 'period']).agg(
    trips=('trip_id', 'count')
).reset_index()


# Pivot to wide format
pivot_summary = summary.pivot_table(index=['route_id', 'route_long_name','direction_id'], columns='period', values='trips', fill_value=0).astype(int)

# Add total trips and variants
pivot_summary['Total Trips'] = trip_times.groupby(['route_id', 'route_long_name', 'direction_id'])['trip_id'].count()
pivot_summary['Total Variants'] = trip_times.groupby(['route_id','route_long_name', 'direction_id'])['shape_id'].nunique()


# Compute frequencies (number of trips divided by hours in range )
overnight = pivot_summary['Overnight'].replace(0,pd.NA).round(2) / 4
early = pivot_summary['Early Morning'].replace(0, pd.NA).round(2) / 2
am = pivot_summary['AM'].replace(0, pd.NA).round(2) / 3
midday = pivot_summary['Midday'].replace(0, pd.NA).round(2) / 6 
pm =  pivot_summary['PM'].replace(0, pd.NA).round(2) / 4
evening =  pivot_summary['Evening'].replace(0, pd.NA).round(2) / 2
late =  pivot_summary['Late Evening'].replace(0, pd.NA).round(2) / 3 



# Replace in DataFrame
pivot_summary['Overnight'] = pd.to_numeric(overnight,errors = 'coerce').round(1)
pivot_summary['Early Morning'] = pd.to_numeric(early,errors = 'coerce').round(1)
pivot_summary['AM'] = pd.to_numeric(am, errors='coerce').round(1)
pivot_summary['Midday'] = pd.to_numeric(midday, errors='coerce').round(1)
pivot_summary['PM'] = pd.to_numeric(pm,errors='coerce').round(1)
pivot_summary['Evening'] = pd.to_numeric(evening,errors='coerce').round(1)
pivot_summary['Late Evening'] = pd.to_numeric(late,errors = 'coerce').round(1)

# Reorder columns
order = ['Overnight','Early Morning','AM', 'Midday', 'PM', 'Evening','Late Evening', 'Total Trips', 'Total Variants']
pivot_summary1 = pivot_summary[order]


collapsed = pivot_summary1.groupby(['route_id', 'route_long_name']).agg({
    'Overnight': 'max',
    'Early Morning': 'max',
    'AM': 'max',
    'Midday': 'max',
    'PM': 'max',
    'Evening': 'max',
    'Late Evening': 'max',
    'Total Trips': 'sum',
    'Total Variants': 'sum'
}).reset_index()


collapsed = collapsed[['route_id', 'route_long_name'] + order]


collapsed.to_csv('route_summary_frequencies1.csv', index=False)



In [86]:
gtfs = import_bundle('2025March_Prod_r01_b05_SHUTTLES_PREDATE_SCHEDULED')

Loading trips
Loading shapes
Loading stops
Loading stop times
Loading calendar
Loading calendar_dates
Loading routes
GTFS data for 2025March_Prod_r01_b05_SHUTTLES_PREDATE_SCHEDULED successfully loaded


In [87]:
trips = gtfs['trips']
stoptimes = gtfs['stoptimes']
stops = gtfs['stops']
shapes = gtfs['shapes']
calendar = gtfs['calendar']
calendar_dates = gtfs['calendar_dates']

In [88]:
# Set target date
target_date = 20250401
target_day = 'tuesday'

# Get active service_ids on that date
base_services = calendar[
    (calendar[target_day] == 1) &
    (calendar['start_date'] <= target_date) &
    (calendar['end_date'] >= target_date)
]['service_id']

exceptions = calendar_dates[calendar_dates['date'] == target_date]
removed = exceptions[exceptions['exception_type'] == 2]['service_id']
added = exceptions[exceptions['exception_type'] == 1]['service_id']

final_services = pd.concat([base_services[~base_services.isin(removed)], added]).drop_duplicates()

### Get valid trips for that date
valid_trips = trips[trips['service_id'].isin(final_services)]

#Join with routes to get route_short_name
valid_trips= valid_trips.merge(routes[['route_id', 'route_short_name']], on='route_id', how='left')

# Get the first stop per trip
stoptimes['stop_sequence'] = stoptimes['stop_sequence'].astype(int)
first_stops = stoptimes.sort_values(['trip_id', 'stop_sequence']).groupby('trip_id').first().reset_index()
# Only keep trip_id and arrival_time
first_stops = first_stops[['trip_id', 'arrival_time']] 
# Now join with trips to get route_id and shape_id
trip_times = valid_trips[['trip_id', 'route_id','route_short_name','shape_id','direction_id']].merge(first_stops, on='trip_id')


# Convert 'arrival_time' to total minutes since midnight
split_time = trip_times['arrival_time'].str.split(':', expand=True).astype(int)
trip_times['minutes'] = split_time[0] * 60 + split_time[1]


# filter out the ones that go into the next day
trip_times = trip_times[trip_times['minutes'] < 1440]


# Group and summarize 
trip_times['period'] = trip_times['minutes'].apply(label_period)

      
summary = trip_times.groupby(['route_id','route_short_name', 'direction_id', 'period']).agg(
    trips=('trip_id', 'count')
).reset_index()


# Pivot to wide format
pivot_summary = summary.pivot_table(index=['route_id', 'route_short_name','direction_id'], columns='period', values='trips', fill_value=0).astype(int)

# Add total trips and variants
pivot_summary['Total Trips'] = trip_times.groupby(['route_id', 'route_short_name', 'direction_id'])['trip_id'].count()
pivot_summary['Total Variants'] = trip_times.groupby(['route_id','route_short_name', 'direction_id'])['shape_id'].nunique()


# Compute frequencies (number of trips divided by hours in range )
overnight = pivot_summary['Overnight'].replace(0,pd.NA).round(2) / 4
early = pivot_summary['Early Morning'].replace(0, pd.NA).round(2) / 2
am = pivot_summary['AM'].replace(0, pd.NA).round(2) / 3
midday = pivot_summary['Midday'].replace(0, pd.NA).round(2) / 6 
pm =  pivot_summary['PM'].replace(0, pd.NA).round(2) / 4
evening =  pivot_summary['Evening'].replace(0, pd.NA).round(2) / 2
late =  pivot_summary['Late Evening'].replace(0, pd.NA).round(2) / 3 


# Replace in DataFrame
pivot_summary['Overnight'] = pd.to_numeric(overnight,errors = 'coerce').round(1)
pivot_summary['Early Morning'] = pd.to_numeric(early,errors = 'coerce').round(1)
pivot_summary['AM'] = pd.to_numeric(am, errors='coerce').round(1)
pivot_summary['Midday'] = pd.to_numeric(midday, errors='coerce').round(1)
pivot_summary['PM'] = pd.to_numeric(pm,errors='coerce').round(1)
pivot_summary['Evening'] = pd.to_numeric(evening,errors='coerce').round(1)
pivot_summary['Late Evening'] = pd.to_numeric(late,errors = 'coerce').round(1)


# Reorder columns
order = ['Overnight','Early Morning','AM', 'Midday', 'PM', 'Evening','Late Evening', 'Total Trips', 'Total Variants']
pivot_summary1 = pivot_summary[order]

collapsed = pivot_summary1.groupby(['route_id','route_short_name']).agg({
    'Overnight': 'max',
    'Early Morning': 'max',
    'AM': 'max',
    'Midday': 'max',
    'PM': 'max',
    'Evening': 'max',
    'Late Evening': 'max',
    'Total Trips': 'sum',
    'Total Variants': 'sum'
}).reset_index()


collapsed = collapsed[['route_id','route_short_name'] + order] 

collapsed.to_csv('route_summary_frequencies2.csv', index=False)



In [89]:
df1 = pd.read_csv('route_summary_frequencies1.csv')
df2 = pd.read_csv('route_summary_frequencies2.csv')
# Rename columns for clarity before merge

df1.columns = [f"{col}_b1" if col not in ['route_id','route_long_name', 'Total Trips','Total Variants'] else col for col in df1.columns]
df2.columns = [f"{col}_b2" if col not in ['route_id', 'hour'] else col for col in df2.columns]

# Merge on route_id, direction_id
merged = pd.merge(
    df1,
    df2,
    on=['route_id',],
    how='outer',
    sort=True
)

merged.to_csv("route_summary_frequencies_combined1.csv", index=False)
