In [96]:
%pip install IPython

Note: you may need to restart the kernel to use updated packages.


In [97]:
import importlib
from IPython.core.magic import register_cell_magic
from IPython import get_ipython
# Conditional skipping of https://kioku-space.com/en/jupyter-skip-execution/
@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().run_cell(cell)

get_ipython().register_magic_function(skip_if, 'cell')

In [98]:
%%skip_if importlib.util.find_spec('Jinja2') is not None
%pip install Jinja2

Note: you may need to restart the kernel to use updated packages.


In [99]:
%%skip_if importlib.util.find_spec('nbformat') is not None
%pip install nbformat

In [100]:
import numpy as np

In [101]:
# Python packages
import pandas as pd
import json
import plotly.graph_objects as go

In [102]:
try:
    with open('raw_data/yqrStops.json', 'r') as f:
        stop_data = json.load(f)
except json.decoder.JSONDecodeError as e:
    print("Invalid JSON", e)

stop_data

{'displayFieldName': 'STOP_NAME',
 'fieldAliases': {'OBJECTID': 'OBJECTID',
  'ONSTREET': 'ONSTREET',
  'ATSTREET': 'ATSTREET',
  'LON': 'LON',
  'LAT': 'LAT',
  'STOP_ID': 'STOP_ID',
  'STOP_NAME': 'STOP_NAME',
  'GLOBALID': 'GLOBALID'},
 'geometryType': 'esriGeometryPoint',
 'spatialReference': {'wkid': 26913, 'latestWkid': 26913},
 'fields': [{'name': 'OBJECTID',
   'type': 'esriFieldTypeOID',
   'alias': 'OBJECTID'},
  {'name': 'ONSTREET',
   'type': 'esriFieldTypeString',
   'alias': 'ONSTREET',
   'length': 200},
  {'name': 'ATSTREET',
   'type': 'esriFieldTypeString',
   'alias': 'ATSTREET',
   'length': 200},
  {'name': 'LON',
   'type': 'esriFieldTypeString',
   'alias': 'LON',
   'length': 200},
  {'name': 'LAT',
   'type': 'esriFieldTypeString',
   'alias': 'LAT',
   'length': 200},
  {'name': 'STOP_ID',
   'type': 'esriFieldTypeString',
   'alias': 'STOP_ID',
   'length': 200},
  {'name': 'STOP_NAME',
   'type': 'esriFieldTypeString',
   'alias': 'STOP_NAME',
   'length': 2

In [103]:
df_stops = pd.json_normalize(stop_data)
df_stops

Unnamed: 0,displayFieldName,geometryType,fields,features,exceededTransferLimit,fieldAliases.OBJECTID,fieldAliases.ONSTREET,fieldAliases.ATSTREET,fieldAliases.LON,fieldAliases.LAT,fieldAliases.STOP_ID,fieldAliases.STOP_NAME,fieldAliases.GLOBALID,spatialReference.wkid,spatialReference.latestWkid
0,STOP_NAME,esriGeometryPoint,"[{'name': 'OBJECTID', 'type': 'esriFieldTypeOI...","[{'attributes': {'OBJECTID': 59930, 'ONSTREET'...",True,OBJECTID,ONSTREET,ATSTREET,LON,LAT,STOP_ID,STOP_NAME,GLOBALID,26913,26913


In [104]:
df_stops = pd.json_normalize(stop_data['features'])
df_stops

Unnamed: 0,attributes.OBJECTID,attributes.ONSTREET,attributes.ATSTREET,attributes.LON,attributes.LAT,attributes.STOP_ID,attributes.STOP_NAME,attributes.GLOBALID,geometry.x,geometry.y
0,59930,University Park Dr,Quance St (NB),-104.54913,50.44416,0742,University Park Dr @ Quance St (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5.588113e+06
1,59931,University Park Dr,Vic Square (NB),-104.54915,50.44592,0743,University Park Dr @ Vic Square (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5.588309e+06
2,59932,Fleet St,North Service Rd (NB),-104.549126,50.448609,0744,Fleet St @ North Service Rd (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5.588608e+06
3,59933,Fleet St,Fines Dr (NB),-104.549111,50.449614,0745,Fleet St @ Fines Dr (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5.588720e+06
4,59934,Cambridge Ave,Milford Cres (WB),-104.5513,50.45041,0746,Cambridge Ave @ Milford Cres (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5.588807e+06
...,...,...,...,...,...,...,...,...,...,...
995,59519,Broad St,Lakeshore Dr (NB),-104.60238,50.42909,0277,Broad St @ Lakeshore Dr (NB),{6C99A3FE-1D49-4D9F-8CF4-47EB76D3C380},528241.7908,5.586416e+06
996,59520,Broad St,Quinn Dr (NB),-104.605029,50.43435,0279,Broad St @ Quinn Dr (NB),{FC736381-BD45-4215-9FAE-621438F00C32},528050.5326,5.587000e+06
997,59521,Broad St,Broadway Ave (NB),-104.60624,50.43833,0280,Broad St @ Broadway Ave (NB),{43F34201-A67B-463E-A50B-A53D6AA58C51},527962.1839,5.587442e+06
998,59522,Broad St,College Ave (NB),-104.60635,50.44131,0281,Broad St @ College Ave (NB),{37C536D7-5591-4DE9-8594-3D443E20D884},527952.6173,5.587773e+06


In [105]:
print("Bus Stop Data Types")
print(df_stops.dtypes[df_stops.columns[:8]])
print("---------------------------------")
print(f"\nSample LAT values: {df_stops['attributes.LAT'].head(2).tolist()}")
print("Bus Stop Missing Values")
missing = df_stops.isnull().sum()
print(missing[missing > 0])
print("---------------------------------")
print("Bus Stop Duplicates")
duplicates = df_stops.duplicated(subset=['attributes.STOP_ID']).sum()
print(f"Duplicate stop IDs: {duplicates}")
print("---------------------------------")

Bus Stop Data Types
attributes.OBJECTID      int64
attributes.ONSTREET     object
attributes.ATSTREET     object
attributes.LON          object
attributes.LAT          object
attributes.STOP_ID      object
attributes.STOP_NAME    object
attributes.GLOBALID     object
dtype: object
---------------------------------

Sample LAT values: ['50.44416', '50.44592']
Bus Stop Missing Values
attributes.ATSTREET    1
dtype: int64
---------------------------------
Bus Stop Duplicates
Duplicate stop IDs: 0
---------------------------------


In [106]:
print("======Sample Raw Bus Stop Data=======")
df_stops[['attributes.STOP_ID','attributes.ONSTREET', 'attributes.ATSTREET', 'attributes.LAT', 'attributes.LON']].head(3)



Unnamed: 0,attributes.STOP_ID,attributes.ONSTREET,attributes.ATSTREET,attributes.LAT,attributes.LON
0,742,University Park Dr,Quance St (NB),50.44416,-104.54913
1,743,University Park Dr,Vic Square (NB),50.44592,-104.54915
2,744,Fleet St,North Service Rd (NB),50.448609,-104.549126


 # Cleaning Bus Stop data using Data Wrangler extension

In [107]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df_stops):
    # Remove leading and trailing whitespace in column: 'attributes.ONSTREET'
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.strip()
    # Convert text to uppercase in column: 'attributes.ONSTREET'
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.upper()
    # Remove leading and trailing whitespace in column: 'attributes.ATSTREET'
    df_stops['attributes.ATSTREET'] = df_stops['attributes.ATSTREET'].str.strip()
    # Convert text to uppercase in column: 'attributes.ATSTREET'
    df_stops['attributes.ATSTREET'] = df_stops['attributes.ATSTREET'].str.upper()
    # Remove leading and trailing whitespace in column: 'attributes.LON'
    df_stops['attributes.LON'] = df_stops['attributes.LON'].str.strip()
    # Remove leading and trailing whitespace in column: 'attributes.LAT'
    df_stops['attributes.LAT'] = df_stops['attributes.LAT'].str.strip()
    # Remove leading and trailing whitespace in column: 'stop_id'
    df_stops['attributes.STOP_ID'] = df_stops['attributes.STOP_ID'].str.strip()
    # Remove leading and trailing whitespace in column: 'attributes.STOP_NAME'
    df_stops['attributes.STOP_NAME'] = df_stops['attributes.STOP_NAME'].str.strip()
    # Convert text to uppercase in column: 'attributes.STOP_NAME'
    df_stops['attributes.STOP_NAME'] = df_stops['attributes.STOP_NAME'].str.upper()
    # Remove leading and trailing whitespace in column: 'attributes.GLOBALID'
    df_stops['attributes.GLOBALID'] = df_stops['attributes.GLOBALID'].str.strip()
    # Replace missing values with "DOROTHY ST (SB)" in column: 'attributes.ATSTREET'
    df_stops = df_stops.fillna({'attributes.ATSTREET':"DOROTHY ST (SB)"})
    # Replace all instances of "1060 DOROTHY ST (SB)" with "DOROTHY ST" in column: 'attributes.ONSTREET'
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.replace("1060 DOROTHY ST (SB)", "DOROTHY ST", case=False, regex=False)
    # Rename column 'attributes.ONSTREET' to 'on_street'
    df_stops = df_stops.rename(columns={'attributes.ONSTREET': 'on_street'})
    # Rename column 'attributes.ATSTREET' to 'at_street'
    df_stops = df_stops.rename(columns={'attributes.ATSTREET': 'at_street'})
    # Rename column 'attributes.LON' to 'lon'
    df_stops = df_stops.rename(columns={'attributes.LON': 'lon'})
    # Rename column 'attributes.LAT' to 'lat'
    df_stops = df_stops.rename(columns={'attributes.LAT': 'lat'})
    # Rename column 'stop_id' to 'stop_id'
    df_stops = df_stops.rename(columns={'attributes.STOP_ID': 'stop_id'})
    # Rename column 'attributes.STOP_NAME' to 'stop_name'
    df_stops = df_stops.rename(columns={'attributes.STOP_NAME': 'stop_name'})
    # Rename column 'attributes.GLOBALID' to 'global_id'
    df_stops = df_stops.rename(columns={'attributes.GLOBALID': 'global_id'})
    # Rename column 'attributes.OBJECTID' to 'object_id'
    df_stops = df_stops.rename(columns={'attributes.OBJECTID': 'object_id'})
    return df_stops

clean_stops = clean_data(df_stops.copy())
clean_stops.head()

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y
0,59930,UNIVERSITY PARK DR,QUANCE ST (NB),-104.54913,50.44416,742,UNIVERSITY PARK DR @ QUANCE ST (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5588113.0
1,59931,UNIVERSITY PARK DR,VIC SQUARE (NB),-104.54915,50.44592,743,UNIVERSITY PARK DR @ VIC SQUARE (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5588309.0
2,59932,FLEET ST,NORTH SERVICE RD (NB),-104.549126,50.448609,744,FLEET ST @ NORTH SERVICE RD (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5588608.0
3,59933,FLEET ST,FINES DR (NB),-104.549111,50.449614,745,FLEET ST @ FINES DR (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5588720.0
4,59934,CAMBRIDGE AVE,MILFORD CRES (WB),-104.5513,50.45041,746,CAMBRIDGE AVE @ MILFORD CRES (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5588807.0


In [108]:
try:
    with open('raw_data/yqrRoutes.json', 'r') as f:
        routes_data = json.load(f)
    print("✓ Loaded routes data")
except json.decoder.JSONDecodeError as e:
    print("Invalid JSON", e)

df_routes = pd.json_normalize(routes_data['features'])
df_routes

✓ Loaded routes data


Unnamed: 0,attributes.OBJECTID,attributes.SHAPE.LEN,attributes.ROUTE_NAME,attributes.ROUTE_NUM,attributes.ROUTE_ID,attributes.SHAPE_ID,attributes.ROUTE_COLOR,attributes.ROUTE_TEXT_COLOR,geometry.paths
0,9601,184989.734238,RCMP - NORMANVIEW,10,10-44,100009,FF0FF9,,"[[[521564.20469999965, 5591834.3639], [521562...."
1,9602,47401.585527,HARBOUR LANDING - UNIVERSITY,18,18-44,180005,80FF00,,"[[[526923.1776999999, 5585099.4342], [526923.0..."
2,9603,180172.679776,UNIVERSITY - SHERWOOD ESTATES,3,3-44,30010,A8A800,,"[[[530186.9751000004, 5583983.2191], [530210.6..."
3,9604,92096.531471,ALBERT S EXPRESS - ALBERT N EXPRESS,40,40-44,400004,00CECE,,"[[[525147.5691, 5594036.3595], [525147.4778000..."
4,9605,57978.96154,ARCOLA E EXP - ARCOLA DWTN EXP,60,60-44,600017,808000,,"[[[533800.2019999996, 5585861.5229], [533799.1..."
5,9606,115065.916376,GLENCAIRN - WHITMORE,7,7-44,70002,0000FF,FFFFFF,"[[[526673.1995999999, 5583389.768200001], [526..."
6,9607,664900.169679,DIEPPE/WESTERRA - BROAD NORTH,1,1-44,10027,1AA68A,,"[[[527280.2152000004, 5592151.0746], [527305.1..."
7,9608,492246.798973,VARSITY PARK - MOUNT ROYAL,12,12-44,120008,00C100,,"[[[533014.9890999999, 5586423.8792], [533016.2..."
8,9609,24622.47965,AIRPORT - DOWNTOWN,24,24-44,240010,9D9D9D,,"[[[527114.4759999998, 5588534.237], [527114.12..."
9,9610,94968.614798,UNIVERSITY - ARCOLA EAST,22,22-44,220005,00A600,,"[[[528963.5651000002, 5584995.586999999], [528..."


In [109]:
print("Bus Route Data Types")
print(df_routes.dtypes[df_routes.columns[:8]])
print("---------------------------------")
print("Bus Route Missing Values")
missing = df_routes.isnull().sum()
print(missing[missing > 0])
print("---------------------------------")
print("Bus Route Duplicates")
duplicates = df_routes.duplicated(subset=['attributes.ROUTE_ID']).sum()
print(f"Duplicate stop IDs: {duplicates}")
print("---------------------------------")

Bus Route Data Types
attributes.OBJECTID              int64
attributes.SHAPE.LEN           float64
attributes.ROUTE_NAME           object
attributes.ROUTE_NUM            object
attributes.ROUTE_ID             object
attributes.SHAPE_ID             object
attributes.ROUTE_COLOR          object
attributes.ROUTE_TEXT_COLOR     object
dtype: object
---------------------------------
Bus Route Missing Values
attributes.ROUTE_TEXT_COLOR    16
dtype: int64
---------------------------------
Bus Route Duplicates
Duplicate stop IDs: 0
---------------------------------


 # Cleaning Bus Route Data using Data Wrangler

In [110]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df_routes):
    # Remove leading and trailing whitespace in columns: 'attributes.ROUTE_NAME', 'attributes.ROUTE_NUM' and 4 other columns
    df_routes['attributes.ROUTE_NAME'] = df_routes['attributes.ROUTE_NAME'].str.strip()
    df_routes['attributes.ROUTE_NUM'] = df_routes['attributes.ROUTE_NUM'].str.strip()
    df_routes['attributes.ROUTE_ID'] = df_routes['attributes.ROUTE_ID'].str.strip()
    df_routes['attributes.SHAPE_ID'] = df_routes['attributes.SHAPE_ID'].str.strip()
    # add a hashtag in front of the route colour hex values 
    df_routes['attributes.ROUTE_COLOR'] = '#' + (df_routes['attributes.ROUTE_COLOR'].str.strip()).astype(str)
    # Convert text to uppercase in column: 'attributes.ROUTE_NAME'
    df_routes['attributes.ROUTE_NAME'] = df_routes['attributes.ROUTE_NAME'].str.upper()
    # Replace missing values with "FFFFFF" in column: 'attributes.ROUTE_TEXT_COLOR'
    df_routes = df_routes.fillna({'attributes.ROUTE_TEXT_COLOR': "FFFFFF"})
    # add a hashtag in front of the route text colour hex values
    df_routes['attributes.ROUTE_TEXT_COLOR'] = "#"+ (df_routes['attributes.ROUTE_TEXT_COLOR'].str.strip()).astype(str)
     # Rename column 'attributes.SHAPE.LEN' to 'shape_length'
    df_routes = df_routes.rename(columns={'attributes.SHAPE.LEN': 'shape_length'})
    # Rename column 'attributes.ROUTE_NAME' to 'route_name'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_NAME': 'route_name'})
    # Rename column 'attributes.ROUTE_NUM' to 'route_num'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_NUM': 'route_num'})
    # Rename column 'attributes.ROUTE_ID' to 'route_id'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_ID': 'route_id'})
    # Rename column 'attributes.SHAPE_ID' to 'shape_id'
    df_routes = df_routes.rename(columns={'attributes.SHAPE_ID': 'shape_id'})
    # Rename column 'attributes.ROUTE_COLOR' to 'route_color'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_COLOR': 'route_color'})
    # Rename column 'attributes.ROUTE_TEXT_COLOR' to 'route_text_color'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_TEXT_COLOR': 'route_text_color'})
    # Rename column 'geometry.paths' to 'geometry_paths'
    df_routes = df_routes.rename(columns={'geometry.paths': 'geometry_paths'})
    # Rename column 'attributes.OBJECTID' to 'object_id'
    df_routes = df_routes.rename(columns={'attributes.OBJECTID': 'object_id'})
    return df_routes

clean_routes = clean_data(df_routes.copy())
clean_routes.head()

Unnamed: 0,object_id,shape_length,route_name,route_num,route_id,shape_id,route_color,route_text_color,geometry_paths
0,9601,184989.734238,RCMP - NORMANVIEW,10,10-44,100009,#FF0FF9,#FFFFFF,"[[[521564.20469999965, 5591834.3639], [521562...."
1,9602,47401.585527,HARBOUR LANDING - UNIVERSITY,18,18-44,180005,#80FF00,#FFFFFF,"[[[526923.1776999999, 5585099.4342], [526923.0..."
2,9603,180172.679776,UNIVERSITY - SHERWOOD ESTATES,3,3-44,30010,#A8A800,#FFFFFF,"[[[530186.9751000004, 5583983.2191], [530210.6..."
3,9604,92096.531471,ALBERT S EXPRESS - ALBERT N EXPRESS,40,40-44,400004,#00CECE,#FFFFFF,"[[[525147.5691, 5594036.3595], [525147.4778000..."
4,9605,57978.96154,ARCOLA E EXP - ARCOLA DWTN EXP,60,60-44,600017,#808000,#FFFFFF,"[[[533800.2019999996, 5585861.5229], [533799.1..."


 # Loading GTFS Data

In [111]:
# load gtfs data
stops_gtfs = pd.read_csv('raw_data/gtfs_data/stops.txt')
routes_gtfs = pd.read_csv('raw_data/gtfs_data/routes.txt')
trips_gtfs = pd.read_csv('raw_data/gtfs_data/trips.txt')
times_gtfs = pd.read_csv('raw_data/gtfs_data/stop_times.txt')

In [112]:
stops_gtfs

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,2,,Courtney St @ Dewdney Ave (NB),,50.455930,-104.686840,,,0,
1,3,,Courtney St @ 8th Ave (NB),,50.457560,-104.686840,,,0,
2,4,,Courtney St @ 7th Ave (NB),,50.458620,-104.686840,,,0,
3,5,,Courtney St @ 6th Ave (NB),,50.459540,-104.686850,,,0,
4,6,,Courtney St @ Kelly Ave (NB),,50.460840,-104.686840,,,0,
...,...,...,...,...,...,...,...,...,...,...
1395,1671,,Chuka Blvd @ Buckingham Dr (SB),,50.429152,-104.512485,,,0,
1396,1672,,Saskatchewan Dr @ Hamilton St (EB),,50.452173,-104.609027,,,0,
1397,1673,,Woodland Grove Dr @ Arcola Ave (NB),,50.420358,-104.526300,,,0,
1398,1674,,Anaquod Rd @ Optimist (SB),,50.443629,-104.505364,,,0,


In [113]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(stops_gtfs):
    # Convert text to uppercase in column: 'stop_name'
    stops_gtfs['stop_name'] = stops_gtfs['stop_name'].str.upper()
    # Remove leading and trailing whitespace in column: 'stop_name'
    stops_gtfs['stop_name'] = stops_gtfs['stop_name'].str.strip()
    return stops_gtfs

stops_gtfs_clean = clean_data(stops_gtfs.copy())
stops_gtfs_clean.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,2,,COURTNEY ST @ DEWDNEY AVE (NB),,50.45593,-104.68684,,,0,
1,3,,COURTNEY ST @ 8TH AVE (NB),,50.45756,-104.68684,,,0,
2,4,,COURTNEY ST @ 7TH AVE (NB),,50.45862,-104.68684,,,0,
3,5,,COURTNEY ST @ 6TH AVE (NB),,50.45954,-104.68685,,,0,
4,6,,COURTNEY ST @ KELLY AVE (NB),,50.46084,-104.68684,,,0,


In [114]:
routes_gtfs

Unnamed: 0,route_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1-44,1,DIEPPE/WESTERRA - BROAD NORTH,,3,,1AA68A,
1,1-45,1,DIEPPE/WESTERRA - BROAD NORTH,,3,,1AA68A,
2,2-44,2,ARGYLE PARK - WOOD MEADOWS,,3,,FF8000,
3,2-45,2,ARGYLE PARK - WOOD MEADOWS,,3,,FF8000,
4,3-44,3,UNIVERSITY - SHERWOOD ESTATES,,3,,A8A800,
5,3-45,3,UNIVERSITY - SHERWOOD ESTATES,,3,,A8A800,
6,4-44,4,HILLSDALE - WALSH ACRES,,3,,EAEA00,
7,4-45,4,HILLSDALE - WALSH ACRES,,3,,EAEA00,
8,5-44,5,UPLANDS - DOWNTOWN,,3,,0000A0,FFFFFF
9,5-45,5,UPLANDS - DOWNTOWN,,3,,0000A0,FFFFFF


In [115]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(routes_gtfs):
    # Convert text to uppercase in column: 'route_long_name'
    routes_gtfs['route_long_name'] = routes_gtfs['route_long_name'].str.upper()
    # Remove leading and trailing whitespace in column: 'route_long_name'
    routes_gtfs['route_long_name'] = routes_gtfs['route_long_name'].str.strip()
    return routes_gtfs

routes_gtfs_clean = clean_data(routes_gtfs.copy())
routes_gtfs_clean.head()

Unnamed: 0,route_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1-44,1,DIEPPE/WESTERRA - BROAD NORTH,,3,,1AA68A,
1,1-45,1,DIEPPE/WESTERRA - BROAD NORTH,,3,,1AA68A,
2,2-44,2,ARGYLE PARK - WOOD MEADOWS,,3,,FF8000,
3,2-45,2,ARGYLE PARK - WOOD MEADOWS,,3,,FF8000,
4,3-44,3,UNIVERSITY - SHERWOOD ESTATES,,3,,A8A800,


In [116]:
trips_gtfs

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,18-44,202502-MVS-Weekday-01,218095-202502-MVS-Weekday-01,UNIVERSITY,0,12208,180008
1,18-44,202502-MVS-Weekday-01,218096-202502-MVS-Weekday-01,UNIVERSITY,0,12209,180008
2,18-44,202502-MVS-Weekday-01,218097-202502-MVS-Weekday-01,UNIVERSITY,0,12208,180008
3,18-44,202502-MVS-Weekday-01,218098-202502-MVS-Weekday-01,UNIVERSITY,0,12209,180008
4,18-44,202502-MVS-Weekday-01,218099-202502-MVS-Weekday-01,UNIVERSITY,0,12208,180008
...,...,...,...,...,...,...,...
5073,18-45,202503-MVS-Sunday-01,231838-202503-MVS-Sunday-01,HARBOUR LANDING,1,12705,180009
5074,18-45,202503-MVS-Sunday-01,231839-202503-MVS-Sunday-01,HARBOUR LANDING,1,12705,180009
5075,18-45,202503-MVS-Sunday-01,231840-202503-MVS-Sunday-01,HARBOUR LANDING,1,12705,180009
5076,18-45,202503-MVS-Sunday-01,231841-202503-MVS-Sunday-01,HARBOUR LANDING,1,12705,180009


In [117]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(trips_gtfs):
    # Remove leading and trailing whitespace in columns: 'route_id', 'service_id' and 2 other columns
    trips_gtfs['route_id'] = trips_gtfs['route_id'].str.strip()
    trips_gtfs['service_id'] = trips_gtfs['service_id'].str.strip()
    trips_gtfs['trip_id'] = trips_gtfs['trip_id'].str.strip()
    trips_gtfs['trip_headsign'] = trips_gtfs['trip_headsign'].str.strip()
    # Convert text to uppercase in columns: 'service_id', 'trip_id', 'trip_headsign'
    trips_gtfs['service_id'] = trips_gtfs['service_id'].str.upper()
    trips_gtfs['trip_id'] = trips_gtfs['trip_id'].str.upper()
    trips_gtfs['trip_headsign'] = trips_gtfs['trip_headsign'].str.upper()
    return trips_gtfs

trips_gtfs_clean = clean_data(trips_gtfs.copy())
trips_gtfs_clean.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id
0,18-44,202502-MVS-WEEKDAY-01,218095-202502-MVS-WEEKDAY-01,UNIVERSITY,0,12208,180008
1,18-44,202502-MVS-WEEKDAY-01,218096-202502-MVS-WEEKDAY-01,UNIVERSITY,0,12209,180008
2,18-44,202502-MVS-WEEKDAY-01,218097-202502-MVS-WEEKDAY-01,UNIVERSITY,0,12208,180008
3,18-44,202502-MVS-WEEKDAY-01,218098-202502-MVS-WEEKDAY-01,UNIVERSITY,0,12209,180008
4,18-44,202502-MVS-WEEKDAY-01,218099-202502-MVS-WEEKDAY-01,UNIVERSITY,0,12208,180008


In [118]:
times_gtfs

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled,timepoint
0,218095-202502-MVS-Weekday-01,06:10:00,06:10:00,1566,1,0,0,0.000,1
1,218095-202502-MVS-Weekday-01,06:11:00,06:11:00,1403,2,0,0,0.263,0
2,218095-202502-MVS-Weekday-01,06:12:00,06:12:00,1418,3,0,0,0.485,0
3,218095-202502-MVS-Weekday-01,06:13:00,06:13:00,1411,4,0,0,0.717,0
4,218095-202502-MVS-Weekday-01,06:14:00,06:14:00,1412,5,0,0,0.904,0
...,...,...,...,...,...,...,...,...,...
285717,231842-202503-MVS-Sunday-01,19:06:00,19:06:00,1561,32,0,0,8.908,0
285718,231842-202503-MVS-Sunday-01,19:07:00,19:07:00,1562,33,0,0,9.275,0
285719,231842-202503-MVS-Sunday-01,19:08:00,19:08:00,1406,34,0,0,9.514,0
285720,231842-202503-MVS-Sunday-01,19:09:00,19:09:00,1407,35,0,0,9.732,0


In [119]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(times_gtfs):
    # Convert text to uppercase in column: 'trip_id'
    times_gtfs['trip_id'] = times_gtfs['trip_id'].str.upper()
    # Replace '24' in hour position with '00'
    times_gtfs['arrival_time'] = times_gtfs['arrival_time'].str.replace(r'^24', '00', regex=True)
    times_gtfs['departure_time'] = times_gtfs['departure_time'].str.replace(r'^24', '00', regex=True)
    # Remove leading and trailing whitespace in columns: 'trip_id', 'arrival_time', 'departure_time'
    times_gtfs['trip_id'] = times_gtfs['trip_id'].str.strip()
    times_gtfs['arrival_time'] = times_gtfs['arrival_time'].str.strip()
    times_gtfs['departure_time'] = times_gtfs['departure_time'].str.strip()
    return times_gtfs

times_gtfs_clean = clean_data(times_gtfs.copy())
times_gtfs_clean.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled,timepoint
0,218095-202502-MVS-WEEKDAY-01,06:10:00,06:10:00,1566,1,0,0,0.0,1
1,218095-202502-MVS-WEEKDAY-01,06:11:00,06:11:00,1403,2,0,0,0.263,0
2,218095-202502-MVS-WEEKDAY-01,06:12:00,06:12:00,1418,3,0,0,0.485,0
3,218095-202502-MVS-WEEKDAY-01,06:13:00,06:13:00,1411,4,0,0,0.717,0
4,218095-202502-MVS-WEEKDAY-01,06:14:00,06:14:00,1412,5,0,0,0.904,0


In [120]:
# Parse time columns
times_gtfs_clean['arrival_datetime'] = pd.to_datetime(
    times_gtfs_clean['arrival_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

times_gtfs_clean['departure_datetime'] = pd.to_datetime(
    times_gtfs_clean['departure_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

# Derive hour of day
times_gtfs_clean['arrival_hour'] = times_gtfs_clean['arrival_datetime'].dt.hour
times_gtfs_clean['arrival_minute'] = times_gtfs_clean['arrival_datetime'].dt.minute
times_gtfs_clean['departure_hour'] = times_gtfs_clean['departure_datetime'].dt.hour
times_gtfs_clean['departure_minute'] = times_gtfs_clean['departure_datetime'].dt.minute

print("Sample parsed times:")
times_gtfs_clean[['arrival_time', 'arrival_datetime', 'arrival_hour', 'arrival_minute', 'departure_datetime', 'departure_time', 'departure_hour', 'departure_minute']].head(10)

Sample parsed times:


Unnamed: 0,arrival_time,arrival_datetime,arrival_hour,arrival_minute,departure_datetime,departure_time,departure_hour,departure_minute
0,06:10:00,1900-01-01 06:10:00,6,10,1900-01-01 06:10:00,06:10:00,6,10
1,06:11:00,1900-01-01 06:11:00,6,11,1900-01-01 06:11:00,06:11:00,6,11
2,06:12:00,1900-01-01 06:12:00,6,12,1900-01-01 06:12:00,06:12:00,6,12
3,06:13:00,1900-01-01 06:13:00,6,13,1900-01-01 06:13:00,06:13:00,6,13
4,06:14:00,1900-01-01 06:14:00,6,14,1900-01-01 06:14:00,06:14:00,6,14
5,06:15:00,1900-01-01 06:15:00,6,15,1900-01-01 06:15:00,06:15:00,6,15
6,06:15:00,1900-01-01 06:15:00,6,15,1900-01-01 06:15:00,06:15:00,6,15
7,06:16:00,1900-01-01 06:16:00,6,16,1900-01-01 06:16:00,06:16:00,6,16
8,06:17:00,1900-01-01 06:17:00,6,17,1900-01-01 06:17:00,06:17:00,6,17
9,06:18:00,1900-01-01 06:18:00,6,18,1900-01-01 06:18:00,06:18:00,6,18


 # Imputation
 ## Find missing bus stops

In [121]:
print(len(clean_stops))
print(len(stops_gtfs_clean))
if len(stops_gtfs_clean) > len(clean_stops):
    print(f"There are {len(stops_gtfs_clean)-len(clean_stops)} missing stops")

1000
1400
There are 400 missing stops


In [122]:
# make sure the join keys have the same data type
clean_stops['stop_id'] = clean_stops['stop_id'].astype(str)
stops_gtfs_clean['stop_id'] = stops_gtfs_clean['stop_id'].astype(str)

In [123]:
# find the missing stops
missing_stops = stops_gtfs_clean[~stops_gtfs_clean['stop_id'].isin(clean_stops['stop_id'])]
len(missing_stops)

1224

In [124]:
merged_stops = pd.concat([clean_stops, missing_stops], ignore_index=True, sort=False)
merged_stops

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,stop_code,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,59930.0,UNIVERSITY PARK DR,QUANCE ST (NB),-104.54913,50.44416,0742,UNIVERSITY PARK DR @ QUANCE ST (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5.588113e+06,,,,,,,,
1,59931.0,UNIVERSITY PARK DR,VIC SQUARE (NB),-104.54915,50.44592,0743,UNIVERSITY PARK DR @ VIC SQUARE (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5.588309e+06,,,,,,,,
2,59932.0,FLEET ST,NORTH SERVICE RD (NB),-104.549126,50.448609,0744,FLEET ST @ NORTH SERVICE RD (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5.588608e+06,,,,,,,,
3,59933.0,FLEET ST,FINES DR (NB),-104.549111,50.449614,0745,FLEET ST @ FINES DR (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5.588720e+06,,,,,,,,
4,59934.0,CAMBRIDGE AVE,MILFORD CRES (WB),-104.5513,50.45041,0746,CAMBRIDGE AVE @ MILFORD CRES (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5.588807e+06,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2219,,,,,,1671,CHUKA BLVD @ BUCKINGHAM DR (SB),,,,,,50.429152,-104.512485,,,0.0,
2220,,,,,,1672,SASKATCHEWAN DR @ HAMILTON ST (EB),,,,,,50.452173,-104.609027,,,0.0,
2221,,,,,,1673,WOODLAND GROVE DR @ ARCOLA AVE (NB),,,,,,50.420358,-104.526300,,,0.0,
2222,,,,,,1674,ANAQUOD RD @ OPTIMIST (SB),,,,,,50.443629,-104.505364,,,0.0,


In [125]:
print(len(missing_stops), "new stops added")
merged_stops.head()

1224 new stops added


Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,stop_code,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,59930.0,UNIVERSITY PARK DR,QUANCE ST (NB),-104.54913,50.44416,742,UNIVERSITY PARK DR @ QUANCE ST (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5588113.0,,,,,,,,
1,59931.0,UNIVERSITY PARK DR,VIC SQUARE (NB),-104.54915,50.44592,743,UNIVERSITY PARK DR @ VIC SQUARE (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5588309.0,,,,,,,,
2,59932.0,FLEET ST,NORTH SERVICE RD (NB),-104.549126,50.448609,744,FLEET ST @ NORTH SERVICE RD (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5588608.0,,,,,,,,
3,59933.0,FLEET ST,FINES DR (NB),-104.549111,50.449614,745,FLEET ST @ FINES DR (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5588720.0,,,,,,,,
4,59934.0,CAMBRIDGE AVE,MILFORD CRES (WB),-104.5513,50.45041,746,CAMBRIDGE AVE @ MILFORD CRES (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5588807.0,,,,,,,,


In [126]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(merged_stops):
    for index, stop in merged_stops.iterrows():
        if pd.isna(stop['on_street']):
            merged_stops.at[index, 'on_street'] = (
                str(merged_stops.at[index, 'stop_name']).split(' @')[0]
            )
        if pd.isna(stop['at_street']):
            merged_stops.at[index, 'at_street'] = (
                str(merged_stops.at[index, 'stop_name']).split('@ ')[-1]
            )
    return merged_stops

merged_stops_clean = clean_data(merged_stops.copy())
merged_stops_clean.head()

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,stop_code,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,59930.0,UNIVERSITY PARK DR,QUANCE ST (NB),-104.54913,50.44416,742,UNIVERSITY PARK DR @ QUANCE ST (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5588113.0,,,,,,,,
1,59931.0,UNIVERSITY PARK DR,VIC SQUARE (NB),-104.54915,50.44592,743,UNIVERSITY PARK DR @ VIC SQUARE (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5588309.0,,,,,,,,
2,59932.0,FLEET ST,NORTH SERVICE RD (NB),-104.549126,50.448609,744,FLEET ST @ NORTH SERVICE RD (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5588608.0,,,,,,,,
3,59933.0,FLEET ST,FINES DR (NB),-104.549111,50.449614,745,FLEET ST @ FINES DR (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5588720.0,,,,,,,,
4,59934.0,CAMBRIDGE AVE,MILFORD CRES (WB),-104.5513,50.45041,746,CAMBRIDGE AVE @ MILFORD CRES (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5588807.0,,,,,,,,


In [127]:
#Adapted from the Plotly documentation https://plotly.com/python/tile-scatter-maps/#multiple-markers
stop_fig = go.Figure(go.Scattermap(
    lat=clean_stops['lat'],
    lon=clean_stops['lon'],
    mode='markers',
    marker=go.scattermap.Marker(size=9),
    text=clean_stops['stop_name'],
))

stop_fig.update_layout(
    autosize=True,
    hovermode='closest',
    map=dict(
        bearing=0,
        center=dict(lat=50.447992743219615, lon=-104.61228441057489),
        pitch=0,
        zoom=10
    ),
)

In [128]:
%%skip_if importlib.util.find_spec('pyproj') is not None
%pip install pyproj

In [129]:
from pyproj import Transformer

In [130]:
# Create transformer to convert from UTM to lat/lon
transformer = Transformer.from_crs("EPSG:26913", "EPSG:4326", always_xy=True)

route_idx = 0

route_name = clean_routes['route_name'].iloc[route_idx]
route_geometry = clean_routes['geometry_paths'].iloc[route_idx]
route_colour = clean_routes['route_color'].iloc[route_idx]
route_text_colour = clean_routes['route_text_color'].iloc[route_idx]

# Combine all paths into single lists
all_lons = []
all_lats = []

for path in route_geometry:
    for coordinate in path:
        lon, lat = transformer.transform(coordinate[0], coordinate[1])
        all_lons.append(lon)
        all_lats.append(lat)
    
    # Add None to separate path segments (prevents connecting them)
    all_lons.append(None)
    all_lats.append(None)

# Add as ONE trace
stop_fig.add_trace(go.Scattermap(
    lon=all_lons,
    lat=all_lats,
    mode='lines',
    line=dict(width=3, color=route_colour),
    name=route_name,
    hovertemplate=f'<b>{route_name}</b><extra></extra>'
))

stop_fig.show()

In [131]:
# Transformation: Bus Stops by Region

city_center_lon = -104.618
city_center_lat = 50.447
clean_stops['region'] = ''

for stop in range(len(clean_stops)):
    if float(clean_stops['lat'].iloc[stop]) > city_center_lat:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "NE"
        else:
            clean_stops.at[stop, 'region'] = "NW"
    else:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "SE"
        else:
            clean_stops.at[stop, 'region'] = "SW"

clean_stops

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,region
0,59930,UNIVERSITY PARK DR,QUANCE ST (NB),-104.54913,50.44416,0742,UNIVERSITY PARK DR @ QUANCE ST (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5.588113e+06,SE
1,59931,UNIVERSITY PARK DR,VIC SQUARE (NB),-104.54915,50.44592,0743,UNIVERSITY PARK DR @ VIC SQUARE (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5.588309e+06,SE
2,59932,FLEET ST,NORTH SERVICE RD (NB),-104.549126,50.448609,0744,FLEET ST @ NORTH SERVICE RD (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5.588608e+06,NE
3,59933,FLEET ST,FINES DR (NB),-104.549111,50.449614,0745,FLEET ST @ FINES DR (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5.588720e+06,NE
4,59934,CAMBRIDGE AVE,MILFORD CRES (WB),-104.5513,50.45041,0746,CAMBRIDGE AVE @ MILFORD CRES (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5.588807e+06,NE
...,...,...,...,...,...,...,...,...,...,...,...
995,59519,BROAD ST,LAKESHORE DR (NB),-104.60238,50.42909,0277,BROAD ST @ LAKESHORE DR (NB),{6C99A3FE-1D49-4D9F-8CF4-47EB76D3C380},528241.7908,5.586416e+06,SE
996,59520,BROAD ST,QUINN DR (NB),-104.605029,50.43435,0279,BROAD ST @ QUINN DR (NB),{FC736381-BD45-4215-9FAE-621438F00C32},528050.5326,5.587000e+06,SE
997,59521,BROAD ST,BROADWAY AVE (NB),-104.60624,50.43833,0280,BROAD ST @ BROADWAY AVE (NB),{43F34201-A67B-463E-A50B-A53D6AA58C51},527962.1839,5.587442e+06,SE
998,59522,BROAD ST,COLLEGE AVE (NB),-104.60635,50.44131,0281,BROAD ST @ COLLEGE AVE (NB),{37C536D7-5591-4DE9-8594-3D443E20D884},527952.6173,5.587773e+06,SE


In [132]:
nwStops = clean_stops[clean_stops['region'] == 'NW']
neStops = clean_stops[clean_stops['region'] == 'NE']
swStops = clean_stops[clean_stops['region'] == 'SW']
seStops = clean_stops[clean_stops['region'] == 'SE']

nwStops

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,region
51,59649,MCINTOSH ST,SHERWOOD DR (NB),-104.652451,50.47761,0426,MCINTOSH ST @ SHERWOOD DR (NB),{ED2D7610-F019-4346-864E-344FC3EAA727},524660.1668,5.591793e+06,NW
52,59650,MCINTOSH ST,ALCOVE PL (NB),-104.652491,50.47925,0427,MCINTOSH ST @ ALCOVE PL (NB),{7D6E76FB-E90F-4361-831B-A18E257267DA},524656.4754,5.591976e+06,NW
53,59651,MCINTOSH ST,RUPERT PL (NB),-104.652471,50.48256,0428,MCINTOSH ST @ RUPERT PL (NB),{71D3A1A2-6A4C-4CF4-83D8-213999B1730A},524656.1723,5.592344e+06,NW
54,59652,MCINTOSH ST,DALGLIESH DR (NB),-104.6525,50.48542,0429,MCINTOSH ST @ DALGLIESH DR (NB),{DD2D0B11-2CAD-44A7-94F3-B7CE3446E1F1},524652.6269,5.592662e+06,NW
55,59653,DALGLIESH DR,COOPER CR (EB),-104.64778,50.48578,0430,DALGLIESH DR @ COOPER CR (EB),{6821311C-21F6-40B8-BE18-6DA7163F3C36},524987.2864,5.592703e+06,NW
...,...,...,...,...,...,...,...,...,...,...,...
956,59480,ELPHINSTONE ST,6TH AVE (SB),-104.63119,50.45954,0229,ELPHINSTONE ST @ 6TH AVE (SB),{74ACED50-2482-4A7B-A4AE-B623E61810EB},526178.7021,5.589791e+06,NW
957,59481,7TH AVE,ELPHINSTONE ST (EB),-104.63062,50.45825,0230,7TH AVE @ ELPHINSTONE ST (EB),{215995C6-861C-4AF8-B925-BBF2C0F431CC},526219.8746,5.589648e+06,NW
958,59482,7TH AVE,ATHOL ST (EB),-104.62784,50.45824,0231,7TH AVE @ ATHOL ST (EB),{C5A3E6E9-D02F-41D9-A0A2-19FF3BD1147F},526417.2137,5.589648e+06,NW
959,59483,7TH AVE,CAMERON ST (EB),-104.624839,50.458243,0232,7TH AVE @ CAMERON ST (EB),{B8E4135D-B760-44CA-AAFD-56801D3B2499},526630.2329,5.589650e+06,NW


In [133]:
neStops

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,region
2,59932,FLEET ST,NORTH SERVICE RD (NB),-104.549126,50.448609,0744,FLEET ST @ NORTH SERVICE RD (NB),{AF32D4D7-5833-4BEB-B7B0-3DD5491E1885},532011.0863,5.588608e+06,NE
3,59933,FLEET ST,FINES DR (NB),-104.549111,50.449614,0745,FLEET ST @ FINES DR (NB),{633EBFA2-7015-478B-B732-6FC74F963853},532011.4732,5.588720e+06,NE
4,59934,CAMBRIDGE AVE,MILFORD CRES (WB),-104.5513,50.45041,0746,CAMBRIDGE AVE @ MILFORD CRES (WB),{E157B3F1-6F6F-4D21-98A0-4A928B9822A4},531855.5283,5.588807e+06,NE
5,59935,CAMBRIDGE AVE,GLENCAIRN RD (WB),-104.555519,50.450435,0747,CAMBRIDGE AVE @ GLENCAIRN RD (WB),{2F4D0044-C4D7-4424-B1C9-4B78B45A3348},531555.9842,5.588808e+06,NE
6,59936,CAMBRIDGE AVE,CAVENDISH ST (WB),-104.558621,50.45041,0748,CAMBRIDGE AVE @ CAVENDISH ST (WB),{7C30EF11-BFAF-4162-89C3-3768C41F2A8A},531335.7747,5.588804e+06,NE
...,...,...,...,...,...,...,...,...,...,...,...
912,59436,11TH AVE,CORNWALL ST (WB),-104.61238,50.45038,0180,11TH AVE @ CORNWALL ST (WB),{FEBDCE3D-E6DC-4249-89DB-227D1B6E591E},527519.1747,5.588780e+06,NE
913,59437,ALBERT ST,DEWDNEY AVE (NB),-104.61793,50.45542,0181,ALBERT ST @ DEWDNEY AVE (NB),{EA7048AC-5F34-4F61-BA13-1BE6BF660F8D},527122.2706,5.589338e+06,NE
914,59438,ALBERT ST,7TH AVE (NB),-104.617969,50.45869,0183,ALBERT ST @ 7TH AVE (NB),{F4711F94-389B-40C7-A02E-C47CFA33179F},527117.6325,5.589702e+06,NE
916,59440,ALBERT ST,4TH AVE (NB),-104.61798,50.46351,0186,ALBERT ST @ 4TH AVE (NB),{00BB03A1-EC04-41EF-B716-78829501929B},527114.0958,5.590238e+06,NE


In [134]:
swStops

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,region
11,59941,PARLIAMENT AVE,RAE ST (WB),-104.621511,50.4155,0753,PARLIAMENT AVE @ RAE ST (WB),{87A3B8CC-044D-44F7-9499-319EBDEB51D1},526890.6694,5.584898e+06,SW
12,59942,PARLIAMENT AVE,ROBINSON ST (WB),-104.62433,50.41548,0754,PARLIAMENT AVE @ ROBINSON ST (WB),{8A33130A-69B5-4670-A2B1-BE09B819B1D9},526690.3984,5.584895e+06,SW
13,59943,PARLIAMENT AVE,GARNET ST (WB),-104.62681,50.41549,0755,PARLIAMENT AVE @ GARNET ST (WB),{D630D9D6-B932-4BBA-A5E9-A1BB2B7DE274},526514.1956,5.584895e+06,SW
14,59944,MONTAGUE ST,PARLIAMENT AVE (SB),-104.62976,50.41518,0756,MONTAGUE ST @ PARLIAMENT AVE (SB),{FB1174AB-119F-4614-BFF2-8EBA29A30733},526304.7779,5.584859e+06,SW
15,59945,MONTAGUE ST,28TH AVE (SB),-104.629721,50.41352,0757,MONTAGUE ST @ 28TH AVE (SB),{D92048BC-ABE8-4756-9AAD-E49AD278455E},526308.4681,5.584675e+06,SW
...,...,...,...,...,...,...,...,...,...,...,...
790,59630,ALBERT ST,13TH AVE (NB),-104.618069,50.44611,0403,ALBERT ST @ 13TH AVE (NB),{F9DAB11D-71E1-44B1-8CDB-9DF73BA8C3AF},527117.7243,5.588303e+06,SW
981,59505,PARLIAMENT AVE,ALBERT ST (WB),-104.618919,50.41548,0259,PARLIAMENT AVE @ ALBERT ST (WB),{CF4B73BC-0B06-4328-95F9-023E9D875E70},527074.8352,5.584897e+06,SW
982,59506,PARLIAMENT AVE,RAE ST (WB),-104.62071,50.41548,0260,PARLIAMENT AVE @ RAE ST (WB),{3C5B3732-BF14-48EF-85CB-9563CC320E10},526947.5895,5.584896e+06,SW
983,59507,RAE ST,GOLDEN MILE (NB),-104.62102,50.41775,0261,RAE ST @ GOLDEN MILE (NB),{4DB85702-86EE-40E8-8C49-C30986291FA9},526924.2782,5.585148e+06,SW


In [135]:
seStops

Unnamed: 0,object_id,on_street,at_street,lon,lat,stop_id,stop_name,global_id,geometry.x,geometry.y,region
0,59930,UNIVERSITY PARK DR,QUANCE ST (NB),-104.54913,50.44416,0742,UNIVERSITY PARK DR @ QUANCE ST (NB),{05092908-E821-4704-86E0-4AA2BA573409},532013.8038,5.588113e+06,SE
1,59931,UNIVERSITY PARK DR,VIC SQUARE (NB),-104.54915,50.44592,0743,UNIVERSITY PARK DR @ VIC SQUARE (NB),{1CA33E59-8211-476E-86F5-EDEB4E8F6F76},532011.1964,5.588309e+06,SE
32,59962,GORDON RD,ALBERT ST (EB),-104.61693,50.40453,0774,GORDON RD @ ALBERT ST (EB),{27197D67-891D-4974-BAC3-CE855B0C16C5},527222.4214,5.583680e+06,SE
33,59963,GRANT RD,GORDON RD (SB),-104.61334,50.40424,0775,GRANT RD @ GORDON RD (SB),{B3CBA124-2B75-482D-956B-8B179FB18D96},527477.7075,5.583649e+06,SE
34,59964,GRANT RD,MARSH CRES (SB),-104.61335,50.4029,0776,GRANT RD @ MARSH CRES (SB),{05DD5E90-5275-4BFB-8992-F105B3983317},527477.7717,5.583500e+06,SE
...,...,...,...,...,...,...,...,...,...,...,...
995,59519,BROAD ST,LAKESHORE DR (NB),-104.60238,50.42909,0277,BROAD ST @ LAKESHORE DR (NB),{6C99A3FE-1D49-4D9F-8CF4-47EB76D3C380},528241.7908,5.586416e+06,SE
996,59520,BROAD ST,QUINN DR (NB),-104.605029,50.43435,0279,BROAD ST @ QUINN DR (NB),{FC736381-BD45-4215-9FAE-621438F00C32},528050.5326,5.587000e+06,SE
997,59521,BROAD ST,BROADWAY AVE (NB),-104.60624,50.43833,0280,BROAD ST @ BROADWAY AVE (NB),{43F34201-A67B-463E-A50B-A53D6AA58C51},527962.1839,5.587442e+06,SE
998,59522,BROAD ST,COLLEGE AVE (NB),-104.60635,50.44131,0281,BROAD ST @ COLLEGE AVE (NB),{37C536D7-5591-4DE9-8594-3D443E20D884},527952.6173,5.587773e+06,SE


In [136]:
if len(clean_stops) == len(nwStops)+len(neStops) + len(swStops) + len(seStops):
    print("True")
else:
    print("False")

True


In [137]:
# Total distance of each bus route

# convert shape_length into km
clean_routes['route_distance_km'] = clean_routes['shape_length'] / 1000 

In [138]:
clean_routes[['route_num', 'route_distance_km']]

Unnamed: 0,route_num,route_distance_km
0,10,184.989734
1,18,47.401586
2,3,180.17268
3,40,92.096531
4,60,57.978962
5,7,115.065916
6,1,664.90017
7,12,492.246799
8,24,24.62248
9,22,94.968615


In [139]:
# Approximate conversion: ~111 km per degree latitude, ~85 km per degree longitude at this latitude
# Euclidean distance
clean_stops['distance_from_center_km'] = np.sqrt(
    ((clean_stops['lat'].astype(float) - city_center_lat) * 111)**2 +  # 111 km per degree lat
    ((clean_stops['lon'].astype(float) - city_center_lon) * 85)**2     # ~85 km per degree lon at this latitude
)

print("Distance statistics (km):")
print(clean_stops['distance_from_center_km'].describe())

Distance statistics (km):
count    1000.000000
mean        3.775980
std         1.791905
min         0.098964
25%         2.319224
50%         3.785285
75%         5.068146
max         8.381130
Name: distance_from_center_km, dtype: float64


In [140]:
# Aggregation - Region Summary

region_summary = clean_stops.groupby('region').agg({
    'stop_id': 'count',
    'distance_from_center_km': ['mean', 'max']
}).reset_index()

region_summary.columns = ['region', 'num_stops', 'avg_distance_km', 'max_distance_km']
print("Stops by region:")
print(region_summary)

Stops by region:
  region  num_stops  avg_distance_km  max_distance_km
0     NE        224         4.113674         7.650399
1     NW        333         3.994900         8.381130
2     SE        264         3.926796         7.536191
3     SW        179         2.723696         5.336768


In [141]:
# Outliers
print("=== OUTLIERS ===")
print(f"Distance from center - outliers beyond 15km:")
outliers = clean_stops[clean_stops['distance_from_center_km'] > 15]
print(f"Found {len(outliers)} stops beyond 15km")
print(outliers[['stop_name', 'distance_from_center_km']].head())

=== OUTLIERS ===
Distance from center - outliers beyond 15km:
Found 0 stops beyond 15km
Empty DataFrame
Columns: [stop_name, distance_from_center_km]
Index: []


In [142]:
# Cardinalities
print("\n=== FIX - CARDINALITIES ===")
print(f"Unique stops: {merged_stops_clean['stop_id'].nunique()}")
print(f"Unique routes: {routes_gtfs_clean['route_id'].nunique()}")
print(f"Unique regions: {clean_stops['region'].nunique()}")


=== FIX - CARDINALITIES ===
Unique stops: 2224
Unique routes: 44
Unique regions: 4


In [143]:
# Reshape: Routes by Region
times_gtfs_clean['trip_id'] = times_gtfs_clean['trip_id'].astype(str)
trips_gtfs_clean['trip_id'] = trips_gtfs_clean['trip_id'].astype(str)
times_gtfs_clean['stop_id'] = times_gtfs_clean['stop_id'].astype(str)
clean_stops['stop_id'] = clean_stops['stop_id'].astype(str)
trips_gtfs_clean['route_id'] = trips_gtfs_clean['route_id'].astype(str)

route_stops = (
    times_gtfs_clean
    .merge(trips_gtfs_clean[['trip_id', 'route_id']], on='trip_id')
    .merge(clean_stops[['stop_id', 'region']], left_on='stop_id', right_on='stop_id')
    .groupby(['route_id', 'region'])
    .size()
    .reset_index(name='stop_count')
)

# Pivot wider
route_region_pivot = route_stops.pivot(
    index='route_id', 
    columns='region', 
    values='stop_count'
).fillna(0)

print("FIX - Routes by region (pivoted):")
print(route_region_pivot.head())

FIX - Routes by region (pivoted):
region       NE      NW      SE    SW
route_id                             
10-44     222.0  1966.0     0.0   0.0
10-45     210.0  1862.0     0.0   0.0
12-44       0.0  1400.0  1536.0   0.0
12-45       0.0  1400.0  1536.0   0.0
15-44      28.0     0.0   316.0  52.0


In [144]:
# Before and After Evidence

print("=" * 50)
print("BEFORE (Raw Data)")
print("=" * 50)
print(f"Stop rows: {len(df_stops)}")
print(f"Route rows: {len(df_routes)}")
print(f"Missing ATSTREET: {df_stops['attributes.ATSTREET'].isnull().sum()}")
print(f"String coordinates: {df_stops['attributes.LAT'].dtype}")

print("\n" + "=" * 50)
print("AFTER (Cleaned & Transformed)")
print("=" * 50)
print(f"Stop rows: {len(clean_stops)} (added {len(missing_stops)} from GTFS)")
print(f"Route rows: {len(clean_routes)}")
print(f"Missing ATSTREET: {clean_stops['at_street'].isnull().sum()}")
print(f"Numeric coordinates: {clean_stops['lat'].dtype}")
print(f"New features: region, distance_from_center_km")

BEFORE (Raw Data)
Stop rows: 1000
Route rows: 22
Missing ATSTREET: 1
String coordinates: object

AFTER (Cleaned & Transformed)
Stop rows: 1000 (added 1224 from GTFS)
Route rows: 22
Missing ATSTREET: 0
Numeric coordinates: object
New features: region, distance_from_center_km
