In [None]:
%pip install IPython

In [None]:
import importlib
from IPython.core.magic import register_cell_magic
from IPython import get_ipython
# Conditional skipping of https://kioku-space.com/en/jupyter-skip-execution/
@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().run_cell(cell)

get_ipython().register_magic_function(skip_if, 'cell')

In [None]:
%%skip_if importlib.util.find_spec('Jinja2') is not None
%pip install Jinja2

In [None]:
%%skip_if importlib.util.find_spec('nbformat') is not None
%pip install nbformat

In [None]:
import numpy as np

In [None]:
# Python packages
import pandas as pd
import json
import plotly.graph_objects as go

In [None]:
try:
    with open('raw_data/yqrStops.json', 'r') as f:
        stop_data = json.load(f)
except json.decoder.JSONDecodeError as e:
    print("Invalid JSON", e)

stop_data

In [None]:
df_stops = pd.json_normalize(stop_data)
df_stops

In [None]:
df_stops = pd.json_normalize(stop_data['features'])
df_stops

In [None]:
print("Bus Stop Data Types")
print(df_stops.dtypes[df_stops.columns[:8]])
print("---------------------------------")
print(f"\nSample LAT values: {df_stops['attributes.LAT'].head(2).tolist()}")
print("Bus Stop Missing Values")
missing = df_stops.isnull().sum()
print(missing[missing > 0])
print("---------------------------------")
print("Bus Stop Duplicates")
duplicates = df_stops.duplicated(subset=['attributes.STOP_ID']).sum()
print(f"Duplicate stop IDs: {duplicates}")
print("---------------------------------")

In [None]:
print("======Sample Raw Bus Stop Data=======")
df_stops[['attributes.STOP_ID','attributes.ONSTREET', 'attributes.ATSTREET', 'attributes.LAT', 'attributes.LON']].head(3)

 # Cleaning Bus Stop data using Data Wrangler extension

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df_stops):
    # Remove leading and trailing whitespace in column: 'attributes.ONSTREET'
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.strip()
    # Convert text to uppercase in column: 'attributes.ONSTREET'
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.upper()
    # Remove leading and trailing whitespace in column: 'attributes.ATSTREET'
    df_stops['attributes.ATSTREET'] = df_stops['attributes.ATSTREET'].str.strip()
    # Convert text to uppercase in column: 'attributes.ATSTREET'
    df_stops['attributes.ATSTREET'] = df_stops['attributes.ATSTREET'].str.upper()
    # Remove leading and trailing whitespace in column: 'attributes.LON'
    df_stops['attributes.LON'] = df_stops['attributes.LON'].str.strip()
    # Remove leading and trailing whitespace in column: 'attributes.LAT'
    df_stops['attributes.LAT'] = df_stops['attributes.LAT'].str.strip()
    # Remove leading and trailing whitespace in column: 'stop_id'
    df_stops['attributes.STOP_ID'] = df_stops['attributes.STOP_ID'].str.strip()
    # Remove leading and trailing whitespace in column: 'attributes.STOP_NAME'
    df_stops['attributes.STOP_NAME'] = df_stops['attributes.STOP_NAME'].str.strip()
    # Convert text to uppercase in column: 'attributes.STOP_NAME'
    df_stops['attributes.STOP_NAME'] = df_stops['attributes.STOP_NAME'].str.upper()
    # Remove leading and trailing whitespace in column: 'attributes.GLOBALID'
    df_stops['attributes.GLOBALID'] = df_stops['attributes.GLOBALID'].str.strip()
    # Replace missing values with "DOROTHY ST (SB)" in column: 'attributes.ATSTREET'
    df_stops = df_stops.fillna({'attributes.ATSTREET':"DOROTHY ST (SB)"})
    # Replace all instances of "1060 DOROTHY ST (SB)" with "DOROTHY ST" in column: 'attributes.ONSTREET'
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.replace("1060 DOROTHY ST (SB)", "DOROTHY ST", case=False, regex=False)
    # Rename column 'attributes.ONSTREET' to 'on_street'
    df_stops = df_stops.rename(columns={'attributes.ONSTREET': 'on_street'})
    # Rename column 'attributes.ATSTREET' to 'at_street'
    df_stops = df_stops.rename(columns={'attributes.ATSTREET': 'at_street'})
    # Rename column 'attributes.LON' to 'lon'
    df_stops = df_stops.rename(columns={'attributes.LON': 'lon'})
    # Rename column 'attributes.LAT' to 'lat'
    df_stops = df_stops.rename(columns={'attributes.LAT': 'lat'})
    # Rename column 'stop_id' to 'stop_id'
    df_stops = df_stops.rename(columns={'attributes.STOP_ID': 'stop_id'})
    # Rename column 'attributes.STOP_NAME' to 'stop_name'
    df_stops = df_stops.rename(columns={'attributes.STOP_NAME': 'stop_name'})
    # Rename column 'attributes.GLOBALID' to 'global_id'
    df_stops = df_stops.rename(columns={'attributes.GLOBALID': 'global_id'})
    # Rename column 'attributes.OBJECTID' to 'object_id'
    df_stops = df_stops.rename(columns={'attributes.OBJECTID': 'object_id'})
    return df_stops

clean_stops = clean_data(df_stops.copy())
clean_stops.head()

In [None]:
try:
    with open('raw_data/yqrRoutes.json', 'r') as f:
        routes_data = json.load(f)
    print("âœ“ Loaded routes data")
except json.decoder.JSONDecodeError as e:
    print("Invalid JSON", e)

df_routes = pd.json_normalize(routes_data['features'])
df_routes

In [None]:
print("Bus Route Data Types")
print(df_routes.dtypes[df_routes.columns[:8]])
print("---------------------------------")
print("Bus Route Missing Values")
missing = df_routes.isnull().sum()
print(missing[missing > 0])
print("---------------------------------")
print("Bus Route Duplicates")
duplicates = df_routes.duplicated(subset=['attributes.ROUTE_ID']).sum()
print(f"Duplicate stop IDs: {duplicates}")
print("---------------------------------")

 # Cleaning Bus Route Data using Data Wrangler

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df_routes):
    # Remove leading and trailing whitespace in columns: 'attributes.ROUTE_NAME', 'attributes.ROUTE_NUM' and 4 other columns
    df_routes['attributes.ROUTE_NAME'] = df_routes['attributes.ROUTE_NAME'].str.strip()
    df_routes['attributes.ROUTE_NUM'] = df_routes['attributes.ROUTE_NUM'].str.strip()
    df_routes['attributes.ROUTE_ID'] = df_routes['attributes.ROUTE_ID'].str.strip()
    df_routes['attributes.SHAPE_ID'] = df_routes['attributes.SHAPE_ID'].str.strip()
    # add a hashtag in front of the route colour hex values 
    df_routes['attributes.ROUTE_COLOR'] = '#' + (df_routes['attributes.ROUTE_COLOR'].str.strip()).astype(str)
    # Convert text to uppercase in column: 'attributes.ROUTE_NAME'
    df_routes['attributes.ROUTE_NAME'] = df_routes['attributes.ROUTE_NAME'].str.upper()
    # Replace missing values with "FFFFFF" in column: 'attributes.ROUTE_TEXT_COLOR'
    df_routes = df_routes.fillna({'attributes.ROUTE_TEXT_COLOR': "FFFFFF"})
    # add a hashtag in front of the route text colour hex values
    df_routes['attributes.ROUTE_TEXT_COLOR'] = "#"+ (df_routes['attributes.ROUTE_TEXT_COLOR'].str.strip()).astype(str)
     # Rename column 'attributes.SHAPE.LEN' to 'shape_length'
    df_routes = df_routes.rename(columns={'attributes.SHAPE.LEN': 'shape_length'})
    # Rename column 'attributes.ROUTE_NAME' to 'route_name'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_NAME': 'route_name'})
    # Rename column 'attributes.ROUTE_NUM' to 'route_num'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_NUM': 'route_num'})
    # Rename column 'attributes.ROUTE_ID' to 'route_id'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_ID': 'route_id'})
    # Rename column 'attributes.SHAPE_ID' to 'shape_id'
    df_routes = df_routes.rename(columns={'attributes.SHAPE_ID': 'shape_id'})
    # Rename column 'attributes.ROUTE_COLOR' to 'route_color'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_COLOR': 'route_color'})
    # Rename column 'attributes.ROUTE_TEXT_COLOR' to 'route_text_color'
    df_routes = df_routes.rename(columns={'attributes.ROUTE_TEXT_COLOR': 'route_text_color'})
    # Rename column 'geometry.paths' to 'geometry_paths'
    df_routes = df_routes.rename(columns={'geometry.paths': 'geometry_paths'})
    # Rename column 'attributes.OBJECTID' to 'object_id'
    df_routes = df_routes.rename(columns={'attributes.OBJECTID': 'object_id'})
    return df_routes

clean_routes = clean_data(df_routes.copy())
clean_routes.head()

 # Loading GTFS Data

In [None]:
# load gtfs data
stops_gtfs = pd.read_csv('raw_data/gtfs_data/stops.txt')
routes_gtfs = pd.read_csv('raw_data/gtfs_data/routes.txt')
trips_gtfs = pd.read_csv('raw_data/gtfs_data/trips.txt')
times_gtfs = pd.read_csv('raw_data/gtfs_data/stop_times.txt')

In [None]:
stops_gtfs

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(stops_gtfs):
    # Convert text to uppercase in column: 'stop_name'
    stops_gtfs['stop_name'] = stops_gtfs['stop_name'].str.upper()
    # Remove leading and trailing whitespace in column: 'stop_name'
    stops_gtfs['stop_name'] = stops_gtfs['stop_name'].str.strip()
    return stops_gtfs

stops_gtfs_clean = clean_data(stops_gtfs.copy())
stops_gtfs_clean.head()

In [None]:
routes_gtfs

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(routes_gtfs):
    # Convert text to uppercase in column: 'route_long_name'
    routes_gtfs['route_long_name'] = routes_gtfs['route_long_name'].str.upper()
    # Remove leading and trailing whitespace in column: 'route_long_name'
    routes_gtfs['route_long_name'] = routes_gtfs['route_long_name'].str.strip()
    return routes_gtfs

routes_gtfs_clean = clean_data(routes_gtfs.copy())
routes_gtfs_clean.head()

In [None]:
trips_gtfs

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(trips_gtfs):
    # Remove leading and trailing whitespace in columns: 'route_id', 'service_id' and 2 other columns
    trips_gtfs['route_id'] = trips_gtfs['route_id'].str.strip()
    trips_gtfs['service_id'] = trips_gtfs['service_id'].str.strip()
    trips_gtfs['trip_id'] = trips_gtfs['trip_id'].str.strip()
    trips_gtfs['trip_headsign'] = trips_gtfs['trip_headsign'].str.strip()
    # Convert text to uppercase in columns: 'service_id', 'trip_id', 'trip_headsign'
    trips_gtfs['service_id'] = trips_gtfs['service_id'].str.upper()
    trips_gtfs['trip_id'] = trips_gtfs['trip_id'].str.upper()
    trips_gtfs['trip_headsign'] = trips_gtfs['trip_headsign'].str.upper()
    return trips_gtfs

trips_gtfs_clean = clean_data(trips_gtfs.copy())
trips_gtfs_clean.head()

In [None]:
times_gtfs

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(times_gtfs):
    # Convert text to uppercase in column: 'trip_id'
    times_gtfs['trip_id'] = times_gtfs['trip_id'].str.upper()
    # Replace '24' in hour position with '00'
    times_gtfs['arrival_time'] = times_gtfs['arrival_time'].str.replace(r'^24', '00', regex=True)
    times_gtfs['departure_time'] = times_gtfs['departure_time'].str.replace(r'^24', '00', regex=True)
    # Remove leading and trailing whitespace in columns: 'trip_id', 'arrival_time', 'departure_time'
    times_gtfs['trip_id'] = times_gtfs['trip_id'].str.strip()
    times_gtfs['arrival_time'] = times_gtfs['arrival_time'].str.strip()
    times_gtfs['departure_time'] = times_gtfs['departure_time'].str.strip()
    return times_gtfs

times_gtfs_clean = clean_data(times_gtfs.copy())
times_gtfs_clean.head()

In [None]:
# Parse time columns
times_gtfs_clean['arrival_datetime'] = pd.to_datetime(
    times_gtfs_clean['arrival_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

times_gtfs_clean['departure_datetime'] = pd.to_datetime(
    times_gtfs_clean['departure_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

# Derive hour of day
times_gtfs_clean['arrival_hour'] = times_gtfs_clean['arrival_datetime'].dt.hour
times_gtfs_clean['arrival_minute'] = times_gtfs_clean['arrival_datetime'].dt.minute
times_gtfs_clean['departure_hour'] = times_gtfs_clean['departure_datetime'].dt.hour
times_gtfs_clean['departure_minute'] = times_gtfs_clean['departure_datetime'].dt.minute

print("Sample parsed times:")
times_gtfs_clean[['arrival_time', 'arrival_datetime', 'arrival_hour', 'arrival_minute', 'departure_datetime', 'departure_time', 'departure_hour', 'departure_minute']].head(10)

 # Imputation
 ## Find missing bus stops

In [None]:
print(len(clean_stops))
print(len(stops_gtfs_clean))
if len(stops_gtfs_clean) > len(clean_stops):
    print(f"There are {len(stops_gtfs_clean)-len(clean_stops)} missing stops")

In [None]:
# make sure the join keys have the same data type
clean_stops['stop_id'] = clean_stops['stop_id'].astype(str)
stops_gtfs_clean['stop_id'] = stops_gtfs_clean['stop_id'].astype(str)

In [None]:
# find the missing stops
missing_stops = stops_gtfs_clean[~stops_gtfs_clean['stop_id'].isin(clean_stops['stop_id'])]
len(missing_stops)

In [None]:
merged_stops = pd.concat([clean_stops, missing_stops], ignore_index=True, sort=False)
merged_stops

In [None]:
print(len(missing_stops), "new stops added")
merged_stops.head()

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(merged_stops):
    for index, stop in merged_stops.iterrows():
        if pd.isna(stop['on_street']):
            merged_stops.at[index, 'on_street'] = (
                str(merged_stops.at[index, 'stop_name']).split(' @')[0]
            )
        if pd.isna(stop['at_street']):
            merged_stops.at[index, 'at_street'] = (
                str(merged_stops.at[index, 'stop_name']).split('@ ')[-1]
            )
    return merged_stops

merged_stops_clean = clean_data(merged_stops.copy())
merged_stops_clean.head()

In [None]:
#Adapted from the Plotly documentation https://plotly.com/python/tile-scatter-maps/#multiple-markers
stop_fig = go.Figure(go.Scattermap(
    lat=clean_stops['lat'],
    lon=clean_stops['lon'],
    mode='markers',
    marker=go.scattermap.Marker(size=9),
    text=clean_stops['stop_name'],
))

stop_fig.update_layout(
    autosize=True,
    hovermode='closest',
    map=dict(
        bearing=0,
        center=dict(lat=50.447992743219615, lon=-104.61228441057489),
        pitch=0,
        zoom=10
    ),
)

In [None]:
%%skip_if importlib.util.find_spec('pyproj') is not None
%pip install pyproj

In [None]:
from pyproj import Transformer

In [None]:
# Create transformer to convert from UTM to lat/lon
transformer = Transformer.from_crs("EPSG:26913", "EPSG:4326", always_xy=True)

route_idx = 0

route_name = clean_routes['route_name'].iloc[route_idx]
route_geometry = clean_routes['geometry_paths'].iloc[route_idx]
route_colour = clean_routes['route_color'].iloc[route_idx]
route_text_colour = clean_routes['route_text_color'].iloc[route_idx]

# Combine all paths into single lists
all_lons = []
all_lats = []

for path in route_geometry:
    for coordinate in path:
        lon, lat = transformer.transform(coordinate[0], coordinate[1])
        all_lons.append(lon)
        all_lats.append(lat)
    
    # Add None to separate path segments (prevents connecting them)
    all_lons.append(None)
    all_lats.append(None)

# Add as ONE trace
stop_fig.add_trace(go.Scattermap(
    lon=all_lons,
    lat=all_lats,
    mode='lines',
    line=dict(width=3, color=route_colour),
    name=route_name,
    hovertemplate=f'<b>{route_name}</b><extra></extra>'
))

stop_fig.show()

In [None]:
# Transformation: Bus Stops by Region

city_center_lon = -104.618
city_center_lat = 50.447
clean_stops['region'] = ''

for stop in range(len(clean_stops)):
    if float(clean_stops['lat'].iloc[stop]) > city_center_lat:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "NE"
        else:
            clean_stops.at[stop, 'region'] = "NW"
    else:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "SE"
        else:
            clean_stops.at[stop, 'region'] = "SW"

clean_stops

In [None]:
nwStops = clean_stops[clean_stops['region'] == 'NW']
neStops = clean_stops[clean_stops['region'] == 'NE']
swStops = clean_stops[clean_stops['region'] == 'SW']
seStops = clean_stops[clean_stops['region'] == 'SE']

nwStops

In [None]:
neStops

In [None]:
swStops

In [None]:
seStops

In [None]:
if len(clean_stops) == len(nwStops)+len(neStops) + len(swStops) + len(seStops):
    print("True")
else:
    print("False")

In [None]:
# Total distance of each bus route

# convert shape_length into km
clean_routes['route_distance_km'] = clean_routes['shape_length'] / 1000 

In [None]:
clean_routes[['route_num', 'route_distance_km']]

In [None]:
# Approximate conversion: ~111 km per degree latitude, ~85 km per degree longitude at this latitude
# Euclidean distance
clean_stops['distance_from_center_km'] = np.sqrt(
    ((clean_stops['lat'].astype(float) - city_center_lat) * 111)**2 +  # 111 km per degree lat
    ((clean_stops['lon'].astype(float) - city_center_lon) * 85)**2     # ~85 km per degree lon at this latitude
)

print("Distance statistics (km):")
print(clean_stops['distance_from_center_km'].describe())

In [None]:
# Aggregation - Region Summary

region_summary = clean_stops.groupby('region').agg({
    'stop_id': 'count',
    'distance_from_center_km': ['mean', 'max']
}).reset_index()

region_summary.columns = ['region', 'num_stops', 'avg_distance_km', 'max_distance_km']
print("Stops by region:")
print(region_summary)

In [None]:
# Outliers
print("=== OUTLIERS ===")
print(f"Distance from center - outliers beyond 15km:")
outliers = clean_stops[clean_stops['distance_from_center_km'] > 15]
print(f"Found {len(outliers)} stops beyond 15km")
print(outliers[['stop_name', 'distance_from_center_km']].head())

In [None]:
# Cardinalities
print("\n=== FIX - CARDINALITIES ===")
print(f"Unique stops: {['stop_id'].nunique()}")
print(f"Unique routes: {routes_gtfs_clean['route_id'].nunique()}")
print(f"Unique regions: {clean_stops['region'].nunique()}")

In [None]:
# Reshape: Routes by Region
times_gtfs_clean['trip_id'] = times_gtfs_clean['trip_id'].astype(str)
trips_gtfs_clean['trip_id'] = trips_gtfs_clean['trip_id'].astype(str)
times_gtfs_clean['stop_id'] = times_gtfs_clean['stop_id'].astype(str)
clean_stops['stop_id'] = clean_stops['stop_id'].astype(str)
trips_gtfs_clean['route_id'] = trips_gtfs_clean['route_id'].astype(str)

route_stops = (
    times_gtfs_clean
    .merge(trips_gtfs_clean[['trip_id', 'route_id']], on='trip_id')
    .merge(clean_stops[['stop_id', 'region']], left_on='stop_id', right_on='stop_id')
    .groupby(['route_id', 'region'])
    .size()
    .reset_index(name='stop_count')
)

# Pivot wider
route_region_pivot = route_stops.pivot(
    index='route_id', 
    columns='region', 
    values='stop_count'
).fillna(0)

print("FIX - Routes by region (pivoted):")
print(route_region_pivot.head())

In [None]:
# Before and After Evidence

print("=" * 50)
print("BEFORE (Raw Data)")
print("=" * 50)
print(f"Stop rows: {len(df_stops)}")
print(f"Route rows: {len(df_routes)}")
print(f"Missing ATSTREET: {df_stops['attributes.ATSTREET'].isnull().sum()}")
print(f"String coordinates: {df_stops['attributes.LAT'].dtype}")

print("\n" + "=" * 50)
print("AFTER (Cleaned & Transformed)")
print("=" * 50)
print(f"Stop rows: {len(clean_stops)} (added {len(missing_stops)} from GTFS)")
print(f"Route rows: {len(clean_routes)}")
print(f"Missing ATSTREET: {clean_stops['at_street'].isnull().sum()}")
print(f"Numeric coordinates: {clean_stops['lat'].dtype}")
print(f"New features: region, distance_from_center_km")