In [None]:
%pip install IPython

In [None]:
import importlib
from IPython.core.magic import register_cell_magic
from IPython import get_ipython
# Conditional skipping of https://kioku-space.com/en/jupyter-skip-execution/
@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().run_cell(cell)

get_ipython().register_magic_function(skip_if, 'cell')

In [None]:
%%skip_if importlib.util.find_spec('Jinja2') is not None
%pip install Jinja2

In [None]:
%%skip_if importlib.util.find_spec('nbformat') is not None
%pip install nbformat

In [None]:
%%skip_if importlib.util.find_spec('pyproj') is not None
%pip install pyproj

In [None]:
import mercury as mr
import nbformat

##PythonLibraries

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import json
import plotly.graph_objects as go
from pyproj import Transformer

In [None]:
app = mr.App(
    title="CS 365: Transit Regina Data Wrangling", 
    description="Cleaning and transforming Transit Regina data for analysis", 
    show_code=False
)

 # CS 365 Final Project: Transit Regina Data Wrangling

 **Data Source:** City of Regina Open Data Portal
 **Dataset:** Transit Stops, Routes, and GTFS Schedule Data
 **Date:** December 2024

 ## Dataset Source & License

 **Source:** [City of Regina Open Data Portal](https://open.regina.ca)

 **Files:**
 - Bus Stop Locations (yqrStops20251120.json) - 1,000 stops
 - Transit Routes (yqrRoutes20251120.json) - 22 routes
 - GTFS Schedule Data (stops.txt, routes.txt, trips.txt, stop_times.txt)

 **License:** Open Government License - Regina
 ✅ Allows educational and commercial use
 ✅ No PII - only public infrastructure data

 **Why this matters:** Understanding transit accessibility and service patterns for urban planning and public transit analysis

 ## Raw Data Snapshot: Bus Stops

In [None]:
mr.Markdown("""
```python
# Load JSON data
with open('raw_data/yqrStops20251120.json', 'r') as f:
    stop_data = json.load(f)

# Normalize nested JSON structure
df_stops = pd.json_normalize(stop_data['features'])
```
""")

In [None]:
# Load stop data
try:
    with open('raw_data/yqrStops.json', 'r') as f:
        stop_data = json.load(f)
except json.decoder.JSONDecodeError as e:
    print("Invalid JSON", e)

df_stops = pd.json_normalize(stop_data['features'])

mr.Markdown(f"**Loaded {len(df_stops)} bus stops**")
df_stops[['attributes.STOP_ID','attributes.ONSTREET', 'attributes.ATSTREET', 'attributes.LAT', 'attributes.LON']].head(3)

 ## Data Profiling: Quality Assessment

In [None]:
mr.Markdown("""
### Checking data quality issues:
- **Data types** - Are coordinates stored correctly?
- **Missing values** - Which columns have gaps?
- **Duplicates** - Any duplicate stop IDs?
- **Outliers** - Any stops in unexpected locations?
- **Cardinalities** - How many unique values?
""")

In [None]:
print("=== BUS STOP DATA TYPES ===")
print(df_stops.dtypes[df_stops.columns[:8]])

In [None]:
print("\n=== MISSING VALUES ===")
missing = df_stops.isnull().sum()
print(missing[missing > 0])

In [None]:
print("\n=== DUPLICATES ===")
duplicates = df_stops.duplicated(subset=['attributes.STOP_ID']).sum()
print(f"Duplicate stop IDs: {duplicates}")

In [None]:
print("\n=== SAMPLE RAW DATA (showing quality issues) ===")
print("Notice: Mixed case, whitespace, coordinates stored as strings")
df_stops[['attributes.STOP_ID','attributes.ONSTREET', 'attributes.ATSTREET', 'attributes.LAT']].head(3)

 ## Cleaning Step 1: Text Standardization & Type Conversions

In [None]:
mr.Markdown("""
**Problems identified:**
- Inconsistent text formatting (mixed case, whitespace)
- Coordinates stored as strings instead of numeric
- Missing values in street names

**Solutions:**
```python
# Text standardization
df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.strip().str.upper()
df_stops['attributes.ATSTREET'] = df_stops['attributes.ATSTREET'].str.strip().str.upper()

# Type conversion
df_stops = df_stops.astype({'attributes.STOP_ID': 'int32'})

# Missing value imputation
df_stops = df_stops.fillna({'attributes.ATSTREET': "DOROTHY ST (SB)"})

# Data correction
df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.replace(
    "1060 DOROTHY ST (SB)", "DOROTHY ST", regex=False
)
```
""")

In [None]:
# Apply cleaning function
def clean_data(df_stops):
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.strip().str.upper()
    df_stops['attributes.ATSTREET'] = df_stops['attributes.ATSTREET'].str.strip().str.upper()
    df_stops['attributes.LON'] = df_stops['attributes.LON'].str.strip()
    df_stops['attributes.LAT'] = df_stops['attributes.LAT'].str.strip()
    df_stops['attributes.STOP_ID'] = df_stops['attributes.STOP_ID'].str.strip()
    df_stops['attributes.STOP_NAME'] = df_stops['attributes.STOP_NAME'].str.strip().str.upper()
    df_stops['attributes.GLOBALID'] = df_stops['attributes.GLOBALID'].str.strip()
    df_stops = df_stops.fillna({'attributes.ATSTREET':"DOROTHY ST (SB)"})
    df_stops['attributes.ONSTREET'] = df_stops['attributes.ONSTREET'].str.replace("1060 DOROTHY ST (SB)", "DOROTHY ST", case=False, regex=False)
    df_stops = df_stops.astype({'attributes.STOP_ID': 'int32'})
    
    # Rename for clarity
    df_stops = df_stops.rename(columns={
        'attributes.ONSTREET': 'on_street',
        'attributes.ATSTREET': 'at_street',
        'attributes.LON': 'lon',
        'attributes.LAT': 'lat',
        'attributes.STOP_ID': 'stop_id',
        'attributes.STOP_NAME': 'stop_name',
        'attributes.GLOBALID': 'global_id',
        'attributes.OBJECTID': 'object_id'
    })
    return df_stops

clean_stops = clean_data(df_stops.copy())

mr.Markdown(f"✅ **Cleaned {len(clean_stops)} bus stops**")
clean_stops[['stop_id', 'on_street', 'at_street', 'lat', 'lon']].head(3)

 ## Loading & Cleaning Routes Data

In [None]:
mr.Markdown("""
```python
# Load routes data
with open('raw_data/yqrRoutes.json', 'r') as f:
    routes_data = json.load(f)

df_routes = pd.json_normalize(routes_data['features'])

# Clean and standardize
df_routes['attributes.ROUTE_NAME'] = df_routes['attributes.ROUTE_NAME'].str.strip().str.upper()
df_routes['attributes.ROUTE_COLOR'] = '#' + df_routes['attributes.ROUTE_COLOR'].str.strip()
```
""")

In [None]:
# Load routes
try:
    with open('raw_data/yqrRoutes.json', 'r') as f:
        routes_data = json.load(f)
except json.decoder.JSONDecodeError as e:
    print("Invalid JSON", e)

df_routes = pd.json_normalize(routes_data['features'])

# Clean routes
def clean_data(df_routes):
    df_routes['attributes.ROUTE_NAME'] = df_routes['attributes.ROUTE_NAME'].str.strip().str.upper()
    df_routes['attributes.ROUTE_NUM'] = df_routes['attributes.ROUTE_NUM'].str.strip()
    df_routes['attributes.ROUTE_ID'] = df_routes['attributes.ROUTE_ID'].str.strip()
    df_routes['attributes.ROUTE_COLOR'] = '#' + (df_routes['attributes.ROUTE_COLOR'].str.strip()).astype(str)
    df_routes = df_routes.fillna({'attributes.ROUTE_TEXT_COLOR': "FFFFFF"})
    df_routes['attributes.ROUTE_TEXT_COLOR'] = "#"+ (df_routes['attributes.ROUTE_TEXT_COLOR'].str.strip()).astype(str)
    
    df_routes = df_routes.rename(columns={
        'attributes.SHAPE.LEN': 'shape_length',
        'attributes.ROUTE_NAME': 'route_name',
        'attributes.ROUTE_NUM': 'route_num',
        'attributes.ROUTE_ID': 'route_id',
        'attributes.ROUTE_COLOR': 'route_color',
        'attributes.ROUTE_TEXT_COLOR': 'route_text_color',
        'geometry.paths': 'geometry_paths',
        'attributes.OBJECTID': 'object_id'
    })
    return df_routes

clean_routes = clean_data(df_routes.copy())

mr.Markdown(f"✅ **Cleaned {len(clean_routes)} routes**")
clean_routes[['route_num', 'route_name', 'route_color']].head(3)

 ## Profiling Routes Data

In [None]:
print("=== ROUTE DATA TYPES ===")
print(clean_routes.dtypes[clean_routes.columns[:6]])
print("\n=== MISSING VALUES ===")
missing = clean_routes.isnull().sum()
print(missing[missing > 0] if len(missing[missing > 0]) > 0 else "No missing values")
print("\n=== DUPLICATES ===")
duplicates = clean_routes.duplicated(subset=['route_id']).sum()
print(f"Duplicate route IDs: {duplicates}")

 ## Transformation 1: Loading GTFS Schedule Data

In [None]:
mr.Markdown("""
**GTFS (General Transit Feed Specification)** provides detailed schedule information:

```python
# Load GTFS files
stops_gtfs = pd.read_csv('raw_data/gtfs_data/stops.txt')
routes_gtfs = pd.read_csv('raw_data/gtfs_data/routes.txt')
trips_gtfs = pd.read_csv('raw_data/gtfs_data/trips.txt')
times_gtfs = pd.read_csv('raw_data/gtfs_data/stop_times.txt')

# Clean and standardize
stops_gtfs['stop_name'] = stops_gtfs['stop_name'].str.upper().str.strip()
routes_gtfs['route_long_name'] = routes_gtfs['route_long_name'].str.upper().str.strip()
```
""")

In [None]:
# Load GTFS data
stops_gtfs = pd.read_csv('raw_data/gtfs_data/stops.txt')
routes_gtfs = pd.read_csv('raw_data/gtfs_data/routes.txt')
trips_gtfs = pd.read_csv('raw_data/gtfs_data/trips.txt')
times_gtfs = pd.read_csv('raw_data/gtfs_data/stop_times.txt')

# Clean GTFS stops
def clean_data(stops_gtfs):
    stops_gtfs['stop_name'] = stops_gtfs['stop_name'].str.upper().str.strip()
    # Change column type to int32 for column: 'stop_id'
    stops_gtfs = stops_gtfs.astype({'stop_id': 'int32'})
    return stops_gtfs
stops_gtfs_clean = clean_data(stops_gtfs.copy())

# Clean GTFS routes
def clean_data(routes_gtfs):
    routes_gtfs['route_long_name'] = routes_gtfs['route_long_name'].str.upper().str.strip()
    return routes_gtfs
routes_gtfs_clean = clean_data(routes_gtfs.copy())

# Clean GTFS trips
def clean_data(trips_gtfs):
    trips_gtfs['route_id'] = trips_gtfs['route_id'].str.strip()
    trips_gtfs['service_id'] = trips_gtfs['service_id'].str.strip().str.upper()
    trips_gtfs['trip_id'] = trips_gtfs['trip_id'].str.strip().str.upper()
    trips_gtfs['trip_headsign'] = trips_gtfs['trip_headsign'].str.strip().str.upper()
    return trips_gtfs
trips_gtfs_clean = clean_data(trips_gtfs.copy())

# Clean GTFS stop times
def clean_data(times_gtfs):
    times_gtfs['trip_id'] = times_gtfs['trip_id'].str.upper()
    times_gtfs['arrival_time'] = times_gtfs['arrival_time'].str.replace(r'^24', '00', regex=True)
    times_gtfs['departure_time'] = times_gtfs['departure_time'].str.replace(r'^24', '00', regex=True)
    times_gtfs['trip_id'] = times_gtfs['trip_id'].str.strip()
    times_gtfs['arrival_time'] = times_gtfs['arrival_time'].str.strip()
    times_gtfs['departure_time'] = times_gtfs['departure_time'].str.strip()
    return times_gtfs
times_gtfs_clean = clean_data(times_gtfs.copy())

mr.Markdown(f"""
**GTFS Data Loaded:**
- {len(stops_gtfs_clean)} stops
- {len(routes_gtfs_clean)} routes
- {len(trips_gtfs_clean)} trips
- {len(times_gtfs_clean)} stop times
""")

 ## Transformation 2: Parsing Date/Time Data

In [None]:
mr.Markdown("""
**Problem:** Time data stored as strings (HH:MM:SS)

**Solution:** Parse to datetime and derive time-based features

```python
# Parse time columns
times_gtfs_clean['arrival_datetime'] = pd.to_datetime(
    times_gtfs_clean['arrival_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

# Derive hour and minute features
times_gtfs_clean['arrival_hour'] = times_gtfs_clean['arrival_datetime'].dt.hour
times_gtfs_clean['arrival_minute'] = times_gtfs_clean['arrival_datetime'].dt.minute
```
""")

In [None]:
# Parse time columns
times_gtfs_clean['arrival_datetime'] = pd.to_datetime(
    times_gtfs_clean['arrival_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

times_gtfs_clean['departure_datetime'] = pd.to_datetime(
    times_gtfs_clean['departure_time'], 
    format='%H:%M:%S',
    errors='coerce'
)

# Derive hour of day
times_gtfs_clean['arrival_hour'] = times_gtfs_clean['arrival_datetime'].dt.hour
times_gtfs_clean['arrival_minute'] = times_gtfs_clean['arrival_datetime'].dt.minute

mr.Markdown("✅ **Parsed time data and derived hour/minute features**")
times_gtfs_clean[['arrival_time', 'arrival_datetime', 'arrival_hour', 'arrival_minute']].head(3)

 ## Transformation 3: Merge/Join Operations

In [None]:
mr.Markdown("""
**Identifying missing stops:** GTFS data contains more stops than geographic data

```python
# Find stops in GTFS but not in geographic data
missing_stops = stops_gtfs_clean[
    ~stops_gtfs_clean['stop_id'].isin(clean_stops['stop_id'])
]

# Merge datasets
merged_stops = pd.concat([clean_stops, missing_stops], ignore_index=True)

# Impute missing street names from stop_name
for index, stop in merged_stops.iterrows():
    if pd.isna(stop['on_street']):
        merged_stops.at[index, 'on_street'] = stop['stop_name'].split(' @')[0]
    if pd.isna(stop['at_street']):
        merged_stops.at[index, 'at_street'] = stop['stop_name'].split('@ ')[-1]
```
""")

In [None]:
# Make sure join keys have same type
clean_stops['stop_id'] = clean_stops['stop_id'].astype(str)
stops_gtfs_clean['stop_id'] = stops_gtfs_clean['stop_id'].astype(str)

# Find missing stops
missing_stops = stops_gtfs_clean[~stops_gtfs_clean['stop_id'].isin(clean_stops['stop_id'])]

mr.Markdown(f"**Found {len(missing_stops)} stops in GTFS not in geographic data**")

# Merge
merged_stops = pd.concat([clean_stops, missing_stops], ignore_index=True, sort=False)

# Impute missing values
def clean_data(merged_stops):
    for index, stop in merged_stops.iterrows():
        if pd.isna(stop['on_street']):
            merged_stops.at[index, 'on_street'] = str(merged_stops.at[index, 'stop_name']).split(' @')[0]
        if pd.isna(stop['at_street']):
            merged_stops.at[index, 'at_street'] = str(merged_stops.at[index, 'stop_name']).split('@ ')[-1]
    return merged_stops

merged_stops_clean = clean_data(merged_stops.copy())

mr.Markdown(f"✅ **Total stops after merge: {len(merged_stops_clean)} ({len(missing_stops)} added)**")

 ## Transformation 4: Feature Derivation - Geographic Regions

In [None]:
mr.Markdown("""
**Derive regional classification** based on city center coordinates:

```python
city_center_lon = -104.618
city_center_lat = 50.447

# Assign quadrants (NE, NW, SE, SW)
for stop in range(len(clean_stops)):
    if float(clean_stops['lat'].iloc[stop]) > city_center_lat:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "NE"
        else:
            clean_stops.at[stop, 'region'] = "NW"
    else:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "SE"
        else:
            clean_stops.at[stop, 'region'] = "SW"
```
""")

In [None]:
# Derive regions
city_center_lon = -104.618
city_center_lat = 50.447
clean_stops['region'] = ''

for stop in range(len(clean_stops)):
    if float(clean_stops['lat'].iloc[stop]) > city_center_lat:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "NE"
        else:
            clean_stops.at[stop, 'region'] = "NW"
    else:
        if float(clean_stops['lon'].iloc[stop]) > city_center_lon:
            clean_stops.at[stop, 'region'] = "SE"
        else:
            clean_stops.at[stop, 'region'] = "SW"

mr.Markdown("✅ **Derived regional classifications for all stops**")

# Show distribution
region_counts = clean_stops['region'].value_counts()
mr.Markdown(f"""
**Stop distribution by region:**
- NW: {region_counts.get('NW', 0)} stops
- NE: {region_counts.get('NE', 0)} stops
- SW: {region_counts.get('SW', 0)} stops
- SE: {region_counts.get('SE', 0)} stops
""")

 ## Transformation 5: Feature Derivation - Distance Calculations

In [None]:
mr.Markdown("""
**Calculate distance from city center** using coordinate geometry:

```python
# Approximate conversion: ~111 km per degree latitude, ~85 km per degree longitude at this latitude
clean_stops['distance_from_center_km'] = np.sqrt(
    ((clean_stops['lat'].astype(float) - city_center_lat) * 111)**2 + 
    ((clean_stops['lon'].astype(float) - city_center_lon) * 85)**2
)
```
""")

In [None]:
clean_stops['distance_from_center_km'] = np.sqrt(
    ((clean_stops['lat'].astype(float) - city_center_lat) * 111)**2 + 
    ((clean_stops['lon'].astype(float) - city_center_lon) * 85)**2
)

mr.Markdown("✅ **Calculated distance from city center for all stops**")
print("\nDistance statistics (km):")
print(clean_stops['distance_from_center_km'].describe())

 ## Transformation 6: Aggregation

In [None]:
mr.Markdown("""
**Aggregate stops by region** to understand service distribution:

```python
region_summary = clean_stops.groupby('region').agg({
    'stop_id': 'count',
    'distance_from_center_km': ['mean', 'max']
}).reset_index()
```
""")

In [None]:
region_summary = clean_stops.groupby('region').agg({
    'stop_id': 'count',
    'distance_from_center_km': ['mean', 'max']
}).reset_index()

region_summary.columns = ['region', 'num_stops', 'avg_distance_km', 'max_distance_km']

mr.Markdown("**Regional Summary Statistics:**")
region_summary

 ## Transformation 7: Reshape (Pivot)

In [None]:
mr.Markdown("""
**Pivot analysis:** How many stops does each route serve in each region?

```python
# Join stop times → trips → stops to get route-region relationships
route_stops = (
    times_gtfs_clean
    .merge(trips_gtfs_clean[['trip_id', 'route_id']], on='trip_id')
    .merge(clean_stops[['stop_id', 'region']], left_on='stop_id', right_on='stop_id')
    .groupby(['route_id', 'region'])
    .size()
    .reset_index(name='stop_count')
)

# Pivot to wide format
route_region_pivot = route_stops.pivot(
    index='route_id', 
    columns='region', 
    values='stop_count'
).fillna(0)
```
""")

In [None]:
# Ensure join keys have the same dtype to avoid merge mismatches
# Cast stop_id/trip_id/route_id to string on both sides of joins
times_gtfs_clean['trip_id'] = times_gtfs_clean['trip_id'].astype(str)
trips_gtfs_clean['trip_id'] = trips_gtfs_clean['trip_id'].astype(str)
times_gtfs_clean['stop_id'] = times_gtfs_clean['stop_id'].astype(str)
clean_stops['stop_id'] = clean_stops['stop_id'].astype(str)
trips_gtfs_clean['route_id'] = trips_gtfs_clean['route_id'].astype(str)

route_stops = (
    times_gtfs_clean
    .merge(trips_gtfs_clean[['trip_id', 'route_id']], on='trip_id')
    .merge(clean_stops[['stop_id', 'region']], left_on='stop_id', right_on='stop_id')
    .groupby(['route_id', 'region'])
    .size()
    .reset_index(name='stop_count')
)

route_region_pivot = route_stops.pivot(
    index='route_id', 
    columns='region', 
    values='stop_count'
).fillna(0)

mr.Markdown("**Routes by Region (Pivoted):**")
route_region_pivot.head(10)

 ## Profiling: Outliers & Cardinalities

In [None]:
mr.Markdown("""
**Outlier Detection:** Identify stops unusually far from city center

**Cardinality Analysis:** Count unique values in key dimensions
""")

In [None]:
print("=== OUTLIERS ===")
print("Stops beyond 15km from city center:")
outliers = clean_stops[clean_stops['distance_from_center_km'] > 15]
print(f"Found {len(outliers)} outlier stops")
if len(outliers) > 0:
    print(outliers[['stop_name', 'distance_from_center_km']].head())

print("\n=== CARDINALITIES ===")
print(f"Unique stops: {clean_stops['stop_id'].nunique()}")
print(f"Unique routes: {routes_gtfs_clean['route_id'].nunique()}")
print(f"Unique regions: {clean_stops['region'].nunique()}")
print(f"Unique trips: {trips_gtfs_clean['trip_id'].nunique()}")

 ## Before/After Evidence

In [None]:
mr.Markdown("""
### Quantitative comparison of data quality improvements:
""")

In [None]:
print("=" * 60)
print("BEFORE (Raw Data)")
print("=" * 60)
print(f"Stop rows: {len(df_stops)}")
print(f"Route rows: {len(df_routes)}")
print(f"Missing ATSTREET: {df_stops['attributes.ATSTREET'].isnull().sum()}")
print(f"Coordinate type: {df_stops['attributes.LAT'].dtype} (string)")
print(f"Stop ID type: {df_stops['attributes.STOP_ID'].dtype} (string)")
print(f"Features: 8 columns")

print("\n" + "=" * 60)
print("AFTER (Cleaned & Transformed)")
print("=" * 60)
print(f"Stop rows: {len(clean_stops)} (+{len(missing_stops)} from GTFS)")
print(f"Route rows: {len(clean_routes)}")
print(f"Missing ATSTREET: {clean_stops['at_street'].isnull().sum()}")
print(f"Coordinate type: {clean_stops['lat'].dtype} (numeric-ready)")
print(f"Stop ID type: {clean_stops['stop_id'].dtype} (int32)")
print(f"Features: {len(clean_stops.columns)} columns")
print(f"New derived features: region, distance_from_center_km")
print(f"Parsed time features: arrival_hour, arrival_minute")

 ## Visualization: Interactive Map

In [None]:
mr.Markdown("""
**Interactive map** showing all bus stops with hover information:
""")

In [None]:
#Adapted from the Plotly documentation https://plotly.com/python/tile-scatter-maps/#multiple-markers
stop_fig = go.Figure(go.Scattermap(
    lat=clean_stops['lat'],
    lon=clean_stops['lon'],
    mode='markers',
    marker=go.scattermap.Marker(size=9, color='blue'),
    text=clean_stops['stop_name'],
    hovertemplate='<b>%{text}</b><extra></extra>'
))

stop_fig.update_layout(
    title="Regina Transit Stops",
    autosize=True,
    hovermode='closest',
    map=dict(
        bearing=0,
        center=dict(lat=50.447992743219615, lon=-104.61228441057489),
        pitch=0,
        zoom=11
    ),
    height=600
)

stop_fig.show()

 ## Visualization: Route Overlay

In [None]:
mr.Markdown("""
**Adding route geometry** with coordinate transformation from UTM to lat/lon:

```python
# Transform UTM coordinates to lat/lon
transformer = Transformer.from_crs("EPSG:26913", "EPSG:4326", always_xy=True)

for coordinate in path:
    lon, lat = transformer.transform(coordinate[0], coordinate[1])
```
""")

In [None]:
transformer = Transformer.from_crs("EPSG:26913", "EPSG:4326", always_xy=True)

route_idx = 0
route_name = clean_routes['route_name'].iloc[route_idx]
route_geometry = clean_routes['geometry_paths'].iloc[route_idx]
route_colour = clean_routes['route_color'].iloc[route_idx]

# Transform coordinates
all_lons = []
all_lats = []

for path in route_geometry:
    for coordinate in path:
        lon, lat = transformer.transform(coordinate[0], coordinate[1])
        all_lons.append(lon)
        all_lats.append(lat)
    all_lons.append(None)
    all_lats.append(None)

# Add route to map
stop_fig.add_trace(go.Scattermap(
    lon=all_lons,
    lat=all_lats,
    mode='lines',
    line=dict(width=3, color=route_colour),
    name=route_name,
    hovertemplate=f'<b>{route_name}</b><extra></extra>'
))

stop_fig.show()

 ## Summary of Transformations

In [None]:
mr.Markdown("""
### Completed Transformations (8 operations across 5 categories):

**1. Type Fixes & Parsing**
- ✅ Converted stop_id from string to int32
- ✅ Parsed arrival/departure times to datetime
- ✅ Derived hour and minute features

**2. Text Cleanup**
- ✅ Stripped whitespace from all text columns
- ✅ Converted to uppercase for consistency
- ✅ Fixed malformed addresses

**3. Missing Data Handling**
- ✅ Imputed missing ATSTREET values
- ✅ Generated street names from stop names for GTFS stops

**4. Join/Merge**
- ✅ Merged geographic stops with GTFS schedule data
- ✅ Joined stop times → trips → routes → stops

**5. Feature Derivation**
- ✅ Created regional classifications (NE, NW, SE, SW)
- ✅ Calculated distance from city center

**6. Aggregation**
- ✅ Summarized stops by region with statistics

**7. Reshape**
- ✅ Pivoted route-region stop counts to wide format

**8. Coordinate Transformation**
- ✅ Converted UTM to lat/lon for visualization
""")

 ## Reproducibility

In [None]:
mr.Markdown("""
### How to reproduce this analysis:

**1. Install dependencies:**
```bash
pip install pandas plotly pyproj numpy jupyter mercury
```

**2. Directory structure:**
```
project/
├── presentation.ipynb
├── raw_data/
│   ├── yqrStops20251120.json
│   ├── yqrRoutes20251120.json
│   └── gtfs_data/
│       ├── stops.txt
│       ├── routes.txt
│       ├── trips.txt
│       └── stop_times.txt
└── README.md
```

**3. Run notebook:**
- Execute all cells sequentially
- Or run with Mercury: `mercury run presentation.ipynb`

**Tool versions:** Python 3.14, pandas 2.2.3, plotly 5.24.1, pyproj 3.7.0
""")