# GTFS Data Exploration
Explore routes, stops, and schedule data from the GTFS feed.

In [None]:
import pandas as pd
import folium
from pathlib import Path
import matplotlib.pyplot as plt


In [None]:
gtfs_path = Path("../data/raw_gtfs")
routes = pd.read_csv(gtfs_path / "routes.txt")
stops = pd.read_csv(gtfs_path / "stops.txt")
stop_times = pd.read_csv(gtfs_path / "stop_times.txt")
trips = pd.read_csv(gtfs_path / "trips.txt")
calendar = pd.read_csv(gtfs_path / "calendar.txt")


In [None]:
# GTFS files are already loaded in cell 2 as DataFrames: routes, stops, stop_times, trips, and calendar.
# No need to reload them here.
print("GTFS files loaded:", list(gtfs_path.iterdir()))

In [None]:
merged = (
    stop_times
    .merge(trips, on="trip_id")
    .merge(routes, on="route_id")
    .merge(stops, on="stop_id")
)
merged["arrival_time"] = pd.to_datetime(merged["arrival_time"], errors='coerce')


In [None]:
# The key GTFS tables have already been merged into the 'merged' DataFrame in cell 4.
# 'merged' contains stop_times, trips, routes, and stops data.
print(merged.head())

In [None]:
# Create GTFS-derived features
merged['hour'] = merged['arrival_time'].dt.hour
merged['minute'] = merged['arrival_time'].dt.minute
merged['day_of_week'] = merged['arrival_time'].dt.dayofweek

In [None]:
map_center = [merged["stop_lat"].mean(), merged["stop_lon"].mean()]
m = folium.Map(location=map_center, zoom_start=12)
for _, row in merged.iterrows():
    folium.CircleMarker(
        location=[row["stop_lat"], row["stop_lon"]],
        radius=2,
        color="red"
    ).add_to(m)
m

In [None]:
from folium.plugins import HeatMap

# Prepare data for heatmap: group by stop location and count occurrences
stop_freq = merged.groupby(['stop_lat', 'stop_lon']).size().reset_index(name='count')
heat_data = stop_freq[['stop_lat', 'stop_lon', 'count']].values.tolist()

heatmap = folium.Map(location=map_center, zoom_start=12)
HeatMap(heat_data, radius=8, max_zoom=13).add_to(heatmap)
heatmap

In [None]:
# Save the enriched GTFS DataFrame to CSV
merged.to_csv("merged.csv", index=False)

# Optionally, save the stops heatmap as an HTML file
heatmap.save("stops_heatmap.html")

# Dynamic time-based filtering: busiest stops at 7am and 5pm
busiest_7am = (
    merged[merged['hour'] == 7]
    .groupby(['stop_id', 'stop_name'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)
busiest_5pm = (
    merged[merged['hour'] == 17]
    .groupby(['stop_id', 'stop_name'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

print("Top 10 busiest stops at 7am:")
print(busiest_7am.head(10))
print("\nTop 10 busiest stops at 5pm:")
print(busiest_5pm.head(10))

# Route-wise trip frequency analytics
route_trip_freq = (
    merged.groupby('route_id')['trip_id']
    .nunique()
    .reset_index(name='trip_count')
    .sort_values('trip_count', ascending=False)
)
print("\nRoute-wise trip frequency:")
print(route_trip_freq.head(10))

# Identifying underserved areas using route density (number of unique routes per stop)
stop_route_density = (
    merged.groupby(['stop_id', 'stop_name', 'stop_lat', 'stop_lon'])['route_id']
    .nunique()
    .reset_index(name='unique_routes')
    .sort_values('unique_routes')
)
print("\nStops with lowest route density (potentially underserved):")
print(stop_route_density.head(10))