In [1]:
import datetime

import altair
import altair_saver
import pandas
import partridge
import toolz
from shapely.ops import orient, clip_by_rect


altair.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
service_by_date = {}

# Get trips, stop times, routes, and shapes from the GTFS feeds.
# We concatenate them to allow us to compare before and after
feed_trips = pandas.DataFrame()
feed_stop_times = pandas.DataFrame()
feed_routes = pandas.DataFrame()
feed_shapes = pandas.DataFrame()

for f in ["gtfs_bus_2021_12.zip", "gtfs_bus_2022_02.zip", "gtfs_bus_2022_06.zip"]:
    feed = partridge.load_geo_feed(f)
    service = partridge.read_service_ids_by_date(f)

    service_by_date.update(service)

    feed_trips = pandas.concat([feed_trips, feed.trips])
    feed_stop_times = pandas.concat([feed_stop_times, feed.stop_times])
    feed_routes = pandas.concat([feed_routes, feed.routes])
    feed_shapes = pandas.concat([feed_shapes, feed.shapes])

service_by_date = pandas.DataFrame.from_records(
    toolz.concat([(k, s) for s in v] for k, v in service_by_date.items()),
    columns=["date", "service_id"],
)

  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))


In [3]:
# Get the dates since I started collecting GTFS-RT updates
d = datetime.date.today()
dates = []
while d > datetime.date(2022, 1, 19):
    d -= datetime.timedelta(days=1)
    dates.append(d)

In [4]:
agg = pandas.DataFrame()

for date in dates:
    updates = pandas.read_parquet(f"trip-updates/trip-updates-{date.isoformat()}.parquet")
    # Service patterns for the day
    service = service_by_date[service_by_date.date == date]
    # Trips for the service patterns
    trips = feed_trips[feed_trips.service_id.isin(service.service_id)]
    # Route data
    routes = feed_routes.assign(
         route=lambda x: x.route_short_name.combine_first(x.route_long_name)
    )[["route_id", "route"]]
    # Timing data for trips, which we use to compute scheduled service hours.
    trip_timings = (
        feed_stop_times
        .groupby("trip_id")
        .agg({"arrival_time": max, "departure_time": min})
    )
    trip_timings = trip_timings.assign(
        duration=pandas.to_timedelta(
            trip_timings.arrival_time - trip_timings.departure_time,
            unit="s"
        ),
        arrival_time=pandas.to_datetime(date).normalize() + pandas.to_timedelta(trip_timings.arrival_time, unit="s"),
        departure_time=pandas.to_datetime(date).normalize() + pandas.to_timedelta(trip_timings.departure_time, unit="s"),

    )

    # Merge the datasets into a single trips dataframe with timing, route, and updates data.
    # Assume that if a trip does not get an update, it is "scheduled". This assumption may
    # not be 100%, but seems to work okay.
    trips = trips.merge(trip_timings, left_on="trip_id", right_index=True)

    trips = trips.merge(routes, on="route_id", how="left",)
    trips = (
        trips.merge(updates, on="trip_id", how="left")
        .assign(
            schedule_relationship=lambda x: x.schedule_relationship.fillna("scheduled")
        )       
    )

    trips_agg = (
        trips
        .assign(count=1)   # unused column for count
        .groupby(["route", "schedule_relationship"], dropna=False)
        .agg({
            "duration": sum,
            "count": "count",
        })
        .unstack(level=1)
    )

    # https://stackoverflow.com/questions/45878333/merge-multiindex-columns-together-into-1-level
    trips_agg.columns = ['_'.join(col) for col in trips_agg.columns.values]
    dc = (
        trips_agg.duration_canceled.fillna(pandas.Timedelta(0))
        if "duration_canceled" in trips_agg.columns
        else pandas.Timedelta(0)
    )
    cc = (
        trips_agg.count_canceled.fillna(0).astype(int)
        if "count_canceled" in trips_agg.columns
        else 0
    )
    trips_agg = trips_agg.assign(
        duration_canceled=dc,
        count_canceled=cc,
        count_scheduled=trips_agg.count_scheduled.astype(int),
    )
    trips_agg = trips_agg.assign(
        percent_duration_canceled=(
            100*trips_agg.duration_canceled/
            (trips_agg.duration_canceled + trips_agg.duration_scheduled)
        ),
        percent_trips_canceled=(
            100*trips_agg.count_canceled/
            (trips_agg.count_canceled + trips_agg.count_scheduled)
        ),
    )
    trips_agg = trips_agg.assign(date=pandas.Timestamp(date))
    agg = pandas.concat([agg, trips_agg])

In [5]:
# The full set of routes is a bit much to show all at once in some charts,
# so we split them up by route type, and allow the user to filter by that.

def route_type(route):
    try:
        if "Orange" in route or "Silver" in route:
            return "Bus Rapid Transit"
        number = int(route.split("/")[0])
        if number > 0 and number < 100:
            return "Downtown Routes (1-99)"
        elif number >= 100 and number < 200:
            return "East-West Routes (100-199)"
        elif number >= 200 and number < 300:
            return "North-South Routes (200-299)"
        elif number >= 400 and number < 600:
            return "Freeway Express Routes (400-599)"
        elif number >= 600 and number < 700:
            return "Neighborhood Circulators (600-699)"
        elif number >= 700 and number < 800:
            return "Rapid Lines (700-799)"
        else:
            return "Unknown"
    except:
        return "Unknown"

agg = (
    agg.assign(
        route_type=agg.index.to_series().apply(route_type)
    )
    .loc[lambda x: x.route_type != "Unknown"]
)

In [6]:
most_canceled = (
    agg
    .groupby("route")
    .percent_duration_canceled.sum()
    .sort_values(ascending=False)
    .index
)

In [7]:
federal_holidays = [
    datetime.date(2022, 5, 30),
    datetime.date(2022, 7, 4),
    datetime.date(2022, 9, 5),
]

In [8]:
base = altair.Chart(
    (
        agg
        [["date", "percent_duration_canceled"]]
        .reset_index()
        .assign(
            shakeup_date1=pandas.Timestamp("2022-02-19"),
            shakeup_date2=pandas.Timestamp("2022-6-27")
        )
    ),
    width=800,
)
chart = (
    base
    .mark_rect()
    .encode(
        y=altair.Y("route:N", title="Route", sort=most_canceled.to_list()),
        x=altair.X("date:T", title="Date"),
        color=altair.Color("percent_duration_canceled:Q", title="Percent of Service Hours Canceled"),
        tooltip=[
            altair.Tooltip("date:T", title="Date"),
            altair.Tooltip("route:N", title="Route"),
            altair.Tooltip("percent_duration_canceled:Q", title="Percent of Service Hours Canceled"),
        ]
    )
)

rule1 = (
    base
    .mark_rule(color='maroon', strokeWidth=0)
    .encode(x="mean(shakeup_date1):T", size=altair.value(5))
)
rule2 = (
    base
    .mark_rule(color='maroon', strokeWidth=0)
    .encode(x="mean(shakeup_date2):T", size=altair.value(5))
)

heatmap = (
    (chart + rule1)
    .properties(title="Cancellations Heat Map")
    .configure_title(
        fontSize=20,
        anchor='start',
    )
)

In [9]:
to_chart =  (
    agg
    .assign(
        duration_scheduled = (agg.duration_scheduled + agg.duration_canceled).dt.total_seconds()/60/60,
        duration_delivered = agg.duration_scheduled.dt.total_seconds()/60/60,
        date=pandas.to_datetime(agg.date),
        shakeup_date=pandas.to_datetime("2022-02-19"),
    )
    [["duration_delivered", "duration_scheduled", "date", "route_type", "shakeup_date"]]
    .loc[lambda df: df.date.dt.dayofweek <= 4]
    .loc[lambda df: ~df.date.dt.date.isin(federal_holidays)]
    .reset_index()
)

route_type_options = agg.route_type.unique().tolist()
route_type_dropdown = altair.binding_select(options=route_type_options, name="Route Type    ")
route_type_selection = altair.selection_single(
    fields=["route_type"],
    bind=route_type_dropdown,
    init={"route_type": route_type_options[0]}
)
label_selection = altair.selection_multi(fields=['route'], bind='legend')
route_selection = altair.selection_single(fields=['route'], on='mouseover')

base = altair.Chart(to_chart, width=600, height=600)

area = (
    base
    .transform_calculate(
        service_percent="round(datum.duration_delivered/datum.duration_scheduled*100)"
    )
    .mark_area(
        interpolate='basis',
        stroke='gray',
        strokeWidth=2.0,
        fillOpacity=0.8,
        strokeOpacity=0.1,
    )
    .encode(
        y=altair.Y("duration_scheduled:Q", title="Service Hours"),
        y2="duration_delivered:Q",
        x=altair.X("date:T", title="Date"),
        color=altair.Color(
            "route:N", 
            scale=altair.Scale(scheme="tableau20", reverse=False),
            legend=altair.Legend(symbolLimit=40),
        ),
        fillOpacity=altair.condition(label_selection, altair.value(0.8), altair.value(0.1)),
        strokeOpacity=altair.condition(label_selection, altair.value(1.0), altair.value(.1)),
        tooltip=[
            altair.Tooltip("route:N", title="Route"),
            altair.Tooltip("route_type:N", title="Route Type"),
            altair.Tooltip("date:T", title="Date"),
            altair.Tooltip("service_percent:Q", title="Percent of Service Hours")
        ],
    )

)


rule = (
    base
    .mark_rule(color='maroon')
    .encode(x="mean(shakeup_date):T", size=altair.value(5))
)
area = (
    (area + rule)
    .add_selection(label_selection)
    .add_selection(route_type_selection)
    .transform_filter(route_type_selection)
)

In [10]:
to_chart =  (
    agg
    .assign(
        duration_scheduled = (agg.duration_scheduled + agg.duration_canceled).dt.total_seconds()/60/60,
        duration_delivered = agg.duration_scheduled.dt.total_seconds()/60/60,
        date=pandas.to_datetime(agg.date)
    )
    .groupby("date")
    .sum()
    .reset_index()
    [["duration_delivered", "duration_scheduled", "date"]]
    .loc[lambda df: df.date.dt.dayofweek <= 4]
    .loc[lambda df: ~df.date.dt.date.isin(federal_holidays)]
    .assign(shakeup_date=pandas.Timestamp("2022-02-19"))
)

base = altair.Chart(to_chart, width=600, height=600)
chart = (
    base
    .transform_calculate(service_percent="round(datum.duration_delivered/datum.duration_scheduled*100)")
    .mark_area(
        interpolate='basis',
        stroke='gray',
        strokeWidth=2.0,
        fillOpacity=0.8,
        strokeOpacity=0.1,
    )
    .encode(
        y=altair.Y("duration_scheduled:Q", title="Service Hours"),
        y2="duration_delivered:Q",
        x=altair.X("date:T", title="Date"),
        tooltip=[altair.Tooltip("date:T", title="Date"), altair.Tooltip("service_percent:Q", title="Percent of Service Hours")]
    )
)

rule = (
    base
    .mark_rule(color='maroon')
    .encode(x="mean(shakeup_date):T", size=altair.value(5))
)

system = (
    (chart + rule)
    .properties(title="Systemwide Cancellations")
    .configure_title(anchor="start", fontSize=20)
)

In [11]:
import geopandas
# Choose the most common shape_id to use for the map
geoms = geopandas.GeoDataFrame(
    (
        trips.groupby("route")
        .agg({"shape_id": lambda g: g.value_counts().index[0]})
        .reset_index()
        .merge(feed_shapes, how="left", on="shape_id")
    ),
    crs="EPSG:4326",
    geometry="geometry",
)
geoms = (
    geoms.assign(
        route_type=geoms.route.apply(route_type)
    )
    .loc[lambda x: x.route_type != "Unknown"]
)

In [12]:
route_map = (
    altair.Chart(geoms, width=600, height=600)
    .mark_geoshape(filled=False)
    .encode(
        color=altair.Color(
            "route:N",
            legend=altair.Legend(symbolLimit=40),
            scale=altair.Scale(scheme="tableau20", reverse=False),
        ),
        strokeOpacity=altair.condition(label_selection, altair.value(1.0), altair.value(.3)),
        strokeWidth=altair.condition(label_selection, altair.value(2), altair.value(1)),
        tooltip=[altair.Tooltip("route:N", title="Route")]
    )
    .add_selection(label_selection)
    .add_selection(route_type_selection)
    .transform_filter(route_type_selection)
)
la = geopandas.read_file("County_Boundaries.geojson").iloc[8]
geom = orient(clip_by_rect(la.geometry, -119.1, 33.70,-117.9, 34.355), -1)
background = altair.Chart(geom).mark_geoshape(fill="whitesmoke")


detailed = (
    (area | (background + route_map))
    .properties(title="Cancellations")
    .configure_title(anchor="start", fontSize=20)
)

In [13]:
altair_saver.save(heatmap, "heatmap.html")
altair_saver.save(system, "system_cancellations.html")
altair_saver.save(detailed, "route_cancellations.html")