# Visualizing the Data

In [None]:
import polars as pl
pl.show_versions()
%xmode minimal

## NYC Bike Trips

In [None]:
trips = pl.read_parquet("data/biketrips/*.parquet")
trips

### Remove non-trips

In [None]:
trips = trips.filter(
    ~((pl.col("station_start") == pl.col("station_end")) &
    (pl.col("duration").dt.total_seconds() < 5*60))
)
trips.height

## hvPlot

![](images/hvplot-overview.png)

### A First Plot

In [None]:
trips_speed = (
    trips
    .filter(pl.col("station_start") == "W 21 St & 6 Ave")
    .select(  
        pl.col("distance"),
        pl.col("duration").dt.total_seconds() / 3600,  # better than total_minutes()
        pl.col("bike_type"),
        pl.col("borough_end")
    )
)
trips_speed

In [None]:
trips_agg = trips.group_by("borough_start").agg(
    pl.col("distance").mean(),
pl.col("duration").mean().dt.total_seconds() / 3600)

In [None]:
trips_agg

In [None]:
trips_agg.plot.scatter(x="distance", y="duration",
                         xlabel="distance (km)", ylabel="duration (h)", size=20,
                         ylim=(0, 2))

In [None]:
trips_speed.plot.hexbin(x="distance", y="duration", color="bike_type",
                         xlabel="distance (km)", ylabel="duration (h)",
                         ylim=(0, 2)) +\
trips_agg.plot.scatter(x="distance", y="duration", color="red",
                         xlabel="distance (km)", ylabel="duration (h)", size=20,
                         ylim=(0, 2))

### GroupBy widget

In [None]:
trips_speed.plot.scatter(x="distance", y="duration", color="bike_type",
                         xlabel="distance (km)", ylabel="duration (h)",
                         ylim=(0, 2), groupby="borough_end")

### Methods in the Plot Namespace

* `df.plot.area()`: Plots a area chart similar to a line chart except for filling the area under the curve and optionally stacking.
* `df.plot.bar()`: Plots a bar chart that can be stacked or grouped.
* `df.plot.bivariate()`: Plots 2D density of a set of points.
* `df.plot.box()`: Plots a box-whisker chart comparing the distribution of one or more variables.
* `df.plot.density()`: Plots the kernel density estimate of one or more variables.
* `df.plot.heatmap()`: Plots a heatmap to visualizing a variable across two independent dimensions.
* `df.plot.hexbins()`: Plots hex bins.
* `df.plot.hist()`: Plots the distribution of one or histograms as a set of bins.
* `df.plot.line()`: Plots a line chart (such as for a time series).
* `df.plot.scatter()`: Plots a scatter chart comparing two variables.
* `df.plot.violin()`: Plots a violin plot comparing the distribution of one or more variables using the kernel density estimate.


In [None]:
# trips.plot.<TAB>

### Getting Help for a Method

In [None]:
?trips.plot.scatter

In [None]:
import hvplot

hvplot.help("scatter", generic=False, style=False)

### Pandas as Backup

In [None]:
trips_per_day_hour = (
    trips
    .sort("datetime_start")
    .group_by_dynamic("datetime_start", every="1h")
    .agg(pl.len())
)

# Produces error: trips_per_day_hour.plot.heatmap(x='datetime_start.hour', y='datetime_start.day', C='len', cmap='reds')

In [None]:
import hvplot.pandas
trips_per_day_hour.to_pandas().hvplot.heatmap(x='datetime_start.hour', y='datetime_start.day', C='len', cmap='reds')

### Manual Transformations

In [None]:
trips_type_counts = trips.group_by("rider_type", "bike_type").len()
trips_type_counts

In [None]:
trips_type_counts.plot.bar(x="rider_type", y="len", by="bike_type",
                           ylabel="count", stacked=True,
                           color=["orange", "green"])

### Changing the Plotting Backend

In [None]:
import hvplot
hvplot.extension("matplotlib")

In [None]:
trips_type_counts.plot.bar(x="rider_type", y="len", by="bike_type",
                           ylabel="count", stacked=True,
                           color=["orange", "green"])

In [None]:
plot = trips_type_counts.plot.bar(x="rider_type", y="len", by="bike_type",
                           ylabel="count", stacked=True,
                           color=["orange", "green"])

In [None]:
type(plot)

In [None]:
hvplot.render(plot, backend="matplotlib")

In [None]:
%matplotlib inline

In [None]:
?hvplot.render?

In [None]:
hvplot.render(plot, backend="matplotlib")

In [None]:
hvplot.extension("bokeh")

### Plotting Points on a Map

In [None]:
trips.plot.scatter(x='lon_start', y='lat_start', color='borough_start',
                   width=600, height=600)

In [None]:
trips.plot.points(x="lon_start", y="lat_start",
                  datashade=True, geo=True,
                  tiles="CartoLight",
                  width=800, height=600)

## Alternatives

### HoloViews

In [None]:
import holoviews as hv
hv.extension("bokeh")

In [None]:
boroughs = (
    trips
    .select(pl.col("borough_start").alias("name"))
    .unique()
    .sort("name")
    .with_row_index()
)
boroughs

In [None]:
trips_flow = (
    trips.group_by("borough_start", "borough_end").agg(pl.len())
    .join(boroughs, left_on="borough_start", right_on="name")
    .join(boroughs, left_on="borough_end", right_on="name")
    .select(source="index",
            target="index_right",
            value=pl.col("len"))
    .filter(pl.col("source") != pl.col("target"))
    .sort("target", descending=False)
)
trips_flow

In [None]:
ds_trips_flow = hv.Dataset(trips_flow,
                           kdims=["source", "target"],
                           vdims=["value"])

ds_boroughs = hv.Dataset(boroughs,
                         kdims=["index"],
                         vdims=["name"])

ds_trips_flow

In [None]:
hv.Chord((ds_trips_flow, ds_boroughs)).opts(
    hv.opts.Chord(cmap='glasbey', edge_cmap='glasbey',
                  node_color=hv.dim('index').str(),
                  edge_color=hv.dim('source').str(),
                  labels='name',
                  toolbar=None,
                  width=600, height=600))

### Altair

In [None]:
import altair as alt

In [None]:
trips_per_hour = (
    trips
    .sort("datetime_start")
    .group_by_dynamic("datetime_start", group_by="borough_start", every="1h")
    .agg(pl.len())
)
trips_per_hour

In [None]:
color_condition = alt.condition(
    "hours(datum.value) == 12",
    alt.value("black"),
    alt.value(None),
)

alt.Chart(trips_per_hour, width=800, height=250).mark_rect().encode(
    alt.X("yearmonthdatehours(datetime_start):O")
         .title("Day in March")
         .axis(
             format="%-d",
             labelAngle=0,
             labelOverlap=False,
             labelColor=color_condition,
             tickColor=color_condition,
         ),
    alt.Y("borough_start:N").title(None),
    alt.Color("len").title("Number of Trips").scale(type="log")
)

### Plotnine

In [None]:
from plotnine import *

In [None]:
trips_speed = (
    trips.group_by("neighborhood_start", "neighborhood_end").agg(
        pl.col("duration").dt.total_seconds().median() / 3600,
        pl.col("distance").median(),
        pl.col("borough_start").first(),
        pl.col("borough_end").first(),
        pl.len(),
    ).filter(
        (pl.col("len") > 30) &
        (pl.col("distance") > 0.2) &
        (pl.col("neighborhood_start") != pl.col("neighborhood_end")),
    ).with_columns(
        speed=pl.col("distance") / pl.col("duration")
    ).sort("borough_start")
)
trips_speed

In [None]:
(
    ggplot(trips_speed, aes(x="distance", y="duration")) +
    geom_point(size=0.25, alpha=0.5) +
    geom_smooth(method="lowess", size=2, se=False, color="blue")
)

In [None]:
(
    ggplot(trips_speed
        .filter(pl.col("borough_start") == pl.col("borough_end")),
        aes(x="distance", y="duration", color="borough_end")) +
    geom_point(size=0.25, alpha=0.5) +
    geom_smooth(method="lowess", size=2, se=False, alpha=0.8) +
    scale_color_brewer(type="qualitative", palette="Set1") +
    labs(title="Trip distance and duration within each borough",
         x="Distance (km)", y="Duration (m)", color="Borough") +
    theme_linedraw() +
    theme(figure_size=(8, 6))
)

In [None]:
(
    ggplot(trips_speed
        .filter(pl.col("borough_start") != pl.col("borough_end"))
        .with_columns(
            ("From " + pl.col("borough_start")).alias("borough_start")),
        aes(x="distance", y="duration", color="borough_end")) +
    geom_point(size=0.25, alpha=0.5) +
    geom_smooth(method="lowess", size=2, se=False, alpha=0.8) +
    scale_color_brewer(type="qualitative", palette="Set1") +
    facet_wrap("borough_start") +
    labs(title="Trip distance and duration cross borough",
         x="Distance (km)", y="Duration (m)", color="To Borough") +
    theme_linedraw() +
    theme(figure_size=(8, 6))
)

### Great Tables

![](images/great-tables-compontents.png)

In [None]:
busiest_stations = (
    trips
    .group_by(   # <1>
        station=pl.col("station_start"),
        date=pl.col("datetime_start").dt.date()
    )
    .agg(
        borough=pl.col("borough_start").first(),
        neighborhood=pl.col("neighborhood_start").first(),
        num_rides=pl.len(),
        percent_member=(pl.col("rider_type") == "member").mean(),
        percent_electric=(pl.col("bike_type") == "electric").mean()
    )
    .sort("date")
    .group_by("station")
    .agg(
        pl.col(pl.String).first(),
        pl.col(pl.NUMERIC_DTYPES).mean(),
        pl.col("num_rides").cast(pl.String).alias("rides_per_day")  #<2>
    )
    .with_columns(pl.col("rides_per_day").list.join(" "))
    .sort("num_rides", descending=True)
    .group_by("borough", maintain_order=True).head(3)
)
busiest_stations

In [None]:
import polars.selectors as cs
from great_tables import GT, style, md

### First Try

In [None]:
GT(busiest_stations.drop("rides_per_day"))

### Add grouping

In [None]:
GT(busiest_stations.drop("rides_per_day"), rowname_col="station", groupname_col="borough")

### Improve Column Names 

In [None]:
(
    GT(busiest_stations.drop("rides_per_day"), rowname_col="station", groupname_col="borough")
    .cols_label(       
        neighborhood="Neighborhood",
        num_rides="Mean Daily Rides",
        percent_member="Members",
        percent_electric="E-Bikes",
    )
    .tab_stubhead(label="Station")
)

### Titles

In [None]:
(
    GT(busiest_stations.drop("rides_per_day"), rowname_col="station", groupname_col="borough")
    .cols_label(       
        neighborhood="Neighborhood",
        num_rides="Mean Daily Rides",
        percent_member="Members",
        percent_electric="E-Bikes",
    )
    .tab_header(
        title="Busiest Bike Stations in NYC",
        subtitle="In March 2024, Per Borough"
    )
    .tab_stubhead(label="Station")
)

### Format numbers

In [None]:
(
    GT(busiest_stations.drop("rides_per_day"), rowname_col="station", groupname_col="borough")
    .cols_label(       
        neighborhood="Neighborhood",
        num_rides="Mean Daily Rides",
        percent_member="Members",
        percent_electric="E-Bikes",
    )
    .tab_header(
        title="Busiest Bike Stations in NYC",
        subtitle="In March 2024, Per Borough"
    )
    .tab_stubhead(label="Station")
    .fmt_number(columns="num_rides", decimals=1)
    .fmt_percent(columns=cs.starts_with("percent_"), decimals=0)
    .data_color(columns="num_rides", palette="Blues")
)

### Add Nano Plots

In [None]:
(
    GT(busiest_stations, rowname_col="station", groupname_col="borough")
    .cols_label(       
        neighborhood="Neighborhood",
        num_rides="Mean Daily Rides",
        percent_member="Members",
        percent_electric="E-Bikes",
        rides_per_day="Rides Per Day",
    )
    .tab_header(
        title="Busiest Bike Stations in NYC",
        subtitle="In March 2024, Per Borough"
    )
    .tab_stubhead(label="Station")
    .fmt_number(columns="num_rides", decimals=1)
    .fmt_percent(columns=cs.starts_with("percent_"), decimals=0)
    .fmt_nanoplot(columns="rides_per_day", reference_line="mean")
)

### Some final touches

In [None]:
(
    GT(busiest_stations, rowname_col="station", groupname_col="borough")
    .cols_label(       
        neighborhood="Neighborhood",
        num_rides="Mean Daily Rides",
        percent_member="Members",
        percent_electric="E-Bikes",
        rides_per_day="Rides Per Day",
    )
    .tab_header(
        title="Busiest Bike Stations in NYC",
        subtitle="In March 2024, Per Borough"
    )
    .tab_stubhead(label="Station")
    .fmt_number(columns="num_rides", decimals=1)
    .fmt_percent(columns=cs.starts_with("percent_"), decimals=0)
    .fmt_nanoplot(columns="rides_per_day", reference_line="mean")
    .data_color(columns="num_rides", palette="Blues")
    .tab_options(row_group_font_weight="bold")
    .tab_source_note(source_note=md(
        "Source: [NYC Citi Bike](https://citibikenyc.com/system-data)"
    ))
)

## Fin.