# Chapter 16: Visualizing Data

In [None]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.13.1

## NYC Bike Trips

In [None]:
trips = pl.read_parquet("data/citibike/*.parquet")

print(trips[:, :4])
print(trips[:, 4:7])
print(trips[:, 7:11])
print(trips[:, 11:])

## Built-in Plotting with Altair

### Introducing Altair

In [None]:
%pip install altair

In [None]:
import altair as alt

### Some Basic Plots

In [None]:
trips_speed = (
    trips.filter(pl.col("station_start") == "W 70 St & Amsterdam Ave")
    .select(  
        pl.col("distance"),
        pl.col("duration").dt.total_seconds() / 3600,  
        pl.col("bike_type"),
    )
    .with_columns(speed=pl.col("distance") / pl.col("duration"))
)

trips_speed

In [None]:
trips_speed.plot.scatter(
    x="distance",
    y="duration",
    color="bike_type:N",  
)

In [None]:
trips_speed["distance"].plot.kde()

In [None]:
trips_hour_num_speed = (
    trips.sort("datetime_start")
    .group_by_dynamic("datetime_start", every="1h")
    .agg(
        num_trips=pl.len(),
        speed=(
            pl.col("distance") / (pl.col("duration").dt.total_seconds() / 3600)
        ).median(),
    )
    .filter(pl.col("datetime_start") > pl.date(2024, 3, 26))
)

trips_hour_num_speed

In [None]:
trips_hour_num_speed.plot.line(x="datetime_start", y="num_trips")

In [None]:
import altair as alt

alt.data_transformers.disable_max_rows()

In [None]:
trips_type_counts = trips.group_by("rider_type", "bike_type").len()
trips_type_counts

In [None]:
trips_type_counts.plot.bar(
    x="rider_type", y="len", fill="bike_type:N"
).properties(
    width=300,
)

## Pandas-like Plotting With hvPlot

### Introducing hvPlot

In [None]:
%pip install hvplot

In [None]:
import hvplot.polars

### A First Plot

In [None]:
trips_speed.hvplot.scatter(
    x="distance",
    y="duration",
    color="bike_type",  
    xlabel="distance (km)",
    ylabel="duration (h)",  
    ylim=(0, 2),  
)

### Methods in the Plot Namespace

### Getting Help for a Method

In [None]:
import hvplot

hvplot.help("scatter", generic=False, style=False)

### Pandas as Backup

In [None]:
trips_per_day_hour = (
    trips.sort("datetime_start")
    .group_by_dynamic("datetime_start", every="1h")
    .agg(pl.len())
)

In [None]:
# This raises a ValueError:
# trips_per_day_hour.hvplot.heatmap(
#     x="datetime_start.hour", y="datetime_start.day", C="len", cmap="reds"
# )

In [None]:
import hvplot.pandas

trips_per_day_hour.to_pandas().hvplot.heatmap(
    x="datetime_start.hour", y="datetime_start.day", C="len", cmap="reds"
)

### Manual Transformations

In [None]:
trips_type_counts.hvplot.bar(
    x="rider_type",
    y="len",
    by="bike_type",
    ylabel="count",
    stacked=True,
    color=["orange", "green"],
)

### Changing the Plotting Backend

In [None]:
import hvplot

hvplot.extension("matplotlib")

In [None]:
trips_type_counts.hvplot.bar(
    x="rider_type",
    y="len",
    by="bike_type",
    ylabel="count",
    stacked=True,
    color=["orange", "green"],
)

In [None]:
hvplot.extension("bokeh")

### Plotting Points on a Map

In [None]:
trips.hvplot.points(
    x="lon_start",
    y="lat_start",
    datashade=True,
    geo=True,
    tiles="CartoLight",
    width=800,
    height=600,
)

### Composing Plots

In [None]:
(
    trips_hour_num_speed.hvplot.line(x="datetime_start", y="num_trips")
    + trips_hour_num_speed.hvplot.line(x="datetime_start", y="speed")
).cols(  
    1
)

In [None]:
(
    trips_hour_num_speed.hvplot.line(x="datetime_start", y="num_trips")
    * trips_hour_num_speed.filter(pl.col("num_trips") > 9000).hvplot.scatter(
        x="datetime_start", y="num_trips", c="red", s=50
    )
)

### Adding Interactive Widgets

In [None]:
trips_per_hour = (
    trips.sort("datetime_start")
    .group_by_dynamic("datetime_start", group_by="borough_start", every="1h")
    .agg(pl.len())
    .with_columns(date=pl.col("datetime_start").dt.date())
)
trips_per_hour

In [None]:
trips_per_hour.hvplot.line(
    x="datetime_start",
    by="borough_start",
    groupby="date",
    widget_location="left_top",
)

## Publication-Quality Graphics with Plotnine

In [None]:
%pip install plotnine[all]

In [None]:
from plotnine import *

In [None]:
trips_speed = (
    trips.group_by("neighborhood_start", "neighborhood_end")
    .agg(
        pl.col("duration").dt.total_seconds().median() / 3600,
        pl.col("distance").median(),
        pl.col("borough_start").first(),
        pl.col("borough_end").first(),
        pl.len(),
    )
    .filter(
        (pl.col("len") > 30)
        & (pl.col("distance") > 0.2)
        & (pl.col("neighborhood_start") != pl.col("neighborhood_end")),
    )
    .with_columns(speed=pl.col("distance") / pl.col("duration"))
    .sort("borough_start")
)
trips_speed

In [None]:
(
    ggplot(
        data=trips_speed.filter(
            pl.col("borough_start") == pl.col("borough_end")
        ),
        mapping=aes(x="distance", y="duration", color="borough_end"),
    )
    + geom_point(size=0.25, alpha=0.5)
    + geom_smooth(method="lowess", size=2, se=False, alpha=0.8)
    + scale_color_brewer(type="qualitative", palette="Set1")
    + labs(
        title="Trip distance and duration within each borough",
        x="Distance (km)",
        y="Duration (m)",
        color="Borough",
    )
    + theme_linedraw()
    + theme(figure_size=(8, 6))
)

In [None]:
(
    ggplot(
        data=trips_speed.filter(
            pl.col("borough_start") != pl.col("borough_end")
        ).with_columns(
            ("From " + pl.col("borough_start")).alias("borough_start")
        ),
        mapping=aes(x="distance", y="duration", color="borough_end"),
    )
    + geom_point(size=0.25, alpha=0.5)
    + geom_smooth(method="lowess", size=2, se=False, alpha=0.8)
    + scale_color_brewer(type="qualitative", palette="Set1")
    + facet_wrap("borough_start")
    + labs(
        title="Trip distance and duration cross borough",
        x="Distance (km)",
        y="Duration (m)",
        color="To Borough",
    )
    + theme_linedraw()
    + theme(figure_size=(8, 6))
)

## Bonus: Styling DataFrames With Great Tables

In [None]:
%pip install great_tables

In [None]:
import polars.selectors as cs

busiest_stations = (
    trips.group_by(  
        station=pl.col("station_start"), date=pl.col("datetime_start").dt.date()
    )
    .agg(
        borough=pl.col("borough_start").first(),
        neighborhood=pl.col("neighborhood_start").first(),
        num_rides=pl.len(),
        percent_member=(pl.col("rider_type") == "member").mean(),
        percent_electric=(pl.col("bike_type") == "electric").mean(),
    )
    .sort("date")
    .group_by("station")
    .agg(
        cs.string().first(),
        cs.numeric().mean(),
        pl.col("num_rides").alias("rides_per_day"),  
    )
    .sort("num_rides", descending=True)
    .group_by("borough", maintain_order=True)
    .head(3)
)

busiest_stations

In [None]:
GT(busiest_stations)

In [None]:
from great_tables import style, md

(
    GT(busiest_stations)
    .tab_stub(rowname_col="station", groupname_col="borough")  
    .cols_label(  
        neighborhood="Neighborhood",
        num_rides="Mean Daily Rides",
        percent_member="Members",
        percent_electric="E-Bikes",
        rides_per_day="Rides Per Day",
    )
    .tab_header(
        title="Busiest Bike Stations in NYC",
        subtitle="In March 2024, Per Borough",
    )
    .tab_stubhead(label="Station")
    .fmt_number(columns="num_rides", decimals=1)
    .fmt_percent(columns=cs.starts_with("percent_"), decimals=0)  
    .fmt_nanoplot(columns="rides_per_day", reference_line="mean")
    .data_color(columns="num_rides", palette="Blues")
    .tab_options(row_group_font_weight="bold")
    .tab_source_note(
        source_note=md(
            "Source: [NYC Citi Bike](https://citibikenyc.com/system-data)"
        )
    )
)

## Takeaways