In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from h3 import int_to_str, str_to_int
from h3ronpy.pandas import grid_disk_aggregate_k
from lonboard import viz
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_continuous_cmap
from palettable.scientific.diverging import Roma_20
from palettable.scientific.sequential import Hawaii_6, Hawaii_16
from sklearn.preprocessing import StandardScaler
from srai.embedders import ContextualCountEmbedder
from srai.h3 import h3_to_shapely_geometry
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer

In [None]:
bike_data = gpd.read_parquet('../data/bikes_spain_example.parquet')
bike_data

In [None]:
bike_data['city'].value_counts()

In [None]:
viz(
    bike_data,
    scatterplot_kwargs=dict(radius_min_pixels=2, get_fill_color=[0, 0, 0, 255]),
    map_kwargs=dict(basemap_style=CartoBasemap.Voyager),
)

In [None]:
H3_RESOLUTION = 11
H3_NEIGHBOURS = 5
H3_PREDICTION_BUFFER = 10

h3_regionalizer = H3Regionalizer(resolution=H3_RESOLUTION)
h3_regions_gdfs = []
for city_name in bike_data["city"].unique():
    city_h3_regions = h3_regionalizer.transform(
        bike_data[bike_data["city"] == city_name]
    )

    expanded_city_h3_regions = grid_disk_aggregate_k(
        city_h3_regions.index.map(str_to_int), H3_NEIGHBOURS + H3_PREDICTION_BUFFER, "min"
    ).rename(columns={"k": "distance_to_station", "cell": "region_id"})
    expanded_city_h3_regions["region_id"] = expanded_city_h3_regions["region_id"].map(
        int_to_str
    )
    expanded_city_h3_regions = expanded_city_h3_regions.set_index("region_id")
    expanded_city_h3_regions["city"] = city_name
    expanded_city_h3_regions = gpd.GeoDataFrame(
        expanded_city_h3_regions,
        geometry=h3_to_shapely_geometry(expanded_city_h3_regions.index),
        crs=4326,
    )
    h3_regions_gdfs.append(expanded_city_h3_regions)

h3_regions = gpd.pd.concat(h3_regions_gdfs)

min_bound = h3_regions["distance_to_station"].min()
max_bound = h3_regions["distance_to_station"].max()
normalized_distance_to_station = (h3_regions["distance_to_station"] - min_bound) / (
    max_bound - min_bound
)

viz(
    [h3_regions, bike_data],
    polygon_kwargs=dict(
        opacity=0.7,
        stroked=False,
        get_fill_color=apply_continuous_cmap(
            normalized_distance_to_station, Hawaii_16,
        ),
    ),
    scatterplot_kwargs=dict(radius_min_pixels=2, get_fill_color=[0, 0, 0, 255]),
    map_kwargs=dict(basemap_style=CartoBasemap.Voyager),
)

In [None]:
features = OSMPbfLoader().load(
    area=h3_regions, tags=GEOFABRIK_LAYERS
)
features

In [None]:
joint = IntersectionJoiner().transform(regions=h3_regions, features=features)
joint

In [None]:
embeddings = ContextualCountEmbedder(
    neighbourhood=H3Neighbourhood(),
    neighbourhood_distance=H3_NEIGHBOURS,
    concatenate_vectors=False,
    expected_output_features=GEOFABRIK_LAYERS,
    count_subcategories=True,
).transform(
    regions_gdf=h3_regions[["geometry"]], features_gdf=features, joint_gdf=joint
)

In [None]:
embeddings

In [None]:
target = "distance_to_station"

In [None]:
madrid_data = (
    h3_regions[h3_regions["city"] == "Madrid"]
    .merge(embeddings, left_index=True, right_index=True)
)

valencia_data = (
    h3_regions[h3_regions["city"] == "Valencia"]
    .merge(embeddings, left_index=True, right_index=True)
)

seville_data = (
    h3_regions[h3_regions["city"] == "Seville"]
    .merge(embeddings, left_index=True, right_index=True)
)

madrid_data.shape, valencia_data.shape, seville_data.shape

In [None]:
madrid_data.head()

In [None]:
x_madrid = StandardScaler().fit_transform(madrid_data[[*embeddings.columns]])
y_madrid = madrid_data[target]

x_seville = StandardScaler().fit_transform(seville_data[[*embeddings.columns]])
y_seville = seville_data[target]

x_valencia = StandardScaler().fit_transform(valencia_data[[*embeddings.columns]])
y_valencia = valencia_data[target]

x_madrid.shape, y_madrid.shape, x_seville.shape, y_seville.shape, x_valencia.shape, y_valencia.shape

In [None]:
model = xgb.XGBRegressor()
mask = y_madrid <= H3_NEIGHBOURS
model.fit(x_madrid[mask], y_madrid[mask])

In [None]:
_, axs = plt.subplots(2, 3, figsize=(12, 8), sharey=True, sharex=True, dpi=600)

axs[0, 0].set_ylabel("Predicted distance to station")
axs[1, 0].set_ylabel("Predicted distance to station")

for idx, (x, y, city_name) in enumerate(
    [
        (x_madrid, y_madrid, "Madrid"),
        (x_valencia, y_valencia, "Valencia"),
        (x_seville, y_seville, "Seville"),
    ]
):
    mask = y <= H3_NEIGHBOURS
    predicted_values = model.predict(x[mask])
    sns.regplot(
        x=y[mask],
        y=predicted_values,
        ax=axs[0, idx],
        scatter=True,
        order=2,
        scatter_kws=dict(
            alpha=0.02,
            color=[Hawaii_6.mpl_colors[_y] for _y in y[mask]],
        ),
        line_kws=dict(
            color="black",
        ),
        x_jitter=0.1,
    )
    sns.violinplot(
        x=y[mask],
        y=predicted_values,
        ax=axs[1, idx],
        fill=True,
        palette=Hawaii_6.mpl_colors,
        hue=y[mask],
        legend=False,
    )
    axs[0, idx].set_title(city_name)
    axs[0, idx].set_xlabel(None)
    axs[1, idx].set_xlabel("Distance to station")

plt.tight_layout()

In [None]:
concatenated_regions = pd.concat([madrid_data, seville_data, valencia_data])[
    [target, "geometry"]
]

concatenated_regions["predicted_distance_to_station"] = np.concatenate(
    [model.predict(x_madrid), model.predict(x_seville), model.predict(x_valencia)]
).round(2)

concatenated_regions = concatenated_regions[concatenated_regions[target] <= H3_PREDICTION_BUFFER]

normalized_predicted_distance_to_station = (
    concatenated_regions["predicted_distance_to_station"]
    - concatenated_regions["predicted_distance_to_station"].min()
) / (
    concatenated_regions["predicted_distance_to_station"].max()
    - concatenated_regions["predicted_distance_to_station"].min()
)

viz(
    [concatenated_regions, bike_data],
    polygon_kwargs=dict(
        opacity=0.7,
        stroked=False,
        get_fill_color=apply_continuous_cmap(
            normalized_predicted_distance_to_station, Hawaii_6
        ),
    ),
    scatterplot_kwargs=dict(radius_min_pixels=2, get_fill_color=[0, 0, 0, 255]),
    map_kwargs=dict(basemap_style=CartoBasemap.Voyager),
)

In [None]:
concatenated_regions["prediction_error"] = (
    concatenated_regions["predicted_distance_to_station"] - concatenated_regions[target].clip(0, H3_NEIGHBOURS)
)

normalized_prediction_error = (
    concatenated_regions["prediction_error"].apply(
        lambda x: (
            -x / concatenated_regions["prediction_error"].min()
            if x < 0
            else x / concatenated_regions["prediction_error"].max()
        )
    )
    + 1
) / 2

viz(
    [concatenated_regions, bike_data],
    polygon_kwargs=dict(
        opacity=1,
        stroked=False,
        get_line_width=0,
        get_fill_color=apply_continuous_cmap(
            normalized_prediction_error,
            Roma_20,
            alpha=(normalized_prediction_error - 0.5).abs() * 2,
        ),
    ),
    scatterplot_kwargs=dict(radius_min_pixels=2, get_fill_color=[0, 0, 0, 255]),
    map_kwargs=dict(basemap_style=CartoBasemap.PositronNoLabels),
)