In [None]:
import contextily as cx
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import pyarrow as pa
import seaborn as sns
import xgboost as xgb
from h3 import int_to_str, str_to_int
from h3ronpy import grid_disk_aggregate_k
from pypalettes import load_cmap
from pytorch_lightning import seed_everything
from sklearn.preprocessing import StandardScaler

from srai.embedders import ContextualCountEmbedder
from srai.h3 import h3_to_shapely_geometry
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMOnlineLoader
from srai.loaders.overturemaps_loader import OvertureMapsLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf

In [None]:
# finish from: https://github.com/kraina-ai/srai-tutorial/blob/osm-deep-dive/tutorial/06_use_osm_data_in_ml_model.ipynb

In [None]:
SEED = 71
seed_everything(SEED)

In [None]:
cities_names = ["Madrid", "Seville", "Valencia"]
regions = geocode_to_region_gdf(cities_names)
regions.index = cities_names

In [None]:
regions

In [None]:
bicycle_stations = OSMOnlineLoader().load(area=regions, tags={"amenity": "bicycle_rental"})
bicycle_stations

In [None]:
bicycle_stations_in_city = IntersectionJoiner().transform(
    regions, bicycle_stations, return_geom=True
)

bicycle_stations_per_city = {}
for city_name in cities_names:
    bicycle_stations_per_city[city_name] = bicycle_stations_in_city.loc[city_name]

In [None]:
bicycle_stations_per_city["Madrid"].explore(tiles="CartoDB Positron")

In [None]:
H3_RESOLUTION = 11
# H3_RESOLUTION = 10
H3_NEIGHBOURS = 5
H3_PREDICTION_BUFFER = 10


def buffer_h3_cells_with_aggregation(h3_regions):
    """Expand H3 regions and calculate minimal distance to origin cells."""
    return (
        pa.table(
            grid_disk_aggregate_k(
                h3_regions.index.map(str_to_int),
                H3_NEIGHBOURS + H3_PREDICTION_BUFFER,
                "min",
            )
        )
        .to_pandas()
        .rename(columns={"k": "distance_to_station", "cell": "region_id"})
    )


h3_regionalizer = H3Regionalizer(resolution=H3_RESOLUTION)
h3_regions_gdfs = []
for city_name, bicycle_stations_data in bicycle_stations_per_city.items():
    city_h3_regions = h3_regionalizer.transform(bicycle_stations_data)

    expanded_city_h3_regions = buffer_h3_cells_with_aggregation(city_h3_regions)
    expanded_city_h3_regions["region_id"] = expanded_city_h3_regions["region_id"].map(int_to_str)
    expanded_city_h3_regions = expanded_city_h3_regions.set_index("region_id")
    expanded_city_h3_regions["city"] = city_name
    expanded_city_h3_regions = gpd.GeoDataFrame(
        expanded_city_h3_regions,
        geometry=h3_to_shapely_geometry(expanded_city_h3_regions.index),
        crs=4326,
    )
    h3_regions_gdfs.append(expanded_city_h3_regions)

h3_regions = gpd.pd.concat(h3_regions_gdfs)

min_bound = h3_regions["distance_to_station"].min()
max_bound = h3_regions["distance_to_station"].max()

h3_regions["normalized_distance_to_station"] = (h3_regions["distance_to_station"] - min_bound) / (
    max_bound - min_bound
)

h3_regions

In [None]:
cmap = load_cmap("Temps", cmap_type="continuous", reverse=False)

ax = h3_regions[h3_regions.city == "Madrid"].plot(
    column="normalized_distance_to_station", figsize=(20, 20), cmap=cmap, alpha=0.6
)
bicycle_stations_per_city["Madrid"].representative_point().plot(ax=ax, color="black", markersize=3)

cx.add_basemap(ax, crs=h3_regions.crs, source=cx.providers.CartoDB.PositronNoLabels, zoom=13)
ax.set_axis_off()
ax.set_title("Distance to the nearest bike station in Madrid", fontsize=20)

In [None]:
features = OvertureMapsLoader(include_all_possible_columns=False, release="2024-12-18.0").load(
    area=h3_regions
)
features

In [None]:
joint = IntersectionJoiner().transform(regions=h3_regions, features=features)
joint

In [None]:
embeddings = ContextualCountEmbedder(
    neighbourhood=H3Neighbourhood(),
    neighbourhood_distance=H3_NEIGHBOURS,
    concatenate_vectors=False,
    count_subcategories=False,
).transform(regions_gdf=h3_regions[["geometry"]], features_gdf=features, joint_gdf=joint)
embeddings

In [None]:
target = "distance_to_station"

In [None]:
madrid_data = h3_regions[h3_regions["city"] == "Madrid"].merge(
    embeddings, left_index=True, right_index=True
)

valencia_data = h3_regions[h3_regions["city"] == "Valencia"].merge(
    embeddings, left_index=True, right_index=True
)

seville_data = h3_regions[h3_regions["city"] == "Seville"].merge(
    embeddings, left_index=True, right_index=True
)

madrid_data.shape, valencia_data.shape, seville_data.shape

In [None]:
madrid_data.head()

In [None]:
x_madrid = StandardScaler().fit_transform(madrid_data[[*embeddings.columns]])
y_madrid = madrid_data[target]

x_seville = StandardScaler().fit_transform(seville_data[[*embeddings.columns]])
y_seville = seville_data[target]

x_valencia = StandardScaler().fit_transform(valencia_data[[*embeddings.columns]])
y_valencia = valencia_data[target]

x_madrid.shape, y_madrid.shape, x_seville.shape, y_seville.shape, x_valencia.shape, y_valencia.shape

In [None]:
mask = y_madrid <= H3_PREDICTION_BUFFER

# Create DMatrix for training
dtrain = xgb.DMatrix(x_madrid[mask], label=y_madrid[mask])

# Set parameters for XGBoost
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": SEED,
}

# Train the model
num_rounds = 1000
bst = xgb.train(params, dtrain, num_rounds)

In [None]:
load_cmap("Avedon")

In [None]:
cmap = load_cmap("pal12", reverse=True, keep_first_n=H3_PREDICTION_BUFFER + 1)

_, axs = plt.subplots(2, 3, figsize=(12, 8), sharey=True, sharex=True, dpi=600)

axs[0, 0].set_ylabel("Predicted distance to station")
axs[1, 0].set_ylabel("Predicted distance to station")

for idx, (x, y, city_name) in enumerate(
    [
        (x_madrid, y_madrid, "Madrid"),
        (x_valencia, y_valencia, "Valencia"),
        (x_seville, y_seville, "Seville"),
    ]
):
    mask = y <= H3_PREDICTION_BUFFER
    predicted_values = bst.predict(xgb.DMatrix(x[mask]))
    sns.regplot(
        x=y[mask],
        y=predicted_values,
        ax=axs[0, idx],
        scatter=True,
        order=2,
        scatter_kws=dict(
            alpha=0.02,
            color=[cmap.colors[_y] for _y in y[mask]],
        ),
        line_kws=dict(
            color="black",
        ),
        x_jitter=0.1,
    )
    sns.violinplot(
        x=y[mask],
        y=predicted_values,
        ax=axs[1, idx],
        fill=True,
        palette=cmap.colors,
        hue=y[mask],
        legend=False,
    )
    axs[0, idx].set_title(city_name)
    axs[0, idx].set_xlabel(None)
    axs[1, idx].set_xlabel("Distance to station")

plt.tight_layout()

In [None]:
concatenated_regions = pd.concat([madrid_data, seville_data, valencia_data])[
    [target, "city", "geometry"]
]

concatenated_regions["predicted_distance_to_station"] = np.concatenate(
    [
        bst.predict(xgb.DMatrix(x_madrid)),
        bst.predict(xgb.DMatrix(x_seville)),
        bst.predict(xgb.DMatrix(x_valencia)),
    ]
).round(2)

concatenated_regions = concatenated_regions[concatenated_regions[target] <= H3_PREDICTION_BUFFER]

concatenated_regions["prediction_error"] = concatenated_regions[
    "predicted_distance_to_station"
] - concatenated_regions[target].clip(0, H3_PREDICTION_BUFFER)

concatenated_regions

In [None]:
cmap = load_cmap("Temps", cmap_type="continuous", reverse=False)

for city_name in cities_names:
    city_data = concatenated_regions[concatenated_regions.city == city_name]
    ax = city_data.plot(
        column="predicted_distance_to_station",
        figsize=(20, 20),
        cmap=cmap,
        alpha=0.8,
        legend=True,
        legend_kwds={
            "shrink": 0.3,
            "location": "bottom",
            "label": "Predicted distance to station",
            "pad": -0.05,
        },
        vmin=0,
        vmax=max(10, city_data["predicted_distance_to_station"].max()),
    )
    bicycle_stations_per_city[city_name].representative_point().plot(
        ax=ax, color="black", markersize=3, alpha=0.4
    )

    cx.add_basemap(ax, crs=h3_regions.crs, source=cx.providers.CartoDB.PositronNoLabels, zoom=13)
    ax.set_axis_off()
    ax.set_title(f"Predicted distance to the nearest bike station in {city_name}", fontsize=20)

    plt.show()

In [None]:
cmap = load_cmap("TangerineBlues", cmap_type="continuous")

for city_name in cities_names:
    city_data = concatenated_regions[concatenated_regions.city == city_name].copy()
    city_data["normalized_prediction_error"] = (
        city_data["prediction_error"].apply(
            lambda x, city_data=city_data: (
                -x / city_data["prediction_error"].min()
                if x < 0
                else x / city_data["prediction_error"].max()
            )
        )
        + 1
    ) / 2

    city_data["normalized_prediction_error_alpha"] = (
        city_data["normalized_prediction_error"] - 0.5
    ).abs() * 2

    ax = city_data.plot(
        column="normalized_prediction_error",
        figsize=(20, 20),
        cmap=cmap,
        alpha=city_data["normalized_prediction_error_alpha"],
        legend=True,
        legend_kwds={
            "shrink": 0.3,
            "location": "bottom",
            "label": "Distance prediction error",
            "pad": -0.05,
            "ticks": [0, 0.5, 1],
            "format": mticker.FixedFormatter(
                [
                    city_data["prediction_error"].min().round(2),
                    "0",
                    city_data["prediction_error"].max().round(2),
                ]
            ),
        },
    )
    bicycle_stations_per_city[city_name].representative_point().plot(
        ax=ax, color="black", markersize=3, alpha=0.4
    )

    cx.add_basemap(ax, crs=h3_regions.crs, source=cx.providers.CartoDB.PositronNoLabels, zoom=13)
    ax.set_axis_off()
    ax.set_title(f"Distance prediction error in {city_name}", fontsize=20)

    plt.show()