In [None]:
from srai.embedders import GeoVexEmbedder, CountEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
from srai.plotting import plot_regions, plot_numeric_data
from pytorch_lightning import seed_everything

In [None]:
SEED = 71
seed_everything(SEED)

### Load data from OSM


First use geocoding to get the area


In [None]:
area_gdf = geocode_to_region_gdf("Greater London, UK")
plot_regions(area_gdf, tiles_style="CartoDB positron")

### Buffer the Area

The GeoVex embedder requires a buffer around the area of interest, as the hexagon needs to have its radius k neighbors in the dataset as well.
The buffer is defined in hexagon radius units, so a buffer of 1 means that the hexagon will have its 1-neighborhood in the dataset as well.


In [None]:
area_gdf.head()

In [None]:
from srai.h3 import ring_buffer_h3_regions_gdf

resolution = 9
k_ring_buffer_radius = 4

regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)

buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union

print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))

### Download the Data


Next, download the data for the selected region and the specified tags. We're using `OSMOnlineLoader` here, as it's faster for low numbers of tags.


In [None]:
from srai.loaders.osm_loaders.filters import HEX2VEC_FILTER

tags = HEX2VEC_FILTER
loader = OSMPbfLoader()

features_gdf = loader.load(buffered_h3_geometry, tags)

## Prepare the data for embedding


After downloading the data, we need to prepare it for embedding. In the previous step we have regionalized the selected area and buffered it, now we have to join the features with prepared regions.


In [None]:
plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")

In [None]:
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf

## GeoVex-Embedding


After preparing the data we can proceed with generating embeddings for the regions.


In [None]:
import warnings

neighbourhood = H3Neighbourhood(buffered_h3_regions)

embedder = GeoVexEmbedder(
    target_features=[f"{super_}_{sub}" for super_, subs in HEX2VEC_FILTER.items() for sub in subs],
    neighbourhood=neighbourhood,
    batch_size=8,
    neighbourhood_radius=k_ring_buffer_radius,
    convolutional_layers=2,
    embedding_size=50,
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    embeddings = embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        trainer_kwargs={
            # "max_epochs": 20, # uncomment for a longer training
            "max_epochs": 2,
            "accelerator": "cpu",
        },
        learning_rate=0.001,
    )

embeddings.head()

### Hex2Vec Embedding


In [None]:
from srai.embedders import Hex2VecEmbedder
import warnings

neighbourhood = H3Neighbourhood(buffered_h3_regions)

hex2vec_embedder = Hex2VecEmbedder(
    encoder_sizes=[300, 150, 50],
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    hex2vec_embeddings = hex2vec_embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        negative_sample_k_distance=2,
        batch_size=64,
        learning_rate=0.001,
        trainer_kwargs={
            # "max_epochs": 50, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": "cpu",
        },
    )

hex2vec_embeddings.head()

## Comparing the Embeddings


### GeoVex Embedding


#### PCA


In [None]:
# do pca with three components and then cast to RGB
from sklearn.decomposition import PCA
from srai.plotting import plot_numeric_data
import pandas as pd

pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(
    enumerate(buffered_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list())
)
buffered_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

### Clustering


In [None]:
from sklearn.cluster import KMeans

clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings

In [None]:
plot_numeric_data(buffered_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")

### Hex2Vec


#### PCA


In [None]:
# do pca with three components and then cast to RGB
from sklearn.decomposition import PCA
from srai.plotting import plot_numeric_data
import pandas as pd

pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(
    enumerate(buffered_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list())
)
buffered_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

### Clustering


In [None]:
from sklearn.cluster import KMeans

clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)

hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings

In [None]:
plot_numeric_data(
    buffered_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron"
)