In [None]:
import copy
import logging
import os
import warnings
from typing import Optional

import geopandas as gpd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset
from shapely.geometry import Polygon
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from tqdm import tqdm

from srai.benchmark import HexRegressionEvaluator
from srai.datasets import AirbnbMulticityDataset
from srai.embedders import Hex2VecEmbedder  # noqa: F401
from srai.h3 import h3_to_geoseries
from srai.joiners import IntersectionJoiner
from srai.loaders.osm_loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import HEX2VEC_FILTER
from srai.neighbourhoods.h3_neighbourhood import H3Neighbourhood
from srai.plotting import plot_numeric_data
from srai.regionalizers import H3Regionalizer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
resolution = 9
embedder_hidden_sizes = [150, 75, 50]

In [None]:
scaler = StandardScaler()
regionalizer = H3Regionalizer(resolution=resolution)

In [None]:
airbnb = AirbnbMulticityDataset()
ds = airbnb.load(version=str(resolution), hf_token=os.getenv("HF_TOKEN"))
train, test = ds["train"], ds["test"]

Create dev split from train split

In [None]:
train, dev = airbnb.train_test_split_bucket_regression(test_size=0.1, dev=True)

Get information about available categorical and numerical columns

In [None]:
airbnb.categorical_columns, airbnb.numerical_columns

In [None]:
train_ = train.copy()
dev_ = dev.copy()
test_ = test.copy()

Get h3 indexes for data points

In [None]:
regions_train = regionalizer.transform(train_)
joined_train = gpd.sjoin(train_, regions_train, how="left", predicate="within")  # noqa: E501
joined_train.rename(columns={"index_right": "h3_index"}, inplace=True)

regions_dev = regionalizer.transform(dev_)
joined_dev = gpd.sjoin(dev_, regions_dev, how="left", predicate="within")  # noqa: E501
joined_dev.rename(columns={"index_right": "h3_index"}, inplace=True)


regions_test = regionalizer.transform(test_)
joined_test = gpd.sjoin(test_, regions_test, how="left", predicate="within")  # noqa: E501
joined_test.rename(columns={"index_right": "h3_index"}, inplace=True)

In [None]:
regions_train.head()

In [None]:
len(regions_test)

Scale numerical data

In [None]:
columns_to_add = airbnb.numerical_columns + [airbnb.target]
joined_train[airbnb.numerical_columns] = scaler.fit_transform(
    joined_train[airbnb.numerical_columns]
)
train_averages_hex = joined_train.groupby("h3_index")[columns_to_add].mean()

joined_dev[airbnb.numerical_columns] = scaler.transform(joined_dev[airbnb.numerical_columns])
joined_test[airbnb.numerical_columns] = scaler.transform(joined_test[airbnb.numerical_columns])

dev_averages_hex = joined_dev.groupby("h3_index")[columns_to_add].mean()
test_averages_hex = joined_test.groupby("h3_index")[columns_to_add].mean()

Embed h3 regions to vectors. Use srai library to train spatial embeddings on train dataset with chosen embedder type (i.e. Hex2Vec, GeoVex ) and use it to get embeddings for hexagons in train, dev and test split.

In [None]:
osm_features = OSMPbfLoader().load(regions_train, HEX2VEC_FILTER)
embedder = Hex2VecEmbedder(embedder_hidden_sizes)
region_intersect_train = IntersectionJoiner().transform(regions_train, osm_features)
neighbourhood = H3Neighbourhood(regions_train)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    embedder.fit(
        regions_gdf=regions_train,
        features_gdf=osm_features,
        joint_gdf=region_intersect_train,
        neighbourhood=neighbourhood,
        trainer_kwargs={"max_epochs": 10, "accelerator": device},
    )

In [None]:
embeddings_train = embedder.transform(
    regions_gdf=regions_train, features_gdf=osm_features, joint_gdf=region_intersect_train
)
embeddings_train["h3"] = embeddings_train.index

osm_features_dev = OSMPbfLoader().load(regions_dev, HEX2VEC_FILTER)
osm_features_test = OSMPbfLoader().load(regions_test, HEX2VEC_FILTER)

region_intersect_dev = IntersectionJoiner().transform(regions_dev, osm_features_dev)
region_intersect_test = IntersectionJoiner().transform(regions_test, osm_features_test)

embeddings_dev = embedder.transform(
    regions_gdf=regions_dev, features_gdf=osm_features_dev, joint_gdf=region_intersect_dev
)
embeddings_dev["h3"] = embeddings_dev.index

embeddings_test = embedder.transform(
    regions_gdf=regions_test, features_gdf=osm_features_test, joint_gdf=region_intersect_test
)
embeddings_test["h3"] = embeddings_test.index

In [None]:
merged_train = embeddings_train.merge(
    train_averages_hex, how="inner", left_on="region_id", right_on="h3_index"
)

merged_dev = embeddings_dev.merge(
    dev_averages_hex, how="inner", left_on="region_id", right_on="h3_index"
)

merged_test = embeddings_test.merge(
    test_averages_hex, how="inner", left_on="region_id", right_on="h3_index"
)

merge_columns = [col for col in merged_train.columns if col not in (["h3"] + [airbnb.target])]

Combine numerical columns with the embedding vector

In [None]:
def concat_columns(row: gpd.GeoSeries) -> np.ndarray:
    """
    Concatenate embedding values together.

    Args:
        row (gpd.GeoSeries): row of embeddings

    Returns:
        np.ndarray: concatenated embedding
    """
    return np.concatenate([np.atleast_1d(val) for val in row.values])

Get final version of data splits (X - embedding vector, X_h3_idx - h3 index, y - target value)

In [None]:
train_dataset = Dataset.from_dict(
    {
        "X": merged_train[merge_columns].apply(concat_columns, axis=1).values,
        "X_h3_idx": merged_train["h3"].values,
        "y": merged_train[airbnb.target].values,
    }
)

train_dataset.set_format(type="torch", columns=["X", "X_h3_idx", "y"])


dev_dataset = Dataset.from_dict(
    {
        "X": merged_dev[merge_columns].apply(concat_columns, axis=1).values,
        "X_h3_idx": merged_dev["h3"].values,
        "y": merged_dev[airbnb.target].values,
    }
)
dev_dataset.set_format(type="torch", columns=["X", "X_h3_idx", "y"])

test_dataset = Dataset.from_dict(
    {
        "X": merged_test[merge_columns].apply(concat_columns, axis=1).values,
        "X_h3_idx": merged_test["h3"].values,
        "y": merged_test[airbnb.target].values,
    }
)
test_dataset.set_format(type="torch", columns=["X", "X_h3_idx", "y"])

In [None]:
train_dataset[0]

In [None]:
embedding_size = train_dataset["X"].shape[1]
embedding_size

Model definition


In [None]:
"""
Regression model

Contains implementation of base model of regression.
"""


class RegressionBaseModel(nn.Module):  # type: ignore
    """
    Regression base model.

    Definition of Regression Model
    """

    def __init__(
        self,
        embeddings_size: int,
        linear_sizes: Optional[list[int]] = None,
        activation_function: Optional[nn.Module] = None,
    ):
        """
        Initializaiton of regression module.

        Args:
            embeddings_size (int): size of input embedding
            linear_sizes (Optional[list[int]], optional): sizes of linear layers inside module. \
                Defaults to [500, 1000].
            activation_function (Optional[nn.Module], optional): activation function from torch.nn \
                Defaults to ReLU.
        """
        super().__init__()
        if linear_sizes is None:
            linear_sizes = [500, 1000]
        if activation_function is None:
            activation_function = nn.ReLU()
        self.model = torch.nn.Sequential()
        previous_size = embeddings_size
        for cnt, size in enumerate(linear_sizes):
            self.model.add_module(f"linear_{cnt}", nn.Linear(previous_size, size))
            self.model.add_module(f"ReLU_{cnt}", activation_function)
            previous_size = size
            if cnt % 2:
                self.model.add_module(f"dropout_{cnt}", nn.Dropout(p=0.2))
        self.model.add_module("linear_final", nn.Linear(previous_size, 1))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the model.

        Args:
            x (torch.Tensor): Vector data

        Returns:
            torch.Tensor: target value
        """
        return self.model(x)

Training parameters

In [None]:
regression_model = RegressionBaseModel(embedding_size)
loss_fn = nn.L1Loss()
optimizer = optim.Adam(regression_model.parameters(), lr=0.001)
epochs = 50
batch_size = 32
save_dir = os.getcwd()

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
evaluator = HexRegressionEvaluator()

In [None]:
stop_counter = 0
prev_eval_loss = np.inf  # init to infinity
loss_eval = []
loss_train = []
metrics_results = []

for epoch in range(epochs):
    batch_loss_list = []
    regression_model.train()
    for batch in tqdm(
        train_dataloader,
        desc=f"Epoch: {epoch}",
        total=len(train_dataloader),
    ):
        inputs = batch["X"].to(device)
        labels = batch["y"].to(device).reshape(-1, 1)

        outputs = regression_model(inputs)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_loss_list.append(loss.item())

    logging.info(f"Epoch [{epoch+1}/{epochs}], avg_loss: {np.mean(batch_loss_list):.4f}")
    loss_train.append(np.mean(batch_loss_list))

    regression_model.eval()
    metrics_per_batch = []
    batch_eval_loss = []
    with torch.no_grad():
        for i, batch in tqdm(
            enumerate(dev_dataloader),
            desc="Evaluation",
            total=len(dev_dataloader),
        ):
            inputs = batch["X"].to(device)
            labels = batch["y"].to(device).reshape(-1, 1)

            outputs = regression_model(inputs)
            loss = loss_fn(outputs, labels)
            batch_eval_loss.append(float(loss.item()))

            metrics = evaluator._compute_metrics(np.asarray(outputs), np.asarray(labels))
            metrics_per_batch.append({"Batch": i, **metrics})

    mean_metrics = {
        key: np.mean([batch[key] for batch in metrics_per_batch])
        for key in metrics_per_batch[0].keys()
        if key != "Batch"
    }
    metrics_results.append(mean_metrics)
    loss_eval.append(np.mean(batch_eval_loss))
    logging.info(f"Evaluation loss: {loss_eval[-1]:.4f}")
    # early_stopping
    if loss_eval[-1] >= prev_eval_loss:
        stop_counter += 1
        if stop_counter == 5:
            logging.info(f"Early stopping at epoch {epoch}")
            best_weights = copy.deepcopy(regression_model.state_dict())
            break
        prev_eval_loss = loss_eval[-1]
if stop_counter == 5:
    regression_model.load_state_dict(best_weights)

torch.save(regression_model.state_dict(), os.path.join(save_dir, "airbnb_best_model.pkl"))

In [None]:
regression_model.eval()
h3_indexes = []
xy_points = []
all_predictions = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Predicting...", total=len(test_dataloader)):
        inputs = batch["X"].to(device)
        indexes = batch["X_h3_idx"]
        points = batch["point"] if "point" in batch else ["" for _ in indexes]
        outputs = regression_model(inputs)
        h3_indexes.extend(indexes)
        xy_points.extend(points)
        all_predictions.extend(outputs.cpu().numpy())

In [None]:
evaluator.evaluate(
    dataset=airbnb, predictions=all_predictions, region_ids=h3_indexes, log_metrics=False
)

Results visualisation

In [None]:
original_label = [test_dataset[i]["y"] for i in range(len(test_dataset))]
original_hexes = [test_dataset[i]["X_h3_idx"] for i in range(len(test_dataset))]

In [None]:
polygons = h3_to_geoseries(
    h3_indexes,
)
preds_gdf = gpd.GeoDataFrame(geometry=polygons)
preds_gdf.crs = {"init": "epsg:4326"}
preds_gdf["price"] = [tensor.item() for tensor in all_predictions]
preds_gdf["region_id"] = h3_indexes
preds_gdf.index = preds_gdf["region_id"]

original_polygons = h3_to_geoseries(original_hexes)
original_gdf = gpd.GeoDataFrame(geometry=[Polygon(polygon) for polygon in original_polygons])
original_gdf.crs = {"init": "epsg:4326"}
original_gdf["price"] = [tensor.item() for tensor in original_label]
original_gdf["region_id"] = original_hexes
original_gdf.index = original_gdf["region_id"]

In [None]:
regionalizer = H3Regionalizer(resolution=resolution)
regions = regionalizer.transform(original_gdf)
plot_numeric_data(regions, "price", original_gdf)
# CO JEST NIE TAK??

In [None]:
plot_numeric_data(regions, "price", preds_gdf)