In [None]:
import os

import geopandas as gpd
import torch
import torch.nn as nn
import torch.optim as optim
from shapely.geometry import Polygon
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

from srai.datasets import AirbnbMulticityDataset
from srai.h3 import h3_to_geoseries
from srai.models import Evaluator, Predictor, RegressionBaseModel, Trainer, Vectorizer
from srai.plotting import plot_numeric_data
from srai.regionalizers import H3Regionalizer

In [None]:
gdf_airbnb = AirbnbMulticityDataset().load(os.getenv("HF_TOKEN"))
gdf_airbnb = gdf_airbnb.loc[gdf_airbnb["city"].isin(["paris"])]

In [None]:
resolution = 8

In [None]:
numerical_columns = [
    "number_of_reviews",
]

vectorizer = Vectorizer(
    gdf_dataset=gdf_airbnb,
    HF_dataset_object=AirbnbMulticityDataset(),
    target_column_name="price",
    # numerical_columns=numerical_columns,
    embedder_type="Hex2VecEmbedder",
    h3_resolution=resolution,
)

vectorizer = Vectorizer(
    gdf_dataset=gdf_airbnb,
    HF_dataset_object=AirbnbMulticityDataset(),
    target_column_name="price",
    embedder_type="GeoVexEmbedder",
    h3_resolution=resolution,
)

In [None]:
dataset_airbnb = vectorizer.get_dataset()
embedding_size = dataset_airbnb["X"].shape[1]

In [None]:
train_indices, test_indices = train_test_split(
    range(len(dataset_airbnb)),
    test_size=0.3,
)


train_split = Subset(dataset_airbnb, train_indices)
test_split = Subset(dataset_airbnb, test_indices)

In [None]:
val_indices, test_indices = train_test_split(
    range(len(test_split)),
    test_size=0.5,
)
val_split = Subset(test_split, val_indices)
test_split = Subset(test_split, test_indices)

In [None]:
regression_model = RegressionBaseModel(embedding_size)

In [None]:
loss_fn = nn.L1Loss()
optimizer = optim.Adam(regression_model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args = {
    "batch_size": 32,
    "task": "regression",
    "epochs": 50,
    "device": device,
    "metric2look4": "MAE",
}
trainer = Trainer(
    model=regression_model,
    train_dataset=train_split,
    eval_dataset=val_split,
    optimizer=optimizer,
    loss_fn=loss_fn,
    **args,
)

In [None]:
model, _, _ = trainer.train()

In [None]:
evaluator = Evaluator(task="regression", device=device)

In [None]:
evaluator.evaluate(model, test_split, return_metrics=False)

In [None]:
predictor = Predictor("regression", device=device)

In [None]:
_, hexes, values = predictor.predict(model, test_split, resolution=resolution)

In [None]:
original_label = [test_split[i]["y"] for i in range(len(test_split))]
original_hexes = [test_split[i]["X_h3_idx"] for i in range(len(test_split))]

In [None]:
polygons = h3_to_geoseries(
    hexes,
)
preds_gdf = gpd.GeoDataFrame(geometry=polygons)
preds_gdf.crs = {"init": "epsg:4326"}
preds_gdf["price"] = [tensor.item() for tensor in values]
preds_gdf["region_id"] = hexes
preds_gdf.index = preds_gdf["region_id"]

original_polygons = h3_to_geoseries(original_hexes)
original_gdf = gpd.GeoDataFrame(geometry=[Polygon(polygon) for polygon in original_polygons])
original_gdf.crs = {"init": "epsg:4326"}
original_gdf["price"] = [tensor.item() for tensor in original_label]
original_gdf["region_id"] = original_hexes
original_gdf.index = original_gdf["region_id"]

In [None]:
regionalizer = H3Regionalizer(resolution=resolution)
regions = regionalizer.transform(original_gdf)
plot_numeric_data(regions, "price", original_gdf)

In [None]:
plot_numeric_data(regions, "price", preds_gdf)