In [None]:
import geopandas as gpd
from srai.embedders import ContextualCountEmbedder
from srai.h3 import ring_buffer_h3_regions_gdf
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.regionalizers import H3Regionalizer
from srai.neighbourhoods import H3Neighbourhood

In [None]:
bike_data = gpd.read_parquet('../data/bikes_spain_example.geoparquet')
# airbnb_data = gpd.read_parquet('../data/airbnb_spain_example.geoparquet')
bike_data

In [None]:
bike_data['city'].value_counts()

In [None]:
from lonboard import viz
from lonboard.colormap import apply_continuous_cmap
from palettable.colorbrewer.sequential import YlOrRd_9

In [None]:
min_bound = airbnb_data["price"].min()
max_bound = airbnb_data["price"].max()
price_column = airbnb_data["price"]
normalized_price_data = (price_column - min_bound) / (max_bound - min_bound)

viz(
    airbnb_data,
    scatterplot_kwargs=dict(
        get_fill_color=apply_continuous_cmap(
            normalized_price_data, YlOrRd_9, alpha=0.7
        ),
        get_radius=normalized_price_data * 10,
        radius_units="meters",
        radius_min_pixels=2,
    ),
)

In [None]:
h3_regions = H3Regionalizer(resolution=10).transform(airbnb_data)
original_h3_cells = h3_regions.index
h3_regions = ring_buffer_h3_regions_gdf(h3_regions, 10)

viz(
    [h3_regions, airbnb_data],
    polygon_kwargs=dict(get_fill_color="#66CCFF", opacity=0.4, get_line_width=0),
    scatterplot_kwargs=dict(
        get_fill_color=apply_continuous_cmap(
            normalized_price_data, YlOrRd_9, alpha=0.7
        ),
        get_radius=normalized_price_data * 10,
        radius_units="meters",
        radius_min_pixels=2,
    ),
)

In [None]:
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS

features = OSMPbfLoader().load(
    area=h3_regions, tags=GEOFABRIK_LAYERS
)

In [None]:
features

In [None]:
joint = IntersectionJoiner().transform(regions=h3_regions, features=features)
joint

In [None]:
embeddings = ContextualCountEmbedder(
    neighbourhood=H3Neighbourhood(),
    neighbourhood_distance=5,
    concatenate_vectors=True,
    expected_output_features=GEOFABRIK_LAYERS,
    count_subcategories=False,
).transform(regions_gdf=h3_regions, features_gdf=features, joint_gdf=joint)

In [None]:
target = "price"
# features_to_add = [
#     "number_of_reviews",
#     "minimum_nights",
#     "availability_365",
#     "calculated_host_listings_count",
#     "number_of_reviews_ltm",
# ]

In [None]:
from h3 import latlng_to_cell

airbnb_data["h3"] = airbnb_data["geometry"].apply(
    lambda x: latlng_to_cell(x.y, x.x, 10)
)

airbnb_data

In [None]:
barcelona_data = (
    airbnb_data[airbnb_data["city"] == "barcelona"]
    .groupby("h3")["price"]
    # .median()
    .mean()
    .reset_index()
    .merge(h3_regions, left_on="h3", right_index=True)
    .merge(embeddings, left_on="h3", right_index=True)
)

madrid_data = (
    airbnb_data[airbnb_data["city"] == "madrid"]
    .groupby("h3")["price"]
    # .median()
    .mean()
    .reset_index()
    .merge(h3_regions, left_on="h3", right_index=True)
    .merge(embeddings, left_on="h3", right_index=True)
)

barcelona_data.shape, madrid_data.shape

In [None]:
barcelona_data

In [None]:
# normalized_price_data_h3 = (median_prices_per_h3["price"] - min_bound) / (
#     max_bound - min_bound
# )

# viz(
#     [median_prices_per_h3, airbnb_data],
#     polygon_kwargs=dict(
#         get_fill_color=apply_continuous_cmap(
#             normalized_price_data_h3, YlOrRd_9, alpha=0.7
#         ),
#         opacity=0.4,
#         get_line_width=0,
#     ),
#     scatterplot_kwargs=dict(
#         get_fill_color=apply_continuous_cmap(
#             normalized_price_data, YlOrRd_9, alpha=0.7
#         ),
#         get_radius=normalized_price_data * 10,
#         radius_units="meters",
#         radius_min_pixels=0.5,
#     ),
# )

In [None]:
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()

In [None]:
x_train = StandardScaler().fit_transform(barcelona_data[[*embeddings.columns]])
# x_test = barcelona_data[[*embeddings.columns]]
y_train = barcelona_data[target]

x_test = StandardScaler().fit_transform(madrid_data[[*embeddings.columns]])
# x_train = madrid_data[[*embeddings.columns]]
y_test = madrid_data[target]

x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
x_test

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor()
model.fit(x_train, y_train)

In [None]:
import seaborn as sns

sns.scatterplot(x=y_train, y=model.predict(x_train))

In [None]:
sns.scatterplot(x=y_test, y=model.predict(x_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor()

regr.fit(x_train, y_train)

In [None]:
sns.scatterplot(x=y_train, y=regr.predict(x_train))

In [None]:
sns.scatterplot(x=y_test, y=regr.predict(x_test))