### Airbnb Dataset

This dataset consists of approximately 3.1 million Airbnb listings collected between June 2022 and May 2023 across 80 cities worldwide. It includes geographic location, property characteristics, host activity, and review metrics. For the benchmark, a cleaned subset from six cities—Paris, Rome, London, Amsterdam, Melbourne, and New York City—was selected.

In [None]:
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

# dataset import
from srai.datasets import AirbnbMulticityDataset

In [None]:
airbnb_multicity = AirbnbMulticityDataset()

In [None]:
type(airbnb_multicity.train_gdf), type(airbnb_multicity.test_gdf)

Loading default version

In [None]:
ds = airbnb_multicity.load()
ds.keys()

In [None]:
type(airbnb_multicity.train_gdf), type(airbnb_multicity.test_gdf)

In [None]:
print("Aggregation H3 resolution:", airbnb_multicity.resolution)

In [None]:
print("Prediction target:", airbnb_multicity.target)

In [None]:
gdf_train, gdf_test = ds["train"], ds["test"]

In [None]:
print("Available cities:", sorted(gdf_train["city"].unique()))

In [None]:
gdf_train.head()

In [None]:
fig, axes = plt.subplots(
    2, 2, sharex=False, sharey=False, figsize=(12, 15), width_ratios=[3, 1]
)

cities = [("Amsterdam", 0.05), ("London", 0.01)]
for row_idx, (city_name, marker_size) in enumerate(cities):
    city_train = gdf_train[gdf_train["city"] == city_name.lower()]
    city_test = gdf_test[gdf_test["city"] == city_name.lower()]
    train_points = len(city_train)
    test_points = len(city_test)
    train_pct = 100 * train_points / (train_points + test_points)
    test_pct = 100 * test_points / (train_points + test_points)

    ax_map = axes[row_idx][0]
    city_train.plot(color="orange", markersize=marker_size, ax=ax_map, label="train")
    city_test.plot(color="royalblue", markersize=marker_size, ax=ax_map, label="test")
    ax_map.set_title(
        f"{city_name} data - points on a map"
        f" (Train: {train_points} ({train_pct:.2f}%),"
        f" Test: {test_points} ({test_pct:.2f}%))"
    )
    ax_map.legend(
        handles=[
            Line2D([], [], marker="o", color="orange", linestyle="None"),
            Line2D([], [], marker="o", color="royalblue", linestyle="None"),
        ],
        labels=["Train", "Test"],
    )
    cx.add_basemap(
        ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=12
    )
    ax_map.set_axis_off()

    ax_dist = axes[row_idx][1]
    sns.kdeplot(
        x=city_train[airbnb_multicity.target],
        label="train",
        color="orange",
        ax=ax_dist,
        fill=False,
        cut=0,
    )
    sns.kdeplot(
        x=city_test[airbnb_multicity.target],
        label="test",
        color="royalblue",
        ax=ax_dist,
        fill=False,
        cut=0,
    )
    ax_dist.set_title(f"{city_name} data - target distribution")
    ax_dist.legend()

plt.tight_layout()
plt.show()

Getting aggregated hexagon values 

In [None]:
train_h3, _, test_h3 = airbnb_multicity.get_h3_with_labels()

In [None]:
train_h3.head()

In [None]:
test_h3.head()

In [None]:
aggregated_train_data = train_h3.cx[-1.04:0.65, 51.09:51.84]
aggregated_test_data = test_h3.cx[-1.04:0.65, 51.09:51.84]

with plt.rc_context({"hatch.linewidth": 0.4}):
    ax = aggregated_train_data.plot(
        airbnb_multicity.target,
        cmap="spring_r",
        legend=True,
        legend_kwds=dict(
            location="right", shrink=0.9, pad=0.02, label=airbnb_multicity.target
        ),
        figsize=(15, 9),
        alpha=0.5,
    )

    ax.set_axis_off()

    aggregated_test_data.plot(
        airbnb_multicity.target, cmap="spring_r", alpha=0.5, ax=ax
    )

    aggregated_test_data.plot(
        ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
    )

    ax.set_title("London data aggregated to H3 cells")
    ax.legend(
        handles=[
            Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
            Patch(
                edgecolor=(0, 0, 0, 0.8),
                linewidth=0.1,
                facecolor=(0, 0, 0, 0),
                hatch="+++",
            ),
        ],
        labels=["Train", "Test"],
        loc=2,
    )

    cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
    ax.set_axis_off()

    plt.show()

Loading raw, full data

In [None]:
ds = airbnb_multicity.load(version="all")
ds.keys()

In [None]:
type(airbnb_multicity.train_gdf), type(airbnb_multicity.test_gdf)

In [None]:
ds["train"].head()

Create your own train-test split -> Spatial splitting with bucket stratification

In [None]:
train, test = airbnb_multicity.train_test_split(
    target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)

In [None]:
type(airbnb_multicity.train_gdf), type(airbnb_multicity.test_gdf)

In [None]:
airbnb_multicity.resolution

In [None]:
train.head()

In [None]:
test.head()