### House Sales in King County Dataset

his dataset contains house sale prices for King County, which includes Seattle, covering approximately 21,000 residential property sales recorded between May 2014 and May 2015. It provides geographic coordinates, physical property attributes (such as size, number of rooms, and condition), as well as contextual features like proximity to waterfronts. 

In [None]:
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

# dataset import
from srai.datasets import HouseSalesInKingCountyDataset

In [None]:
hskc_dataset = HouseSalesInKingCountyDataset()

In [None]:
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

Load default version of dataset

In [None]:
ds = hskc_dataset.load()
ds.keys()

In [None]:
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

In [None]:
print("Aggregation H3 resolution:", hskc_dataset.resolution)

In [None]:
print("Prediction target:", hskc_dataset.target)

In [None]:
train_gdf, test_gdf = ds["train"], ds["test"]

In [None]:
train_gdf.head()

In [None]:
len(test_gdf)

In [None]:
fig, axes = plt.subplots(
    2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)

train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)

ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
    f"King County data - points on a map"
    f" (Train: {train_points} ({train_pct:.2f}%),"
    f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
    handles=[
        Line2D([], [], marker="o", color="orange", linestyle="None"),
        Line2D([], [], marker="o", color="royalblue", linestyle="None"),
    ],
    labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()

ax_dist = axes[1]
sns.kdeplot(
    x=train_gdf[hskc_dataset.target],
    label="train",
    color="orange",
    ax=ax_dist,
    fill=False,
    cut=0,
)
sns.kdeplot(
    x=test_gdf[hskc_dataset.target],
    label="test",
    color="royalblue",
    ax=ax_dist,
    fill=False,
    cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()

fig.tight_layout()

plt.show()

Getting the h3 with target values

In [None]:
hskc_dataset.resolution

In [None]:
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()

In [None]:
train_h3.head()

In [None]:
test_h3.head()

In [None]:
with plt.rc_context({"hatch.linewidth": 0.4}):
    ax = train_h3.plot(
        hskc_dataset.target,
        cmap="spring_r",
        legend=True,
        legend_kwds=dict(
            location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
        ),
        figsize=(15, 9),
        alpha=0.5,
    )

    axes[0].set_axis_off()

    test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)

    test_h3.plot(
        ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
    )

    ax.set_title("King County data aggregated to H3 cells")
    ax.legend(
        handles=[
            Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
            Patch(
                edgecolor=(0, 0, 0, 0.8),
                linewidth=0.1,
                facecolor=(0, 0, 0, 0),
                hatch="+++",
            ),
        ],
        labels=["Train", "Test"],
        loc=2,
    )

    cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)

    fig.tight_layout()

    plt.show()

Load raw version of dataset

In [None]:
ds = hskc_dataset.load(version="all")
ds.keys()

In [None]:
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

In [None]:
ds["train"].head()

Creating your own train - test split -> Bucket regression (works similarly for spatial regression)

In [None]:
train, test = hskc_dataset.train_test_split(
    target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)

In [None]:
resolution = hskc_dataset.resolution

In [None]:
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

In [None]:
train.head()

In [None]:
test.head()