In [None]:
%pip install srai[overturemaps] openpyxl contextily seaborn

# Prepare London population dataset

We will download geometries for the MSOA (Middle layer Super Output Area) regions for London from 2021 and combine it with population estimates for 2022.

Based on that we will calculate population density and try to predict it.

In [None]:
import zipfile
from pathlib import Path

import contextily as cx
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pooch import retrieve
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from srai.embedders import CountEmbedder
from srai.loaders import OvertureMapsLoader

In [None]:
msoa_url = "https://data.london.gov.uk/download/statistical-gis-boundary-files-london/f6d9340a-2ccb-46ad-846b-c9122b4b5d1f/LB_MSOA2021_shp.zip"
destination_file = retrieve(
    url=msoa_url,
    fname=Path(msoa_url).name,
    path=".",
    known_hash=None,
)

In [None]:
zip_path = destination_file
zip_paths = []

with zipfile.ZipFile(zip_path, mode="r") as archive:
    zip_paths = [
        f"zip:{zip_path}!{p}" for p in archive.namelist() if p.endswith(".shp")
    ]

zip_paths

In [None]:
# Load MSOA boundaries (GeoJSON from London Datastore)
msoa_gdf = gpd.pd.concat(
    [gpd.read_file(p, columns=["msoa21cd", "msoa21nm", "geometry"]) for p in zip_paths]
)
msoa_gdf

In [None]:
stats = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/middlesuperoutputareamidyearpopulationestimatesnationalstatistics/mid2021andmid2022/sapemsoaquinaryagetablefinal.xlsx"
destination_file = retrieve(
    url=stats,
    fname="msoa_density.xlsx",
    path=".",
    known_hash=None,
)

x = pd.read_excel(
    destination_file,
    sheet_name="Mid-2022 MSOA 2021",
    skiprows=3,
)[["MSOA 2021 Code", "Total"]]
x

In [None]:
msoa_stats_gdf = msoa_gdf.merge(x, left_on="msoa21cd", right_on="MSOA 2021 Code")
msoa_stats_gdf["area"] = msoa_stats_gdf.area
msoa_stats_gdf["population_density"] = msoa_stats_gdf["Total"] / msoa_stats_gdf["area"]
msoa_stats_gdf = msoa_stats_gdf.to_crs(epsg=4326).set_index("msoa21cd")
msoa_stats_gdf

In [None]:
f, ax = plt.subplots(figsize=(12, 8))
msoa_stats_gdf.plot("population_density", ax=ax, legend=True, alpha=0.8)
cx.add_basemap(ax, crs=msoa_stats_gdf.crs, source=cx.providers.CartoDB.PositronNoLabels)
ax.set_axis_off()

plt.show()

# Generate embeddings

In [None]:
london_features = OvertureMapsLoader(hierarchy_depth=1).load(msoa_stats_gdf)
london_features

In [None]:
region_idx, features_idx = london_features.sindex.query(
    msoa_stats_gdf.geometry, predicate="intersects"
)
features_per_msoa = pd.DataFrame(
    index=pd.MultiIndex.from_arrays(
        (msoa_stats_gdf.index[region_idx], london_features.index[features_idx]),
        names=(msoa_stats_gdf.index.name, london_features.index.name),
    )
)
features_per_msoa

In [None]:
embeddings = CountEmbedder(count_subcategories=False).transform(
    msoa_stats_gdf, london_features, features_per_msoa
)
embeddings

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    msoa_stats_gdf.loc[embeddings.index, "population_density"],
    test_size=0.33,
    random_state=42,
)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model

In [None]:
y_true = y_test

y_pred = model.predict(X_test)

r2_score(y_true, y_pred)

In [None]:
f, ax = plt.subplots(figsize=(8, 8))

sns.regplot(
    x=y_true,
    y=y_pred,
    scatter_kws=dict(alpha=0.5, s=10),
    line_kws=dict(color=".2", linestyle="--"),
    ax=ax,
)
min_density = y_true.min()
max_density = y_true.max()
sns.lineplot(
    x=[min_density, max_density], y=[min_density, max_density], color="red", ax=ax
)

ax.set_xlabel("True population density")
ax.set_ylabel("Predicted population density")

plt.show()

In [None]:
ax = sns.barplot(
    pd.DataFrame(
        {
            "feature_importance": model.feature_importances_,
            "feature_names": embeddings.columns,
        }
    ).nlargest(20, "feature_importance"),
    y="feature_names",
    x="feature_importance",
)
ax.set_title("Feature importances")
ax.set_ylabel("Features")
ax.set_xlabel("Feature importance")
plt.show()

In [None]:
msoa_stats_gdf["predicted_population_density"] = model.predict(
    embeddings.loc[msoa_stats_gdf.index]
)

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 16), sharex=True, sharey=True)
msoa_stats_gdf.plot("population_density", ax=ax1, legend=True)
msoa_stats_gdf.plot("predicted_population_density", ax=ax2, legend=True)

ax1.set_title("Population density")
ax2.set_title("Predicted population density")

plt.show()

In [None]:
msoa_stats_gdf["error"] = (
    msoa_stats_gdf["population_density"]
    - msoa_stats_gdf["predicted_population_density"]
)
msoa_stats_gdf["alpha"] = msoa_stats_gdf["error"].apply(
    lambda x: (
        x / msoa_stats_gdf["error"].max()
        if x >= 0
        else x / msoa_stats_gdf["error"].min()
    )
)

In [None]:
f, ax = plt.subplots(figsize=(12, 8))

msoa_stats_gdf.boundary.plot(ax=ax, color="black", alpha=0.8, lw=0.1)
msoa_stats_gdf.plot(
    "error", ax=ax, legend=True, alpha=msoa_stats_gdf["alpha"], cmap="bwr_r"
)

cx.add_basemap(
    ax, crs=msoa_stats_gdf.crs, source=cx.providers.CartoDB.PositronNoLabels, zoom=12
)
ax.set_axis_off()

plt.show()