In [1]:
import os

import geopandas as gpd
import duckdb
from dotenv import load_dotenv
from shapely import from_wkb

In [23]:
load_dotenv()
connection_string = os.getenv("BLOB_STORAGE_CONNECTION_STRING")

con = duckdb.connect(":memory:")
con.execute("INSTALL spatial; LOAD spatial;")
con.execute("INSTALL azure; LOAD azure;")
con.execute("""
CREATE SECRET secret (
    TYPE azure,
    PROVIDER config,
    ACCOUNT_NAME 'doppablobstorage'
);
""")
con.execute("SET azure_transport_option_type = curl")

<_duckdb.DuckDBPyConnection at 0x7f40dcd6c8f0>

In [28]:
release_1 = "az://raw/release/2025-10-27.19/dataset=osm/theme=buildings/region=*/*.parquet"
release_2 = "az://raw/release/2025-11-12.0/dataset=osm/theme=buildings/region=*/*.parquet"

In [29]:
count_release_1 = con.execute(f"SELECT count(*) AS count FROM '{release_1}'").fetchone()[0]
count_release_2 = con.execute(f"SELECT count(*) AS count FROM '{release_2}'").fetchone()[0]
print("Number of rows release 1:", count_release_1)
print("Number of rows release 2:", count_release_2)

Number of rows release 1: 4163561
Number of rows release 2: 4163473


In [5]:
query = f"""
WITH old AS (
    SELECT id, md5(ST_AsWKB(ST_Normalize(geometry))) AS geom_hash
    FROM read_parquet('{release_1}')
),
new AS (
    SELECT id, md5(ST_AsWKB(ST_Normalize(geometry))) AS geom_hash
    FROM read_parquet('{release_2}')
)
SELECT
    COALESCE(n.id, o.id) AS id,
    n.id AS new_id,
    o.id AS old_id
FROM new n
FULL OUTER JOIN old o ON n.id = o.id
WHERE n.geom_hash IS DISTINCT FROM o.geom_hash;
"""

df = con.execute(query).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
query = f"""
WITH old AS (
    SELECT id, ST_Normalize(geometry) AS geom
    FROM read_parquet('{release_1}')
),
new AS (
    SELECT id, ST_Normalize(geometry) AS geom
    FROM read_parquet('{release_2}')
)
SELECT
    COALESCE(n.id, o.id) AS id,
    n.id AS new_id,
    o.id AS old_id
FROM new n
FULL OUTER JOIN old o ON n.id = o.id
WHERE NOT ST_Equals(n.geom, o.geom);
"""

df = con.execute(query).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
df.head()

Unnamed: 0,id,new_id,old_id
0,104875574,104875574,104875574
1,179263722,179263722,179263722
2,251003450,251003450,251003450
3,301843694,301843694,301843694
4,301905936,301905936,301905936


In [8]:
df.shape

(94, 3)

In [9]:
changed_ids = df["id"].to_numpy()

In [10]:
query = f"""
SELECT
   id, ST_AsWKB(geometry) AS geometry
FROM '{release_1}'
WHERE id IN ({",".join(map(str, changed_ids))})
ORDER BY id;
"""
release_1_diff = con.execute(query).fetchdf()
release_1_diff.head()

Unnamed: 0,id,geometry
0,20711907,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
1,20712057,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 4, ..."
2,39748960,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
3,52715334,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
4,58567004,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."


In [11]:
query = f"""
SELECT
    id, ST_AsWKB(geometry) AS geometry
FROM '{release_2}'
WHERE id IN ({",".join(map(str, changed_ids))})
ORDER BY id;
"""
release_2_diff = con.execute(query).fetchdf()
release_2_diff.head()

Unnamed: 0,id,geometry
0,20711907,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
1,20712057,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 4, ..."
2,39748960,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
3,52715334,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
4,58567004,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."


In [12]:
release_1_diff["geometry"] = release_1_diff["geometry"].apply(
    lambda g: bytes(g) if isinstance(g, (memoryview, bytearray)) else g)
release_2_diff["geometry"] = release_2_diff["geometry"].apply(
    lambda g: bytes(g) if isinstance(g, (memoryview, bytearray)) else g)

In [13]:
release_1_diff.head()

Unnamed: 0,id,geometry
0,20711907,"b""\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03..."
1,20712057,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
2,39748960,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
3,52715334,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
4,58567004,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...


In [14]:
release_2_diff.head()

Unnamed: 0,id,geometry
0,20711907,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
1,20712057,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
2,39748960,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
3,52715334,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...
4,58567004,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...


In [15]:
release_1_diff["geometry"] = release_1_diff["geometry"].apply(from_wkb)
release_2_diff["geometry"] = release_2_diff["geometry"].apply(from_wkb)

In [16]:
release_1_gdf = gpd.GeoDataFrame(release_1_diff, geometry=release_1_diff["geometry"], crs="EPSG:4326")
release_2_gdf = gpd.GeoDataFrame(release_2_diff, geometry=release_2_diff["geometry"], crs="EPSG:4326")

In [17]:
release_1_gdf.head()

Unnamed: 0,id,geometry
0,20711907,"MULTIPOLYGON (((10.27004 59.56295, 10.2701 59...."
1,20712057,"MULTIPOLYGON (((9.59243 59.19178, 9.59252 59.1..."
2,39748960,"MULTIPOLYGON (((10.69602 59.45891, 10.69622 59..."
3,52715334,"MULTIPOLYGON (((9.6007 59.19671, 9.60074 59.19..."
4,58567004,"MULTIPOLYGON (((10.68892 59.4641, 10.68899 59...."


In [18]:
release_2_gdf.head()

Unnamed: 0,id,geometry
0,20711907,"MULTIPOLYGON (((10.26999 59.56277, 10.27009 59..."
1,20712057,"MULTIPOLYGON (((9.59243 59.19178, 9.59252 59.1..."
2,39748960,"MULTIPOLYGON (((10.69602 59.45891, 10.69622 59..."
3,52715334,"MULTIPOLYGON (((9.6007 59.19671, 9.60074 59.19..."
4,58567004,"MULTIPOLYGON (((10.6889 59.4641, 10.68914 59.4..."


In [19]:
# release_1_gdf.to_parquet("release_1.parquet", schema_version="1.1.0")
# release_2_gdf.to_parquet("release_2.parquet", schema_version="1.1.0")