In [1]:
import geopandas as gpd
import duckdb
from shapely import from_wkb

ModuleNotFoundError: No module named 'geopandas'

In [None]:
# Initialize DuckDB connection and Azure access
con = duckdb.connect(":memory:")
con.execute("INSTALL spatial; LOAD spatial;")
con.execute("INSTALL azure; LOAD azure;")

# Create Azure secret for public access
con.execute("""
CREATE SECRET secret (
    TYPE azure,
    PROVIDER config,
    ACCOUNT_NAME 'doppablobstorage'
);
""")

# Use curl transport for Azure requests
con.execute("SET azure_transport_option_type = curl")

In [None]:
release_1 = "az://raw/release/2025-10-27.19/dataset=osm/theme=buildings/region=*/*.parquet"
release_2 = "az://raw/release/2025-11-12.0/dataset=osm/theme=buildings/region=*/*.parquet"

In [None]:
count_release_1 = con.execute(f"SELECT count(*) AS count FROM '{release_1}'").fetchone()[0]
count_release_2 = con.execute(f"SELECT count(*) AS count FROM '{release_2}'").fetchone()[0]
print("Number of rows release 1:", count_release_1)
print("Number of rows release 2:", count_release_2)

In [None]:
# Compare geometries directly (not hashes)
query_geo = f"""
WITH old AS (
    SELECT id, ST_Normalize(geometry) AS geom
    FROM read_parquet('{release_1}')
),
new AS (
    SELECT id, ST_Normalize(geometry) AS geom
    FROM read_parquet('{release_2}')
)
SELECT
    COALESCE(n.id, o.id) AS id,
    n.id AS new_id,
    o.id AS old_id
FROM new n
FULL OUTER JOIN old o ON n.id = o.id
WHERE NOT ST_Equals(n.geom, o.geom);
"""

df_geo = con.execute(query_geo).df_geo()

In [None]:
# Compare geometries by canonical hash (faster, less precise)
query_hash = f"""
WITH old AS (
    SELECT id, md5(ST_AsWKB(ST_Normalize(geometry))) AS geom_hash
    FROM read_parquet('{release_1}')
),
new AS (
    SELECT id, md5(ST_AsWKB(ST_Normalize(geometry))) AS geom_hash
    FROM read_parquet('{release_2}')
)
SELECT
    COALESCE(n.id, o.id) AS id,
    n.id AS new_id,
    o.id AS old_id
FROM new n
FULL OUTER JOIN old o ON n.id = o.id
WHERE n.geom_hash IS DISTINCT FROM o.geom_hash;
"""
df_hash = con.execute(query_hash).df_geo()

In [None]:
print(df_geo.shape)
print(df_hash.shape)

In [None]:
changed_ids = df_geo["id"].to_numpy()

In [None]:
# Fetch changed geometries for release 1
query = f"""
SELECT id, ST_AsWKB(geometry) AS geometry
FROM '{release_1}'
WHERE id IN ({','.join(map(str, changed_ids))})
ORDER BY id;
"""
release_1_diff = con.execute(query).fetchdf()

In [None]:
# Fetch changed geometries for release 2
query = f"""
SELECT id, ST_AsWKB(geometry) AS geometry
FROM '{release_2}'
WHERE id IN ({','.join(map(str, changed_ids))})
ORDER BY id;
"""
release_2_diff = con.execute(query).fetchdf()

In [None]:
# Ensure geometry columns are proper byte arrays
release_1_diff['geometry'] = release_1_diff['geometry'].apply(lambda g: bytes(g) if isinstance(g, (memoryview, bytearray)) else g)
release_2_diff['geometry'] = release_2_diff['geometry'].apply(lambda g: bytes(g) if isinstance(g, (memoryview, bytearray)) else g)

In [None]:
# Convert WKB to shapely geometries
release_1_diff['geometry'] = release_1_diff['geometry'].apply(from_wkb)
release_2_diff['geometry'] = release_2_diff['geometry'].apply(from_wkb)

In [None]:
# Create GeoDataFrames
release_1_gdf = gpd.GeoDataFrame(release_1_diff, geometry='geometry', crs='EPSG:4326')
release_2_gdf = gpd.GeoDataFrame(release_2_diff, geometry='geometry', crs='EPSG:4326')

In [None]:
# Optionally save to Parquet
# release_1_gdf.to_parquet('release_1.parquet', schema_version='1.1.0')
# release_2_gdf.to_parquet('release_2.parquet', schema_version='1.1.0')