In [1]:
from shapely import from_wkb
import geopandas as gpd

from src.infra.persistence.context import create_duckdb_context

In [2]:
db = create_duckdb_context()

osm_release_1 = "az://raw/release/2025-10-27.19/dataset=osm/theme=buildings/region=*/*.parquet"
osm_release_2 = "az://raw/release/2025-11-12.0/dataset=osm/theme=buildings/region=*/*.parquet"
fkb_release_1 = "az://raw/release/2025-11-12.0/dataset=fkb/theme=buildings/region=*/*.parquet"

### Count rows in each dataset

In [3]:
db.sql(
    f"""
    CREATE OR REPLACE TABLE osm_release_count_1 AS (
        SELECT 'osm_release_1' AS name, COUNT(*) AS count FROM read_parquet('{osm_release_1}')
    );

    CREATE OR REPLACE TABLE osm_release_count_2 AS (
        SELECT 'osm_release_2' AS name, COUNT(*) AS count FROM read_parquet('{osm_release_2}')
    );

    CREATE OR REPLACE TABLE fkb_release_count_1 AS (
        SELECT 'fkb_release_1' AS name, COUNT(*) AS count FROM read_parquet('{fkb_release_1}')
    );

    SELECT * FROM osm_release_count_1
    UNION
    SELECT * FROM osm_release_count_2
    UNION
    SELECT * FROM fkb_release_count_1
    ORDER BY name;
    """
).show()

┌───────────────┬─────────┐
│     name      │  count  │
│    varchar    │  int64  │
├───────────────┼─────────┤
│ fkb_release_1 │   10882 │
│ osm_release_1 │ 4163561 │
│ osm_release_2 │ 4163473 │
└───────────────┴─────────┘



### Describe columns and view columns

In [4]:
db.sql(f"DESCRIBE SELECT * FROM '{osm_release_2}'").show()

┌────────────────┬────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
│  column_name   │                        column_type                         │  null   │   key   │ default │  extra  │
│    varchar     │                          varchar                           │ varchar │ varchar │ varchar │ varchar │
├────────────────┼────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ type           │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ ref:bygningsnr │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ id             │ BIGINT                                                     │ YES     │ NULL    │ NULL    │ NULL    │
│ geometry       │ GEOMETRY                                                   │ YES     │ NULL    │ NULL    │ NULL    │
│ partition_key  │ VARCHAR              

In [5]:
db.sql(
    f'''
    SELECT 'Non-null' as name, SUM(CASE WHEN "ref:bygningsnr" IS NOT NULL THEN 1 ELSE 0 END) as count FROM '{osm_release_2}'
    UNION ALL
    SELECT 'Null' as name, SUM(CASE WHEN "ref:bygningsnr" IS NULL THEN 1 ELSE 0 END) as count FROM '{osm_release_2}'
    '''
).show()

┌──────────┬─────────┐
│   name   │  count  │
│ varchar  │ int128  │
├──────────┼─────────┤
│ Non-null │ 3980028 │
│ Null     │  183445 │
└──────────┴─────────┘



In [6]:
osm_preview = db.execute(f"SELECT * FROM '{osm_release_2}' LIMIT 10").fetchdf()
osm_preview

Unnamed: 0,type,ref:bygningsnr,id,geometry,partition_key,bbox,dataset,region,theme
0,unspecified,81410631.0,8737738,"[5, 4, 0, 0, 0, 0, 0, 0, 116, 188, 43, 65, 4, ...",u4x,"{'xmin': 10.7335099, 'ymin': 59.9140781, 'xmax...",osm,3,buildings
1,commercial,80753756.0,8781748,"[5, 4, 0, 0, 0, 0, 0, 0, 240, 14, 44, 65, 48, ...",u4x,"{'xmin': 10.7536473, 'ymin': 59.9113187, 'xmax...",osm,3,buildings
2,train_station,,8781750,"[5, 4, 0, 0, 0, 0, 0, 0, 152, 4, 44, 65, 202, ...",u4x,"{'xmin': 10.7511223, 'ymin': 59.9099536, 'xmax...",osm,3,buildings
3,unspecified,81433453.0,8789076,"[5, 4, 0, 0, 0, 0, 0, 0, 228, 244, 43, 65, 90,...",u4x,"{'xmin': 10.7472885, 'ymin': 59.9085476, 'xmax...",osm,3,buildings
4,office,81066884.0,8969088,"[5, 4, 0, 0, 0, 0, 0, 0, 233, 180, 46, 65, 36,...",u4x,"{'xmin': 10.919168, 'ymin': 59.940569, 'xmax':...",osm,3,buildings
5,transportation,81042020.0,9515420,"[5, 4, 0, 0, 0, 0, 0, 0, 218, 19, 44, 65, 225,...",u4x,"{'xmin': 10.7548475, 'ymin': 59.9110183, 'xmax...",osm,3,buildings
6,university,81410208.0,10008720,"[5, 4, 0, 0, 0, 0, 0, 0, 86, 188, 43, 65, 60, ...",u4x,"{'xmin': 10.7334811, 'ymin': 59.9152681, 'xmax...",osm,3,buildings
7,university,80487835.0,10008732,"[5, 4, 0, 0, 0, 0, 0, 0, 56, 193, 43, 65, 45, ...",u4x,"{'xmin': 10.7346732, 'ymin': 59.9152116, 'xmax...",osm,3,buildings
8,church,80259069.0,10021678,"[5, 4, 0, 0, 0, 0, 0, 0, 165, 139, 46, 65, 220...",u4x,"{'xmin': 10.9090938, 'ymin': 59.9442016, 'xmax...",osm,3,buildings
9,university,81166838.0,10023894,"[5, 4, 0, 0, 0, 0, 0, 0, 10, 127, 43, 65, 239,...",u4x,"{'xmin': 10.7185158, 'ymin': 59.938414, 'xmax'...",osm,3,buildings


In [7]:
db.sql(f"DESCRIBE SELECT * FROM '{fkb_release_1}'").show()

┌──────────────────┬────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name    │                        column_type                         │  null   │   key   │ default │  extra  │
│     varchar      │                          varchar                           │ varchar │ varchar │ varchar │ varchar │
├──────────────────┼────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ geometry         │ GEOMETRY                                                   │ YES     │ NULL    │ NULL    │ NULL    │
│ gml_id           │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ lokalId          │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ navnerom         │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ versjonId        │ VAR

In [8]:
db.sql(
    f'''
    SELECT 'Non-null' as name, SUM(CASE WHEN bygningsnummer IS NOT NULL THEN 1 ELSE 0 END) as count FROM '{fkb_release_1}'
    UNION ALL
    SELECT 'Null' as name, SUM(CASE WHEN bygningsnummer IS NULL THEN 1 ELSE 0 END) as count FROM '{fkb_release_1}'
    '''
).show()

┌──────────┬────────┐
│   name   │ count  │
│ varchar  │ int128 │
├──────────┼────────┤
│ Non-null │   9088 │
│ Null     │   1794 │
└──────────┴────────┘



In [9]:
fkb_preview = db.execute(
    f'''
    SELECT lokalId, bygningsnummer, bygningstype, layer, geometry FROM '{fkb_release_1}'
    WHERE bygningsnummer IS NOT NULL
    LIMIT 10
    ''').fetchdf()

fkb_preview

Unnamed: 0,lokalId,bygningsnummer,bygningstype,layer,geometry
0,856aeb88-88e9-4075-ae03-f499ed36d472,300195537.0,212.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 40, 171, 98, 65, 91, ..."
1,46c9eef1-a9cc-45e8-93c0-b49ab0477452,300195539.0,311.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 243, 173, 98, 65, 115..."
2,63c85c5d-0da2-4279-88e8-50e92b6aae20,11163548.0,219.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 177, 174, 98, 65, 119..."
3,c3cee0d6-a70a-4e40-a9f6-5059fcfe2d36,11144403.0,219.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 62, 178, 98, 65, 80, ..."
4,5ea71c95-80b8-4d10-9f89-4a4a088c7e6f,11162460.0,311.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 97, 178, 98, 65, 96, ..."
5,e19951e4-422b-4cc6-9544-1de3932d4b7f,11163521.0,219.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 55, 186, 98, 65, 35, ..."
6,3164555b-4692-4d20-936e-62995ff41ecd,11163513.0,212.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 225, 192, 98, 65, 106..."
7,1d51da7f-c92b-431d-998d-ba4e329e6d25,300207494.0,212.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 15, 195, 98, 65, 57, ..."
8,5cdca7ed-65d6-4384-a14a-873dab4289c9,11164064.0,219.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 171, 133, 98, 65, 138..."
9,5876ce33-4320-49c8-bef3-06a750ff333a,18785684.0,219.0,Bygning,"[2, 4, 0, 0, 0, 0, 0, 0, 4, 150, 98, 65, 254, ..."


### Merge datasets on IDs

In [10]:
fkb_osm_join_df = db.sql(
    f'''
    CREATE OR REPLACE TABLE osm AS (
        SELECT id, geometry, TRY_CAST("ref:bygningsnr" AS INTEGER) as bygningsnummer, region FROM '{osm_release_2}'
    );

    CREATE OR REPLACE TABLE fkb AS (
        SELECT lokalId, geometry, TRY_CAST(bygningsnummer AS INTEGER) AS bygningsnummer, region FROM '{fkb_release_1}'
    );

    SELECT id AS osm_id, lokalId AS fkb_id, osm.bygningsnummer AS building_id FROM osm
    JOIN fkb ON osm.bygningsnummer = fkb.bygningsnummer
    LIMIT 10;
    '''
).show()

┌────────────┬──────────────────────────────────────┬─────────────┐
│   osm_id   │                fkb_id                │ building_id │
│   int64    │               varchar                │    int32    │
├────────────┼──────────────────────────────────────┼─────────────┤
│ 1877822650 │ 18051539-fa0c-4612-b692-3a84b7745e20 │    20320710 │
│ 1877822744 │ 876560b9-43de-4464-bcdd-22d9f2a0a631 │    20317183 │
│ 1877823000 │ 8743e499-311f-4751-89b6-2e4d2e298bd9 │    20320184 │
│ 1877823186 │ 211a9be6-8550-44d6-9985-d887c6da38c6 │    20321695 │
│ 1877823386 │ c1b126cd-fa67-4055-8c6e-e753aaee8a0b │    20311045 │
│ 1877823390 │ 7b760c00-0571-4e79-861e-9c346ea96d02 │    20311053 │
│ 1877823408 │ 05aab06a-cb8c-4b8b-ad7a-84bdcbc6f857 │    20311096 │
│ 1877823416 │ 84fb4d5c-6e66-4af4-9921-27f967ecfc8d │    20311118 │
│ 1877823424 │ 9362ed7a-547e-4e40-a1a0-b3c28fb229cb │    20311061 │
│ 1877823430 │ d3d6fe86-e075-42d4-86be-1b9359df948d │    20311088 │
├────────────┴──────────────────────────────────

### Merge merge OSM datasets between releases
#### Comparing geometries with `CROSS JOIN`
Not feasible as there are too many geometires
```
db.execute(
    f'''
    CREATE OR REPLACE TABLE old_release AS (
        SELECT
            id,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            ST_Normalize(ST_Force2D(ST_Simplify(geometry, 0.0000449))) AS geom
        FROM '{osm_release_1}'
    );

    CREATE OR REPLACE TABLE new_release AS (
        SELECT
            id,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            ST_Normalize(ST_Force2D(ST_Simplify(geometry, 0.0000449))) AS geom
        FROM '{osm_release_2}'
    );

    SELECT DISTINCT
        COALESCE(new_release.id, old_release.id) AS id,
        old_release.id as old_id,
        new_release.id as new_id,
        old_release.geom as old_geom,
        new_release.geom as new_geom
    FROM old_release, new_release
    WHERE NOT ST_Equals(old_geom, new_geom)
    '''
)
```

#### Compare geometries by finding similar centroids

In [11]:
limit = 10_000_000

In [None]:
compare_centroids_df = db.execute(
    f'''
    WITH old_release AS (
        SELECT
            id,
            ST_Force2D(geometry) AS geom,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Area(geometry) AS area
        FROM '{osm_release_1}'
        LIMIT {limit}
    ),
    new_release AS (
        SELECT
            id,
            ST_Force2D(geometry) AS geom,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Area(geometry) AS area
        FROM '{osm_release_2}'
        LIMIT {limit}
    ),
    overlapping_buildings AS (
        SELECT
            o.id AS old_id,
            n.id AS new_id,
            ST_AsWKB(o.geom) AS old_geom,
            ST_AsWKB(n.geom) AS new_geom,
            COALESCE(n.building_id, o.building_id) AS building_id,
            ST_Area(ST_Intersection(o.geom, n.geom)) / ST_Area(ST_Union(o.geom, n.geom)) AS iou
        FROM old_release o
        JOIN new_release n ON o.grid_x = n.grid_x AND o.grid_y = n.grid_y
        WHERE ST_Intersects(o.geom, n.geom)
        AND NOT ST_Equals(o.geom, n.geom)
    ),
    changed_buildings AS (
        SELECT * FROM overlapping_buildings WHERE iou > 0.90
    )

    SELECT * FROM changed_buildings;
    '''
).fetchdf()

In [13]:
print(f"Number of matches:", compare_centroids_df.shape[0])
compare_centroids_df.head()

Number of matches: 3


Unnamed: 0,old_id,new_id,old_geom,new_geom,building_id,iou
0,10023918,24007443,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 2, ...",81213496,0.944875
1,171576910,171576910,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",80953941,0.999994
2,24007443,10023918,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 2, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",81213496,0.944875


In [14]:
compare_centroids_df["old_geom"] = compare_centroids_df["old_geom"].apply(
    lambda g: bytes(g) if isinstance(g, (bytearray, memoryview)) else g)
compare_centroids_df["new_geom"] = compare_centroids_df["new_geom"].apply(
    lambda g: bytes(g) if isinstance(g, (bytearray, memoryview)) else g)
compare_centroids_df.head()

Unnamed: 0,old_id,new_id,old_geom,new_geom,building_id,iou
0,10023918,24007443,"b""\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...","b""\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...",81213496,0.944875
1,171576910,171576910,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...,b'\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...,80953941,0.999994
2,24007443,10023918,"b""\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...","b""\x01\x06\x00\x00\x00\x01\x00\x00\x00\x01\x03...",81213496,0.944875


In [15]:
compare_centroids_df["old_geom"] = compare_centroids_df["old_geom"].apply(from_wkb)
compare_centroids_df["new_geom"] = compare_centroids_df["new_geom"].apply(from_wkb)

In [16]:
compare_centroids_df.head()

Unnamed: 0,old_id,new_id,old_geom,new_geom,building_id,iou
0,10023918,24007443,"MULTIPOLYGON (((10.720639 59.938739, 10.720785...","MULTIPOLYGON (((10.720639 59.938739, 10.720785...",81213496,0.944875
1,171576910,171576910,"MULTIPOLYGON (((10.845068 59.906434, 10.845084...","MULTIPOLYGON (((10.845068 59.906434, 10.845084...",80953941,0.999994
2,24007443,10023918,"MULTIPOLYGON (((10.720639 59.938739, 10.720785...","MULTIPOLYGON (((10.720639 59.938739, 10.720785...",81213496,0.944875


In [17]:
"""
gdf_1 = gpd.GeoDataFrame(
    compare_centroids_df[["old_id", "new_id", "building_id", "iou", "old_geom"]],
    geometry="old_geom", crs="EPSG:4326"
)
gdf_2 = gpd.GeoDataFrame(
    compare_centroids_df[["old_id", "new_id", "building_id", "iou", "new_geom"]],
    geometry="new_geom", crs="EPSG:4326"
)
gdf_1.to_parquet("matched_old.parquet", schema_version="1.1.0")
gdf_2.to_parquet("matched_new.parquet", schema_version="1.1.0")
"""

'\ngdf_1 = gpd.GeoDataFrame(\n    compare_centroids_df[["old_id", "new_id", "building_id", "iou", "old_geom"]],\n    geometry="old_geom", crs="EPSG:4326"\n)\ngdf_2 = gpd.GeoDataFrame(\n    compare_centroids_df[["old_id", "new_id", "building_id", "iou", "new_geom"]],\n    geometry="new_geom", crs="EPSG:4326"\n)\ngdf_1.to_parquet("matched_old.parquet", schema_version="1.1.0")\ngdf_2.to_parquet("matched_new.parquet", schema_version="1.1.0")\n'

#### Find geometries that have been added in the new release, but wasn't there in the old one

In [12]:
new_buildings_df = db.execute(
    f'''
    WITH old_release AS (
        SELECT
            id AS old_id,
            ST_Force2D(geometry) AS old_geom,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Area(geometry) AS area
        FROM '{osm_release_1}'
        LIMIT {limit}
    ),
    new_release AS (
        SELECT
            id AS new_id,
            ST_Force2D(geometry) AS new_geom,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Area(geometry) AS area
        FROM '{osm_release_2}'
        LIMIT {limit}
    ),

    -- Your original CTE
    overlapping_buildings AS (
        SELECT
            o.old_id,
            n.new_id,
            ST_AsWKB(o.old_geom) AS old_geom,
            ST_AsWKB(n.new_geom) AS new_geom,
            COALESCE(n.building_id, o.building_id) AS building_id,
            ST_Area(ST_Intersection(o.old_geom, n.new_geom)) /
            ST_Area(ST_Union(o.old_geom, n.new_geom)) AS iou
        FROM old_release o
        JOIN new_release n
          ON o.grid_x = n.grid_x AND o.grid_y = n.grid_y
        WHERE ST_Intersects(o.old_geom, n.new_geom)
          AND NOT ST_Equals(o.old_geom, n.new_geom)
    ),

    -- Your original logic for changed buildings
    changed_buildings AS (
        SELECT *
        FROM overlapping_buildings
        WHERE iou > 0.90
    ),

    -- NEW: best-match logic to find geometry-based "no match"
    candidate_matches AS (
        SELECT
            o.old_id,
            n.new_id,
            ST_Area(ST_Intersection(o.old_geom, n.new_geom)) /
            ST_Area(ST_Union(o.old_geom, n.new_geom)) AS iou
        FROM old_release o
        JOIN new_release n
          ON o.grid_x = n.grid_x
         AND o.grid_y = n.grid_y
        WHERE ST_Intersects(o.old_geom, n.new_geom)
    ),

    best_match AS (
        SELECT *
        FROM (
            SELECT *,
                   ROW_NUMBER() OVER (PARTITION BY new_id ORDER BY iou DESC) AS rn
            FROM candidate_matches
        )
        WHERE rn = 1
    ),

    -- NEW: buildings in new_release without a valid old match
    added_buildings AS (
        SELECT
            NULL AS old_id,
            n.new_id,
            NULL AS old_geom,
            ST_AsWKB(n.new_geom) AS new_geom,
            n.building_id AS building_id,
            NULL AS iou
        FROM new_release n
        LEFT JOIN best_match b ON n.new_id = b.new_id
        WHERE b.iou IS NULL OR b.iou < 0.90
    )

    -- Final output
    SELECT * FROM changed_buildings
    UNION ALL
    SELECT * FROM added_buildings;
    '''
).fetchdf()

In [13]:
new_buildings_df["new_geom"] = new_buildings_df["new_geom"].apply(
    lambda g: bytes(g) if isinstance(g, (bytearray, memoryview)) else g
)

In [14]:
new_buildings_df["new_geom"] = new_buildings_df["new_geom"].apply(from_wkb)

In [15]:
print("Number of matches:", new_buildings_df.shape[0])
new_buildings_df.head()

Number of matches: 204


Unnamed: 0,old_id,new_id,old_geom,new_geom,building_id,iou
0,32507535,56081036,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 2, ...","MULTIPOLYGON (((5.738607 58.853286, 5.738796 5...",,0.914468
1,2058073680,2058070166,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...","MULTIPOLYGON (((11.447818 59.6089149, 11.44791...",147187130.0,0.930274
2,744553906,1951656038,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...","MULTIPOLYGON (((10.775699 59.3627161, 10.77600...",6516742.0,0.965423
3,744553892,1951657086,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...","MULTIPOLYGON (((10.7758939 59.3630227, 10.7759...",148265011.0,0.950142
4,744553816,1951656710,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...","MULTIPOLYGON (((10.7759662 59.362833, 10.77610...",6516750.0,0.952053


In [16]:
print(f"Number of new buildings added:", new_buildings_df["old_id"].isna().sum())

Number of new buildings added: 73
