In [1]:
import geopandas as gpd
from shapely import from_wkb

from src.infra.persistence.context import create_duckdb_context

In [2]:
db = create_duckdb_context()

In [3]:
LIMIT = 10_000_000
osm_release = "az://raw/release/2025-11-12.0/dataset=osm/theme=buildings/region=*/*.parquet"
fkb_release = "az://raw/release/2025-11-12.0/dataset=fkb/theme=buildings/region=*/*.parquet"

In [4]:
db.sql(
    f"""
    CREATE OR REPLACE TABLE  osm AS (
        SELECT 'osm' AS name, COUNT(*) AS count FROM read_parquet('{osm_release}')
);

    CREATE OR REPLACE TABLE  fkb AS (
        SELECT 'fkb' AS name, COUNT(*) AS count FROM read_parquet('{fkb_release}')
    );

    SELECT * FROM osm
    UNION
    SELECT * FROM fkb
    ORDER BY name;
    """
).show()

┌─────────┬─────────┐
│  name   │  count  │
│ varchar │  int64  │
├─────────┼─────────┤
│ fkb     │   10882 │
│ osm     │ 4163473 │
└─────────┴─────────┘



### Describe columns and view head
#### OSM-dataset

In [5]:
db.sql(f"DESCRIBE SELECT * FROM '{osm_release}'").show()

┌────────────────┬────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
│  column_name   │                        column_type                         │  null   │   key   │ default │  extra  │
│    varchar     │                          varchar                           │ varchar │ varchar │ varchar │ varchar │
├────────────────┼────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ type           │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ ref:bygningsnr │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ id             │ BIGINT                                                     │ YES     │ NULL    │ NULL    │ NULL    │
│ geometry       │ GEOMETRY                                                   │ YES     │ NULL    │ NULL    │ NULL    │
│ partition_key  │ VARCHAR              

In [6]:
db.execute(f"SELECT * FROM '{osm_release}' LIMIT 10").fetchdf()

Unnamed: 0,type,ref:bygningsnr,id,geometry,partition_key,bbox,dataset,region,theme
0,unspecified,81410631.0,8737738,"[5, 4, 0, 0, 0, 0, 0, 0, 116, 188, 43, 65, 4, ...",u4x,"{'xmin': 10.7335099, 'ymin': 59.9140781, 'xmax...",osm,3,buildings
1,commercial,80753756.0,8781748,"[5, 4, 0, 0, 0, 0, 0, 0, 240, 14, 44, 65, 48, ...",u4x,"{'xmin': 10.7536473, 'ymin': 59.9113187, 'xmax...",osm,3,buildings
2,train_station,,8781750,"[5, 4, 0, 0, 0, 0, 0, 0, 152, 4, 44, 65, 202, ...",u4x,"{'xmin': 10.7511223, 'ymin': 59.9099536, 'xmax...",osm,3,buildings
3,unspecified,81433453.0,8789076,"[5, 4, 0, 0, 0, 0, 0, 0, 228, 244, 43, 65, 90,...",u4x,"{'xmin': 10.7472885, 'ymin': 59.9085476, 'xmax...",osm,3,buildings
4,office,81066884.0,8969088,"[5, 4, 0, 0, 0, 0, 0, 0, 233, 180, 46, 65, 36,...",u4x,"{'xmin': 10.919168, 'ymin': 59.940569, 'xmax':...",osm,3,buildings
5,transportation,81042020.0,9515420,"[5, 4, 0, 0, 0, 0, 0, 0, 218, 19, 44, 65, 225,...",u4x,"{'xmin': 10.7548475, 'ymin': 59.9110183, 'xmax...",osm,3,buildings
6,university,81410208.0,10008720,"[5, 4, 0, 0, 0, 0, 0, 0, 86, 188, 43, 65, 60, ...",u4x,"{'xmin': 10.7334811, 'ymin': 59.9152681, 'xmax...",osm,3,buildings
7,university,80487835.0,10008732,"[5, 4, 0, 0, 0, 0, 0, 0, 56, 193, 43, 65, 45, ...",u4x,"{'xmin': 10.7346732, 'ymin': 59.9152116, 'xmax...",osm,3,buildings
8,church,80259069.0,10021678,"[5, 4, 0, 0, 0, 0, 0, 0, 165, 139, 46, 65, 220...",u4x,"{'xmin': 10.9090938, 'ymin': 59.9442016, 'xmax...",osm,3,buildings
9,university,81166838.0,10023894,"[5, 4, 0, 0, 0, 0, 0, 0, 10, 127, 43, 65, 239,...",u4x,"{'xmin': 10.7185158, 'ymin': 59.938414, 'xmax'...",osm,3,buildings


#### FKB-dataset

In [7]:
db.sql(f"DESCRIBE SELECT * FROM '{fkb_release}'").show()

┌──────────────────┬────────────────────────────────────────────────────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name    │                        column_type                         │  null   │   key   │ default │  extra  │
│     varchar      │                          varchar                           │ varchar │ varchar │ varchar │ varchar │
├──────────────────┼────────────────────────────────────────────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ geometry         │ GEOMETRY                                                   │ YES     │ NULL    │ NULL    │ NULL    │
│ gml_id           │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ lokalId          │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ navnerom         │ VARCHAR                                                    │ YES     │ NULL    │ NULL    │ NULL    │
│ versjonId        │ VAR

In [8]:
db.execute(f"SELECT * FROM '{fkb_release}' LIMIT 10").fetchdf()

Unnamed: 0,geometry,gml_id,lokalId,navnerom,versjonId,oppdateringsdato,datafangstdato,bygningsnummer,bygningstype,kommunenummer,...,H-MÅLEMETODE,treDNivå,medium,TRE_D_NIVÅ,layer,partition_key,bbox,dataset,region,theme
0,"[2, 4, 0, 0, 0, 0, 0, 0, 170, 164, 98, 65, 48,...",AnnenBygning.4879,faed7c3e-eb80-47f0-8da9-fbdd3efef2ec,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.165201577039001, 'ymin': 66.307011...",fkb,18,buildings
1,"[2, 4, 0, 0, 0, 0, 0, 0, 233, 163, 98, 65, 35,...",AnnenBygning.4952,d4e851fc-c033-4011-8aca-75cf0bca0014,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.165017786222235, 'ymin': 66.306909...",fkb,18,buildings
2,"[2, 4, 0, 0, 0, 0, 0, 0, 40, 171, 98, 65, 91, ...",Bygning.4769,856aeb88-88e9-4075-ae03-f499ed36d472,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,300195537.0,212.0,1833.0,...,,,,,Bygning,u7f,"{'xmin': 14.16678672524374, 'ymin': 66.3073404...",fkb,18,buildings
3,"[2, 4, 0, 0, 0, 0, 0, 0, 243, 173, 98, 65, 115...",Bygning.4981,46c9eef1-a9cc-45e8-93c0-b49ab0477452,http://data.geonorge.no/SFKB/FKB-Bygning/so,2019-01-31 10:59:21.846000,2019-01-31T10:59:21,,300195539.0,311.0,1833.0,...,,,,,Bygning,u7f,"{'xmin': 14.167468087761144, 'ymin': 66.307523...",fkb,18,buildings
4,"[2, 4, 0, 0, 0, 0, 0, 0, 177, 174, 98, 65, 119...",Bygning.4529,63c85c5d-0da2-4279-88e8-50e92b6aae20,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,11163548.0,219.0,1833.0,...,,,,,Bygning,u7f,"{'xmin': 14.167649353986613, 'ymin': 66.307548...",fkb,18,buildings
5,"[2, 4, 0, 0, 0, 0, 0, 0, 156, 176, 98, 65, 32,...",AnnenBygning.4930,9a7c2354-5e1c-4e1f-9265-a7178d116f3e,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.168118157129847, 'ymin': 66.306887...",fkb,18,buildings
6,"[2, 4, 0, 0, 0, 0, 0, 0, 192, 175, 98, 65, 243...",AnnenBygning.4864,ce340d77-1f35-4fd9-8827-52755cbefef5,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.167908457324565, 'ymin': 66.306544...",fkb,18,buildings
7,"[2, 4, 0, 0, 0, 0, 0, 0, 130, 177, 98, 65, 91,...",AnnenBygning.4816,3719df71-15e0-4133-baf1-7430a5370ec3,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.168337698634899, 'ymin': 66.305383...",fkb,18,buildings
8,"[2, 4, 0, 0, 0, 0, 0, 0, 117, 178, 98, 65, 82,...",AnnenBygning.4873,e5ffd195-5eed-474c-9c92-b8edeba28540,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.168569028625843, 'ymin': 66.305315...",fkb,18,buildings
9,"[2, 4, 0, 0, 0, 0, 0, 0, 23, 180, 98, 65, 76, ...",AnnenBygning.4838,7af07caa-6a13-49b5-8f43-2a1b1bad6483,http://data.geonorge.no/SFKB/FKB-Bygning/so,2017-05-16 15:03:21.742000,2017-05-16T15:03:21,,,,,...,,,,,AnnenBygning,u7f,"{'xmin': 14.168967802279756, 'ymin': 66.305270...",fkb,18,buildings


### Query from workshop 1
- Runs in 2 minute and 30 seconds

In [9]:
db.execute(
    f"""
    CREATE OR REPLACE TABLE filtered_buildings AS (
        WITH fkb AS (
            SELECT
                geometry,
                CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_x,
                CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_y,
                ST_Area(geometry) as area
            FROM read_parquet('{fkb_release}')
            LIMIT {LIMIT}
        ),

        osm AS (
            SELECT
                geometry,
                CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_x,
                CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_y,
                ST_Area(geometry) as area
            FROM read_parquet('{osm_release}')
            LIMIT {LIMIT}
        ),

        -- Find OSM buildings that intersect with FKB buildings (prefiltered by grid cell)
        -- Calculate IoU (Intersection over Union) for robust similarity measure
        -- Group by OSM geometry to get MAX IoU when one OSM building overlaps multiple FKB buildings
        osm_with_overlap AS (
            SELECT
                o.geometry as osm_geom,
                MAX(
                    ST_Area(ST_Intersection(o.geometry, f.geometry)) / ST_Area(ST_Union(o.geometry, f.geometry))
                ) as max_iou_ratio
            FROM osm o
            JOIN fkb f
                ON o.grid_x = f.grid_x
                AND o.grid_y = f.grid_y  -- Prefilter using grid
            WHERE ST_Intersects(o.geometry, f.geometry)
            GROUP BY o.geometry -- Critical: aggregate per OSM building to handle multiple FKB overlaps
        )
        -- Keep all FKB buildings (red)
        SELECT
            geometry,
            'fkb' as source,
            1 as color,
            NULL as iou_ratio
        FROM fkb

        UNION ALL

        -- Keep OSM buildings with no overlap (green)
        SELECT
            o.geometry,
            'osm_no_overlap' as source,
            2 as color,
            0.0 as iou_ratio
        FROM osm o
        WHERE NOT EXISTS (
            SELECT 1 FROM fkb f
            WHERE f.grid_x = o.grid_x AND f.grid_y = o.grid_y
            AND ST_Intersects(o.geometry, f.geometry)
        )

        UNION ALL

        -- Keep OSM buildings with significant difference (IoU < 70%, yellow)
        SELECT
            osm_geom as geometry,
            'osm_different' as source,
            3 as color,
            max_iou_ratio as iou_ratio
        FROM osm_with_overlap
        WHERE max_iou_ratio < 0.70
    )
    """
)

<_duckdb.DuckDBPyConnection at 0x1b2b8c85ef0>

#### Show how the final distribution of buildings should be, but the method does not handle IDs correctly

In [10]:
db.sql(
    """
    SELECT source,
           COUNT(*) as count,
        ROUND(AVG(iou_ratio), 3) as avg_iou,
        ROUND(MIN(iou_ratio), 3) as min_iou,
        ROUND(MAX(iou_ratio), 3) as max_iou,
        CASE source
            WHEN 'fkb' THEN 'FKB buildings (red)'
            WHEN 'osm_no_overlap' THEN 'OSM buildings - no FKB overlap (green)'
            WHEN 'osm_different' THEN 'OSM buildings - significantly different from FKB (yellow)'
    END
    as description
    FROM filtered_buildings
    GROUP BY source
    ORDER BY count DESC
    """
).show()

┌────────────────┬─────────┬─────────┬─────────┬─────────┬───────────────────────────────────────────────────────────┐
│     source     │  count  │ avg_iou │ min_iou │ max_iou │                        description                        │
│    varchar     │  int64  │ double  │ double  │ double  │                          varchar                          │
├────────────────┼─────────┼─────────┼─────────┼─────────┼───────────────────────────────────────────────────────────┤
│ osm_no_overlap │ 4154345 │     0.0 │     0.0 │     0.0 │ OSM buildings - no FKB overlap (green)                    │
│ fkb            │   10882 │    NULL │    NULL │    NULL │ FKB buildings (red)                                       │
│ osm_different  │     313 │   0.153 │     0.0 │   0.698 │ OSM buildings - significantly different from FKB (yellow) │
└────────────────┴─────────┴─────────┴─────────┴─────────┴───────────────────────────────────────────────────────────┘



### Select all FKB polygons by default, use centroids to find overlaps and return the OSM dataset as a diff
It seems like this algorithm struggles to create a relation between the 10 000 FKB buildings and the corresponding OSM buildings.

In [11]:
query = f'''
    WITH fkb AS (
        SELECT
            lokalId AS fkb_id,
            TRY_CAST("bygningsnummer" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Force2D(geometry) AS geom,
            ST_Area(geometry) AS area
        FROM '{fkb_release}'
        LIMIT {LIMIT}
    ),

    osm AS (
        SELECT
            id AS osm_id,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Force2D(geometry) AS geom,
            ST_Area(geometry) AS area
        FROM '{osm_release}'
        LIMIT {LIMIT}
    ),

    fkb_osm_overlap AS (
        SELECT
            MAX(f.fkb_id) AS fkb_id,
            MAX(o.osm_id) AS osm_id,
            MAX(
                ST_Area(ST_Intersection(o.geom, f.geom)) / ST_Area(ST_Union(o.geom, f.geom))
            ) AS iou,
        FROM osm o
        JOIN fkb f
            ON o.grid_x = f.grid_x
            AND o.grid_y = f.grid_y
        WHERE ST_Intersects(o.geom, f.geom)
        GROUP BY o.geom
    ),

    fkb_only AS (
        SELECT
            fkb_id,
            NULL AS osm_id,
            FROM fkb f
            WHERE NOT EXISTS (
                SELECT 1
                FROM fkb_osm_overlap foo
                WHERE f.fkb_id = foo.fkb_id
            )
    ),

    osm_only AS (
        SELECT
            NULL AS fkb_id,
            osm_id
        FROM osm o,
        WHERE NOT EXISTS (
            SELECT 1
            FROM fkb_osm_overlap foo
            WHERE o.osm_id = foo.osm_id
        )
    ),

    fkb_osm_diff AS (
        SELECT
            fkb_id,
            osm_id
        FROM fkb_osm_overlap
        WHERE iou < 0.70
    )

    SELECT * FROM fkb_only
    UNION ALL
    SELECT * FROM osm_only
    UNION ALL
    SELECT * FROM fkb_osm_diff
    '''

### Match all polygons without any special filters
- Display data and group by îds

In [12]:
query = f'''
    WITH fkb AS (
        SELECT
            lokalId AS fkb_id,
            TRY_CAST("bygningsnummer" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Force2D(geometry) AS geom,
        FROM '{fkb_release}'
        LIMIT {LIMIT}
    ),

    osm AS (
        SELECT
            id AS osm_id,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Force2D(geometry) AS geom,
        FROM '{osm_release}'
        LIMIT {LIMIT}
    ),

    fkb_osm_overlap AS (
        SELECT
            f.fkb_id AS fkb_id,
            o.osm_id AS osm_id,
            ST_AsWKB(f.geom) AS fkb_geom,
            ST_AsWKB(o.geom) AS osm_geom,
            COALESCE(f.building_id, o.building_id) AS building_id,
            f.building_id AS fkb_building_id,
            o.building_id AS osm_building_id,
            ST_Area(ST_Intersection(f.geom, o.geom)) / ST_Area(ST_Union(f.geom, o.geom)) AS iou,
        FROM fkb f
        JOIN osm o
            ON f.grid_x = o.grid_x
            AND f.grid_y = o.grid_y
        WHERE ST_Intersects(f.geom, o.geom)
    )

    SELECT
        fkb_id,
        osm_id,
        fkb_building_id,
        osm_building_id,
        fkb_geom,
        osm_geom
    FROM fkb_osm_overlap
    '''

In [13]:
df = db.execute(query).fetchdf()
df

Unnamed: 0,fkb_id,osm_id,fkb_building_id,osm_building_id,fkb_geom,osm_geom
0,0e029fa7-83d7-4a5b-8bf1-d56d00f2537b,395638970,11163122,11163122,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 13, 0, 0, 0, 234, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
1,335e04f8-5bde-4eed-bb7f-1bf53a3b638f,396950452,11163971,11163971,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 23, 0, 0, 0, 180, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
2,a58f279d-ec72-4a2f-b173-6d65b4b95c44,395638962,20298081,11164064,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 14, 0, 0, 0, 8, 0,...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
3,fe811362-afa0-4214-928e-748b781c5efc,395638954,11164048,11164048,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 96, 0, 0, 0, 56, 1...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
4,6ff8d582-9621-48c7-a835-28ea43b0a413,395638944,11164080,11164099,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 12, 0, 0, 0, 124, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
...,...,...,...,...,...,...
12531,9609cbe4-59ad-48a0-a8a0-31f86d87101c,2728177642,300722828,,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 32, 0, 0, 0, 24, 1...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
12532,9609cbe4-59ad-48a0-a8a0-31f86d87101c,2728177644,300722828,,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 32, 0, 0, 0, 24, 1...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
12533,39ba73b3-9d5f-40e6-b458-41b7351b5758,2309981050,301013061,,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 135, 1...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."
12534,c07861ac-5d97-4e32-a41d-1f43b72220a8,2728177646,300769118,,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 224, 1...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ..."


In [14]:
df["fkb_geom"] = df["fkb_geom"].apply(lambda g: bytes(g) if isinstance(g, (bytearray, memoryview)) else g)
df["fkb_geom"] = df["fkb_geom"].apply(lambda g: from_wkb(g) if g else None)
df["osm_geom"] = df["osm_geom"].apply(lambda g: bytes(g) if isinstance(g, (bytearray, memoryview)) else g)
df["osm_geom"] = df["osm_geom"].apply(lambda g: from_wkb(g) if g else None)

**Show entries where the `building_id` is not matching**

In [15]:
mismatching_building_ids = df[df['fkb_building_id'] != df['osm_building_id']]
mismatching_building_ids

Unnamed: 0,fkb_id,osm_id,fkb_building_id,osm_building_id,fkb_geom,osm_geom
2,a58f279d-ec72-4a2f-b173-6d65b4b95c44,395638962,20298081,11164064,"POLYGON ((14.16578363522423 66.3102316356353, ...","MULTIPOLYGON (((14.157634 66.310996, 14.157742..."
4,6ff8d582-9621-48c7-a835-28ea43b0a413,395638944,11164080,11164099,"POLYGON ((14.16540100149745 66.31213090397907,...","MULTIPOLYGON (((14.1657096 66.3125321, 14.1667..."
8,5876ce33-4320-49c8-bef3-06a750ff333a,395638962,18785684,11164064,POLYGON ((14.169750834405177 66.31086384644966...,"MULTIPOLYGON (((14.157634 66.310996, 14.157742..."
26,a2b8ec03-5519-4c00-9bb2-a8ac66468ce3,1877820414,300009975,300009987,"POLYGON ((14.17255448341043 66.31794018261229,...","MULTIPOLYGON (((14.1724807 66.3178978, 14.1726..."
29,d6495220-dbab-4729-9ac6-25d1c736d361,1877820416,300009987,300009975,POLYGON ((14.172693854849413 66.31790198589066...,"MULTIPOLYGON (((14.1725543 66.3179404, 14.1726..."
...,...,...,...,...,...,...
12481,5c51ed94-410c-496a-9be9-d7106a3ad7d3,2047392822,190695816,190872114,"POLYGON ((19.00301678182121 69.64633819413618,...","MULTIPOLYGON (((19.0030164 69.6463383, 19.0030..."
12488,2518ecd5-01a6-4557-afc9-0ee8a574c8d1,2047392874,190860108,190860116,"POLYGON ((19.007751479661188 69.6466128132472,...","MULTIPOLYGON (((19.0077136 69.6464984, 19.0077..."
12497,0569f83f-3bc0-4d23-9191-f2ebf480226c,2047392670,190875024,190686477,"POLYGON ((18.9943729412624 69.64802316660285, ...","MULTIPOLYGON (((18.9943283 69.6481259, 18.9943..."
12504,942d43c1-5fe7-4880-9e71-03befb20a48f,2047392710,16536245,16529796,POLYGON ((19.008802439832184 69.64516468565921...,"MULTIPOLYGON (((19.008596 69.645209, 19.008614..."


**Group by `fkb_id`**

In [16]:
counts = df['fkb_id'].value_counts()
dup_ids = counts[counts >= 2].index
grouped_by_fkb_id_df = df[df['fkb_id'].isin(dup_ids)].copy()
grouped_by_fkb_id_df['count'] = grouped_by_fkb_id_df['fkb_id'].map(counts)
grouped_by_fkb_id_df = grouped_by_fkb_id_df.sort_values(['count', 'fkb_id'], ascending=[False, True])
grouped_by_fkb_id_df.head()

Unnamed: 0,fkb_id,osm_id,fkb_building_id,osm_building_id,fkb_geom,osm_geom,count
7606,820931ce-d82f-49f0-99e6-67817d0fb221,985128338,139838963,139277775,POLYGON ((5.317873312205446 60.389304176489176...,"MULTIPOLYGON (((5.31764 60.389242, 5.317764 60...",8
7607,820931ce-d82f-49f0-99e6-67817d0fb221,985128340,139838963,139277791,POLYGON ((5.317873312205446 60.389304176489176...,"MULTIPOLYGON (((5.317852 60.389181, 5.317947 6...",8
7608,820931ce-d82f-49f0-99e6-67817d0fb221,985128342,139838963,139838963,POLYGON ((5.317873312205446 60.389304176489176...,"MULTIPOLYGON (((5.317763 60.389339, 5.31782 60...",8
7609,820931ce-d82f-49f0-99e6-67817d0fb221,985128344,139838963,139277813,POLYGON ((5.317873312205446 60.389304176489176...,"MULTIPOLYGON (((5.318113 60.389105, 5.318235 6...",8
7610,820931ce-d82f-49f0-99e6-67817d0fb221,985128346,139838963,139277848,POLYGON ((5.317873312205446 60.389304176489176...,"MULTIPOLYGON (((5.318258 60.389064, 5.318391 6...",8


**Group by `osm_id`**

In [17]:
counts = df['osm_id'].value_counts()
dup_ids = counts[counts >= 2].index
grouped_by_osm_id_df = df[df['osm_id'].isin(dup_ids)].copy()
grouped_by_osm_id_df['count'] = grouped_by_osm_id_df['osm_id'].map(counts)
grouped_by_osm_id_df = grouped_by_osm_id_df.sort_values(['count', 'osm_id'], ascending=[False, True])
grouped_by_osm_id_df.head()

Unnamed: 0,fkb_id,osm_id,fkb_building_id,osm_building_id,fkb_geom,osm_geom,count
7137,cc610abe-ce98-4cf3-bcab-0983e599ac90,832862678,139281373,9423141,"POLYGON ((5.322272800587236 60.3905027148841, ...","MULTIPOLYGON (((5.322091 60.390299, 5.322132 6...",8
7140,44cd744f-9c24-4027-9b9e-e6c8bc48e8b3,832862678,139282604,9423141,"POLYGON ((5.322210697800086 60.39047367315272,...","MULTIPOLYGON (((5.322091 60.390299, 5.322132 6...",8
7142,c789b0e1-23ab-48bd-8add-7b1a88cf1748,832862678,23586827,9423141,"POLYGON ((5.32213518879636 60.39035105282214, ...","MULTIPOLYGON (((5.322091 60.390299, 5.322132 6...",8
7147,e6ada906-4fb8-4c9b-8c42-65c8f3896f59,832862678,9423141,9423141,"POLYGON ((5.322132044749497 60.39028400366596,...","MULTIPOLYGON (((5.322091 60.390299, 5.322132 6...",8
7149,38d96821-f1db-45d7-a2c7-d8be717880df,832862678,139282981,9423141,"POLYGON ((5.322741067206036 60.39022552520583,...","MULTIPOLYGON (((5.322091 60.390299, 5.322132 6...",8


**Create GeoDataFrames and save files**

In [18]:
fkb_grouped_by_fkb_id = grouped_by_fkb_id_df[
    ["fkb_id", "osm_id", "fkb_building_id", "osm_building_id", "count", "fkb_geom"]]
osm_grouped_by_fkb_id = grouped_by_fkb_id_df[
    ["fkb_id", "osm_id", "fkb_building_id", "osm_building_id", "count", "osm_geom"]]
fkb_grouped_by_osm_id = grouped_by_osm_id_df[
    ["fkb_id", "osm_id", "fkb_building_id", "osm_building_id", "count", "fkb_geom"]]
osm_grouped_by_osm_id = grouped_by_osm_id_df[
    ["fkb_id", "osm_id", "fkb_building_id", "osm_building_id", "count", "osm_geom"]]

In [19]:
fkb_grouped_by_fkb_id_gdf = gpd.GeoDataFrame(fkb_grouped_by_fkb_id, geometry="fkb_geom", crs="EPSG:4326")
fkb_grouped_by_osm_id_gdf = gpd.GeoDataFrame(fkb_grouped_by_osm_id, geometry="fkb_geom", crs="EPSG:4326")

osm_grouped_by_fkb_id_gdf = gpd.GeoDataFrame(osm_grouped_by_fkb_id, geometry="osm_geom", crs="EPSG:4326")
osm_grouped_by_osm_id_gdf = gpd.GeoDataFrame(osm_grouped_by_osm_id, geometry="osm_geom", crs="EPSG:4326")

In [20]:
"""
fkb_grouped_by_fkb_id_gdf.to_parquet("fkb_osm_compare_fkb_geom_group_by_fkb.parquet")
osm_grouped_by_fkb_id_gdf.to_parquet("fkb_osm_compare_osm_geom_group_by_fkb.parquet")
fkb_grouped_by_osm_id_gdf.to_parquet("fkb_osm_compare_fkb_geom_group_by_osm.parquet")
osm_grouped_by_osm_id_gdf.to_parquet("fkb_osm_compare_osm_geom_group_by_osm.parquet")
"""

'\nfkb_grouped_by_fkb_id_gdf.to_parquet("fkb_osm_compare_fkb_geom_group_by_fkb.parquet")\nosm_grouped_by_fkb_id_gdf.to_parquet("fkb_osm_compare_osm_geom_group_by_fkb.parquet")\nfkb_grouped_by_osm_id_gdf.to_parquet("fkb_osm_compare_fkb_geom_group_by_osm.parquet")\nosm_grouped_by_osm_id_gdf.to_parquet("fkb_osm_compare_osm_geom_group_by_osm.parquet")\n'

### Extend previous query
- In the cases where there are multiple `osm_id` for a single `fkb_id` add multiple `fkb_id` entries with one `osm_id`. This means that the final output can have multiple rows with the same `fkb_id` but only one row with a the same `osm_id`

In [50]:
query = f'''
    WITH fkb AS (
        SELECT
            lokalId AS fkb_id,
            TRY_CAST("bygningsnummer" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Force2D(geometry) AS geom,
        FROM '{fkb_release}'
        LIMIT {LIMIT}
    ),

    osm AS (
        SELECT
            id AS osm_id,
            TRY_CAST("ref:bygningsnr" AS INTEGER) AS building_id,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) AS grid_y,
            ST_Force2D(geometry) AS geom,
        FROM '{osm_release}'
        LIMIT {LIMIT}
    ),

    candidate_fkb_only_buildings AS (
        SELECT
            f.fkb_id AS fkb_id,
            o.osm_id AS osm_id,
            ST_AsWKB(f.geom) AS fkb_geom,
            ST_AsWKB(o.geom) AS osm_geom,
            COALESCE(f.building_id, o.building_id) AS building_id,
            f.building_id AS fkb_building_id,
            o.building_id AS osm_building_id,
            CAST(MAX (
                ST_Area(ST_Intersection(f.geom, o.geom)) / NULLIF(ST_Area(ST_Union(f.geom, o.geom)), 0)
            ) AS DECIMAl) AS max_iou
        FROM fkb f
        LEFT JOIN osm o
            ON f.grid_x = o.grid_x
            AND f.grid_y = o.grid_y
            AND ST_Intersects(f.geom, o.geom)
        GROUP BY f.fkb_id, o.osm_id, f.geom, o.geom, f.building_id, o.building_id
    ),

    candidate_osm_only_buildings AS (
        SELECT
            f.fkb_id AS fkb_id,
            o.osm_id AS osm_id,
            ST_AsWKB(f.geom) AS fkb_geom,
            ST_AsWKB(o.geom) AS osm_geom,
            COALESCE(f.building_id, o.building_id) AS building_id,
            f.building_id AS fkb_building_id,
            o.building_id AS osm_building_id,
            CAST(MAX (
                ST_Area(ST_Intersection(f.geom, o.geom)) / NULLIF(ST_Area(ST_Union(f.geom, o.geom)), 0)
            ) AS DECIMAl) AS max_iou
        FROM osm o
        LEFT JOIN fkb f
            ON f.grid_x = o.grid_x
            AND f.grid_y = o.grid_y
            AND ST_Intersects(f.geom, o.geom)
        GROUP BY f.fkb_id, o.osm_id, f.geom, o.geom, f.building_id, o.building_id
    ),

    fkb_only AS (
        SELECT
            fkb_id,
            osm_id,
            fkb_geom,
            osm_geom,
            building_id,
            fkb_building_id,
            osm_building_id,
            max_iou AS iou,
        FROM candidate_fkb_only_buildings
        WHERE max_iou IS NULL
    ),

    osm_only AS (
        SELECT
            fkb_id,
            osm_id,
            fkb_geom,
            osm_geom,
            building_id,
            fkb_building_id,
            osm_building_id,
            max_iou AS iou,
        FROM candidate_osm_only_buildings
        WHERE max_iou IS NULL
    ),

    fkb_osm_overlap AS (
        SELECT
            f.fkb_id AS fkb_id,
            o.osm_id AS osm_id,
            ST_AsWKB(f.geom) AS fkb_geom,
            ST_AsWKB(o.geom) AS osm_geom,
            COALESCE(f.building_id, o.building_id) AS building_id,
            f.building_id AS fkb_building_id,
            o.building_id AS osm_building_id,
            ST_Area(ST_Intersection(f.geom, o.geom)) / ST_Area(ST_Union(f.geom, o.geom)) AS iou,
        FROM fkb f
        JOIN osm o
            ON f.grid_x = o.grid_x
            AND f.grid_y = o.grid_y
        WHERE ST_Intersects(f.geom, o.geom)
    ),

    merged AS (
        SELECT
            fkb_id,
            osm_id,
            fkb_building_id,
            osm_building_id,
            fkb_geom,
            osm_geom,
            'fkb-only' AS description
        FROM fkb_only
        UNION
        SELECT
            fkb_id,
            osm_id,
            fkb_building_id,
            osm_building_id,
            fkb_geom,
            osm_geom,
            'osm-only' AS description
        FROM osm_only
        UNION
        SELECT
            fkb_id,
            osm_id,
            fkb_building_id,
            osm_building_id,
            fkb_geom,
            osm_geom,
            'fkb-osm-overlap' AS description
        FROM fkb_osm_overlap
    )

    SELECT * FROM merged
    '''

In [51]:
df = db.execute(query).fetchdf()

In [52]:
df

Unnamed: 0,fkb_id,osm_id,fkb_building_id,osm_building_id,fkb_geom,osm_geom,description
0,c33cbab2-c01c-4cfa-acf6-3e89523b40df,1877855860,300469559,300469559,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 21, 0, 0, 0, 100, ...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",fkb-osm-overlap
1,5b7f4b4c-4062-431d-abaa-c6d2b253cb7c,1877856982,300174681,300174681,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 19, 0, 0, 0, 50, 2...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",fkb-osm-overlap
2,47f3e254-f859-479a-82f2-f25e4dd95f3a,1877857714,300112823,300112823,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 139, 6...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",fkb-osm-overlap
3,b047c8d5-1994-41fc-b0f7-0d9994a2dba3,1877857194,300154987,300154987,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 12, 0, 0, 0, 63, 2...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",fkb-osm-overlap
4,20f9c325-fd10-42c6-939a-212868474d21,1877856082,12009682,300429025,"[1, 3, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 5, 225...","[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",fkb-osm-overlap
...,...,...,...,...,...,...,...
4168768,,2017402964,,192702607,,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",osm-only
4168769,,2017404980,,192405815,,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",osm-only
4168770,,2017406676,,192404053,,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",osm-only
4168771,,2017406758,,192433215,,"[1, 6, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 1, ...",osm-only


In [69]:
fkb_only = df[df["description"] == "fkb-only"]
osm_only = df[df["description"] == "osm-only"]
overlap = df[df["description"] == "fkb-osm-overlap"]

In [70]:
fkb_only = fkb_only[["fkb_id", "fkb_geom"]]
osm_only = osm_only[["osm_id", "osm_geom"]]
overlap = overlap[["fkb_id", "osm_id", "fkb_geom"]]

In [71]:
fkb_only["fkb_geom"] = fkb_only["fkb_geom"].apply(lambda g: bytes(g) if isinstance(g, (memoryview, bytearray)) else g)
fkb_only["fkb_geom"] = fkb_only["fkb_geom"].apply(lambda g: from_wkb(g) if g else None)

osm_only["osm_geom"] = osm_only["osm_geom"].apply(lambda g: bytes(g) if isinstance(g, (memoryview, bytearray)) else g)
osm_only["osm_geom"] = osm_only["osm_geom"].apply(lambda g: from_wkb(g) if g else None)

overlap["fkb_geom"] = overlap["fkb_geom"].apply(lambda g: bytes(g) if isinstance(g, (bytearray, memoryview)) else g)
overlap["fkb_geom"] = overlap["fkb_geom"].apply(lambda g: from_wkb(g) if g else None)

In [72]:
fkb_only_gdf = gpd.GeoDataFrame(fkb_only, geometry="fkb_geom", crs="EPSG:4326")
fkb_only_gdf.head()

Unnamed: 0,fkb_id,fkb_geom
1544,20a4956c-2e52-4488-9ca7-560466512c84,"POLYGON ((14.16911 66.31392, 14.16914 66.31396..."
1545,83204b37-4084-4453-98d4-7e38431ace71,"POLYGON ((14.16555 66.31339, 14.16553 66.3134,..."
1546,9bd05e5f-a32b-46d8-bd0d-83df477b7ed1,"POLYGON ((14.16003 66.3077, 14.16002 66.30774,..."
1547,499ee8b2-c80d-4cf5-b456-553d472bb6b9,"POLYGON ((10.52522 59.89063, 10.52523 59.89064..."
1548,a243ef05-3777-4211-bcda-73b399470167,"POLYGON ((8.05893 58.15828, 8.05895 58.15831, ..."


In [73]:
osm_only_gdf = gpd.GeoDataFrame(osm_only, geometry="osm_geom", crs="EPSG:4326")
osm_only_gdf.head()

Unnamed: 0,osm_id,osm_geom
1780,227718908,"MULTIPOLYGON (((10.76922 59.92524, 10.76932 59..."
1781,227755486,"MULTIPOLYGON (((10.7613 59.92139, 10.7613 59.9..."
1782,229458164,"MULTIPOLYGON (((10.72773 59.941, 10.72778 59.9..."
1783,229951500,"MULTIPOLYGON (((10.73492 59.93811, 10.73519 59..."
1784,230036720,"MULTIPOLYGON (((10.72851 59.92842, 10.72855 59..."


In [74]:
overlap_gdf = gpd.GeoDataFrame(overlap, geometry="fkb_geom", crs="EPSG:4326")
overlap_gdf.head()

Unnamed: 0,fkb_id,osm_id,fkb_geom
0,c33cbab2-c01c-4cfa-acf6-3e89523b40df,1877855860,"POLYGON ((14.1558 66.30556, 14.15581 66.30561,..."
1,5b7f4b4c-4062-431d-abaa-c6d2b253cb7c,1877856982,"POLYGON ((14.15491 66.317, 14.15501 66.31698, ..."
2,47f3e254-f859-479a-82f2-f25e4dd95f3a,1877857714,"POLYGON ((14.15769 66.31807, 14.15763 66.31809..."
3,b047c8d5-1994-41fc-b0f7-0d9994a2dba3,1877857194,"POLYGON ((14.15676 66.31507, 14.15675 66.31508..."
4,20f9c325-fd10-42c6-939a-212868474d21,1877856082,"POLYGON ((14.1653 66.31634, 14.16537 66.31639,..."


In [75]:
fkb_only_gdf.to_parquet("conflated_fkb_only.parquet")
osm_only_gdf.to_parquet("conflated_osm_only.parquet")
overlap_gdf.to_parquet("conflated_overlapping.parquet")