In [None]:
import os
import duckdb
from dotenv import load_dotenv
import keplergl

In [None]:
load_dotenv()
connection_string = os.getenv("BLOB_STORAGE_CONNECTION_STRING")
print("Connection string:", connection_string)

con = duckdb.connect(":memory:")
con.execute("INSTALL spatial; LOAD spatial;")
con.execute("INSTALL azure; LOAD azure;")
con.execute("SET azure_storage_connection_string = ?;", [os.getenv("BLOB_STORAGE_CONNECTION_STRING")])
con.execute("SET azure_transport_option_type = curl")

# Verify that connection string is set in duckdb

In [None]:
con.sql("""SELECT current_setting('azure_storage_connection_string')""")

In [None]:
con.sql("""
    SELECT * FROM parquet_schema('az://raw/release/2025-11-06.3/dataset=fkb/theme=buildings/region=18/**.parquet')
""")

In [None]:
con.sql("""
    SELECT * FROM parquet_schema('az://raw/release/2025-11-06.3/dataset=osm/theme=buildings/region=18/**.parquet')
""")

In [None]:
con.sql("""
    SELECT * FROM read_parquet('az://raw/release/2025-11-06.3/dataset=osm/theme=buildings/region=18/**.parquet') LIMIT 10
""")

In [None]:
# Create KeplerGL map with buildings (FKB=red, OSM=green)
kart = keplergl.KeplerGl(height=700)

kart.add_data(
    con.sql("""
        SELECT * EXCLUDE(geometry), 
               ST_AsText(geometry) as geometry 
        FROM read_parquet('az://raw/release/2025-11-06.4/dataset=fkb/theme=buildings/region=46/**.parquet')
    """).df(), 
    name='FKB Buildings'
)

kart.add_data(
    con.sql("""
        SELECT * EXCLUDE(geometry), 
               ST_AsText(geometry) as geometry 
        FROM read_parquet('az://raw/release/2025-11-06.4/dataset=osm/theme=buildings/region=46/**.parquet')
    """).df(), 
    name='OSM Buildings'
)

kart


In [None]:
# Filter OSM buildings: remove if very similar to FKB, keep if significantly different
# Uses grid-based prefiltering for efficiency (data already in EPSG:4326)
con.execute("""
    CREATE OR REPLACE TABLE filtered_buildings AS
    WITH fkb AS (
        SELECT 
            geometry,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_y,
            ST_Area(geometry) as area
        FROM read_parquet('az://raw/release/2025-11-06.4/dataset=fkb/theme=buildings/region=46/**.parquet')
    ),
    osm AS (
        SELECT 
            geometry,
            CAST(FLOOR(ST_X(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_x,
            CAST(FLOOR(ST_Y(ST_Centroid(geometry)) * 100) AS INTEGER) as grid_y,
            ST_Area(geometry) as area
        FROM read_parquet('az://raw/release/2025-11-06.4/dataset=osm/theme=buildings/region=46/**.parquet')
    ),
    -- Find OSM buildings that intersect with FKB buildings (prefiltered by grid cell)
    osm_with_overlap AS (
        SELECT 
            o.geometry as osm_geom,
            o.area as osm_area,
            f.area as fkb_area,
            ST_Area(ST_Intersection(o.geometry, f.geometry)) / LEAST(o.area, f.area) as overlap_ratio
        FROM osm o
        JOIN fkb f ON o.grid_x = f.grid_x AND o.grid_y = f.grid_y  -- Prefilter using grid
        WHERE ST_Intersects(o.geometry, f.geometry)
    )
    -- Keep all FKB buildings (red)
    SELECT 
        geometry,
        'fkb' as source,
        1 as color
    FROM fkb
    
    UNION ALL
    
    -- Keep OSM buildings with no overlap (green)
    SELECT 
        o.geometry,
        'osm_no_overlap' as source,
        2 as color
    FROM osm o
    WHERE NOT EXISTS (
        SELECT 1 FROM fkb f 
        WHERE f.grid_x = o.grid_x AND f.grid_y = o.grid_y 
        AND ST_Intersects(o.geometry, f.geometry)
    )
    
    UNION ALL
    
    -- Keep OSM buildings with significant difference (overlap < 70%, yellow)
    SELECT 
        osm_geom as geometry,
        'osm_different' as source,
        3 as color
    FROM osm_with_overlap
    WHERE overlap_ratio < 0.70
""")

# Show summary
con.sql("""
    SELECT 
        source,
        COUNT(*) as count,
        CASE source
            WHEN 'fkb' THEN 'FKB buildings (red)'
            WHEN 'osm_no_overlap' THEN 'OSM buildings - no FKB overlap (green)'
            WHEN 'osm_different' THEN 'OSM buildings - significantly different from FKB (yellow)'
        END as description
    FROM filtered_buildings
    GROUP BY source
    ORDER BY count DESC
""").show()


In [None]:
# Visualize filtered buildings on map
kart2 = keplergl.KeplerGl(height=700)

kart2.add_data(
    con.sql("""
        SELECT * EXCLUDE(geometry), 
               ST_AsText(geometry) as geometry 
        FROM filtered_buildings
    """).df(), 
    name='Filtered Buildings'
)

kart2
