In [None]:
import duckdb
import geopandas as gpd
import numpy as np
from shapely import box, from_wkb, wkb

from src import Config

In [None]:
def decode_geometry(x):
    if isinstance(x, list):
        return from_wkb(bytes(x))
    if isinstance(x, (bytes, bytearray)):
        return from_wkb(x)
    if isinstance(x, str) and x[:4] == "0106":
        return from_wkb(bytes.fromhex(x))
    return None

In [None]:
minx, miny, maxx, maxy = 5.15, 60.25, 5.45, 60.50
bbox = box(minx, miny, maxx, maxy)

In [None]:
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")

### Explore one partition

In [None]:
count = con.execute(f"SELECT count(*) FROM '{Config.OSM_BUILDINGS_PARQUET_PATH}'").fetchone()[0]
print(f"There are {count} entries in the Norway OSM-dataset")

In [None]:
con.execute(f"SELECT geometry FROM read_parquet('{Config.OSM_BUILDINGS_PARQUET_PATH}')").fetchdf()

### Fetch data within Bergen bounding box

In [None]:
bergen_wkt = "POLYGON((5.15 60.25, 5.45 60.25, 5.45 60.50, 5.15 60.50, 5.15 60.25))"

# Query only features within Bergen bounding box
query = f"""
SELECT * FROM read_parquet('{Config.OSM_BUILDINGS_PARQUET_PATH}')
WHERE ST_Within(
    geometry,
    ST_GeomFromText('{bergen_wkt}')
)
"""

# bergen_df = con.execute(query).fetchdf()

In [None]:
def safe_load_wkb(x):
    if not isinstance(x, (bytes, bytearray)):
        return None
    try:
        return wkb.loads(bytes(x))
    except Exception:
        return None  # skip invalid geometries

In [None]:
query = f"""
SELECT *
FROM read_parquet('{Config.OSM_BUILDINGS_PARQUET_PATH}')
LIMIT 1_000_000
"""
df["geometry"] = df["geometry"].apply(safe_load_wkb)

bergen_df = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")

### Explore shapes and columns of the dataset

In [14]:
print(f"Shape of dataframe {bergen_df.shape}")

Shape of dataframe (1000000, 1268)


In [15]:
bergen_df.dtypes

building              object
ref:bygningsnr        object
id                     int64
building:levels       object
access                object
                       ...  
portal:type           object
levels:underground    object
bbq                   object
developer             object
ref:kulturminneid     object
Length: 1268, dtype: object

#### Find columns that can be removed and remove them

In [16]:
none_columns = [col for col in bergen_df.columns if bergen_df[col].isna().all()]
none_columns.sort()

for col in none_columns:
    print(col)

Layer_1
abandoned:landuse
abandoned:power
abandoned:shop
abandoned:tourism
access:disabled
aerialway:station
appointment
architect:wikipedia
artist:wikidata
artist:wikimedia_commons
artist:wikipedia
basin
bbq
bike_ride
blind
boat:sails
building:floor_area
building:layer
building:prefabricated
building:units
capacity:cargo_bike
capacity:note
closed
communication:Telephone
communication:amateur_radio
communication:amateur_radio:callsign
communication:amateur_radio:repeater:ctcss
communication:amateur_radio:repeater:frequency_out
communication:amateur_radio:repeater:modulation
communication:amateur_radio:repeater:shift
community
community:gender
compensator
construction:leisure
contact:tiktok
contact:tripadvisor
contact:whatsapp
cooling:method
denomination:wikidata
destroyed
developer
diet:dairy_free
diet:fruitarian
diet:healthy
diet:lacto_vegetarian
diet:local
diet:ovo_vegetarian
diet:pescetarian
diet:raw
dispensing
disused:craft
disused:man_made
disused:military
disused:religion
disused

In [17]:
bergen_df.drop(columns=none_columns, inplace=True)

In [18]:
keep_columns = [
    # Geometry
    "geometry", "geom_wkb", "area",

    # Identifiers
    "id", "osm_id", "osmid", "osm_way_id", "osm_relation_id",
    "ref:bygningsnr", "bygningsnr", "matrikkelnummer",
    "gnr", "bnr",

    # Descriptive
    "building", "name", "amenity", "shop", "office", "tourism",
    "leisure", "man_made", "industrial", "historic",

    # Physical
    "height", "building:levels", "building:levels:underground",

    # Construction
    "year_built", "build_year", "construction_year", "start_date",

    # Optional detail (include only if present)
    "roof:shape", "roof:height", "roof:material", "roof:colour",
    "building:material", "facade_material", "facade_color",
]

# Retain only columns that actually exist in bergen_df
bergen_df = bergen_df[[col for col in keep_columns if col in bergen_df.columns]]
bergen_df

Unnamed: 0,geometry,area,id,ref:bygningsnr,building,name,amenity,shop,office,tourism,...,building:levels:underground,year_built,build_year,construction_year,start_date,roof:shape,roof:height,roof:material,roof:colour,building:material
0,,,8265818,12247613,farm,,,,,,...,,,,,,,,,,
1,,,8292478,173892403,warehouse,,,,,,...,,,,,,,,,,
2,,,8295912,17874764,cabin,,,,,,...,,,,,,,,,,
3,,,8295920,9609636,cabin,,,,,,...,,,,,,,,,,
4,,,8302768,9346694,cabin,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,,,1595880882,,yes,,,,,,...,,,,,,,,,,
999996,,,1595880980,300330908,garages,,,,,,...,,,,,,,,,,
999997,,,1595880982,300301030,house,,,,,,...,,,,,,,,,,
999998,,,1595880984,171933854,barn,,,,,,...,,,,,,,,,,


In [19]:
threshold = 0.95
none_ratio = bergen_df.isna().mean()
high_none_columns = none_ratio[none_ratio > threshold].index.tolist()
bergen_df.drop(columns=high_none_columns, inplace=True)
bergen_df

Unnamed: 0,id,ref:bygningsnr,building,building:levels
0,8265818,12247613,farm,
1,8292478,173892403,warehouse,
2,8295912,17874764,cabin,
3,8295920,9609636,cabin,
4,8302768,9346694,cabin,
...,...,...,...,...
999995,1595880882,,yes,
999996,1595880980,300330908,garages,
999997,1595880982,300301030,house,
999998,1595880984,171933854,barn,


In [20]:
types = np.unique(bergen_df["building"].to_numpy())
types

array(['Transformer_building', 'air_to_cable_transition',
       'airplane_mockup', 'airport_tower', 'airstrip_control_tower',
       'allotment_house', 'ambulance_station', 'anchor_block', 'annexe',
       'apartments', 'avalanche_protector', 'bakehouse', 'barn',
       'barracks', 'basilica', 'bell_tower', 'bf', 'bike_shed', 'boat',
       'boat_shed', 'boathouse', 'brewery', 'bridge', 'bungalow',
       'bunker', 'bus_shelter', 'cab', 'cabin', 'cafe', 'canon_house',
       'car_port', 'car_wash', 'caravan', 'carport', 'cathedral',
       'cell_base_station', 'chapel', 'chimney', 'church', 'church_tower',
       'cinema', 'civic', 'clinic', 'club_house', 'collapsed', 'college',
       'commercial', 'concrete_block', 'connector', 'conservatory',
       'construction', 'container', 'contour', 'covered_footbridge',
       'covered_walkway', 'cowshed', 'cultural', 'depot', 'detached',
       'dock', 'dome', 'dormitory', 'elevator_shaft', 'factory', 'farm',
       'farm_auxiliary', 'fire_

In [21]:
bergen_df["building"] = bergen_df["building"].replace("yes", "building")
bergen_df

Unnamed: 0,id,ref:bygningsnr,building,building:levels
0,8265818,12247613,farm,
1,8292478,173892403,warehouse,
2,8295912,17874764,cabin,
3,8295920,9609636,cabin,
4,8302768,9346694,cabin,
...,...,...,...,...
999995,1595880882,,building,
999996,1595880980,300330908,garages,
999997,1595880982,300301030,house,
999998,1595880984,171933854,barn,


In [22]:
types = np.unique(bergen_df["building"].to_numpy())
types

array(['Transformer_building', 'air_to_cable_transition',
       'airplane_mockup', 'airport_tower', 'airstrip_control_tower',
       'allotment_house', 'ambulance_station', 'anchor_block', 'annexe',
       'apartments', 'avalanche_protector', 'bakehouse', 'barn',
       'barracks', 'basilica', 'bell_tower', 'bf', 'bike_shed', 'boat',
       'boat_shed', 'boathouse', 'brewery', 'bridge', 'building',
       'bungalow', 'bunker', 'bus_shelter', 'cab', 'cabin', 'cafe',
       'canon_house', 'car_port', 'car_wash', 'caravan', 'carport',
       'cathedral', 'cell_base_station', 'chapel', 'chimney', 'church',
       'church_tower', 'cinema', 'civic', 'clinic', 'club_house',
       'collapsed', 'college', 'commercial', 'concrete_block',
       'connector', 'conservatory', 'construction', 'container',
       'contour', 'covered_footbridge', 'covered_walkway', 'cowshed',
       'cultural', 'depot', 'detached', 'dock', 'dome', 'dormitory',
       'elevator_shaft', 'factory', 'farm', 'farm_auxili