In [1]:
import duckdb
import geopandas as gpd
from shapely import box, from_wkb, wkb

from src import Config

In [2]:
def decode_geometry(x):
    if isinstance(x, list):
        return from_wkb(bytes(x))
    if isinstance(x, (bytes, bytearray)):
        return from_wkb(x)
    if isinstance(x, str) and x[:4] == "0106":
        return from_wkb(bytes.fromhex(x))
    return None

In [3]:
minx, miny, maxx, maxy = 5.15, 60.25, 5.45, 60.50
bbox = box(minx, miny, maxx, maxy)

In [4]:
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")

### Explore one partition

In [None]:
con.execute(f"""
SELECT *
FROM ST_Read_Meta('{Config.OSM_TEMP_PARQUET_DIR / 'part_00000.parquet'}');
""").fetchdf()

In [None]:
count = con.execute(f"SELECT count(*) FROM '{Config.OSM_BUILDINGS_PARQUET_PATH}'").fetchone()[0]
print(f"There are {count} entries in the Norway OSM-dataset")

In [None]:
con.execute(f"SELECT geometry FROM read_parquet('{Config.OSM_BUILDINGS_PARQUET_PATH}')").fetchdf()

### Fetch data within Bergen bounding box

In [None]:
bergen_wkt = "POLYGON((5.15 60.25, 5.45 60.25, 5.45 60.50, 5.15 60.50, 5.15 60.25))"

# Query only features within Bergen bounding box
query = f"""
SELECT * FROM read_parquet('{Config.OSM_BUILDINGS_PARQUET_PATH}')
WHERE ST_Within(
    ST_GeomFromWKB(geom_wkb),
    ST_GeomFromText('{bergen_wkt}')
)
"""

bergen_df = con.execute(query).fetchdf()

In [None]:
bergen_df["geometry"] = bergen_df["geom_wkb"].apply(
    lambda x: wkb.loads(bytes(x)) if isinstance(x, (bytes, bytearray)) else None
)

# Create GeoDataFrame using decoded geometry
bergen_df = gpd.GeoDataFrame(bergen_df, geometry="geometry", crs="EPSG:4326")

### Explore shapes and columns of the dataset

In [None]:
print(f"Shape of dataframe {bergen_df.shape}")

In [None]:
bergen_df.dtypes

#### Find columns that can be removed and remove them

In [None]:
none_columns = [col for col in bergen_df.columns if bergen_df[col].isna().all()]
none_columns.sort()

for col in none_columns:
    print(col)

In [None]:
bergen_df.drop(columns=none_columns, inplace=True)

In [None]:
keep_columns = [
    # Geometry
    "geometry", "geom_wkb", "area",

    # Identifiers
    "id", "osm_id", "osmid", "osm_way_id", "osm_relation_id",
    "ref:bygningsnr", "bygningsnr", "matrikkelnummer",
    "gnr", "bnr",

    # Descriptive
    "building", "name", "amenity", "shop", "office", "tourism",
    "leisure", "man_made", "industrial", "historic",

    # Physical
    "height", "building:levels", "building:levels:underground",

    # Construction
    "year_built", "build_year", "construction_year", "start_date",

    # Optional detail (include only if present)
    "roof:shape", "roof:height", "roof:material", "roof:colour",
    "building:material", "facade_material", "facade_color",
]

# Retain only columns that actually exist in bergen_df
bergen_df = bergen_df[[col for col in keep_columns if col in bergen_df.columns]]
bergen_df

In [None]:
threshold = 0.99
none_ratio = bergen_df.isna().mean()
high_none_columns = none_ratio[none_ratio > threshold].index.tolist()
bergen_df.drop(columns=high_none_columns, inplace=True)
bergen_df