In [7]:
import geopandas as gpd
import pandas as pd
import requests
import yaml
from shapely import to_wkb, from_wkb

from src import Config
from src.persistence import create_duckdb_context
from src.utils import convert_fgb_bytes_to_gdf, download_fgb_zip_file, unzip_flat_geobuf, cast_to_string

## Pre-processing of data
This is often domain-specific, and the data owner is free to include attributes as they please as long as they include the required attributes in the schema.
### Download data from Hugging Face and store it in a GeoDataFrame

In [2]:
fkb_dataframes: list[gpd.GeoDataFrame] = []
for utm_32n_path in Config.HUGGING_FACE_UTM32N_PATHS:
    zip_file = download_fgb_zip_file(path=utm_32n_path)
    fgb_file = unzip_flat_geobuf(zip_file, *Config.FKB_LAYERS)
    wgs_84_gdf = convert_fgb_bytes_to_gdf(fgb_file, crs_in=25832, crs_out=4326)

    fkb_dataframes.append(wgs_84_gdf)

for utm_33n_path in Config.HUGGING_FACE_UTM33N_PATHS:
    zip_file = download_fgb_zip_file(path=utm_33n_path)
    fgb_file = unzip_flat_geobuf(zip_file, *Config.FKB_LAYERS)
    wgs_84_gdf = convert_fgb_bytes_to_gdf(fgb_file, crs_in=25833, crs_out=4326)

    fkb_dataframes.append(wgs_84_gdf)

In [3]:
fkb_dataset = gpd.GeoDataFrame(
    data=pd.concat(fkb_dataframes, ignore_index=True),
    crs=4326
)

fkb_dataset["layer"] = ([gml_id.split(".", 1)[0] for gml_id in fkb_dataset["gml_id"].to_numpy()])
fkb_dataset["geometry"] = fkb_dataset["geometry"].apply(to_wkb)

  data=pd.concat(fkb_dataframes, ignore_index=True),
  fkb_dataset["geometry"] = fkb_dataset["geometry"].apply(to_wkb)


In [4]:
db = create_duckdb_context()
db.register("fkb_data", fkb_dataset)

<_duckdb.DuckDBPyConnection at 0x1a4eb00beb0>

### Create building polygons and rename to match convention defined in the schema

In [17]:
output_path = str(Config.OUTPUT_DATA_DIR / "fkb.parquet")
query = f'''
COPY (
    WITH polygons AS (
        WITH polygon_structs AS (
            SELECT
                UNNEST(
                    ST_Dump(ST_Polygonize([
                        ST_Union_Agg(ST_GeomFromWKB(geometry))
                    ])),
                    recursive := true
                ) AS geometry
            FROM fkb_data
            WHERE layer IN ('Takkant', 'FiktivBygningsavgrensning', 'Bygningsdelelinje') AND geometry IS NOT NULL
        )

        SELECT ST_Force2D(geom) AS geometry FROM polygon_structs
    ),

    points AS (
        SELECT
            lokalId AS external_id,
            ST_Force2D(ST_GeomFromWKB(geometry)) AS point,
            CAST(bygningsnummer AS INT) AS building_id,
            gml_id,
            CAST(bygningstype AS INT) AS building_type,
            oppdateringsdato AS feature_update_time,
            datafangstdato AS feature_capture_time,
            'building' AS theme,
            'fkb' AS dataset
        FROM fkb_data
        WHERE layer IN ('Bygning', 'AnnenBygning')
    ),

    fkb AS (
        SELECT * EXCLUDE(p.point) FROM polygons poly, points p
        WHERE ST_Contains(poly.geometry, p.point)
    )

    SELECT * FROM fkb
) TO '{output_path}'
'''

db.sql(query)