In [1]:
import pandas as pd
from shapely import to_wkb
import pyarrow as pa

from src import Config
from src.persistence import create_duckdb_context, create_blob_storage_context
from src.utils import download_osm_pbf_file, make_unique_columns, upload_parquet_to_container
from src.utils.osm_utils import process_osm

In [2]:
download_osm_pbf_file()

In [3]:
batches = process_osm()



In [4]:
batches = [make_unique_columns(b.reset_index(drop=True, inplace=False)) for b in batches]
df = pd.concat(batches)

In [5]:
df["geometry"] = df["geometry"].apply(to_wkb)

  df["geometry"] = df["geometry"].apply(to_wkb)


In [6]:
output_path = Config.OUTPUT_DATA_DIR / "osm.parquet"
table = pa.Table.from_pandas(df, preserve_index=True)

In [7]:
db = create_duckdb_context()
db.register("osm", table)
db.execute(
    f'''
    COPY (
        SELECT
            id AS external_id,
            geometry,
            name,
            "ref:bygningsnr" AS building_id,
            height,
            type AS building_type,
            "building:levels" AS building_levels,
            "roof:shape" AS roof_shape,
            "roof:levels" AS roof_levels
        FROM osm
    ) TO '{output_path}'
    '''
)

<_duckdb.DuckDBPyConnection at 0x24f091b4eb0>

In [8]:
storage_client = create_blob_storage_context()
upload_parquet_to_container(
    blob_service_client=storage_client,
    parquet_path=output_path,
    blob_name="osm.parquet",
    container_name="contributions"
)