In [41]:
import duckdb
import numpy as np
from shapely import from_wkb

from src import Config

### Helper methods

In [42]:
def quote_col(col: str) -> str:
    return f'"{col}"'


def to_geometry(value):
    if isinstance(value, (bytes, bytearray)):
        return from_wkb(value)
    elif isinstance(value, list):
        return from_wkb(bytes(value))
    elif isinstance(value, np.ndarray):
        return from_wkb(value.tobytes())
    else:
        return None

In [43]:
path = Config.FKB_BUILDINGS_PARQUET_PATH

### Load spatial packages for DuckDB

In [44]:
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")
con.execute("SELECT * FROM duckdb_extensions() WHERE loaded = true;").fetchdf()

Unnamed: 0,extension_name,loaded,installed,install_path,description,aliases,extension_version,install_mode,installed_from
0,core_functions,True,True,(BUILT-IN),Core function library,[],v1.4.0,STATICALLY_LINKED,
1,icu,True,True,(BUILT-IN),Adds support for time zones and collations usi...,[],v1.4.0,STATICALLY_LINKED,
2,json,True,True,(BUILT-IN),Adds support for JSON operations,[],v1.4.0,STATICALLY_LINKED,
3,parquet,True,True,(BUILT-IN),Adds support for reading and writing parquet f...,[],v1.4.0,STATICALLY_LINKED,
4,spatial,True,True,C:\Users\Jathavaan Shankarr\.duckdb\extensions...,Geospatial extension that adds support for wor...,[],a6a607f,REPOSITORY,core


### EDA
#### Row count

In [45]:
total_rows = con.execute(f"SELECT COUNT(*) AS n FROM read_parquet('{path}');").fetchone()[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 56111


#### Describe columns

In [46]:
columns = con.execute(f"DESCRIBE SELECT * FROM read_parquet('{path}');").fetchdf()
columns

Unnamed: 0,column_name,column_type,null,key,default,extra
0,gml_id,VARCHAR,YES,,,
1,lokalId,VARCHAR,YES,,,
2,navnerom,VARCHAR,YES,,,
3,versjonId,VARCHAR,YES,,,
4,produkt,VARCHAR,YES,,,
5,versjon,VARCHAR,YES,,,
6,oppdateringsdato,VARCHAR,YES,,,
7,datafangstdato,VARCHAR,YES,,,
8,geometry,GEOMETRY,YES,,,
9,dataset,VARCHAR,YES,,,


#### Count `NaN` and `null`-values

In [47]:
cols = columns["column_name"].tolist()

# exclude geometry column to avoid heavy scan
cols = [c for c in cols if c.lower() != "geometry"]

null_count_query = "SELECT " + ", ".join(
    [f"SUM(CASE WHEN {quote_col(c)} IS NULL THEN 1 ELSE 0 END) AS {quote_col(c + '_nulls')}" for c in cols]
) + f" FROM read_parquet('{path}');"

# run query safely
null_counts = con.execute(null_count_query).fetchdf().T
null_counts.columns = ["null_count"]

# compute null percentages
total_rows = con.execute(f"SELECT COUNT(*) FROM read_parquet('{path}');").fetchone()[0]
null_counts["null_percent"] = (null_counts["null_count"] / total_rows * 100).round(2)
null_counts.sort_values("null_percent", ascending=False)

Unnamed: 0,null_count,null_percent
REGISTRERINGSKRETSNUMMER_nulls,56111.0,100.0
HUSLØPENUMMER_nulls,56111.0,100.0
KOMM_nulls,56111.0,100.0
TRE_D_NIVÅ_nulls,55808.0,99.46
bygningstype_nulls,54533.0,97.19
bygningsnummer_nulls,54533.0,97.19
kommunenummer_nulls,54533.0,97.19
BYGGSTAT_nulls,54533.0,97.19
medium_nulls,53134.0,94.69
informasjon_nulls,52763.0,94.03


#### Describe geometry feature

In [48]:
geom_stats = con.execute(f"""
    SELECT
        COUNT(*) AS total,
        SUM(CASE WHEN geometry IS NULL THEN 1 ELSE 0 END) AS null_geometries,
        COUNT(DISTINCT ST_GeometryType(geometry)) AS geom_types,
        MIN(ST_XMin(geometry)) AS xmin,
        MIN(ST_YMin(geometry)) AS ymin,
        MAX(ST_XMax(geometry)) AS xmax,
        MAX(ST_YMax(geometry)) AS ymax
    FROM read_parquet('{path}');
""").fetchdf()
geom_stats

Unnamed: 0,total,null_geometries,geom_types,xmin,ymin,xmax,ymax
0,56111,0.0,2,296275.0,6699405.09,298415.86,6701763.81


#### Count number of entries for each geometry type

In [49]:
geometry_types = con.execute(f"""
SELECT ST_GeometryType(ST_GeomFromWKB(geom_wkb)) AS geom_type, COUNT(*)
FROM read_parquet('{path}')
GROUP BY geom_type
""").fetchdf()
geometry_types

Unnamed: 0,geom_type,count_star()
0,POINT,2212
1,LINESTRING,53899


#### Find length, height differences and others `LINESTRING`-geometries

In [56]:
geometry_stats = con.execute(f"""
    SELECT
        gml_id,
        lokalId,
        dataset,
        ST_Length(ST_GeomFromWKB(geom_wkb)) AS length,
        ST_ZMin(ST_GeomFromWKB(geom_wkb)) AS min_height,
        ST_ZMax(ST_GeomFromWKB(geom_wkb)) AS max_height,
        ST_ZMax(ST_GeomFromWKB(geom_wkb)) - ST_ZMin(ST_GeomFromWKB(geom_wkb)) AS height_range,
        ST_Z(ST_PointN(ST_GeomFromWKB(geom_wkb), 1)) AS start_z,
        ST_Z(ST_PointN(
            ST_GeomFromWKB(geom_wkb),
            CAST(ST_NPoints(ST_GeomFromWKB(geom_wkb)) AS INTEGER)
        )) AS end_z,
        ST_Z(ST_PointN(
            ST_GeomFromWKB(geom_wkb),
            CAST(ST_NPoints(ST_GeomFromWKB(geom_wkb)) AS INTEGER)
        )) - ST_Z(ST_PointN(ST_GeomFromWKB(geom_wkb), 1)) AS z_diff_start_end
    FROM read_parquet('{path}')
    WHERE geometry IS NOT NULL
      AND ST_GeometryType(ST_GeomFromWKB(geom_wkb)) = 'LINESTRING'
      AND ST_Length(ST_GeomFromWKB(geom_wkb)) > 0
    ORDER BY length DESC;
""").fetchdf()

geometry_stats

Unnamed: 0,gml_id,lokalId,dataset,length,min_height,max_height,height_range,start_z,end_z,z_diff_start_end
0,Fasadeliv.52624,082abe66-166f-449a-9f1e-6a7d47211072,Fasadeliv,1005.965900,1.40,5.24,3.84,1.94,1.86,-0.08
1,Fasadeliv.53280,f8000036-8630-4513-8134-4f6b6c94b5c7,Fasadeliv,818.090375,1.16,3.72,2.56,2.99,3.00,0.01
2,Fasadeliv.51798,0f2c836d-099f-4ec9-83c0-adeedd536473,Fasadeliv,730.258608,1.94,4.24,2.30,3.77,3.77,0.00
3,Fasadeliv.52870,654ae770-7b23-473c-9a04-46f93777a6d2,Fasadeliv,623.379854,7.15,9.08,1.93,7.92,7.92,0.00
4,Fasadeliv.52731,2981b9d8-d68c-4965-9a4e-da14abc56f69,Fasadeliv,531.325404,3.39,9.09,5.70,4.76,4.76,0.00
...,...,...,...,...,...,...,...,...,...,...
53893,Taksprang.49219,dd6aceb8-cbec-4467-a695-813b17dd6792,Taksprang,0.042426,35.91,35.95,0.04,35.95,35.91,-0.04
53894,Takkant.8353,425e81d1-32ea-42de-8e2a-2c1e71551dea,Takkant,0.041231,17.47,17.47,0.00,17.47,17.47,0.00
53895,TaksprangBunn.43597,1b1fc550-2ac4-483b-a1f2-9733eda59ff5,TaksprangBunn,0.036056,10.10,10.10,0.00,10.10,10.10,0.00
53896,Takkant.47268,9939a5fa-052e-43b8-af31-f74bfb74c84e,Takkant,0.028284,23.95,23.95,0.00,23.95,23.95,0.00
