### CBS data for whole NL
- Need to:
    - Clean for only necessary attributes (columns)
    - find a way to filter for the city data (extents?) from CityPy 

In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
import folium
from pathlib import Path

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

In [5]:
def load_cbs(cbs_data):
    # load cbs data
    cbs = gpd.read_file(cbs_data)
    return cbs

data1 = "../data/cbs/cbs_vk500_2023_v1.gpkg"

In [None]:
# check layers in gpkg
layers = gpd.list_layers(data1)
layers

print(f"There is {len(layers)} layer(s) in the gpkg file.")

In [7]:
# assign cbs.gpkg
cbs_gdf = load_cbs(data1)

In [None]:
# check the basics
cbs_gdf.shape

print(f"This cbs file has {cbs_gdf.shape[0]} rows and {cbs_gdf.shape[1]} columns.")

In [None]:
cbs_gdf.columns

print(f"This cbs file has the following columns: {cbs_gdf.columns}")

In [None]:
cbs_gdf.head()

In [None]:
cbs_gdf.dtypes

In [None]:
cbs_gdf.describe()

In [None]:
cbs_gdf.isnull().sum()

In [None]:
for col in cbs_gdf.columns:
    null_count = cbs_gdf[col].isnull().sum()
    if null_count > 0:
        print(f"Column {col} has {null_count} null values.")

""" TODO: Can't make a quick decision to remove all rows with null values.
The cells would mostly be in inhabited areas. Need to check first. But how? """ 

In [None]:
"""TODO:
- load .csv
- check overlapping data with the gpd file
- create a data inventory of what to use and what not to use
"""

In [None]:
# Keep rows that have at least (num_cols - 5) non-null values
cbs_clean = cbs_gdf.dropna(thresh=cbs_gdf.shape[1] - 5)

# save geom as wkt
out_csv = "../data/cbs/cbs_gdf_clean.csv"  
Path(out_csv).parent.mkdir(parents=True, exist_ok=True)

cbs_clean.assign(wkt=cbs_clean.geometry.apply(lambda g: g.wkt if g is not None else None)) \
         .drop(columns="geometry") \
         .to_csv(out_csv, index=False)

print(f"Saved: {out_csv} (rows kept: {len(cbs_clean)} of {len(cbs_gdf)})")
cbs_clean.shape[0]


In [None]:
cbs_clean.shape[0]

In [None]:
csv_2023 = pd.read_csv("../data/cbs/HH_500m_grid_2023.csv")



In [None]:
print(f"This csv file has {csv_2023.shape[0]} rows and {csv_2023.shape[1]} columns.")
print(f"This csv file has {len(csv_2023.columns.tolist())} following columns: {csv_2023.columns.tolist()}")
csv_2023.head()

In [None]:
csv_2023.dtypes

In [None]:
csv_2023.describe()

In [None]:
csv_2023.isnull().sum()

In [None]:
cbs_clean.columns.tolist()

In [None]:
from shapely import wkt
from shapely.geometry.base import BaseGeometry
import geopandas as gpd

def to_geom(val):
    if isinstance(val, BaseGeometry):
        return val
    if isinstance(val, str):
        return wkt.loads(val)
    return None

cbs_clean["geometry"] = cbs_clean["geometry"].apply(to_geom)
left_key = "VRLVIERKANT500M" # in csv_2023
right_key = "crs28992res500m" # in cbs_clean
geometry_lookup = cbs_clean[[right_key, "geometry"]].rename(columns={right_key: left_key})
merged = pd.merge(csv_2023, geometry_lookup, on=left_key, how="left")
gdf = gpd.GeoDataFrame(merged, geometry="geometry", crs = "EPSG:28992")
gdf.to_file("../data/cbs/cbs_gdf_clean.gpkg", driver="GPKG")

In [None]:
print(sorted(cbs_clean.columns.tolist())[:20])  # peek
[name for name in cbs_clean.columns if "vierkant" in name.lower() or "500" in name.lower() or "grid" in name.lower()]

In [None]:
gpd.read_file("../data/cbs/cbs_gdf_clean.gpkg")