# Clean Buildings Data
This script cleans the buildings data which was downloaded from https://smart-zwolle.opendata.arcgis.com/datasets/digitale-tweelingstad-zwolle-adressen-2d/about

The column names are as follows:
['IDENTIFICA', 'ADRES', 'POSTCODE', 'STATUS', 'GEBRUIKSDO', 'OPPERVLAKT', 'PAND_IDENT', 'BOUWJAAR', 'ACTIVITEIT', 'WOZ_ONDERD', 'BUURTNAAM', 'WIJKNAAM', 'GEMEENTE', 'VEILIGHEID', 'WATERSCHAP', 'NETBEHEERD', 'DRINKWATER', 'NAAM', 'MONUMENT', 'LRK', 'KVK', 'LEEFBAARHE', 'GEBOUWTYPE', 'GEBOUWSUBT', 'ENERGIELAB', 'BEREKENING', 'AANDEELHER', 'ENERGIEBEH', 'GRONDHOOGT', 'HOOGTE', 'BOUWLAAG_G', 'BOUWLAAG', 'GASVERBRUI', 'ELECTRAVER', 'DAKVORM', 'DAKVORM_LA', 'ZONNEPANEL', 'BEREIKBAAR', 'BEREIKBA_1', 'BVB_LI', 'BVB_VI', 'BVB_FI', 'BVB_SI', 'IS_BIJEENK', 'IS_GEZONDH', 'IS_INDUSTR', 'IS_KANTOOR', 'IS_LOGIES', 'IS_ONDERWI', 'IS_SPORT', 'IS_WINKEL', 'IS_WOON', 'IS_KAS', 'geometry']

Processing and output:

The file was saved as a shapefile and used in the script below. 

## Before running:
1. Update directories and file names as necessary under CONFIGURATION

In [6]:
import geopandas as gpd
import pandas as pd
import os
import re

# -------------------------------
# CONFIGURATION
# -------------------------------
RAW_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../clean_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

shapefile_path = os.path.join(RAW_DATA_DIR, "Digitale_Tweelingstad_Zwolle_Adressen_2D.shp")
output_shapefile_path = os.path.join(OUTPUT_DIR, "Digitale_Tweelingstad_Zwolle_Adressen_2D_cleaned.shp")

# Aliases to unify inconsistent column names
COLUMN_ALIASES = {
    "buurtname": "buurt_naam",
    "buurtnaam": "buurt_naam",
    "buurt": "buurt_naam",
    "wijknaam": "wijk_naam",
    "wijk": "wijk_naam",
}

# -------------------------------
# FUNCTIONS
# -------------------------------
def normalize_name(name):
    if pd.isna(name):
        return name
    name = str(name).strip().lower()
    name = re.sub(r"\s+", " ", name)
    name = name.replace(".", "")
    return name

def clean_column_names(df):
    def clean(col):
        col = re.sub(r"\[.*?\]|\(.*?\)", "", col)
        col = col.strip().lower()
        col = re.sub(r"[^\w\s]", "", col)
        col = re.sub(r"\s+", "_", col)
        col = re.sub(r"_+", "_", col)
        col = col.strip("_")
        col = COLUMN_ALIASES.get(col, col)
        return col
    original_columns = df.columns.tolist()
    df.columns = [clean(col) for col in df.columns]
    print("🧾 Standardized column names:")
    for old, new in zip(original_columns, df.columns):
        if old != new:
            print(f"  - '{old}' ➜ '{new}'")
    return df

def strip_whitespace(df):
    return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

def clean_data(df):
    df = strip_whitespace(df)
    df = df.dropna(how='all')
    df = df.drop_duplicates()
    return df

# -------------------------------
# PROCESSING SHAPEFILE
# -------------------------------
print(f"\n🔄 Processing shapefile: {shapefile_path}")
try:
    gdf = gpd.read_file(shapefile_path)
    gdf = clean_column_names(gdf)

    # Normalize buurt_naam and wijk_naam
    if "buurt_naam" in gdf.columns:
        gdf["buurt_naam"] = gdf["buurt_naam"].apply(normalize_name)
    else:
        print("⚠️ 'buurt_naam' column not found.")

    if "wijk_naam" in gdf.columns:
        gdf["wijk_naam"] = gdf["wijk_naam"].apply(normalize_name)
    else:
        print("⚠️ 'wijk_naam' column not found.")

    gdf = clean_data(gdf)

    print(f"✅ Total rows after cleaning: {len(gdf)}")
    
    if "buurt_naam" in gdf.columns:
        print(f"📌 Unique neighborhoods: {gdf['buurt_naam'].nunique()}")
        print(gdf['buurt_naam'].value_counts())

    if "wijk_naam" in gdf.columns:
        print(f"📍 Unique districts: {gdf['wijk_naam'].nunique()}")
        print(gdf['wijk_naam'].value_counts())

    gdf.to_file(output_shapefile_path)
    print(f"📁 Saved cleaned shapefile to: {output_shapefile_path}")
except Exception as e:
    print(f"❌ Failed to process shapefile: {e}")



🔄 Processing shapefile: ../raw_data/Digitale_Tweelingstad_Zwolle_Adressen_2D.shp
🧾 Standardized column names:
  - 'IDENTIFICA' ➜ 'identifica'
  - 'ADRES' ➜ 'adres'
  - 'POSTCODE' ➜ 'postcode'
  - 'STATUS' ➜ 'status'
  - 'GEBRUIKSDO' ➜ 'gebruiksdo'
  - 'OPPERVLAKT' ➜ 'oppervlakt'
  - 'PAND_IDENT' ➜ 'pand_ident'
  - 'BOUWJAAR' ➜ 'bouwjaar'
  - 'ACTIVITEIT' ➜ 'activiteit'
  - 'WOZ_ONDERD' ➜ 'woz_onderd'
  - 'BUURTNAAM' ➜ 'buurt_naam'
  - 'WIJKNAAM' ➜ 'wijk_naam'
  - 'GEMEENTE' ➜ 'gemeente'
  - 'VEILIGHEID' ➜ 'veiligheid'
  - 'WATERSCHAP' ➜ 'waterschap'
  - 'NETBEHEERD' ➜ 'netbeheerd'
  - 'DRINKWATER' ➜ 'drinkwater'
  - 'NAAM' ➜ 'naam'
  - 'MONUMENT' ➜ 'monument'
  - 'LRK' ➜ 'lrk'
  - 'KVK' ➜ 'kvk'
  - 'LEEFBAARHE' ➜ 'leefbaarhe'
  - 'GEBOUWTYPE' ➜ 'gebouwtype'
  - 'GEBOUWSUBT' ➜ 'gebouwsubt'
  - 'ENERGIELAB' ➜ 'energielab'
  - 'BEREKENING' ➜ 'berekening'
  - 'AANDEELHER' ➜ 'aandeelher'
  - 'ENERGIEBEH' ➜ 'energiebeh'
  - 'GRONDHOOGT' ➜ 'grondhoogt'
  - 'HOOGTE' ➜ 'hoogte'
  - 'BOUWLAAG_G

  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


✅ Total rows after cleaning: 71888
📌 Unique neighborhoods: 78
buurt_naam
milligen                             2977
frankhuis                            2411
oud-assendorp                        2361
aa-landen-midden                     2355
hogenkamp                            2318
                                     ... 
langenholte                            56
bedrijventerrein voorst-d              30
stadsbroek                             28
bedrijventerrein marslanden-noord      24
mastenbroek                            23
Name: count, Length: 78, dtype: int64
📍 Unique districts: 16
wijk_naam
stadshagen               11924
assendorp                 7771
diezerpoort               7600
aa-landen                 6893
ittersum                  6739
schelle                   6328
holtenbroek               6091
binnenstad                3961
wipstrik                  3449
berkum                    2997
westenholte               2600
kamperpoort-veerallee     2263
marsweteringlanden    

## Minimize Data

This process selects columns that will be used for future modeling.

In [23]:
import geopandas as gpd
import os

# -------------------------------
# CONFIGURATION
# -------------------------------
CLEAN_DATA_DIR = "../clean_data/"
OUTPUT_DIR = "../minimized_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

shapefile_path = os.path.join(CLEAN_DATA_DIR, "Digitale_Tweelingstad_Zwolle_Adressen_2D_cleaned.shp")
output_shapefile_path = os.path.join(OUTPUT_DIR, "Digitale_Tweelingstad_Zwolle_Selected_minimized.shp")

# Variables to select
selected_columns = ["status", "bouwjaar", "buurt_naam", "wijk_naam", "energielab", "geometry", "adres"]

# -------------------------------
# PROCESSING SHAPEFILE
# -------------------------------
print(f"\n🔄 Processing shapefile: {shapefile_path}")
try:
    gdf = gpd.read_file(shapefile_path)
    
    # Check for missing columns
    missing_columns = set(selected_columns) - set(gdf.columns)
    if missing_columns:
        raise ValueError(f"Missing columns in the shapefile: {missing_columns}")

    # Select specified columns
    gdf_selected = gdf[selected_columns]

    # Save to new shapefile
    gdf_selected.to_file(output_shapefile_path)
    print(f"✅ Selected columns saved to: {output_shapefile_path}")

except Exception as e:
    print(f"❌ Failed to process shapefile: {e}")



🔄 Processing shapefile: ../clean_data/Digitale_Tweelingstad_Zwolle_Adressen_2D_cleaned.shp
✅ Selected columns saved to: ../minimized_data/Digitale_Tweelingstad_Zwolle_Selected_minimized.shp


## Aggregation based on neighbourhood

The following code perform aggregation of values included in specific columns per neighborhood based on file Digitale_Tweelingstad_Zwolle_Adressen_2D_cleaned.shp.

In [26]:
import geopandas as gpd
import pandas as pd
import os

# Configuration paths
MINIMIZED_DATA_DIR = "../minimized_data/"
OUTPUT_DIR = "../aggregated_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load minimized shapefile
gdf = gpd.read_file(os.path.join(MINIMIZED_DATA_DIR, "Digitale_Tweelingstad_Zwolle_Selected_minimized.shp"))

# Drop rows with missing crucial data
gdf = gdf.dropna(subset=["bouwjaar", "energielab", "status", "buurt_naam", "geometry"])

# Create building age categories
bins = [0, 1945, 1960, 1980, 2000, 2010, 2025]
labels = ["Pre-1945", "1945-1960", "1961-1980", "1981-2000", "2001-2010", "2011-2025"]
gdf["bouwjaar_cat"] = pd.cut(gdf["bouwjaar"], bins=bins, labels=labels, right=True)

# Simplify occupancy status
gdf["occupied"] = gdf["status"].apply(lambda x: "occupied" if "in gebruik" in x.lower() else "unoccupied")

# Categorize energy labels
def categorize_energy_label(label):
    high_efficiency = ['A++', 'A+', 'A']
    moderate_efficiency = ['B', 'C', 'D']
    low_efficiency = ['E', 'F', 'G']

    if label in high_efficiency:
        return 'high_efficiency'
    elif label in moderate_efficiency:
        return 'moderate_efficiency'
    elif label in low_efficiency:
        return 'low_efficiency'
    else:
        return 'unknown'

gdf["energy_category"] = gdf["energielab"].apply(categorize_energy_label)

# Aggregate data at neighborhood level
agg_df = gdf.groupby("buurt_naam").apply(lambda df: pd.Series({
    "total_buildings": len(df),
    "occupied_pct": (df["occupied"] == "occupied").mean() * 100,
    "unoccupied_pct": (df["occupied"] == "unoccupied").mean() * 100,
    "pre_1945": (df["bouwjaar_cat"] == "Pre-1945").sum(),
    "1945_1960": (df["bouwjaar_cat"] == "1945-1960").sum(),
    "1961_1980": (df["bouwjaar_cat"] == "1961-1980").sum(),
    "1981_2000": (df["bouwjaar_cat"] == "1981-2000").sum(),
    "2001_2010": (df["bouwjaar_cat"] == "2001-2010").sum(),
    "2011_2024": (df["bouwjaar_cat"] == "2011-2024").sum(),
    "high_efficiency": (df["energy_category"] == "high_efficiency").sum(),
    "moderate_efficiency": (df["energy_category"] == "moderate_efficiency").sum(),
    "low_efficiency": (df["energy_category"] == "low_efficiency").sum(),
    "unknown_efficiency": (df["energy_category"] == "unknown").sum(),
})).reset_index()

# Merge aggregated data with geometry
geometry_df = gdf.dissolve(by="buurt_naam").geometry.reset_index()
final_gdf = geometry_df.merge(agg_df, on="buurt_naam")

# Save aggregated data to shapefile
output_path = os.path.join(OUTPUT_DIR, "Zwolle_Neighborhood_Aggregated_Buildings.shp")
final_gdf.to_file(output_path)

print(f"✅ Aggregated shapefile saved successfully to: {output_path}")


✅ Aggregated shapefile saved successfully to: ../aggregated_data/Zwolle_Neighborhood_Aggregated_Buildings.shp


  agg_df = gdf.groupby("buurt_naam").apply(lambda df: pd.Series({
  final_gdf.to_file(output_path)
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


## Aggregation based on district

The following code perform aggregation of values included in specific columns per district based on file Digitale_Tweelingstad_Zwolle_Adressen_2D_cleaned.shp.

In [27]:
import geopandas as gpd
import pandas as pd
import os

# Configuration paths
MINIMIZED_DATA_DIR = "../minimized_data/"
OUTPUT_DIR = "../aggregated_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load minimized shapefile
gdf = gpd.read_file(os.path.join(MINIMIZED_DATA_DIR, "Digitale_Tweelingstad_Zwolle_Selected_minimized.shp"))

# Drop rows with missing crucial data
gdf = gdf.dropna(subset=["bouwjaar", "energielab", "status", "wijk_naam", "geometry"])

# Create building age categories
bins = [0, 1945, 1960, 1980, 2000, 2010, 2024]
labels = ["Pre-1945", "1945-1960", "1961-1980", "1981-2000", "2001-2010", "2011-2024"]
gdf["bouwjaar_cat"] = pd.cut(gdf["bouwjaar"], bins=bins, labels=labels, right=True)

# Simplify occupancy status
gdf["occupied"] = gdf["status"].apply(lambda x: "occupied" if "in gebruik" in x.lower() else "unoccupied")

# Categorize energy labels
def categorize_energy_label(label):
    high_efficiency = ['A++', 'A+', 'A']
    moderate_efficiency = ['B', 'C', 'D']
    low_efficiency = ['E', 'F', 'G']

    if label in high_efficiency:
        return 'high_efficiency'
    elif label in moderate_efficiency:
        return 'moderate_efficiency'
    elif label in low_efficiency:
        return 'low_efficiency'
    else:
        return 'unknown'

gdf["energy_category"] = gdf["energielab"].apply(categorize_energy_label)

# Aggregate data at district level
agg_df = gdf.groupby("wijk_naam").apply(lambda df: pd.Series({
    "total_buildings": len(df),
    "occupied_pct": (df["occupied"] == "occupied").mean() * 100,
    "unoccupied_pct": (df["occupied"] == "unoccupied").mean() * 100,
    "pre_1945": (df["bouwjaar_cat"] == "Pre-1945").sum(),
    "1945_1960": (df["bouwjaar_cat"] == "1945-1960").sum(),
    "1961_1980": (df["bouwjaar_cat"] == "1961-1980").sum(),
    "1981_2000": (df["bouwjaar_cat"] == "1981-2000").sum(),
    "2001_2010": (df["bouwjaar_cat"] == "2001-2010").sum(),
    "2011_2024": (df["bouwjaar_cat"] == "2011-2024").sum(),
    "high_efficiency": (df["energy_category"] == "high_efficiency").sum(),
    "moderate_efficiency": (df["energy_category"] == "moderate_efficiency").sum(),
    "low_efficiency": (df["energy_category"] == "low_efficiency").sum(),
    "unknown_efficiency": (df["energy_category"] == "unknown").sum(),
})).reset_index()

# Merge aggregated data with geometry
geometry_df = gdf.dissolve(by="wijk_naam").geometry.reset_index()
final_gdf = geometry_df.merge(agg_df, on="wijk_naam")

# Save aggregated data to shapefile
output_path = os.path.join(OUTPUT_DIR, "Zwolle_District_Aggregated_Buildings.shp")
final_gdf.to_file(output_path)

print(f"✅ Aggregated shapefile saved successfully to: {output_path}")


✅ Aggregated shapefile saved successfully to: ../aggregated_data/Zwolle_District_Aggregated_Buildings.shp


  agg_df = gdf.groupby("wijk_naam").apply(lambda df: pd.Series({
  final_gdf.to_file(output_path)
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
