# Clean Energy Consumption Data
This script cleans Cijfers data downloaded from cijfersoverzwolle.nl

About the raw data:

The Cijfers data was originally download as XLSX files. Upon initial inspection, it was noted that the files contained extra informational columns and rows that needed to be deleted. The information included a header. The first column was given a header of Buurt_name and is listed with the following original columns:
* Buurt_naam

* Totaal aardgasverbruik woningen (temperatuurgecorrigeerd) [TJ]

* Aardgasverbruik woningen [miljoen m3]

* Elektriciteitsverbruik woningen [GWh (miljoen kWh)]

* Aardgasverbruik woningen (temperatuurgecorrigeerd) [miljoen m3]

* CO2-uitstoot Woningen (elektriciteit) [kton]

* CO2-uitstoot Woningen (aardgas, temperatuurgecorrigeerd) [kton]

* CO2-uitstoot Woningen (aardgas) [kton]

* Consumentenprijs gas (variabele kosten incl. energiebelasting en BTW) [€]

* Consumentenprijs elektriciteit (variabele kosten incl. energiebelasting en BTW) [€]

* Aantal woningen met verwarming op basis van aardgas (benadering) [aantal]

* Aantal woningen met elektrische verwarming (benadering) [aantal]

* Aantal geregistreerde installaties met zonnepanelen woningen [aantal]

* Vermogen geregistreerde zonnepanelen woningen [kW]

* Vermogen geregistreerde zonnepanelen per woning [Wattpiek]

* Wind op land fysiek opgesteld vermogen [MW]

* Percentage woningen met geregistreerde zonnepanelen [%]

* Huishoudens [aantal]

* Bevolking totaal [aantal]

* Gemiddeld inkomen per inwoner [€]

* Gemiddeld inkomen per huishouden [€]
Processing and output:

The file was saved as a CSV and used in the script below. The final output is generates a CSV file.

## Before running:
1. Update directories and file names as necessary under CONFIGURATION


In [25]:
import pandas as pd
import os

# -------------------------------
# CONFIGURATION
# -------------------------------
RAW_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../clean_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Only process CSVs that match this naming pattern
target_files = [
    f for f in os.listdir(RAW_DATA_DIR)
    if f.startswith("energyconsumption-buurt") and f.endswith(".csv")
]

# -------------------------------
# FUNCTIONS
# -------------------------------
def clean_column_names_preserve_units(df):
    # Strip whitespace only; keep full original column names
    df.columns = [col.strip() for col in df.columns]
    return df

def strip_whitespace(df):
    return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

def clean_data(df):
    df = strip_whitespace(df)
    df = df.dropna(how='all')
    df = df.drop_duplicates()
    return df

def validate_columns(df):
    columns_to_validate = [
        "Aardgasverbruik woningen [miljoen m3]",
        "Elektriciteitsverbruik woningen [GWh (miljoen kWh)]",
        "CO2-uitstoot Woningen (elektriciteit) [kton]",
        "CO2-uitstoot Woningen (aardgas) [kton]",
        "Huishoudens [aantal]",
        "Bevolking totaal [aantal]"
    ]
    for col in columns_to_validate:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
            print(f"🔢 '{col}' converted to numeric.")
        else:
            print(f"⚠️ Column '{col}' not found.")
    return df

def summarize_neighborhoods(df):
    col = "Buurt_naam"
    if col in df.columns:
        print(f"📌 Unique neighborhoods: {df[col].nunique()}")
        print("🏘️ Rows per neighborhood:\n", df[col].value_counts())
    else:
        print(f"⚠️ Column '{col}' not found.")

# -------------------------------
# BATCH PROCESSING
# -------------------------------
for filename in target_files:
    print(f"\n🔄 Processing file: {filename}")
    input_path = os.path.join(RAW_DATA_DIR, filename)
    output_filename = filename.replace(".csv", "_clean_validated.csv")
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    try:
        df = pd.read_csv(input_path, sep=",", encoding="utf-8")
        df = clean_column_names_preserve_units(df)
        print("🧾 Column names:")
        print(df.columns.tolist())

        df = clean_data(df)
        df = validate_columns(df)
        df = df.fillna(0)  # ⬅️ Replace *all* remaining NaNs with 0

        print(f"✅ Total rows after cleaning and validation: {len(df)}")
        summarize_neighborhoods(df)

        df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"📁 Saved cleaned file to: {output_path}")
    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")




🔄 Processing file: energyconsumption-buurt-2015.csv
🧾 Column names:
['Buurt_naam', 'Totaal aardgasverbruik woningen (temperatuurgecorrigeerd) [TJ]', 'Aardgasverbruik woningen [miljoen m3]', 'Elektriciteitsverbruik woningen [GWh (miljoen kWh)]', 'Aardgasverbruik woningen (temperatuurgecorrigeerd) [miljoen m3]', 'CO2-uitstoot Woningen (elektriciteit) [kton]', 'CO2-uitstoot Woningen (aardgas, temperatuurgecorrigeerd) [kton]', 'CO2-uitstoot Woningen (aardgas) [kton]', 'Consumentenprijs gas (variabele kosten incl. energiebelasting en BTW) [€]', 'Consumentenprijs elektriciteit (variabele kosten incl. energiebelasting en BTW) [€]', 'Aantal woningen met verwarming op basis van aardgas  (benadering) [aantal]', 'Aantal woningen met elektrische verwarming (benadering) [aantal]', 'Aantal geregistreerde installaties met zonnepanelen woningen [aantal]', 'Vermogen geregistreerde zonnepanelen woningen [kW]', 'Vermogen geregistreerde zonnepanelen per woning [Wattpiek]', 'Wind op land fysiek opgesteld 

  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


## Minimize and Combine all cleaned files

The following script will combine all the files from the output above and add a year column extracted from the filename.capitalize .It also filters and minimizes the clomuns to only keep specific ones

In [26]:
import pandas as pd
import os
import re

# -------------------------------
# CONFIGURATION
# -------------------------------
CLEAN_DATA_DIR = "../clean_data/"
OUTPUT_FILE = os.path.join(CLEAN_DATA_DIR, "energyconsumption_buurt_combined_filtered.csv")

# Match filenames like energyconsumption-buurt-2021_clean_validated.csv
pattern = re.compile(r"energyconsumption-buurt-(\d{4})_clean_validated\.csv")
files = [f for f in os.listdir(CLEAN_DATA_DIR) if pattern.match(f)]

# -------------------------------
# COLUMNS TO KEEP
# -------------------------------
columns_to_keep = [
    "Buurt_naam",
    "Totaal aardgasverbruik woningen (temperatuurgecorrigeerd) [TJ]",
    "Elektriciteitsverbruik woningen [GWh (miljoen kWh)]",
    "CO2-uitstoot Woningen (elektriciteit) [kton]",
    "CO2-uitstoot Woningen (aardgas, temperatuurgecorrigeerd) [kton]",
    "Consumentenprijs gas (variabele kosten incl. energiebelasting en BTW) [€]",
    "Consumentenprijs elektriciteit (variabele kosten incl. energiebelasting en BTW) [€]",
    "Aantal woningen met elektrische verwarming (benadering) [aantal]",
    "Aantal geregistreerde installaties met zonnepanelen woningen [aantal]"
]

# -------------------------------
# PROCESSING
# -------------------------------
combined_df = []

for filename in files:
    match = pattern.match(filename)
    if match:
        year = int(match.group(1))
        file_path = os.path.join(CLEAN_DATA_DIR, filename)

        try:
            df = pd.read_csv(file_path)

            # Check which columns are actually present
            available_cols = [col for col in columns_to_keep if col in df.columns]
            missing_cols = set(columns_to_keep) - set(available_cols)
            if missing_cols:
                print(f"⚠️ {filename} is missing columns: {missing_cols}")

            df_filtered = df[available_cols].copy()
            df_filtered['year'] = year
            combined_df.append(df_filtered)

            print(f"✅ Loaded {filename} with {len(df_filtered)} rows.")
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")

# Combine all dataframes
if combined_df:
    merged_df = pd.concat(combined_df, ignore_index=True)
    merged_df.to_csv(OUTPUT_FILE, index=False)
    print(f"\n📁 Combined filtered file saved to: {OUTPUT_FILE}")
    print(f"🧮 Total combined rows: {len(merged_df)}")
else:
    print("⚠️ No valid files were found or loaded.")



✅ Loaded energyconsumption-buurt-2015_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2016_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2017_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2018_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2019_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2020_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2021_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2022_clean_validated.csv with 78 rows.
✅ Loaded energyconsumption-buurt-2023_clean_validated.csv with 78 rows.

📁 Combined filtered file saved to: ../clean_data/energyconsumption_buurt_combined_filtered.csv
🧮 Total combined rows: 702


## Map to neighbourhood_shapefile

The following code maps the demographic data by buurt_naam to the neighborhood shapefile Buurtgrenzen_Zwolle.shp.

In [27]:
import geopandas as gpd
import pandas as pd
import os
import re
import unicodedata

# -------------------------------
# CONFIGURATION
# -------------------------------
CLEAN_DATA_DIR = "../clean_data/"
OUTPUT_DIR = "../mapped_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SHAPEFILE_PATH = "../raw_data/Buurtgrenzen_Zwolle.shp"
RAW_SHAPE_COLUMN = "OFFICIËLE"
DATA_KEY = "buurt_naam"

# -------------------------------
# FUNCTIONS
# -------------------------------
def normalize_name(name):
    if pd.isna(name):
        return ""
    name = str(name).strip().lower()
    name = unicodedata.normalize("NFKD", name)
    return "".join(c for c in name if not unicodedata.combining(c))

# -------------------------------
# LOAD SHAPEFILE
# -------------------------------
gdf_buurten = gpd.read_file(SHAPEFILE_PATH)

if RAW_SHAPE_COLUMN not in gdf_buurten.columns:
    print(f"❌ ERROR: Column '{RAW_SHAPE_COLUMN}' not found in shapefile.")
    print("📋 Available columns:", gdf_buurten.columns.tolist())
    raise SystemExit

gdf_buurten["buurt_naam_normalized"] = gdf_buurten[RAW_SHAPE_COLUMN].apply(normalize_name)

# -------------------------------
# PROCESS EACH CLEANED ENERGY FILE
# -------------------------------
pattern = re.compile(r"energyconsumption-buurt-(\d{4})_clean_validated\.csv")
files = [f for f in os.listdir(CLEAN_DATA_DIR) if pattern.match(f)]

print("📂 Found cleaned ENERGY CSV files:")
print(files)

for filename in files:
    match = pattern.match(filename)
    if not match:
        continue

    year = match.group(1)
    csv_path = os.path.join(CLEAN_DATA_DIR, filename)
    output_path = os.path.join(OUTPUT_DIR, f"energy_buurt_{year}_mapped.geojson")

    try:
        df = pd.read_csv(csv_path)

        # Normalize column names
        df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
        print(f"🧪 Columns in {filename}: {df.columns.tolist()}")

        # Normalize buurt name column
        if "buurt_naam" in df.columns:
            df["buurt_naam"] = df["buurt_naam"].apply(normalize_name)
        elif "buurtnaam" in df.columns:
            df["buurt_naam"] = df["buurtnaam"].apply(normalize_name)
        else:
            print(f"⚠️ No 'buurt_naam' or 'buurtnaam' column found in {filename}. Skipping.")
            continue

        # Merge with shapefile
        merged_gdf = gdf_buurten.merge(df, how="left", left_on="buurt_naam_normalized", right_on="buurt_naam")

        if merged_gdf.empty:
            print(f"⚠️ Merged GeoDataFrame is empty for {filename}. Skipping export.")
            continue

        # Optional: check match rate
        matched = merged_gdf["buurt_naam"].notna().sum()
        print(f"🔗 Match rate: {matched}/{len(merged_gdf)}")

        # Export
        merged_gdf.to_file(output_path, driver="GeoJSON")
        print(f"🌍 Exported energy GeoJSON for {year}: {output_path}")

    except Exception as e:
        print(f"❌ Failed to map {filename}: {e}")


📂 Found cleaned ENERGY CSV files:
['energyconsumption-buurt-2015_clean_validated.csv', 'energyconsumption-buurt-2016_clean_validated.csv', 'energyconsumption-buurt-2017_clean_validated.csv', 'energyconsumption-buurt-2018_clean_validated.csv', 'energyconsumption-buurt-2019_clean_validated.csv', 'energyconsumption-buurt-2020_clean_validated.csv', 'energyconsumption-buurt-2021_clean_validated.csv', 'energyconsumption-buurt-2022_clean_validated.csv', 'energyconsumption-buurt-2023_clean_validated.csv']
🧪 Columns in energyconsumption-buurt-2015_clean_validated.csv: ['buurt_naam', 'totaal_aardgasverbruik_woningen_(temperatuurgecorrigeerd)_[tj]', 'aardgasverbruik_woningen_[miljoen_m3]', 'elektriciteitsverbruik_woningen_[gwh_(miljoen_kwh)]', 'aardgasverbruik_woningen_(temperatuurgecorrigeerd)_[miljoen_m3]', 'co2-uitstoot_woningen_(elektriciteit)_[kton]', 'co2-uitstoot_woningen_(aardgas,_temperatuurgecorrigeerd)_[kton]', 'co2-uitstoot_woningen_(aardgas)_[kton]', 'consumentenprijs_gas_(variabele_