# Clean Demographic Data
This script cleans Cijfers data downloaded from cijfersoverzwolle.nl

About the raw data:

The Cijfers data was originally download as XLSX files. Upon initial inspection, it was noted that the files contained extra informational columns and rows that needed to be deleted. The information included a header. The first column was given a header of Buurt_name and is listed with the following original columns:
 - Buurt_naam
 - woningen [woonruimten]
 - woonruimten totaal [woonruimten] (1)
 - inwoners op 1 januari [personen] (1)
 - inwoners in de leeftijd van 0-3 jaar [personen] (1)
 - inwoners in de leeftijd van 4-11 jaar [personen] (1)
 - inwoners in de leeftijd van 12-17 jaar [personen] (1)
 - inwoners in de leeftijd van 18-24 jaar [personen] (1)
 - inwoners in de leeftijd van 25-39 jaar [personen] (1)
 - inwoners in de leeftijd van 40-54 jaar [personen] (1)
 - inwoners in de leeftijd van 55-64 jaar [personen] (1)
 - inwoners in de leeftijd van 65-74 jaar [personen] (1)
 - inwoners in de leeftijd van 75-84 jaar [personen] (1)
 - inwoners in de leeftijd van 85 jaar en ouder [personen] (1)
 - inwoners in de leeftijd van 0-17 jaar [personen] (1)
 - inwoners in de leeftijd van 18-64 jaar [personen] (1)
 - inwoners in de leeftijd van 65 jaar en ouder [personen] (1)
 - gemiddeld gestandaardiseerd inkomen per huishouden [euro]
 - % huishoudens onder 105% van sociaal minimum [%]
 
Processing and output:

The file was saved as a CSV and used in the script below. The final output is generates a CSV file.

## Before running:
1. Update directories and file names as necessary under CONFIGURATION


In [3]:
import pandas as pd 
import os
import re

# -------------------------------
# CONFIGURATION
# -------------------------------
RAW_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../clean_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Only process CSVs that match this naming pattern
target_files = [f for f in os.listdir(RAW_DATA_DIR) if f.startswith("demographics-buurt") and f.endswith(".csv")]

# Aliases to unify inconsistent column names
COLUMN_ALIASES = {
    "buurtname": "buurt_naam",
    "buurtnaam": "buurt_naam",
    "buurt": "buurt_naam",
    "wijknaam": "wijk_naam",
    "wijk": "wijk_naam",
}

# -------------------------------
# FUNCTIONS
# -------------------------------
def normalize_name(name):
    """Normalize neighborhood or district names."""
    if pd.isna(name):
        return name
    name = str(name).strip().lower()
    name = re.sub(r"\s+", " ", name)
    name = name.replace(".", "")
    return name

def clean_column_names(df):
    """Standardize and alias column headers."""
    def clean(col):
        col = re.sub(r"\[.*?\]|\(.*?\)", "", col)
        col = col.strip().lower()
        col = re.sub(r"[^\w\s]", "", col)
        col = re.sub(r"\s+", "_", col)
        col = re.sub(r"_+", "_", col)
        col = col.strip("_")
        col = COLUMN_ALIASES.get(col, col)  # Apply alias if available
        return col
    original_columns = df.columns.tolist()
    df.columns = [clean(col) for col in df.columns]
    print("🧾 Standardized column names:")
    for old, new in zip(original_columns, df.columns):
        if old != new:
            print(f"  - '{old}' ➜ '{new}'")
    return df

def strip_whitespace(df):
    return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

def clean_data(df):
    df = strip_whitespace(df)
    df = df.dropna(how='all')
    df = df.drop_duplicates()
    return df

def convert_to_integer(df, column_name):
    if column_name in df.columns:
        original_len = len(df)
        df[column_name] = (
            df[column_name]
            .astype(str)
            .str.extract(r"(\d+)", expand=False)
            .dropna()
            .astype("Int64")
        )
        cleaned_len = df[column_name].notna().sum()
        print(f"🔢 Converted '{column_name}' to integer. Valid entries: {cleaned_len}/{original_len}")
    else:
        print(f"⚠️ Column '{column_name}' not found.")
    return df

def validate_columns(df):
    columns_to_validate = [
        "woonruimten_totaal",
        "inwoners_op_1_januari",
        "inwoners_in_de_leeftijd_van_65_jaar_en_ouder",
        "inwoners_in_de_leeftijd_van_1864_jaar",
        "inwoners_in_de_leeftijd_van_017_jaar",
        "gemiddeld_gestandaardiseerd_inkomen_per_huishouden"
    ]
    for col in columns_to_validate:
        df = convert_to_integer(df, col)
    return df

def drop_invalid_woonruimten(df):
    if "woonruimten_totaal" in df.columns:
        original_len = len(df)
        df = df[df["woonruimten_totaal"].notna()]
        print(f"🧹 Dropped {original_len - len(df)} rows with invalid 'woonruimten_totaal'")
    else:
        print("⚠️ Column 'woonruimten_totaal' not found.")
    return df

def summarize_neighborhoods(df):
    if "buurt_naam" in df.columns:
        print(f"📌 Unique neighborhoods: {df['buurt_naam'].nunique()}")
        print("🏘️ Rows per neighborhood:\n", df['buurt_naam'].value_counts()) 
    if "wijk_naam" in df.columns:
        print(f"📍 Unique districts: {df['wijk_naam'].nunique()}")
        print("📊 Rows per district:\n", df['wijk_naam'].value_counts()) 

# -------------------------------
# BATCH PROCESSING
# -------------------------------
for filename in target_files:
    print(f"\n🔄 Processing file: {filename}")
    input_path = os.path.join(RAW_DATA_DIR, filename)
    output_filename = filename.replace(".csv", "_clean_validated.csv")
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    try:
        df = pd.read_csv(input_path, sep=";", encoding="utf-8")
        df = clean_column_names(df)

        # Normalize buurt_naam and wijk_naam
        if "buurt_naam" in df.columns:
            df["buurt_naam"] = df["buurt_naam"].apply(normalize_name)
        else:
            print(f"⚠️ No 'buurt_naam' column found in {filename}. Skipping.")
            continue

        if "wijk_naam" in df.columns:
            df["wijk_naam"] = df["wijk_naam"].apply(normalize_name)
        else:
            print(f"⚠️ No 'wijk_naam' column found in {filename}. Proceeding without it.")

        df = clean_data(df)
        df = validate_columns(df)
        df = drop_invalid_woonruimten(df)

        print(f"✅ Total rows after cleaning and validation: {len(df)}")
        summarize_neighborhoods(df)

        df.to_csv(output_path, index=False)
        print(f"📁 Saved cleaned file to: {output_path}")
    except Exception as e:
        print(f"❌ Failed to process {filename}: {e}")



🔄 Processing file: demographics-buurt-2015.csv
🧾 Standardized column names:
  - 'Buurt_naam' ➜ 'buurt_naam'
  - 'woningen [woonruimten]' ➜ 'woningen'
  - 'woonruimten totaal [woonruimten] (1)' ➜ 'woonruimten_totaal'
  - 'inwoners op 1 januari [personen] (1)' ➜ 'inwoners_op_1_januari'
  - 'inwoners in de leeftijd van 0-3 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_03_jaar'
  - 'inwoners in de leeftijd van 4-11 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_411_jaar'
  - 'inwoners in de leeftijd van 12-17 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_1217_jaar'
  - 'inwoners in de leeftijd van 18-24 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_1824_jaar'
  - 'inwoners in de leeftijd van 25-39 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_2539_jaar'
  - 'inwoners in de leeftijd van 40-54 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_4054_jaar'
  - 'inwoners in de leeftijd van 55-64 jaar [personen] (1)' ➜ 'inwoners_in_de_leeftijd_van_5564_jaar'
  - 'i

  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


## Minimize Data

This process selects columns that will be used for future modeling.

In [4]:
import pandas as pd
import os

# -------------------------------
# CONFIGURATION
# -------------------------------
INPUT_DIR = "../clean_data/"
OUTPUT_DIR = "../minimized_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Only process cleaned and validated files
target_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("demographics-buurt") and f.endswith("_clean_validated.csv")]

# Define essential columns to keep
MINIMAL_COLUMNS = [
    "buurt_naam",
    "woonruimten_totaal",
    "inwoners_op_1_januari",
    "inwoners_in_de_leeftijd_van_017_jaar",
    "inwoners_in_de_leeftijd_van_1864_jaar",
    "inwoners_in_de_leeftijd_van_65_jaar_en_ouder",
    "gemiddeld_gestandaardiseerd_inkomen_per_huishouden"
]

# -------------------------------
# PROCESSING LOOP
# -------------------------------
for filename in target_files:
    print(f"\n📂 Minimizing file: {filename}")
    input_path = os.path.join(INPUT_DIR, filename)
    output_filename = filename.replace("_clean_validated.csv", "_minimized.csv")
    output_path = os.path.join(OUTPUT_DIR, output_filename)

    try:
        df = pd.read_csv(input_path)

        # Keep only minimal columns (if present)
        available_cols = [col for col in MINIMAL_COLUMNS if col in df.columns]
        missing = [col for col in MINIMAL_COLUMNS if col not in df.columns]

        if missing:
            print(f"⚠️ Missing columns in {filename}: {missing}")

        minimized_df = df[available_cols]

        print(f"✅ Keeping columns: {available_cols}")
        print(f"🔢 Rows retained: {len(minimized_df)}")

        minimized_df.to_csv(output_path, index=False)
        print(f"💾 Saved minimized file to: {output_path}")

    except Exception as e:
        print(f"❌ Failed to minimize {filename}: {e}")



📂 Minimizing file: demographics-buurt-2015_clean_validated.csv
✅ Keeping columns: ['buurt_naam', 'woonruimten_totaal', 'inwoners_op_1_januari', 'inwoners_in_de_leeftijd_van_017_jaar', 'inwoners_in_de_leeftijd_van_1864_jaar', 'inwoners_in_de_leeftijd_van_65_jaar_en_ouder', 'gemiddeld_gestandaardiseerd_inkomen_per_huishouden']
🔢 Rows retained: 78
💾 Saved minimized file to: ../minimized_data/demographics-buurt-2015_minimized.csv

📂 Minimizing file: demographics-buurt-2016_clean_validated.csv
✅ Keeping columns: ['buurt_naam', 'woonruimten_totaal', 'inwoners_op_1_januari', 'inwoners_in_de_leeftijd_van_017_jaar', 'inwoners_in_de_leeftijd_van_1864_jaar', 'inwoners_in_de_leeftijd_van_65_jaar_en_ouder', 'gemiddeld_gestandaardiseerd_inkomen_per_huishouden']
🔢 Rows retained: 78
💾 Saved minimized file to: ../minimized_data/demographics-buurt-2016_minimized.csv

📂 Minimizing file: demographics-buurt-2017_clean_validated.csv
✅ Keeping columns: ['buurt_naam', 'woonruimten_totaal', 'inwoners_op_1_jan

## Combine all cleaned files

The following script will combine all the files from the output above and add a year column extracted from the filename.capitalize 

In [5]:
import pandas as pd
import os
import re

# -------------------------------
# CONFIGURATION
# -------------------------------
CLEAN_DATA_DIR = "../clean_data/"
OUTPUT_FILE = os.path.join(CLEAN_DATA_DIR, "demographics_buurt_combined.csv")

# Match filenames like demographics_buurt_2021_clean_validated.csv
pattern = re.compile(r"demographics-buurt-(\d{4})_clean_validated\.csv")
files = [f for f in os.listdir(CLEAN_DATA_DIR) if pattern.match(f)]

# -------------------------------
# PROCESSING
# -------------------------------
combined_df = []

for filename in files:
    match = pattern.match(filename)
    if match:
        year = int(match.group(1))
        file_path = os.path.join(CLEAN_DATA_DIR, filename)

        try:
            df = pd.read_csv(file_path)
            df['year'] = year
            combined_df.append(df)
            print(f"✅ Loaded {filename} with {len(df)} rows.")
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")

# Combine all dataframes
if combined_df:
    merged_df = pd.concat(combined_df, ignore_index=True)
    merged_df.to_csv(OUTPUT_FILE, index=False)
    print(f"\n📁 Combined file saved to: {OUTPUT_FILE}")
    print(f"🧮 Total combined rows: {len(merged_df)}")
else:
    print("⚠️ No valid files were found or loaded.")


✅ Loaded demographics-buurt-2015_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2016_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2017_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2018_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2019_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2020_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2021_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2023_clean_validated.csv with 78 rows.
✅ Loaded demographics-buurt-2024_clean_validated.csv with 78 rows.

📁 Combined file saved to: ../clean_data/demographics_buurt_combined.csv
🧮 Total combined rows: 702


## Map to Neighbourhood_shapefile

The following code maps the demographic data by buurt_naam to the neighborhood shapefile Buurtgrenzen_Zwolle.shp.



In [6]:
import geopandas as gpd
import pandas as pd
import os
import re
import unicodedata

# -------------------------------
# CONFIGURATION
# -------------------------------
CLEAN_DATA_DIR = "../minimized_data/"
OUTPUT_DIR = "../mapped_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SHAPEFILE_PATH = "../raw_data/Buurtgrenzen_Zwolle.shp"
RAW_SHAPE_COLUMN = "OFFICIËLE"  # original column with special characters
DATA_KEY = "buurt_naam"

# -------------------------------
# FUNCTIONS
# -------------------------------
def normalize_name(name):
    if pd.isna(name):
        return ""
    name = str(name).strip().lower()
    name = unicodedata.normalize("NFKD", name)
    return "".join(c for c in name if not unicodedata.combining(c))

# -------------------------------
# LOAD SHAPEFILE
# -------------------------------
gdf_buurten = gpd.read_file(SHAPEFILE_PATH)

# Check for correct shapefile column
if RAW_SHAPE_COLUMN not in gdf_buurten.columns:
    print(f"❌ ERROR: Column '{RAW_SHAPE_COLUMN}' not found in shapefile.")
    print("📋 Available columns:", gdf_buurten.columns.tolist())
    raise SystemExit

gdf_buurten["buurt_naam_normalized"] = gdf_buurten[RAW_SHAPE_COLUMN].apply(normalize_name)

# -------------------------------
# PROCESS EACH CLEANED FILE
# -------------------------------
pattern = re.compile(r"demographics-buurt-(\d{4})_minimized\.csv")
files = [f for f in os.listdir(CLEAN_DATA_DIR) if pattern.match(f)]

print("📂 Found cleaned CSV files:")
print(files)

for filename in files:
    match = pattern.match(filename)
    if not match:
        continue

    year = match.group(1)
    csv_path = os.path.join(CLEAN_DATA_DIR, filename)
    output_path = os.path.join(OUTPUT_DIR, f"demographics_buurt_{year}_mapped.geojson")

    try:
        df = pd.read_csv(csv_path)

        # Merge with shapefile
        merged_gdf = gdf_buurten.merge(df, how="left", left_on="buurt_naam_normalized", right_on="buurt_naam")

        if merged_gdf.empty:
            print(f"⚠️ Merged GeoDataFrame is empty for {filename}. Skipping export.")
            continue

        merged_gdf.to_file(output_path, driver="GeoJSON")
        print(f"🌍 Exported GeoJSON for {year}: {output_path}")

    except Exception as e:
        print(f"❌ Failed to map {filename}: {e}")



📂 Found cleaned CSV files:
['demographics-buurt-2015_minimized.csv', 'demographics-buurt-2016_minimized.csv', 'demographics-buurt-2017_minimized.csv', 'demographics-buurt-2018_minimized.csv', 'demographics-buurt-2019_minimized.csv', 'demographics-buurt-2020_minimized.csv', 'demographics-buurt-2021_minimized.csv', 'demographics-buurt-2023_minimized.csv', 'demographics-buurt-2024_minimized.csv']
🌍 Exported GeoJSON for 2015: ../mapped_data/demographics_buurt_2015_mapped.geojson
🌍 Exported GeoJSON for 2016: ../mapped_data/demographics_buurt_2016_mapped.geojson
🌍 Exported GeoJSON for 2017: ../mapped_data/demographics_buurt_2017_mapped.geojson
🌍 Exported GeoJSON for 2018: ../mapped_data/demographics_buurt_2018_mapped.geojson
🌍 Exported GeoJSON for 2019: ../mapped_data/demographics_buurt_2019_mapped.geojson
🌍 Exported GeoJSON for 2020: ../mapped_data/demographics_buurt_2020_mapped.geojson
🌍 Exported GeoJSON for 2021: ../mapped_data/demographics_buurt_2021_mapped.geojson
🌍 Exported GeoJSON for