# Clean and GeoCode Energy Coaching Data from HOOMdossier
This script cleans and geocodes energy coaching data from HOOMdossier.

About the raw data:

The energy coaching data from HOOMdossier was originally provided as an XLSX file. Upon initial inspection, it was noted that the file contained the following columns:
 - Datum afspraak coachgesprek: Appointment date
 - Postcode: 6-digit postal code
 - Telling: Count (or ID/index)

Processing and output:

The file was saved as a CSV and used in the script below. Opencage was used to reverse lookup the latitude and longitude using the "postcode"  The final output is generates a GeoJSON file.

## Before running:
1. Update directories and file names as necessary under CONFIGURATION




In [3]:
import pandas as pd
import re
import os

# -------------------------------
# CONFIGURATION
# -------------------------------
RAW_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../clean_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

input_file = RAW_DATA_DIR + "Coachgesprekken HOOMdossier.csv"
output_file = OUTPUT_DIR + "coachgesprekken_clean_validated.csv"

# -------------------------------
# Load and clean data
# -------------------------------
df = pd.read_csv(input_file, sep=";")

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(".", "_")

# Strip whitespace from string fields
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# -------------------------------
# Format postcode
# -------------------------------
if 'postcode' in df.columns:
    df['postcode'] = df['postcode'].astype(str).str.replace(" ", "").str.upper()

# -------------------------------
# Validate postcode format
# Dutch postcode format: 4 digits + 2 uppercase letters (no space)
# Example: 1234AB
# -------------------------------
postcode_pattern = re.compile(r"^[1-9][0-9]{3}[A-Z]{2}$")

def is_valid_postcode(postcode):
    return bool(postcode_pattern.match(str(postcode)))

if 'postcode' in df.columns:
    df['valid_postcode'] = df['postcode'].apply(is_valid_postcode)

# -------------------------------
# Convert date columns
# -------------------------------
date_cols = ['datum_afspraak_coachgesprek']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# -------------------------------
# Remove empty rows and duplicates
# -------------------------------
df = df.dropna(how='all')  # Entirely empty rows
df = df.drop_duplicates()

# -------------------------------
# Summary stats (printed to console)
# -------------------------------
total_rows = len(df)
valid_postcodes = df['valid_postcode'].sum() if 'valid_postcode' in df.columns else "N/A"
invalid_postcodes = total_rows - valid_postcodes if valid_postcodes != "N/A" else "N/A"

print(f"✅ Total rows: {total_rows}")
print(f"✅ Valid postcodes: {valid_postcodes}")
print(f"❌ Invalid postcodes: {invalid_postcodes}")

# -------------------------------
# Save cleaned and validated file
# -------------------------------
df.to_csv(output_file, index=False)

✅ Total rows: 312
✅ Valid postcodes: 312
❌ Invalid postcodes: 0


  df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(".", "_")
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, f

## Minimize Data

This process selects columns that will be used for future modeling.

In [4]:
import pandas as pd
import os

# -------------------------------
# CONFIGURATION
# -------------------------------
INPUT_DIR = "../clean_data/"
OUTPUT_DIR = "../minimized_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

input_file = INPUT_DIR + "coachgesprekken_clean_validated.csv"
output_file = OUTPUT_DIR + "coachgesprekken_minimized.csv"

# -------------------------------
# Load cleaned data
# -------------------------------
df = pd.read_csv(input_file)

# -------------------------------
# Select required columns
# -------------------------------
required_columns = ['telling', 'datum_afspraak_coachgesprek', 'postcode']

# Check which required columns are present
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

# Select and reorder columns
df_selected = df[required_columns]

# -------------------------------
# Save selected columns to new file
# -------------------------------
df_selected.to_csv(output_file, index=False)
print(f"✅ Selected columns saved to: {output_file}")

✅ Selected columns saved to: ../minimized_data/coachgesprekken_minimized.csv


## Before Running:
1. Update directory and file names as necessary
1. Uses Nominiatim open-source geocoding with OpenStreetMap data. View the license here: htttps://nominatim.org

In [5]:
import pandas as pd
import os
from geopy.geocoders import Nominatim
import time 
from geopy.exc import GeocoderTimedOut

#-----------------------------------------
# CONFIGURATION
#----------------------------------------

CLEAN_DATA_DIR = "../minimized_data/"
OUTPUT_DIR = "../geocoded_data/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

input_file = CLEAN_DATA_DIR + "coachgesprekken_minimized.csv"
output_file = OUTPUT_DIR + "coachgesprekken_geocoded.csv"

#----------------------------------------------
# Load CSV file
#----------------------------------------------
df = pd.read_csv(input_file)

#----------------------------------------------
# Ensure there's a 'postcode' column
#----------------------------------------------
if 'postcode' not in df.columns:
    raise ValueError("CSV file must contain a 'postcode' column")

#-----------------------------------------
# Use Nominatim open-source geocoder with OpenStreetMap data
#------------------------------------------
# Initialize the geocoder
geolocator = Nominatim(user_agent="geo_script")

# Function to get latitude and longitude with retry logic
def get_lat_lon(postcode, retries=3, delay=1):
    try:
        location = geolocator.geocode(postcode, country_codes="NL")  # Replace "XX" with the actual country code
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except GeocoderTimedOut:
        if retries > 0:
            time.sleep(delay)
            return get_lat_lon(postcode, retries - 1, delay * 2)  # Exponential backoff
        else:
            return None, None
    finally:
        time.sleep(1)  # Respect rate limits

#----------------------------------------------    
# Apply geocoding function
#-----------------------------------------------
# Apply geocoding function to postcodes
df[['latitude', 'longitude']] = df['postcode'].astype(str).apply(lambda x: pd.Series(get_lat_lon(x)))

#-----------------------------------------------
# Save GeoCoded results to a new CSV file
#-----------------------------------------------
df.to_csv(output_file, index=False)

print("Geocoding complete. Saved as coachgesprekken_geocoded.csv")

Geocoding complete. Saved as coachgesprekken_geocoded.csv


# Aggregation based on neighbourhood

This code aggregates energy coach visits by performing a spatial join between geocoded visit locations (coachgesprekken_geocoded.csv) and neighborhood boundaries (Buurtgrenzen_Zwolle.shp). It calculates the total number of coach visits within each neighborhood.

In [1]:
import geopandas as gpd
import pandas as pd
import os

GEOCODED_DATA_DIR = "../geocoded_data/"
NEIGHBORHOOD_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../aggregated_data/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define paths explicitly
input_file = os.path.join(GEOCODED_DATA_DIR, "coachgesprekken_geocoded.csv")
neighborhood_file = os.path.join(NEIGHBORHOOD_DATA_DIR, "Buurtgrenzen_Zwolle.shp")

# Load coach visit data and convert to GeoDataFrame
coaches_df = pd.read_csv(input_file)
coaches_gdf = gpd.GeoDataFrame(
    coaches_df,
    geometry=gpd.points_from_xy(coaches_df.longitude, coaches_df.latitude),
    crs='EPSG:4326'
)

# Load neighborhoods shapefile
neighborhoods_gdf = gpd.read_file(neighborhood_file).to_crs('EPSG:4326')

# Spatial join: assign visits to neighborhoods
joined_gdf = gpd.sjoin(coaches_gdf, neighborhoods_gdf, predicate='within')

# Aggregate number of coach visits per official neighborhood
aggregated_visits = joined_gdf.groupby('OFFICIËLE').size().reset_index(name='coach_visits')

# Merge aggregation results back with neighborhood geometry
result_gdf = neighborhoods_gdf.merge(aggregated_visits, on='OFFICIËLE', how='left')
result_gdf['coach_visits'] = result_gdf['coach_visits'].fillna(0).astype(int)

# Save aggregated data to shapefile
output_path = os.path.join(OUTPUT_DIR, "Zwolle_Neighbourhood_Aggregated_EnergyCoaching.shp")
result_gdf.to_file(output_path)

print(f"✅ Aggregated shapefile saved successfully to: {output_path}")


✅ Aggregated shapefile saved successfully to: ../aggregated_data/Zwolle_Neighbourhood_Aggregated_EnergyCoaching.shp


  result_gdf.to_file(output_path)
  ogr_write(


# Aggregation based on district

This code aggregates energy coach visits by performing a spatial join between geocoded visit locations (coachgesprekken_geocoded.csv) and district boundaries (Wijkgrenzen_Zwolle.shp). It calculates the total number of coach visits within each district.

In [2]:
import geopandas as gpd
import pandas as pd
import os

GEOCODED_DATA_DIR = "../geocoded_data/"
DISTRICT_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../aggregated_data/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define paths explicitly
input_file = os.path.join(GEOCODED_DATA_DIR, "coachgesprekken_geocoded.csv")
district_file = os.path.join(DISTRICT_DATA_DIR, "Wijkgrenzen_Zwolle.shp")

# Load coach visit data and convert to GeoDataFrame
coaches_df = pd.read_csv(input_file)
coaches_gdf = gpd.GeoDataFrame(
    coaches_df,
    geometry=gpd.points_from_xy(coaches_df.longitude, coaches_df.latitude),
    crs='EPSG:4326'
)

# Load districts shapefile
districts_gdf = gpd.read_file(district_file).to_crs('EPSG:4326')

# Spatial join: assign visits to districts
joined_gdf = gpd.sjoin(coaches_gdf, districts_gdf, predicate='within')

# Aggregate number of coach visits per official district
aggregated_visits = joined_gdf.groupby('OFFICIËLE').size().reset_index(name='coach_visits')

# Merge aggregation results back with district geometry
result_gdf = districts_gdf.merge(aggregated_visits, on='OFFICIËLE', how='left')
result_gdf['coach_visits'] = result_gdf['coach_visits'].fillna(0).astype(int)

# Save aggregated data to shapefile
output_path = os.path.join(OUTPUT_DIR, "Zwolle_Districts_Aggregated_EnergyCoaching.shp")
result_gdf.to_file(output_path)

print(f"✅ Aggregated shapefile saved successfully to: {output_path}")

✅ Aggregated shapefile saved successfully to: ../aggregated_data/Zwolle_Districts_Aggregated_EnergyCoaching.shp


  result_gdf.to_file(output_path)
  ogr_write(
