# Clean ISDE Subsidy Data
This script cleans ISDE subsidy data downloaded from RVO.nl 

About the raw data:

The ISDE subsidy data from RVO.nl was originally download as XLSX files. Upon initial inspection, it was noted that the files contained extra informational columns and rows that needed to be deleted. The information included images and plain text introduction material. The following original columns remain:
 - POSTCODE
 - PLAATS
 - GEMEENTENAAM
 - GEMEENTECODE
 - Realisatiejaar
 - WIJKNAAM
 - BUURTNAAM
 - DEELPROGRAMMA
 - SUBSIDIEJAAR
 - TECHNIEK
 - SUBCATEGORIE
 - AANTAL_ADRESSEN
 - TOTALE_VERMOGEN
 - AANTAL_M2
 - AANTAL_APP
 

Processing and output:

The file was saved as a CSV and used in the script below. The final output is generates a CSV file.

## Before running:
1. Update directories and file names as necessary under CONFIGURATION




In [3]:
import pandas as pd
import os
import re

# -------------------------------
# CONFIGURATION
# -------------------------------
RAW_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../clean_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

input_file = os.path.join(RAW_DATA_DIR, "ISDE_subsidies.csv")
output_file = os.path.join(OUTPUT_DIR, "isde_subsidies_clean.csv")

# -------------------------------
# FUNCTIONS
# -------------------------------
def clean_column_names(df):
    df.columns = (
        df.columns.str.strip()
        .str.lower()
        .str.replace(" ", "_", regex=False)
        .str.replace(".", "_", regex=False)
    )
    return df

def strip_whitespace(df):
    return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

def clean_data(df):
    df = strip_whitespace(df)
    df = df.dropna(how='all')  # Drop entirely empty rows
    df = df.drop_duplicates()
    return df

def remove_non_zwolle(df):
    if 'plaats' in df.columns:
        original_len = len(df)
        df = df[df['plaats'].str.lower() == 'zwolle']
        print(f"🏙️ Removed {original_len - len(df)} rows not from Zwolle.")
    else:
        print("⚠️ Column 'plaats' not found in the dataset.")
    return df

def validate_by_neighborhood(df):
    if 'buurtnaam' in df.columns:
        buurt_counts = df['buurtnaam'].value_counts()
        print("🏘️ Subsidy counts per neighborhood:\n", buurt_counts)
    else:
        print("⚠️ Column 'buurtnaam' not found. Skipping neighborhood validation.")
    return df

def validate_postcodes(df):
    if 'postcode_agg' in df.columns:
        pattern = re.compile(r"^\d{4}\s?[A-Z]{2}$")  # Dutch postcode pattern
        valid_postcodes = df['postcode_agg'].astype(str).str.upper().str.replace(" ", "")
        df['valid_postcode_agg'] = valid_postcodes.apply(lambda x: bool(pattern.match(x)))
        
        invalid_count = df['valid_postcode_agg'].value_counts().get(False, 0)
        print(f"🔎 Invalid postcode entries: {invalid_count}")
        
        df = df[df['valid_postcode_agg'] == True].drop(columns=['valid_postcode_agg'])
    else:
        print("⚠️ Column 'postcode_agg' not found. Skipping postcode validation.")
    return df

# -------------------------------
# PROCESSING
# -------------------------------
df = pd.read_csv(input_file, sep=",", encoding="cp1252")
df = clean_column_names(df)  # <- Ensure lowercase headers early
df = clean_data(df)
df = remove_non_zwolle(df)
df = validate_postcodes(df)
df = validate_by_neighborhood(df)

# -------------------------------
# SUMMARY
# -------------------------------
print(f"✅ Total rows after cleaning, filtering, and validation: {len(df)}")
if 'buurtnaam' in df.columns:
    print(f"📌 Unique neighborhoods: {df['buurtnaam'].nunique()}")

# -------------------------------
# SAVE OUTPUT
# -------------------------------
df.to_csv(output_file, index=False)
print(f"📁 Cleaned and validated file saved to: {output_file}")


  return df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


🏙️ Removed 811495 rows not from Zwolle.
🔎 Invalid postcode entries: 135
🏘️ Subsidy counts per neighborhood:
 buurtnaam
Berkum                           507
Aa-landen-Noord                  323
Wipstrik-Noord                   299
Aa-landen-Oost                   286
Gerenlanden                      281
                                ... 
Bedrijventerrein Floresstraat      6
Breecamp                           6
Oude Mars                          4
Noordereiland                      3
Bedrijventerrein Voorst-D          2
Name: count, Length: 67, dtype: int64
✅ Total rows after cleaning, filtering, and validation: 6326
📌 Unique neighborhoods: 67
📁 Cleaned and validated file saved to: ../clean_data/isde_subsidies_clean.csv


## Minimize ISDE Data
The following code saves a minimized csv file with the following columns:
   -postcode_agg
    -wijknaam
    -buurtnaam
    -subsidiejaar
    -techniek
    -subcategorie

## Before running:
1. Update directories and file names as necessary under CONFIGURATION

In [4]:
import pandas as pd
import os

# -------------------------------
# CONFIGURATION
# -------------------------------
CLEAN_DATA_DIR = "../clean_data/"
OUTPUT_DIR = "../minimized_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

input_file = os.path.join(CLEAN_DATA_DIR, "isde_subsidies_clean.csv")
output_file = os.path.join(OUTPUT_DIR, "isde_subsidies_minimized.csv")

# COLUMNS TO KEEP
# -------------------------------
COLUMNS_TO_KEEP = [
    "postcode_agg",
    "wijknaam",
    "buurtnaam",
    "subsidiejaar",
    "techniek",
    "subcategorie"
]

# -------------------------------
# LOAD & FILTER
# -------------------------------
df = pd.read_csv(input_file)

# Ensure all required columns exist
missing_cols = [col for col in COLUMNS_TO_KEEP if col not in df.columns]
if missing_cols:
    raise ValueError(f"❌ Missing columns in input file: {missing_cols}")

# Filter only the needed columns and rename postcode_agg to postcode
df_minimized = df[COLUMNS_TO_KEEP].rename(columns={"postcode_agg": "postcode"})

# -------------------------------
# OUTPUT
# -------------------------------
print(f"✅ Final dataset: {len(df_minimized)} rows")
df_minimized.to_csv(output_file, index=False)

print(f"📁 Output saved to: {output_file}")



✅ Final dataset: 6326 rows
📁 Output saved to: ../minimized_data/isde_subsidies_minimized.csv


## Before Running:
1. Update directory and file names as necessary
1. Uses Nominiatim open-source geocoding with OpenStreetMap data. View the license here: htttps://nominatim.org

In [6]:
import pandas as pd
import os
from geopy.geocoders import Nominatim
import time 
from geopy.exc import GeocoderTimedOut

#-----------------------------------------
# CONFIGURATION
#----------------------------------------

CLEAN_DATA_DIR = "../minimized_data/"
OUTPUT_DIR = "../geocoded_data/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

input_file = CLEAN_DATA_DIR + "isde_subsidies_minimized.csv"
output_file = OUTPUT_DIR + "isde_subsidies_geocoded.csv"

#----------------------------------------------
# Load CSV file
#----------------------------------------------
df = pd.read_csv(input_file)

#----------------------------------------------
# Ensure there's a 'postcode' column
#----------------------------------------------
if 'postcode' not in df.columns:
    raise ValueError("CSV file must contain a 'postcode' column")

#-----------------------------------------
# Use Nominatim open-source geocoder with OpenStreetMap data
#------------------------------------------
# Initialize the geocoder
geolocator = Nominatim(user_agent="geo_script")

# Function to get latitude and longitude with retry logic
def get_lat_lon(postcode, retries=3, delay=1):
    try:
        location = geolocator.geocode(postcode, country_codes="NL")  # Replace "XX" with the actual country code
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except GeocoderTimedOut:
        if retries > 0:
            time.sleep(delay)
            return get_lat_lon(postcode, retries - 1, delay * 2)  # Exponential backoff
        else:
            return None, None
    finally:
        time.sleep(1)  # Respect rate limits

#----------------------------------------------    
# Apply geocoding function
#-----------------------------------------------
# Apply geocoding function to postcodes
df[['latitude', 'longitude']] = df['postcode'].astype(str).apply(lambda x: pd.Series(get_lat_lon(x)))

#-----------------------------------------------
# Save GeoCoded results to a new CSV file
#-----------------------------------------------
df.to_csv(output_file, index=False)

print("Geocoding complete. Saved as isde_subsidies_geocoded.csv")


Geocoding complete. Saved as isde_subsidies_geocoded.csv


# Aggregation based on neighbourhood

This code aggregates subsidies occurence by performing a spatial join between geocoded subsidies data (sde_minimized_geocoded.csv) and neighborhood boundaries (Buurtgrenzen_Zwolle.shp). It calculates the total number of applied subsidies within each neighborhood.

In [7]:
import geopandas as gpd
import pandas as pd
import os

GEOCODED_DATA_DIR = "../geocoded_data/"
NEIGHBORHOOD_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../aggregated_data/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define paths explicitly
input_file = os.path.join(GEOCODED_DATA_DIR, "isde_subsidies_geocoded.csv")
neighborhood_file = os.path.join(NEIGHBORHOOD_DATA_DIR, "Buurtgrenzen_Zwolle.shp")

# Load ISDE subsidy data and convert to GeoDataFrame
subsidy_df = pd.read_csv(input_file)
subsidy_gdf = gpd.GeoDataFrame(
    subsidy_df,
    geometry=gpd.points_from_xy(subsidy_df.longitude, subsidy_df.latitude),
    crs='EPSG:4326'
)

# Load neighborhoods shapefile with correct CRS (EPSG:28992)
neighborhoods_gdf = gpd.read_file(neighborhood_file)

# Convert subsidy points to match neighborhoods CRS (EPSG:28992)
subsidy_gdf = subsidy_gdf.to_crs(neighborhoods_gdf.crs)

# Spatial join: assign subsidies to neighborhoods
joined_gdf = gpd.sjoin(subsidy_gdf, neighborhoods_gdf, predicate='within')

# Count occurrences of subsidies per neighborhood
aggregated_subsidies = joined_gdf.groupby('OFFICIËLE').size().reset_index(name='subsidy_count')

# Merge aggregation results back with neighborhood geometry
result_gdf = neighborhoods_gdf.merge(aggregated_subsidies, on='OFFICIËLE', how='left')
result_gdf['subsidy_count'] = result_gdf['subsidy_count'].fillna(0).astype(int)

# Save aggregated data to shapefile
output_path = os.path.join(OUTPUT_DIR, "Zwolle_Neighbourhood_Aggregated_ISDESubsidy.shp")
result_gdf.to_file(output_path)

print(f"✅ Aggregated shapefile saved successfully to: {output_path}")


✅ Aggregated shapefile saved successfully to: ../aggregated_data/Zwolle_Neighbourhood_Aggregated_ISDESubsidy.shp


  result_gdf.to_file(output_path)
  ogr_write(


# Aggregation based on district

This code aggregates subsidies occurence by performing a spatial join between geocoded subsidies data (sce_minimized_geocoded.csv) and district boundaries (Wijkgrenzen_Zwolle.shp). It calculates the total number of applied subsidies within each district.

In [8]:
import geopandas as gpd
import pandas as pd
import os

GEOCODED_DATA_DIR = "../geocoded_data/"
DISTRICT_DATA_DIR = "../raw_data/"
OUTPUT_DIR = "../aggregated_data/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define paths explicitly
input_file = os.path.join(GEOCODED_DATA_DIR, "isde_subsidies_geocoded.csv")
district_file = os.path.join(DISTRICT_DATA_DIR, "Wijkgrenzen_Zwolle.shp")

# Load ISDE subsidy data and convert to GeoDataFrame
subsidy_df = pd.read_csv(input_file)
subsidy_gdf = gpd.GeoDataFrame(
    subsidy_df,
    geometry=gpd.points_from_xy(subsidy_df.longitude, subsidy_df.latitude),
    crs='EPSG:4326'
)

# Load districts shapefile with correct CRS (EPSG:28992)
districts_gdf = gpd.read_file(district_file)

# Convert subsidy points to match districts CRS (EPSG:28992)
subsidy_gdf = subsidy_gdf.to_crs(districts_gdf.crs)

# Spatial join: assign subsidies to districts
joined_gdf = gpd.sjoin(subsidy_gdf, districts_gdf, predicate='within')

# Count occurrences of subsidies per district
aggregated_subsidies = joined_gdf.groupby('OFFICIËLE').size().reset_index(name='subsidy_count')

# Merge aggregation results back with district geometry
result_gdf = districts_gdf.merge(aggregated_subsidies, on='OFFICIËLE', how='left')
result_gdf['subsidy_count'] = result_gdf['subsidy_count'].fillna(0).astype(int)

# Save aggregated data to shapefile
output_path = os.path.join(OUTPUT_DIR, "Zwolle_District_Aggregated_ISDESubsidy.shp")
result_gdf.to_file(output_path)

print(f"✅ Aggregated shapefile saved successfully to: {output_path}")


✅ Aggregated shapefile saved successfully to: ../aggregated_data/Zwolle_District_Aggregated_ISDESubsidy.shp


  result_gdf.to_file(output_path)
  ogr_write(
