# Create a set of urban regions with names and bounding boxes for the data collection

To create the set, the Urban Centre database is taken as a base: 
https://human-settlement.emergency.copernicus.eu/ucdb2024visual.php#

Specifically, the GHSL thematic data layer is downloaded and processed

## Import libraries

In [1]:
## Import libraries
# system
import os
import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import RLock
from dotenv import load_dotenv

# data manipulation
import json
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import box

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

p=os.popen('git rev-parse --show-toplevel')
repo_dir = p.read().strip()
p.close()

## Process the GHSL data

read GHSL data

In [2]:
os.chdir(repo_dir)
ghsl_df= gpd.read_file("data/ghsl/GHS_UCDB_THEME_GHSL_GLOBE_R2024A_V1_0/GHS_UCDB_THEME_GHSL_GLOBE_R2024A.gpkg")

  result = read_func(


save as parquet for faster read in the future

In [4]:
ghsl_df.to_parquet("data/ghsl/ghsl_data.parquet", index=False)

In [7]:
ghsl_df = gpd.read_parquet("data/ghsl/ghsl_data.parquet")
ghsl_df.head(2)

Unnamed: 0,ID_UC_G0,GC_UCN_MAI_2025,GC_CNT_GAD_2025,GC_UCA_KM2_2025,GC_POP_TOT_2025,GC_DEV_WIG_2025,GC_DEV_USR_2025,GH_BUS_TOT_1975,GH_BUS_TOT_1980,GH_BUS_TOT_1985,...,GH_XST_D11_2025,GH_XST_D12_2025,GH_XST_D13_2025,GH_XST_D21_2025,GH_XST_D22_2025,GH_XST_D23_2025,GH_XST_D30_2025,GH_L30_2025,GH_W30_2025,geometry
0,1,Apia,Samoa,35,60041.65661,Lower Middle,Oceania,1984866,2284807,2600248,...,0,0,0,0,0,0,2296.099859,26.14941,8.85059,"MULTIPOLYGON (((-16906000 -1703000, -16905000 ..."
1,2,Nuku'alofa,Tonga,20,51990.7662,Upper Middle,Oceania,1700175,1921455,2142709,...,0,0,0,0,0,0,3941.202447,13.1916,6.8084,"MULTIPOLYGON (((-16819000 -2590000, -16817000 ..."


plot the first German city

In [12]:
ghsl_df[ghsl_df["GC_CNT_GAD_2025"] == "Germany"].iloc[0:1].explore()

create a new dataset (with the country and region names and bounding boxes)

In [27]:
#copy the relevant columns and rename
ghsl_df_new = ghsl_df[["ID_UC_G0", "GC_UCN_MAI_2025", "GC_CNT_GAD_2025", "geometry"]].copy()
ghsl_df_new.rename(columns={
    "GC_UCN_MAI_2025": "region_name",
    "GC_CNT_GAD_2025": "country_name"
}, inplace=True)

# reproject to EPSG:4326
ghsl_df_new = ghsl_df_new.to_crs(epsg=4326)

# create bounding boxes
ghsl_df_new["bbox"] = ghsl_df_new["geometry"].apply(lambda geom: box(*geom.bounds))

ghsl_df_new.head(2)

Unnamed: 0,ID_UC_G0,region_name,country_name,geometry,bbox
0,1,Apia,Samoa,"MULTIPOLYGON (((-171.77356 -13.8248, -171.7633...","POLYGON ((-171.73291 -13.85752, -171.73291 -13..."
1,2,Nuku'alofa,Tonga,"MULTIPOLYGON (((-175.19374 -21.13139, -175.172...","POLYGON ((-175.16856 -21.16461, -175.16856 -21..."


In [28]:
map = gpd.GeoDataFrame(geometry=ghsl_df_new.head(1).bbox).explore(color="#ffffff")
ghsl_df_new.head(1).explore(m=map, color="#ff0000", add_to_map=True)

save as parquet

In [33]:
ghsl_df_new.to_parquet("data/processed/ghsl_regions.parquet", index=False)

In [35]:
ghsl_df_new = gpd.read_parquet("data/processed/ghsl_regions.parquet")
ghsl_df_new.head(2)

Unnamed: 0,ID_UC_G0,region_name,country_name,geometry,bbox
0,1,Apia,Samoa,"MULTIPOLYGON (((-171.77356 -13.8248, -171.7633...","POLYGON ((-171.73291 -13.85752, -171.73291 -13..."
1,2,Nuku'alofa,Tonga,"MULTIPOLYGON (((-175.19374 -21.13139, -175.172...","POLYGON ((-175.16856 -21.16461, -175.16856 -21..."


plot for Leipzig

In [36]:
# Create GeoDataFrame
bbox_gdf = gpd.GeoDataFrame(geometry=ghsl_df_new[ghsl_df_new["region_name"]=="Leipzig"].bbox, crs="EPSG:4326")

bbox_gdf.explore()