# Config

In [4]:
# Libraries
import pandas as pd
import geopandas as gpd
import tobler as tb
from pathlib import Path
import sys
repo_root = Path.cwd() if (Path.cwd() / "src").exists() else Path.cwd().parent
sys.path.append(str(repo_root / "src"))
from config import INTERIM_DATA_DIR,PROCESSED_DATA_DIR

In [5]:
# Params
CITY = "barcelona"
RES = 10

# Load data

In [3]:
gdf_h3 = gpd.read_parquet(INTERIM_DATA_DIR/f"{CITY}_h3_res{RES}.parquet")
gdf_h3.head()

Unnamed: 0,h3_id,geometry
0,8a394461b98ffff,"POLYGON ((430592.392 4576915.43, 430577.5 4576..."
1,8a394461b92ffff,"POLYGON ((430681.742 4577368.251, 430666.851 4..."
2,8a394461b91ffff,"POLYGON ((430637.067 4577141.842, 430622.176 4..."
3,8a394461b90ffff,"POLYGON ((430722.225 4577245.969, 430707.333 4..."
4,8a39446f4b1ffff,"POLYGON ((430607.72 4574700.8, 430592.828 4574..."


# Data management (Code)

## Points of Interest

In [4]:
poi = gpd.read_parquet(INTERIM_DATA_DIR / "overture_places.parquet")
poi.to_crs(gdf_h3.crs, inplace=True)

poi = gpd.sjoin(gdf_h3[[ "h3_id", "geometry"]],poi, how="left", predicate="contains")
results_poi = poi.groupby(["h3_id","category","geometry"]).size().reset_index(name="count")
results_poi = gpd.GeoDataFrame(results_poi, geometry="geometry", crs=gdf_h3.crs)
results_poi.head()

#

Unnamed: 0,h3_id,category,geometry,count
0,8a3944600007fff,architectural_design_service,"POLYGON ((427657.851 4583973.849, 427642.959 4...",2
1,8a3944600007fff,bakery,"POLYGON ((427657.851 4583973.849, 427642.959 4...",1
2,8a3944600007fff,beauty_salon,"POLYGON ((427657.851 4583973.849, 427642.959 4...",2
3,8a3944600007fff,christian_place_of_worship,"POLYGON ((427657.851 4583973.849, 427642.959 4...",3
4,8a3944600007fff,clothing_store,"POLYGON ((427657.851 4583973.849, 427642.959 4...",1


In [5]:
results_poi_wide = results_poi.pivot_table(
    index='h3_id', 
    columns='category', 
    values='count', 
    aggfunc='sum',
    fill_value=0
).reset_index()

results_poi_wide = gpd.GeoDataFrame(
    results_poi_wide.merge(results_poi[['h3_id', 'geometry']].drop_duplicates(), on='h3_id'),
    geometry='geometry',
    crs=results_poi.crs
)



## NDVI

In [6]:
# --- Read NDVI data and align CRS with H3 grid ---
ndvi = gpd.read_parquet(INTERIM_DATA_DIR / "ndvi_data.parquet")
ndvi.to_crs(gdf_h3.crs, inplace=True)

# --- Spatial join with H3 cells ---
ndvi = gpd.sjoin(
    gdf_h3[['h3_id', 'geometry']],
    ndvi[['NDVI', 'geometry']],
    how='left',
    predicate='contains'
)

# --- Define NDVI categories ---
def ndvi_category(ndvi_value):
    if ndvi_value < 0:
        return 'No vegetation'
    elif ndvi_value < 0.2:
        return 'Bare soil'
    elif ndvi_value < 0.5:
        return 'Low vegetation'
    elif ndvi_value < 0.7:
        return 'Moderate vegetation'
    else:
        return 'Green'

ndvi['category'] = ndvi['NDVI'].apply(ndvi_category)

# --- Compute mean NDVI per H3 cell ---
mean_ndvi = ndvi.groupby(['h3_id', 'geometry'])['NDVI'].mean().reset_index(name='mean_ndvi')

# --- Compute category percentages per H3 cell ---
total_counts = ndvi.groupby('h3_id').size().rename('total')
category_counts = ndvi.groupby(['h3_id', 'category']).size().unstack(fill_value=0)
category_percent = (category_counts.div(total_counts, axis=0) * 100).reset_index()

# --- Merge mean NDVI and category percentages ---
results_ndvi = mean_ndvi.merge(category_percent, on='h3_id', how='left')

# Make geodataframe
results_ndvi = gpd.GeoDataFrame(results_ndvi, geometry='geometry', crs=gdf_h3.crs)

results_ndvi.head()


Unnamed: 0,h3_id,geometry,mean_ndvi,Bare soil,Green,Low vegetation,Moderate vegetation,No vegetation
0,8a3944600007fff,"POLYGON ((427657.851 4583973.849, 427642.959 4...",0.078996,100.0,0.0,0.0,0.0,0.0
1,8a394460000ffff,"POLYGON ((427783.424 4583955.746, 427768.532 4...",0.078697,94.610778,0.0,5.389222,0.0,0.0
2,8a3944600017fff,"POLYGON ((427572.728 4583869.754, 427557.836 4...",0.119628,83.435583,0.0,16.564417,0.0,0.0
3,8a394460001ffff,"POLYGON ((427698.3 4583851.65, 427683.408 4583...",0.076534,96.987952,0.0,2.409639,0.0,0.60241
4,8a3944600027fff,"POLYGON ((427617.404 4584096.046, 427602.512 4...",0.071437,98.795181,0.0,1.204819,0.0,0.0


## Buildings


In [6]:
#read buildings data
buildings = gpd.read_parquet(INTERIM_DATA_DIR / "overture_buildings.parquet")
buildings.to_crs(gdf_h3.crs, inplace=True)

print(buildings["subtype"].value_counts(dropna=False))

#Spatial join with H3 grid
buildings_h3 = gpd.sjoin(buildings, gdf_h3[['h3_id', 'geometry']], how='left', predicate='intersects')

FileNotFoundError: [Errno 2] Failed to open local file '/Users/jamesmurphy/Documents/GitHub/reallocate_AI/data/interim/overture_buildings.parquet'. Detail: [errno 2] No such file or directory

In [8]:
# First, compute total area per H3
total_area = buildings_h3.groupby('h3_id')['geometry'].apply(lambda x: x.area.sum()).rename('total_area')

# Then compute subtype-specific stats
agg = buildings_h3.groupby(['h3_id', 'subtype']).agg(
    num_buildings=('geometry', 'size'),
    subtype_area=('geometry', lambda x: x.area.sum())
).reset_index()

# Merge total area to compute percentage
agg = agg.merge(total_area, on='h3_id', how='left')
agg['pct_area'] = agg['subtype_area'] / agg['total_area'] * 100

agg_pivot = agg.pivot(index='h3_id', columns='subtype', 
                      values=['num_buildings','pct_area'])

# Flatten multiindex columns
agg_pivot.columns = ['_'.join(col).strip() for col in agg_pivot.columns.values]
agg_pivot.reset_index(inplace=True)

#Drop columns that all all NaN
print(agg_pivot.shape)
for col in agg_pivot.columns:
    if agg_pivot[col].isna().all():
        agg_pivot.drop(columns=col, inplace=True)
        print(f"Dropped column {col} because all values are NaN")

#Fill NaN with 0
agg_pivot.fillna(0, inplace=True)

results_buildings = pd.merge(gdf_h3[['h3_id', 'geometry']], agg_pivot, on='h3_id', how='left')
results_buildings = gpd.GeoDataFrame(results_buildings, geometry='geometry', crs=gdf_h3.crs)
results_buildings.head()

(4252, 27)


Unnamed: 0,h3_id,geometry,num_buildings_agricultural,num_buildings_civic,num_buildings_commercial,num_buildings_education,num_buildings_entertainment,num_buildings_industrial,num_buildings_medical,num_buildings_military,...,pct_area_education,pct_area_entertainment,pct_area_industrial,pct_area_medical,pct_area_military,pct_area_outbuilding,pct_area_religious,pct_area_residential,pct_area_service,pct_area_transportation
0,8a394461b98ffff,"POLYGON ((430592.392 4576915.43, 430577.5 4576...",,,,,,,,,...,,,,,,,,,,
1,8a394461b92ffff,"POLYGON ((430681.742 4577368.251, 430666.851 4...",,,,,,,,,...,,,,,,,,,,
2,8a394461b91ffff,"POLYGON ((430637.067 4577141.842, 430622.176 4...",,,,,,,,,...,,,,,,,,,,
3,8a394461b90ffff,"POLYGON ((430722.225 4577245.969, 430707.333 4...",,,,,,,,,...,,,,,,,,,,
4,8a39446f4b1ffff,"POLYGON ((430607.72 4574700.8, 430592.828 4574...",,,,,,,,,...,,,,,,,,,,


## Land Use

In [9]:
#read buildings data
landuse = gpd.read_parquet(INTERIM_DATA_DIR / "overture_landuse.parquet")
landuse.to_crs(gdf_h3.crs, inplace=True)

#Spatial join with H3 grid
landuse_h3 = gpd.sjoin(landuse, gdf_h3[['h3_id','geometry']], how='left', predicate='intersects')


In [10]:
# Total H3 area (use gdf_h3.geometry.area)
h3_area = gdf_h3.set_index('h3_id')['geometry'].area.rename('h3_area')

# Group by H3 and subtype
agg = landuse_h3.groupby(['h3_id','confidence']).agg(
    num_polygons=('geometry','size'),
    area=('geometry', lambda x: x.area.sum()),
    avg_polygon_area=('geometry', lambda x: x.area.mean()),
    max_polygon_area=('geometry', lambda x: x.area.max())
).reset_index()

# Merge total H3 area to calculate percentage
agg = agg.merge(h3_area, on='h3_id', how='left')
agg['pct_area'] = agg['area'] / agg['h3_area'] * 100


agg_pivot = agg.pivot(index='h3_id', columns='confidence', 
                      values=['num_polygons','area','pct_area','avg_polygon_area','max_polygon_area'])

# Flatten multiindex columns
agg_pivot.columns = ['_'.join(col).strip() for col in agg_pivot.columns.values]
agg_pivot.reset_index(inplace=True)

#Drop columns that all all NaN
print(agg_pivot.shape)
for col in agg_pivot.columns:
    if agg_pivot[col].isna().all():
        agg_pivot.drop(columns=col, inplace=True)
        print(f"Dropped column {col} because all values are NaN")

#Fill NaN with 0
agg_pivot.fillna(0, inplace=True)

results_landuse = pd.merge(gdf_h3[['h3_id', 'geometry']], agg_pivot, on='h3_id', how='left')
results_landuse = gpd.GeoDataFrame(results_landuse, geometry='geometry', crs=gdf_h3.crs)
results_landuse.head()

(4173, 86)


Unnamed: 0,h3_id,geometry,num_polygons_agriculture,num_polygons_cemetery,num_polygons_construction,num_polygons_developed,num_polygons_education,num_polygons_entertainment,num_polygons_golf,num_polygons_horticulture,...,max_polygon_area_horticulture,max_polygon_area_managed,max_polygon_area_medical,max_polygon_area_military,max_polygon_area_park,max_polygon_area_pedestrian,max_polygon_area_recreation,max_polygon_area_religious,max_polygon_area_residential,max_polygon_area_transportation
0,8a394461b98ffff,"POLYGON ((430592.392 4576915.43, 430577.5 4576...",,,,,,,,,...,,,,,,,,,,
1,8a394461b92ffff,"POLYGON ((430681.742 4577368.251, 430666.851 4...",,,,,,,,,...,,,,,,,,,,
2,8a394461b91ffff,"POLYGON ((430637.067 4577141.842, 430622.176 4...",,,,,,,,,...,,,,,,,,,,
3,8a394461b90ffff,"POLYGON ((430722.225 4577245.969, 430707.333 4...",,,,,,,,,...,,,,,,,,,,
4,8a39446f4b1ffff,"POLYGON ((430607.72 4574700.8, 430592.828 4574...",,,,,,,,,...,,,,,,,,,,


## Open Data

In [None]:
if CITY == "barcelona":
    


    #Spatial join with H3 grid
    opendata_h3 = gpd.sjoin(opendata, gdf_h3[['h3_id','geometry']], how='left', predicate='intersects')


In [34]:
from tobler.area_weighted import area_interpolate
if CITY == "barcelona":
    #read opendata data
    opendata = gpd.read_parquet(INTERIM_DATA_DIR / "opendata.parquet")
    opendata.to_crs(gdf_h3.crs, inplace=True)
    # --- List your variables ---
    extensive_vars = [
        'Altres vehicles','Camions','Ciclomotors','Furgonetes','Motos','Turismes',
        'Africa','America','Asia','Europe','Oceania',
        'Female','Male',
        '5-9 years','10-14 years','15-19 years','20-24 years','25-29 years',
        '30-34 years','35-39 years','40-44 years','45-49 years','50-54 years','55-59 years',
        '60-64 years','65-69 years','70-74 years','75-79 years','80-84 years','85-89 years',
        '90-94 years','95-99 years','100+'
    ]

    # Assuming 'Import_Renda_Bruta_€' is a rate/intensive variable
    intensive_vars = ['<5 years','Import_Renda_Bruta_€']

    # --- Transfer attributes using Tobler ---
    opendata_h3 = area_interpolate(
        source_df=opendata,
        target_df=gdf_h3,
        extensive_variables=extensive_vars,
        intensive_variables=intensive_vars
    )

    # --- Result ---
    print(opendata_h3.head())

   Altres vehicles   Camions  Ciclomotors  Furgonetes     Motos  Turismes  \
0         0.368064  0.211602     0.006768    0.109284  0.055239  0.602060   
1         2.359317  1.356384     0.043384    0.700522  0.354089  3.859251   
2         0.866650  0.498242     0.015936    0.257323  0.130068  1.417622   
3         3.328435  1.913535     0.061204    0.988270  0.499535  5.444485   
4         1.531068  0.880219     0.028154    0.454601  0.229784  2.504443   

     Africa   America      Asia    Europe  ...  70-74 years  75-79 years  \
0  0.005176  0.010849  0.005773  0.121328  ...     0.008361     0.003981   
1  0.033176  0.069542  0.037004  0.777720  ...     0.053592     0.025520   
2  0.012187  0.025545  0.013593  0.285680  ...     0.019686     0.009374   
3  0.046803  0.098107  0.052204  1.097178  ...     0.075605     0.036003   
4  0.021529  0.045129  0.024014  0.504698  ...     0.034778     0.016561   

   80-84 years  85-89 years  90-94 years  95-99 years  100+  <5 years  \
0     0

## L

In [66]:
# #Main body of code
# for dataset,format in datasets.items():
#     print(f"Aggregating {dataset} into H3 resolution {RES}...")
#     # gdf = gpd.read_parquet(INTERIM_DATA_DIR/f"{CITY}_{dataset}.parquet")
#     gdf = gpd.read_parquet(INTERIM_DATA_DIR/f"{dataset}.parquet")
#     # Ensure same CRS
#     if gdf.crs is None:
#         raise ValueError(f"{dataset} has no CRS defined")
#     if gdf.crs != gdf_h3.crs:
#             gdf = gdf.to_crs(gdf_h3.crs)
#     print(gdf.crs)
#     print(gdf_h3.crs)
#     if format == "Points":
#         # Spatial join points -> H3 polygons to assign h3_id
#         try:
#             gdf_pts = gpd.sjoin(gdf, gdf_h3[['h3_id', 'geometry']], how='left', predicate='within')        
#         except TypeError:
#             gdf_pts = gpd.sjoin(gdf_h3[['h3_id', 'geometry']],gdf, how='left', op='within')

#         # Drop points not matched to any H3 cell and count points per h3_id
#         gdf_pts = gdf_pts.dropna(subset=['h3_id'])
#         gdf_agg = gdf_pts.groupby('h3_id').size().reset_index(name='count')
#     elif format == "Polygons":
#         gdf_poly = gpd.overlay(gdf_h3, gdf, how='intersection')
#         gdf_agg = gdf_poly.groupby('h3_id').agg({'some_field': 'sum'}).reset_index()
    
#     gdf_h3 = gdf_h3.merge(gdf_agg, on="h3_id", how="left")
#     # gdf_h3['count'] = gdf_h3['count'].fillna(0)
    
#     print(f"Completed aggregation for {dataset}.")

# Plots

In [3]:
# Plots and visualizations

# Save results

In [35]:
#Save results and figures

#Points of interest
results_poi_wide.to_parquet(PROCESSED_DATA_DIR/f"{CITY}_h3_res{RES}_poi.parquet", index=False)

# Results buildings
results_buildings.to_parquet(PROCESSED_DATA_DIR/f"{CITY}_h3_res{RES}_building_aggregated.parquet", index=False)

# Results buildings
results_landuse.to_parquet(PROCESSED_DATA_DIR/f"{CITY}_h3_res{RES}_landuse_aggregated.parquet", index=False)

#NDVI
results_ndvi.to_parquet(PROCESSED_DATA_DIR/f"{CITY}_h3_res{RES}_ndvi_aggregated.parquet", index=False)

#Open data
if CITY == "barcelona":
    opendata_h3.to_parquet(PROCESSED_DATA_DIR/f"{CITY}_h3_res{RES}_opendata_aggregated.parquet", index=False)