# Education data

**Objetive:** <br> 
The following notebook shows how to aggregate the data on school centers by building for one or more countries, specified by their ISO alpha-3 codes. It applies different summary functions depending on the variable type (text, binary, numeric, or logical).
The process may take some time depending on dataset size. <br>

The data is available upon request in the SCL Data Lake [Geospatial Infrastructure Repository](https://scldata.iadb.org/app/folder/768E6E2A-DEFF-4BF9-B62D-E32518B15180).

Author(s): 
* Sofía Karsaclian, Consultant EDU, sofiakarsaclian@gmail.com <br>

Created: November 3, 2025

In [None]:
# Libraries
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
# Define base directory (adjust if needed)
# By default, this script assumes it is being run from within the root of your local repository.
# The expected folder structure follows the data lake structure:
#   <base_dir>/
#       ├── SLV/
#       │    ├── SLV_total.geojson
#       |    ├── SLV_total.csv
#       |    └── SLV_quality_report.html
#       ├── SUR/
#       │    ├── SUR_total.geojson
#       |    ├── SUR_total.csv
#       |    └── SUR_quality_report.html
#       └── (each country folder will contain its own 'buildings' subfolder for outputs)

# You can modify 'base_dir' below to point to any other location as needed.
base_dir = Path.cwd()  # Uses the current working directory as the base

In [None]:
# Define one or more countries using ISO ALPHA-3 codes.
# Example: ["ECU", "CHL", "BRA", "ARG"]
# If you are working with the merged regional dataset, set it to ["LAC"] instead.

countries = ["SLV", "SUR"]  # Change to ["LAC"] for regional processing

In [None]:
# Column lists by expected type
col_text = ["adm0_pcode", "internet_tipo", "electricidad_tipo"]
col_binary = ["area", "internet", "electricidad", "nivel_inicial", "nivel_primaria", 
              "nivel_secbaja", "nivel_secalta", "turno_manana", "turno_tarde", 
              "turno_otros", "turno_junica", "turno_diurno"]
col_sum = ["docentes_total", "matricula_total", "matricula_manana", "matricula_tarde", 
           "matricula_otros", "matricula_junica", "matricula_inicial", "matricula_primaria", 
           "matricula_secbaja", "matricula_secalta", "matricula_diurna"]
col_mean = ["longitud", "latitud", "tasa_promocion", "tasa_abandono", "tasa_repeticion", 
            "nse_escuela", "calidad_escuela", "ideb_prim", "ideb_sec", "ideb_media", 
            "calidad_secalta", "calidad_basica"]
col_logical = ["flag_manual"]

# Helper function for text columns
def combine_text_values(series):
    unique_vals = series.dropna().unique().tolist()
    if len(unique_vals) == 0:
        return np.nan
    elif len(unique_vals) == 1:
        return unique_vals[0]
    else:
        return ", ".join(sorted(map(str, unique_vals)))

In [None]:
# Loop through each country
for country in countries:
    print(f"Processing country: {country}")

    # Paths for input and output
    country_dir = base_dir / country
    input_path = country_dir /  f"{country}_total.geojson"
    output_dir = country_dir / "buildings"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Read GeoJSON
    base_total = gpd.read_file(input_path)

    # Determine which columns are present
    cols_present = base_total.columns.tolist()

    # Build aggregation dictionary dynamically
    agg_dict = {}

    for col in [c for c in col_text if c in cols_present]:
        agg_dict[col] = combine_text_values

    for col in [c for c in col_binary if c in cols_present]:
        agg_dict[col] = lambda x: np.nanmax(x) if not x.isna().all() else np.nan

    for col in [c for c in col_sum if c in cols_present]:
        agg_dict[col] = "sum"

    for col in [c for c in col_mean if c in cols_present]:
        agg_dict[col] = "mean"

    for col in [c for c in col_logical if c in cols_present]:
        agg_dict[col] = "any"

    # Aggregate by 'id_edificio'
    base_aggregated = base_total.groupby("id_edificio", dropna=False).agg(agg_dict).reset_index()

    # Retain geometry — get representative geometry (first non-null)
    geom_series = base_total.groupby("id_edificio")["geometry"].first().reset_index()
    base_aggregated = geom_series.merge(base_aggregated, on="id_edificio")

    # Convert to GeoDataFrame
    base_aggregated = gpd.GeoDataFrame(base_aggregated, geometry="geometry", crs=base_total.crs)

    # Define output paths inside the country's own folder
    output_geojson = output_dir / f"{country}_aggregated_by_building.geojson"
    output_csv = output_dir / f"{country}_aggregated_by_building.csv"

    # Save outputs
    base_aggregated.to_file(output_geojson, driver="GeoJSON")
    base_aggregated.drop(columns="geometry").to_csv(output_csv, index=False, encoding="latin1")

    print(f"Saved files for {country} in:\n- {output_geojson}\n- {output_csv}\n")
