<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/9_statistics_agbd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Subdirectories

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Imports and upgrades
!pip install geopandas
!pip install rasterio

In [None]:
# Imports
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import geopandas as gpd
from google.colab import runtime
import json
import ipywidgets as widgets
import numpy as np
from os.path import exists, join
from os import makedirs
from osgeo import gdal, ogr, osr
gdal.UseExceptions()
import pandas as pd
import plotly.graph_objects as go
import rasterio
from rasterio import mask as msk
import re
import requests
from shutil import copyfile
from sklearn.metrics import root_mean_squared_error, r2_score

In [None]:
# Define directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
masks_dir = join(areas_dir, "masks")
targets_dir = join(base_dir, "2_targets")
gedi_raster_final_dir = join(targets_dir, "gedi_raster_final")
targets_pkl_final_dir = join(targets_dir, "pkl_final")
models_dir = join(base_dir, "5_models")

scenarios_dir = join(base_dir, "6_scenarios")
uncertainty_dir = join(base_dir, "7_uncertainty")
differences_dir = join(base_dir, "8_differences")

statistics_agbd_dir = join(base_dir, "9_statistics_agbd")
sample_polygons_dir = join(statistics_agbd_dir, "sample_polygons")

# Create directories
makedirs(statistics_agbd_dir, exist_ok=True)
makedirs(sample_polygons_dir, exist_ok=True)

# Cell area raster for accurate pixel-by-pixel area calculations
cell_area_path = join(areas_dir, "cell_area.tif")

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -11111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'] # Good speed / size ratio
    else: options = []
    if input_array.dtype == 'int16': dtype = gdal.GDT_Int16
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Global function: Sample raster values
def sample_raster_values(pd_dataframe, raster_path, geom_x, geom_y, feature=False, n_threads=1):
    # Derive column name from filename
    raster_name = raster_path.split('/')[-1][:-4]
    if feature: raster_name = 'fea_' + raster_name
    # Load raster and extract metadata
    raster = gdal.Open(raster_path)
    band = raster.GetRasterBand(1)
    geotransform = raster.GetGeoTransform()
    raster_array = band.ReadAsArray()
    nodata = band.GetNoDataValue()
    rows, cols = raster_array.shape
    fill_value = nodata if nodata is not None else np.nan
    # Initialise output array with nodata
    sampled_values = np.full(len(geom_x), fill_value, dtype=raster_array.dtype)
    # Worker function for threaded sampling
    def sample_chunk(start, end):
        x_idx = ((geom_x[start:end] - geotransform[0]) / geotransform[1]).astype(int)
        y_idx = ((geom_y[start:end] - geotransform[3]) / geotransform[5]).astype(int)
        valid = (x_idx >= 0) & (x_idx < cols) & (y_idx >= 0) & (y_idx < rows)
        local_values = np.full(end - start, fill_value, dtype=raster_array.dtype)
        local_values[valid] = raster_array[y_idx[valid], x_idx[valid]]
        sampled_values[start:end] = local_values
    # Split points into chunks and process in parallel
    n_points = len(geom_x)
    chunk_size = (n_points + n_threads - 1) // n_threads
    chunk_ranges = [(i, min(i + chunk_size, n_points)) for i in range(0, n_points, chunk_size)]
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        executor.map(lambda r: sample_chunk(*r), chunk_ranges)
    # Assign to dataframe and release resources
    pd_dataframe[raster_name] = sampled_values
    raster = band = None

# Model comparison

In [None]:
# Define model comparison directories
model_comparison_dir = join(statistics_agbd_dir, "model_comparison")
products_dir = join(model_comparison_dir, "products")

makedirs(model_comparison_dir, exist_ok=True)
makedirs(products_dir, exist_ok=True)

In [None]:
# Model comparison directories
model_comparison_dir = join(statistics_agbd_dir, "model_comparison")
products_dir = join(model_comparison_dir, "products")
makedirs(model_comparison_dir, exist_ok=True)
makedirs(products_dir, exist_ok=True)

# Configurable raster patterns
l4d_prefix, l4d_suffix = "GEDI04_D_original_epsg32648_UTM", "_agbd.tif"
l4b_filename = "GEDI04_B_original_epsg6933_EASE-Grid_MU.tif"
veg_grid_filename = "GEDI_vegetation_grid_original_epsg6933_EASE-Grid_agbd-a0-qf_20190417_20230316.tif"

# L4D validation metrics URL
l4d_gpkg_url = "https://data.ornldaac.earthdata.nasa.gov/public/gedi/GEDI_L4D_Imputed_Waveforms/comp/GEDI_L4D_20190418_20230316_validation_metrics.gpkg"

# Template for clipping
template_gdf = gpd.read_file(join(polygons_dir, "template.gpkg"))

# Locate GEDI product rasters
gedi_products = {'L4D': [], 'L4B': None, 'vegetation_grid': None}
gedi04d_dir = join(gedi_raster_final_dir, "GEDI04_D")
gedi04b_dir = join(gedi_raster_final_dir, "GEDI04_B")
veg_dir = join(gedi_raster_final_dir, "GEDI_vegetation_grid")

if exists(gedi04d_dir):
    gedi_products['L4D'] = [join(gedi04d_dir, f) for f in os.listdir(gedi04d_dir) if f.startswith(l4d_prefix) and f.endswith(l4d_suffix)]
print(f"# L4D grid: {len(gedi_products['L4D'])} UTM tiles.")

if exists(gedi04b_dir) and exists(join(gedi04b_dir, l4b_filename)):
    gedi_products['L4B'] = join(gedi04b_dir, l4b_filename)
print(f"# L4B grid: {'found' if gedi_products['L4B'] else 'not found'}")

if exists(veg_dir) and exists(join(veg_dir, veg_grid_filename)):
    gedi_products['vegetation_grid'] = join(veg_dir, veg_grid_filename)
print(f"# Vegetation grid (AGBD): {'found' if gedi_products['vegetation_grid'] else 'not found'}")

# Discover model predictions
model_predictions = {}
for source_dir, subdir_name, category in [(scenarios_dir, 'scenario_predictions', 'scenarios'), (uncertainty_dir, 'uncertainty_predictions', 'uncertainty')]:
    if not exists(source_dir): continue
    for model_name in os.listdir(source_dir):
        pred_dir = join(source_dir, model_name, subdir_name)
        if not exists(pred_dir): continue
        for f in os.listdir(pred_dir):
            if not f.endswith('.tif'): continue
            parts = f.split('__')
            if category == 'scenarios' and len(parts) >= 2 and parts[0].isdigit() and len(parts[0]) == 4:
                year = parts[0]
            elif category == 'uncertainty' and len(parts) >= 3 and parts[0] == 'mean' and parts[1].isdigit() and len(parts[1]) == 4:
                year = parts[1]
            else: continue
            if model_name not in model_predictions: model_predictions[model_name] = {}
            if category not in model_predictions[model_name]: model_predictions[model_name][category] = []
            model_predictions[model_name][category].append(year)
for model in model_predictions:
    for cat in model_predictions[model]:
        model_predictions[model][cat] = sorted(set(model_predictions[model][cat]))

# Print selectable dictionary
print("\n# Model predictions available (comment out to exclude):")
print("selected_predictions = {")
for model, categories in model_predictions.items():
    print(f"    '{model}': {{")
    for cat, years in categories.items():
        print(f"        '{cat}': [")
        for y in years:
            print(f"            '{y}',")
        print(f"        ],")
    print(f"    }},")
print("}")

In [None]:
# L4D grid: 1 UTM tiles.
# L4B grid: found
# Vegetation grid: found

# Model predictions available (comment out to exclude):
selected_predictions = {
    'agbd_251203_161707': {
        'scenarios': [
            '2018',
            '2019',
            '2020',
            '2021',
            '2022',
            '2023',
            '2024',
        ],
        'uncertainty': [
            '2021',
            '2024',
        ],
    },
    'agbd_alpha_earth_251216_144719': {
        'scenarios': [
            '2023',
            '2024',
        ],
    },
}

In [None]:
# Process existing GEDI data: L4D area-weighted polygon accuracy and mask all GEDI rasters
# Forest mask year: 2023 used for all products as this is the final GEDI data collection year
gedi_mask_year = 2023

gpkg_raw = join(products_dir, "GEDI_L4D_validation_metrics_raw.gpkg")
gpkg_clipped = join(products_dir, "GEDI_L4D_validation_metrics_clipped.gpkg")
l4d_accuracy_path = join(model_comparison_dir, "l4d_tile_accuracy.csv")

# Download validation gpkg
if not exists(gpkg_raw):
    print("Downloading L4D validation gpkg...")
    response = requests.get(l4d_gpkg_url, stream=True)
    response.raise_for_status()
    with open(gpkg_raw, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192): f.write(chunk)
    print("  Download complete")
else: print("L4D validation gpkg exists")

# Clip to template extent
if not exists(gpkg_clipped):
    print("Clipping gpkg to template extent...")
    tiles_gdf = gpd.read_file(gpkg_raw).to_crs(template_gdf.crs)
    tiles_clipped = gpd.clip(tiles_gdf, template_gdf)
    tiles_clipped.to_file(gpkg_clipped, driver="GPKG")
    print(f"  Clipped to {len(tiles_clipped)} tiles")
else: print("Clipped gpkg exists")

# Calculate weighted accuracy
if not exists(l4d_accuracy_path):
    print("Calculating L4D area-weighted polygon accuracy...")
    tiles_gdf = gpd.read_file(gpkg_clipped)
    rmse_col, mean_col = "rmse_agbd", "agbd_mean_valid"

    # Load reference L4D raster
    ref_raster = gedi_products['L4D'][0]
    ref_ds = gdal.Open(ref_raster)
    ref_proj, ref_gt = ref_ds.GetProjection(), ref_ds.GetGeoTransform()
    ref_xsize, ref_ysize = ref_ds.RasterXSize, ref_ds.RasterYSize
    ref_bounds = [ref_gt[0], ref_gt[3] + ref_gt[5] * ref_ysize, ref_gt[0] + ref_gt[1] * ref_xsize, ref_gt[3]]
    ref_ds = None

    # Resample forest mask to reference grid
    gedi_mask_path = join(masks_dir, f"mask_forest_{gedi_mask_year}.tif")
    mask_aligned = gdal.Warp('', gedi_mask_path, options=gdal.WarpOptions(format='MEM', dstSRS=ref_proj, outputBounds=ref_bounds, width=ref_xsize, height=ref_ysize, resampleAlg='near', dstNodata=nodatavalue))
    mask_array = mask_aligned.GetRasterBand(1).ReadAsArray()
    mask_aligned = None

    # Reproject tiles to raster CRS
    raster_srs = osr.SpatialReference()
    raster_srs.ImportFromWkt(ref_proj)
    raster_epsg = int(raster_srs.GetAuthorityCode(None))
    tiles_reproj = tiles_gdf.to_crs(epsg=raster_epsg)

    # Calculate forest pixel counts per tile
    forest_counts = []
    for idx, tile in tiles_reproj.iterrows():
        minx, miny, maxx, maxy = tile.geometry.bounds
        col_start, col_end = max(0, int((minx - ref_gt[0]) / ref_gt[1])), min(ref_xsize, int((maxx - ref_gt[0]) / ref_gt[1]))
        row_start, row_end = max(0, int((ref_gt[3] - maxy) / abs(ref_gt[5]))), min(ref_ysize, int((ref_gt[3] - miny) / abs(ref_gt[5])))
        if col_end > col_start and row_end > row_start:
            tile_mask = mask_array[row_start:row_end, col_start:col_end]
            forest_count = np.sum(tile_mask == 1)
        else: forest_count = 0
        forest_counts.append(forest_count)

    tiles_gdf['forest_pixels'] = forest_counts
    tiles_gdf['rrmse'] = (tiles_gdf[rmse_col] / tiles_gdf[mean_col]) * 100

    # Filter and aggregate
    tiles_valid = tiles_gdf[(tiles_gdf['forest_pixels'] > 0)].dropna(subset=[rmse_col, 'rrmse'])
    weights = tiles_valid['forest_pixels'].values
    weighted_rmse = np.sqrt(np.average(tiles_valid[rmse_col].values**2, weights=weights))
    weighted_rrmse = np.average(tiles_valid['rrmse'].values, weights=weights)

    # Save results
    l4d_accuracy = {'source': 'L4D grid (area-weighted polygons)', 'rmse': weighted_rmse, 'rrmse': weighted_rrmse, 'n_tiles': len(tiles_valid), 'n_forest_pixels': int(weights.sum())}
    pd.DataFrame([l4d_accuracy]).to_csv(l4d_accuracy_path, index=False)
    print(f"  Tiles: {len(tiles_valid)}, Forest pixels: {weights.sum():,}")
    print(f"  RMSE={weighted_rmse:.4f}, rRMSE={weighted_rrmse:.2f}%")
    print(f"  Saved: {l4d_accuracy_path}")
else:
    l4d_accuracy = pd.read_csv(l4d_accuracy_path).iloc[0].to_dict()
    print(f"L4D tile accuracy loaded: RMSE={l4d_accuracy['rmse']:.4f}, rRMSE={l4d_accuracy['rrmse']:.2f}%")

# Copy and mask GEDI rasters to forest extent
print("\nCopying and masking GEDI rasters")

# Mask raster to forest extent, resampling mask if needed
def apply_forest_mask(raster_path, mask_path, output_path):
    if exists(output_path):
        print(f"  Exists: {os.path.basename(output_path)}")
        return
    raster_ds = gdal.Open(raster_path)
    raster_proj = raster_ds.GetProjection()
    raster_gt = raster_ds.GetGeoTransform()
    raster_xsize, raster_ysize = raster_ds.RasterXSize, raster_ds.RasterYSize
    raster_bounds = [raster_gt[0], raster_gt[3] + raster_gt[5] * raster_ysize, raster_gt[0] + raster_gt[1] * raster_xsize, raster_gt[3]]
    mask_resampled = gdal.Warp('', mask_path, options=gdal.WarpOptions(format='MEM', dstSRS=raster_proj, outputBounds=raster_bounds, width=raster_xsize, height=raster_ysize, resampleAlg='near', dstNodata=nodatavalue))
    mask_array = mask_resampled.GetRasterBand(1).ReadAsArray()
    raster_array = raster_ds.GetRasterBand(1).ReadAsArray()
    raster_array = np.where(mask_array == 1, raster_array, nodatavalue)
    driver = gdal.GetDriverByName('GTiff')
    out_ds = driver.Create(output_path, raster_xsize, raster_ysize, 1, raster_ds.GetRasterBand(1).DataType, options=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'])
    out_ds.SetGeoTransform(raster_gt)
    out_ds.SetProjection(raster_proj)
    out_band = out_ds.GetRasterBand(1)
    out_band.WriteArray(raster_array)
    out_band.SetNoDataValue(nodatavalue)
    out_ds = raster_ds = mask_resampled = None
    print(f"  Created: {os.path.basename(output_path)}")

gedi_mask_path = join(masks_dir, f"mask_forest_{gedi_mask_year}.tif")

# Mask L4D rasters
l4d_masked = []
if gedi_products['L4D']:
    print("L4D rasters:")
    for raster_path in gedi_products['L4D']:
        filename = os.path.basename(raster_path).replace('_original', '').replace('.tif', f'_masked_{gedi_mask_year}.tif')
        output_path = join(products_dir, filename)
        apply_forest_mask(raster_path, gedi_mask_path, output_path)
        l4d_masked.append(output_path)

# Mask L4B raster
l4b_masked = None
if gedi_products['L4B']:
    print("L4B raster:")
    filename = os.path.basename(gedi_products['L4B']).replace('_original', '').replace('.tif', f'_masked_{gedi_mask_year}.tif')
    output_path = join(products_dir, filename)
    apply_forest_mask(gedi_products['L4B'], gedi_mask_path, output_path)
    l4b_masked = output_path

# Mask vegetation_grid raster
veg_masked = None
if gedi_products['vegetation_grid']:
    print("Vegetation grid raster:")
    filename = os.path.basename(gedi_products['vegetation_grid']).replace('_original', '').replace('.tif', f'_masked_{gedi_mask_year}.tif')
    output_path = join(products_dir, filename)
    apply_forest_mask(gedi_products['vegetation_grid'], gedi_mask_path, output_path)
    veg_masked = output_path

print(f"\nMasked rasters: L4D={len(l4d_masked)}, L4B={'1' if l4b_masked else '0'}, Vegetation grid={'1' if veg_masked else '0'}")

In [None]:
# Point-based accuracy comparison
# Select the final AGBD dataset used for modelling
print("Available datasets")
for f in os.listdir(targets_pkl_final_dir):
    if f.endswith(".pkl"):
        print(f'original_gedi4a_dataset = "{f}"')

In [None]:
# Point-based accuracy comparison
# Sensitivity threshold for L4B only (beam sensitivity >= threshold)
l4b_sensitivity_threshold = 0.98

original_gedi4a_dataset = "GEDI04_A.pkl"

# Load accuracy points from GEDI L4A dataset
original_gedi4a_df = pd.read_pickle(join(targets_pkl_final_dir, original_gedi4a_dataset))
accuracy_df = original_gedi4a_df[['geometry', 'timestamp', 'agbd', 'sensitivity']].copy()
accuracy_df['year'] = pd.to_datetime(accuracy_df['timestamp']).dt.year
print(f"Loaded {len(accuracy_df)} accuracy points, years: {sorted(accuracy_df['year'].unique())}")

# Get EPSG code from raster
def get_raster_epsg(raster_path):
    raster_ds = gdal.Open(raster_path)
    srs = osr.SpatialReference()
    srs.ImportFromWkt(raster_ds.GetProjection())
    epsg = int(srs.GetAuthorityCode(None))
    raster_ds = None
    return epsg

# Sample raster at point locations, reprojecting to raster CRS
def sample_raster_at_points(raster_path, points_df, geom_col='geometry'):
    epsg = get_raster_epsg(raster_path)
    gdf = gpd.GeoDataFrame(points_df, geometry=geom_col, crs='EPSG:4326').to_crs(epsg=epsg)
    temp_df = pd.DataFrame(index=points_df.index)
    sample_raster_values(temp_df, raster_path, gdf.geometry.x.values, gdf.geometry.y.values)
    return temp_df.iloc[:, 0].values

# Calculate accuracy metrics
def calculate_metrics(observed, predicted):
    obs, pred = observed.astype('float32'), predicted.astype('float32')
    rmse = root_mean_squared_error(obs, pred)
    return {'r2': r2_score(obs, pred), 'me': np.mean(obs - pred), 'rmse': rmse, 'rrmse': (rmse / np.mean(obs)) * 100, 'n_points': len(obs)}

# All products use 2019-2023 GEDI points
gedi_years = [2019, 2020, 2021, 2022, 2023]

results = []

# Add L4D area-weighted polygon accuracy
if exists(l4d_accuracy_path):
    l4d_tile = pd.read_csv(l4d_accuracy_path).iloc[0].to_dict()
    results.append(l4d_tile)
    print(f"L4D grid area-weighted: RMSE={l4d_tile['rmse']:.4f}, rRMSE={l4d_tile['rrmse']:.2f}%")

# L4D point-based accuracy (no sensitivity threshold)
print("\nL4D grid point-based accuracy")
l4d_points = accuracy_df[accuracy_df['year'].isin(gedi_years)].copy()
if len(l4d_points) > 0 and l4d_masked:
    l4d_samples = np.full(len(l4d_points), nodatavalue, dtype=float)
    for raster_path in l4d_masked:
        values = sample_raster_at_points(raster_path, l4d_points)
        mask = (l4d_samples == nodatavalue) & (values != nodatavalue)
        l4d_samples[mask] = values[mask]
    l4d_valid_mask = l4d_samples != nodatavalue
    if l4d_valid_mask.sum() > 0:
        metrics = calculate_metrics(l4d_points['agbd'].values[l4d_valid_mask], l4d_samples[l4d_valid_mask])
        metrics['source'] = 'L4D grid (point)'
        results.append(metrics)
        print(f"  R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")
else: print("  No points or rasters available")

# L4B point-based accuracy (sensitivity threshold applied)
print(f"\nL4B grid point-based accuracy (sensitivity >= {l4b_sensitivity_threshold})")
l4b_points = accuracy_df[(accuracy_df['year'].isin(gedi_years)) & (accuracy_df['sensitivity'] >= l4b_sensitivity_threshold)].copy()
if len(l4b_points) > 0 and l4b_masked:
    l4b_samples = sample_raster_at_points(l4b_masked, l4b_points)
    l4b_valid_mask = l4b_samples != nodatavalue
    if l4b_valid_mask.sum() > 0:
        metrics = calculate_metrics(l4b_points['agbd'].values[l4b_valid_mask], l4b_samples[l4b_valid_mask])
        metrics['source'] = 'L4B grid (point)'
        results.append(metrics)
        print(f"  R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")
else: print("  No points or raster available")

# Vegetation grid point-based accuracy (no sensitivity threshold)
print("\nVegetation grid point-based accuracy")
if veg_masked:
    veg_points = accuracy_df[accuracy_df['year'].isin(gedi_years)].copy()
    if len(veg_points) > 0:
        veg_samples = sample_raster_at_points(veg_masked, veg_points)
        veg_valid_mask = veg_samples != nodatavalue
        if veg_valid_mask.sum() > 0:
            metrics = calculate_metrics(veg_points['agbd'].values[veg_valid_mask], veg_samples[veg_valid_mask])
            metrics['source'] = 'Veg grid (2019-2023)'
            results.append(metrics)
            print(f"  R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")
    else: print("  No points available")
else: print("  No raster available")

# Model predictions point-based accuracy (no sensitivity threshold, sample year+1)
print("\nModel predictions point-based accuracy")
category_labels = {'scenarios': 'single prediction', 'uncertainty': 'uncertainty mean'}
for model_name, categories in selected_predictions.items():
    for category, years in categories.items():
        pred_dir = join(scenarios_dir if category == 'scenarios' else uncertainty_dir, model_name, 'scenario_predictions' if category == 'scenarios' else 'uncertainty_predictions')
        for year in years:
            raster_path = join(pred_dir, f"{year}__{model_name}.tif" if category == 'scenarios' else f"mean__{year}__{model_name}.tif")
            if not exists(raster_path):
                print(f"  {model_name} {category} {year}: not found")
                continue
            sample_year = int(year) + 1
            pred_points = accuracy_df[accuracy_df['year'] == sample_year].copy()
            if len(pred_points) == 0:
                print(f"  {model_name} {category} {year}: no points for {sample_year}")
                continue
            pred_samples = sample_raster_at_points(raster_path, pred_points)
            pred_valid_mask = pred_samples != nodatavalue
            if pred_valid_mask.sum() > 0:
                metrics = calculate_metrics(pred_points['agbd'].values[pred_valid_mask], pred_samples[pred_valid_mask])
                metrics['source'] = f"{model_name} ({category_labels[category]} {year})"
                results.append(metrics)
                print(f"  {metrics['source']}: R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")

# Model cross-validation metrics from training
print("\nModel cross-validation metrics")
for model_name in selected_predictions.keys():
    model_desc_path = join(models_dir, model_name, "model_description.json")
    if not exists(model_desc_path):
        print(f"  {model_name}: not found")
        continue
    with open(model_desc_path) as f:
        model_desc = json.load(f)
    results.append({'source': f'{model_name} (cross-validation)', 'r2': model_desc.get('score_validation (r2) mean', np.nan), 'me': model_desc.get('score_validation (me) mean', np.nan), 'rmse': model_desc.get('score_validation (rmse) mean', np.nan), 'rrmse': model_desc.get('score_validation (rrmse) mean', np.nan), 'n_points': np.nan})
    print(f"  {model_name}: R²={results[-1]['r2']:.4f}, RMSE={results[-1]['rmse']:.4f}, rRMSE={results[-1]['rrmse']:.2f}%")

# Build comparison table: products/models as rows, metrics as columns
comparison_df = pd.DataFrame(results)
comparison_df = comparison_df.drop(columns=['variate', 'n_tiles', 'n_forest_pixels'], errors='ignore')
col_order = ['source', 'r2', 'me', 'rmse', 'rrmse', 'n_points']
comparison_df = comparison_df[[c for c in col_order if c in comparison_df.columns]]
comparison_path = join(model_comparison_dir, "accuracy_comparison.csv")
comparison_df.to_csv(comparison_path, index=False)
print(f"\nComparison saved: {comparison_path}")
print(comparison_df.to_string(index=False))

In [None]:
# Point-based accuracy comparison

# A sensitivity threshold of 0.98 was used in Tropical Evergreen Broadleaf Tree prediction strata for GEDI 4B
# Dubayah et al (2023). GEDI L4B Gridded Aboveground Biomass Density, Version 2.1.
# ORNL Distributed Active Archive Center. https://doi.org/10.3334/ORNLDAAC/2299
# GEDI 4D did not have any threshold "because the GEDI L4D k-NN algorithm did not have the same assumptions as the GEDI L4B algorithm"
# Seo et al (2025). GEDI L4D Imputed Waveforms, Version 2.
# ORNL Distributed Active Archive Center. https://doi.org/10.3334/ORNLDAAC/2455
# The documentation for GEDI gridded vegetation indices did not mention a sensitivity threhsold either.
# Burns et al (2024). Gridded GEDI Vegetation Structure Metrics and Biomass Density at Multiple Resolutions (Version 1).
# ORNL Distributed Active Archive Center. https://doi.org/10.3334/ORNLDAAC/2339
sensitivity_threshold = 0.98

original_gedi4a_dataset = "GEDI04_A.pkl"

# Load accuracy points from GEDI L4A dataset
original_gedi4a_df = pd.read_pickle(join(targets_pkl_final_dir, original_gedi4a_dataset))
accuracy_df = original_gedi4a_df[['geometry', 'timestamp', 'agbd', 'sensitivity']].copy()
accuracy_df['year'] = pd.to_datetime(accuracy_df['timestamp']).dt.year
accuracy_df = accuracy_df[accuracy_df['sensitivity'] >= sensitivity_threshold].copy()
print(f"Loaded {len(accuracy_df)} accuracy points (sensitivity >= {sensitivity_threshold}), years: {sorted(accuracy_df['year'].unique())}")

# Get EPSG code from raster
def get_raster_epsg(raster_path):
    raster_ds = gdal.Open(raster_path)
    srs = osr.SpatialReference()
    srs.ImportFromWkt(raster_ds.GetProjection())
    epsg = int(srs.GetAuthorityCode(None))
    raster_ds = None
    return epsg

# Sample raster at point locations, reprojecting to raster CRS
def sample_raster_at_points(raster_path, points_df, geom_col='geometry'):
    epsg = get_raster_epsg(raster_path)
    gdf = gpd.GeoDataFrame(points_df, geometry=geom_col, crs='EPSG:4326').to_crs(epsg=epsg)
    temp_df = pd.DataFrame(index=points_df.index)
    sample_raster_values(temp_df, raster_path, gdf.geometry.x.values, gdf.geometry.y.values)
    return temp_df.iloc[:, 0].values

# Calculate accuracy metrics
def calculate_metrics(observed, predicted):
    obs, pred = observed.astype('float32'), predicted.astype('float32')
    rmse = root_mean_squared_error(obs, pred)
    return {'r2': r2_score(obs, pred), 'me': np.mean(obs - pred), 'rmse': rmse, 'rrmse': (rmse / np.mean(obs)) * 100, 'n_points': len(obs)}

# All products use 2019-2023 GEDI points
gedi_years = [2019, 2020, 2021, 2022, 2023]

results = []

# Add L4D area-weighted polygon accuracy
if exists(l4d_accuracy_path):
    l4d_tile = pd.read_csv(l4d_accuracy_path).iloc[0].to_dict()
    results.append(l4d_tile)
    print(f"L4D grid area-weighted: RMSE={l4d_tile['rmse']:.4f}, rRMSE={l4d_tile['rrmse']:.2f}%")

# L4D point-based accuracy
print("\nL4D grid point-based accuracy")
l4d_points = accuracy_df[accuracy_df['year'].isin(gedi_years)].copy()
if len(l4d_points) > 0 and l4d_masked:
    l4d_samples = np.full(len(l4d_points), nodatavalue, dtype=float)
    for raster_path in l4d_masked:
        values = sample_raster_at_points(raster_path, l4d_points)
        mask = (l4d_samples == nodatavalue) & (values != nodatavalue)
        l4d_samples[mask] = values[mask]
    l4d_valid_mask = l4d_samples != nodatavalue
    if l4d_valid_mask.sum() > 0:
        metrics = calculate_metrics(l4d_points['agbd'].values[l4d_valid_mask], l4d_samples[l4d_valid_mask])
        metrics['source'] = 'L4D grid (point)'
        results.append(metrics)
        print(f"  R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")
else: print("  No points or rasters available")

# L4B point-based accuracy
print("\nL4B grid point-based accuracy")
l4b_points = accuracy_df[accuracy_df['year'].isin(gedi_years)].copy()
if len(l4b_points) > 0 and l4b_masked:
    l4b_samples = sample_raster_at_points(l4b_masked, l4b_points)
    l4b_valid_mask = l4b_samples != nodatavalue
    if l4b_valid_mask.sum() > 0:
        metrics = calculate_metrics(l4b_points['agbd'].values[l4b_valid_mask], l4b_samples[l4b_valid_mask])
        metrics['source'] = 'L4B grid (point)'
        results.append(metrics)
        print(f"  R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")
else: print("  No points or raster available")

# Vegetation grid point-based accuracy
print("\nVegetation grid point-based accuracy")
if veg_masked:
    veg_points = accuracy_df[accuracy_df['year'].isin(gedi_years)].copy()
    if len(veg_points) > 0:
        veg_samples = sample_raster_at_points(veg_masked, veg_points)
        veg_valid_mask = veg_samples != nodatavalue
        if veg_valid_mask.sum() > 0:
            metrics = calculate_metrics(veg_points['agbd'].values[veg_valid_mask], veg_samples[veg_valid_mask])
            metrics['source'] = 'Veg grid (2019-2023)'
            results.append(metrics)
            print(f"  R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")
    else: print("  No points available")
else: print("  No raster available")

# Model predictions point-based accuracy (sample year+1)
print("\nModel predictions point-based accuracy")
category_labels = {'scenarios': 'single prediction', 'uncertainty': 'uncertainty mean'}
for model_name, categories in selected_predictions.items():
    for category, years in categories.items():
        pred_dir = join(scenarios_dir if category == 'scenarios' else uncertainty_dir, model_name, 'scenario_predictions' if category == 'scenarios' else 'uncertainty_predictions')
        for year in years:
            raster_path = join(pred_dir, f"{year}__{model_name}.tif" if category == 'scenarios' else f"mean__{year}__{model_name}.tif")
            if not exists(raster_path):
                print(f"  {model_name} {category} {year}: not found")
                continue
            sample_year = int(year) + 1
            pred_points = accuracy_df[accuracy_df['year'] == sample_year].copy()
            if len(pred_points) == 0:
                print(f"  {model_name} {category} {year}: no points for {sample_year}")
                continue
            pred_samples = sample_raster_at_points(raster_path, pred_points)
            pred_valid_mask = pred_samples != nodatavalue
            if pred_valid_mask.sum() > 0:
                metrics = calculate_metrics(pred_points['agbd'].values[pred_valid_mask], pred_samples[pred_valid_mask])
                metrics['source'] = f"{model_name} ({category_labels[category]} {year})"
                results.append(metrics)
                print(f"  {metrics['source']}: R²={metrics['r2']:.4f}, RMSE={metrics['rmse']:.4f}, rRMSE={metrics['rrmse']:.2f}%, n={metrics['n_points']}")

# Model cross-validation metrics from training
print("\nModel cross-validation metrics")
for model_name in selected_predictions.keys():
    model_desc_path = join(models_dir, model_name, "model_description.json")
    if not exists(model_desc_path):
        print(f"  {model_name}: not found")
        continue
    with open(model_desc_path) as f:
        model_desc = json.load(f)
    results.append({'source': f'{model_name} (cross-validation)', 'r2': model_desc.get('score_validation (r2) mean', np.nan), 'me': model_desc.get('score_validation (me) mean', np.nan), 'rmse': model_desc.get('score_validation (rmse) mean', np.nan), 'rrmse': model_desc.get('score_validation (rrmse) mean', np.nan), 'n_points': np.nan})
    print(f"  {model_name}: R²={results[-1]['r2']:.4f}, RMSE={results[-1]['rmse']:.4f}, rRMSE={results[-1]['rrmse']:.2f}%")

# Build comparison table: products/models as rows, metrics as columns
comparison_df = pd.DataFrame(results)
comparison_df = comparison_df.drop(columns=['variate', 'n_tiles', 'n_forest_pixels'], errors='ignore')
col_order = ['source', 'r2', 'me', 'rmse', 'rrmse', 'n_points']
comparison_df = comparison_df[[c for c in col_order if c in comparison_df.columns]]
comparison_path = join(model_comparison_dir, "accuracy_comparison.csv")
comparison_df.to_csv(comparison_path, index=False)
print(f"\nComparison saved: {comparison_path}")
print(comparison_df.to_string(index=False))

# Select model

In [None]:
# Select if to source predictions from scenarios_dir or uncertainty_dir
source_dir = uncertainty_dir
# source_dir = scenarios_dir
source_dir_name = f"{source_dir.split('_')[-1]}_dir"

# Select the model
for subdir in os.listdir(source_dir):
  if 'scenario_masks' not in subdir:
    print(f"selected_model = '{subdir}'")

In [None]:
selected_model = 'agbd_251203_161707'

# Define prediction, disturbance and intactness directories
selected_model_prediction_dir = join(source_dir, selected_model)
if source_dir == scenarios_dir: prediction_raster_dir = join(selected_model_prediction_dir, 'scenario_predictions')
if source_dir == uncertainty_dir: prediction_raster_dir = join(selected_model_prediction_dir, 'uncertainty_predictions')
model_differences_dir = join(differences_dir, f"{selected_model}_{source_dir_name}")
disturbance_dir = join(model_differences_dir, 'disturbance')
intactness_dir = join(model_differences_dir, 'intactness')
restoration_dir = join(model_differences_dir, 'restoration')

# Check prediction directory
if not exists(prediction_raster_dir):
  print(f"Prediction directory doesn't exist yet: {prediction_raster_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(prediction_raster_dir))} rasters in {prediction_raster_dir}")
# Check disturbance directory
if not exists(disturbance_dir):
  print(f"Disturbance directory doesn't exist yet: {disturbance_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(disturbance_dir))} rasters in {disturbance_dir}")
# Check intactness directory
if not exists(intactness_dir):
  print(f"Intactness directory doesn't exist yet: {intactness_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(intactness_dir))} rasters in {intactness_dir}")
# Check restoration directory
if not exists(restoration_dir):
  print(f"Intactness directory doesn't exist yet: {restoration_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(restoration_dir))} rasters in {restoration_dir}")

# Define model stats directory
model_statistics_agbd_dir = join(statistics_agbd_dir, f"{selected_model}_{source_dir_name}")
makedirs(model_statistics_agbd_dir, exist_ok=True)

# Define and create converted AGB directories
agb_total_raster_dir = join(model_statistics_agbd_dir, 'agb_total_rasters')
agb_total_scenario_dir = join(agb_total_raster_dir, 'scenarios')
agb_total_disturbance_dir = join(agb_total_raster_dir, 'disturbance')
agb_total_restoration_dir = join(agb_total_raster_dir, 'restoration')

makedirs(agb_total_raster_dir, exist_ok=True)
makedirs(agb_total_scenario_dir, exist_ok=True)
makedirs(agb_total_disturbance_dir, exist_ok=True)
makedirs(agb_total_restoration_dir, exist_ok=True)

# AGBD to AGB total rasters

In [None]:
# Converts from measurements/ha to totals using cell area

# This can be a higher precision than Mg/ha, as ~30 m pixels are ~0.9 ha,
# with smaller values. Precision = 3 = 1 kg
agb_mg_precision = 3
agb_ci_mg_precision = 3

# List all raster files in source directories
scenario_mean_rasters = []
scenario_ci_rasters = []
disturbance_mean_rasters = []
disturbance_ci_rasters = []
restoration_mean_rasters = []
restoration_ci_rasters = []

# Collect scenario rasters
if exists(prediction_raster_dir):
    for f in os.listdir(prediction_raster_dir):
        if f.endswith('.tif'):
            full_path = join(prediction_raster_dir, f)
            if source_dir == uncertainty_dir:
                if 'mean__' in f: scenario_mean_rasters.append(full_path)
                elif f.startswith('ci_'): scenario_ci_rasters.append(full_path)
            else: scenario_mean_rasters.append(full_path)

# Collect disturbance rasters
if exists(disturbance_dir):
    for f in os.listdir(disturbance_dir):
        if f.endswith('.tif'):
            full_path = join(disturbance_dir, f)
            if source_dir == uncertainty_dir:
                if 'mean__' in f: disturbance_mean_rasters.append(full_path)
                elif f.startswith('ci_'): disturbance_ci_rasters.append(full_path)
            else: disturbance_mean_rasters.append(full_path)

# Collect restoration rasters
if exists(restoration_dir):
    for f in os.listdir(restoration_dir):
        if f.endswith('.tif'):
            full_path = join(restoration_dir, f)
            if source_dir == uncertainty_dir:
                if 'mean__' in f: restoration_mean_rasters.append(full_path)
                elif f.startswith('ci_'): restoration_ci_rasters.append(full_path)
            else: restoration_mean_rasters.append(full_path)

# Sort rasters chronologically
scenario_mean_rasters = sorted(scenario_mean_rasters)
scenario_ci_rasters = sorted(scenario_ci_rasters)
disturbance_mean_rasters = sorted(disturbance_mean_rasters)
disturbance_ci_rasters = sorted(disturbance_ci_rasters)
restoration_mean_rasters = sorted(restoration_mean_rasters)
restoration_ci_rasters = sorted(restoration_ci_rasters)

# Create lookup dictionaries for CI matching (ci_{value}__ -> mean__)
scenario_ci_lookup = {}
for ci_raster in scenario_ci_rasters:
    base_name = re.sub(r'ci_[^_]+__', 'mean__', os.path.basename(ci_raster))
    scenario_ci_lookup[base_name] = ci_raster
disturbance_ci_lookup = {}
for ci_raster in disturbance_ci_rasters:
    base_name = re.sub(r'ci_[^_]+__', 'mean__', os.path.basename(ci_raster))
    disturbance_ci_lookup[base_name] = ci_raster
restoration_ci_lookup = {}
for ci_raster in restoration_ci_rasters:
    base_name = re.sub(r'ci_[^_]+__', 'mean__', os.path.basename(ci_raster))
    restoration_ci_lookup[base_name] = ci_raster

# Load cell area raster
cell_area = gdal.Open(cell_area_path)
cell_area_array = cell_area.ReadAsArray().astype(np.float64)
cell_area = None
# Convert cell area from m2 to ha
cell_area_ha = cell_area_array / 10000

# Convert per-hectare rasters to total values using cell area.
def process_rasters(raster_paths, ci_lookup, output_dir, is_disturbance=False, is_restoration=False):
    progress_index = 0
    progress_total = len(raster_paths)
    raster_type = "Restoration" if is_restoration else ("Disturbance" if is_disturbance else "Scenario")
    progress_label = widgets.Label(f"{raster_type} rasters progress: {progress_index}/{progress_total}")
    display(progress_label)
    print(f"Processing {progress_total} {raster_type.lower()} rasters...")
    for raster_path in raster_paths:
        base_filename = os.path.basename(raster_path)

        # Extract scenario name
        if source_dir == uncertainty_dir: raster_name = base_filename.split('__')[1].split('.')[0]
        else: raster_name = base_filename.split('__')[0].split('.')[0]
        output_agb_mg = join(output_dir, f"agb_total_mg__{raster_name}.tif")
        agb_exists = exists(output_agb_mg)
        agbd_array = nodata = valid_mask = None

        # Create total AGB raster: AGB (Mg) = AGBD (Mg/ha) × area (ha)
        if not agb_exists:
            agbd = gdal.Open(raster_path)
            agbd_array = agbd.ReadAsArray()
            nodata = int(agbd.GetRasterBand(1).GetNoDataValue())
            agbd = None
            valid_mask = (agbd_array != nodata)
            total_agb_mg = np.zeros_like(agbd_array, dtype='float64')
            total_agb_mg[valid_mask] = agbd_array[valid_mask] * cell_area_ha[valid_mask]
            total_agb_mg[~valid_mask] = nodata
            if agb_mg_precision == 0:
              total_agb_mg = np.round(total_agb_mg, agb_mg_precision).astype(np.int16)
            else: total_agb_mg = np.round(total_agb_mg, agb_mg_precision)
            export_array_as_tif(total_agb_mg, output_agb_mg, template=raster_path)

        # Process CI rasters if available
        if source_dir == uncertainty_dir:
            base_name = os.path.basename(raster_path)
            if base_name in ci_lookup:
                ci_path = ci_lookup[base_name]
                output_agb_ci_mg = join(output_dir, f"agb_total_ci_95_mg__{raster_name}.tif")

                agb_ci_exists = exists(output_agb_ci_mg)
                if not agb_ci_exists:
                    ci_raster = gdal.Open(ci_path)
                    ci_array = ci_raster.ReadAsArray().astype(np.float64)
                    ci_raster = None
                    # Load mean raster for nodata mask if needed
                    if valid_mask is None:
                        agbd = gdal.Open(raster_path)
                        agbd_array = agbd.ReadAsArray()
                        nodata = int(agbd.GetRasterBand(1).GetNoDataValue())
                        agbd = None
                        valid_mask = (agbd_array != nodata)
                    # Total CI (Mg) = CI (Mg/ha) × area (ha)
                    total_agb_ci_mg = np.zeros_like(ci_array, dtype='float64')
                    total_agb_ci_mg[valid_mask] = ci_array[valid_mask] * cell_area_ha[valid_mask]
                    total_agb_ci_mg[~valid_mask] = nodata
                    if agb_ci_mg_precision == 0:
                      total_agb_ci_mg = np.round(total_agb_ci_mg, agb_ci_mg_precision).astype(np.int16)
                    else: total_agb_ci_mg = np.round(total_agb_ci_mg, agb_ci_mg_precision)
                    export_array_as_tif(total_agb_ci_mg, output_agb_ci_mg, template=raster_path)

        progress_index += 1
        progress_label.value = f"{raster_type} rasters progress: {progress_index}/{progress_total}"
    return progress_total

scenario_count = process_rasters(scenario_mean_rasters, scenario_ci_lookup, agb_total_scenario_dir)
disturbance_count = process_rasters(disturbance_mean_rasters, disturbance_ci_lookup, agb_total_disturbance_dir, is_disturbance=True)
restoration_count = process_rasters(restoration_mean_rasters, restoration_ci_lookup, agb_total_restoration_dir, is_restoration=True)
print(f"Processed {scenario_count} scenario rasters, {disturbance_count} disturbance rasters and {restoration_count} restoration rasters.")

# Select sample polygons

In [None]:
# Select sample area polygons. This should be a single .gpkg with the field 'name' differentiating polygons.
sample_polygons = []
for geopackage in os.listdir(sample_polygons_dir):
  sample_polygons.append(geopackage)
if len(sample_polygons) == 0:
  print(f"No sample areas found. Upload .gpkg polygons to {sample_polygons_dir}")
else:
  for sample_polygon in sample_polygons: print(f"selected_sample_polygons = '{sample_polygon}'")

In [None]:
selected_sample_polygons = 'tekai_sample_polygons.gpkg'

# Define and create sample polygons directory
selected_sample_polygons_dir = join(sample_polygons_dir, selected_sample_polygons)
selected_sample_polygons_gpkg = gpd.read_file(selected_sample_polygons_dir)
sample_polygons_statistics_agbd_dir = join(model_statistics_agbd_dir, selected_sample_polygons[:-5])
makedirs(sample_polygons_statistics_agbd_dir, exist_ok=True)

# Define and create statistic .csv directories
land_and_forest_cover_by_area_dir = join(sample_polygons_statistics_agbd_dir, 'land_and_forest_cover_by_area')
land_and_forest_cover_by_scenario_dir = join(sample_polygons_statistics_agbd_dir, 'land_and_forest_cover_by_scenario')
scenario_stats_by_area_dir = join(sample_polygons_statistics_agbd_dir, 'scenario_stats_by_area')
scenario_stats_by_scenario_dir = join(sample_polygons_statistics_agbd_dir, 'scenario_stats_by_scenario')
disturbance_stats_by_area_dir = join(sample_polygons_statistics_agbd_dir, 'disturbance_stats_by_area')
disturbance_stats_by_disturbance_dir = join(sample_polygons_statistics_agbd_dir, 'disturbance_stats_by_disturbance')
restoration_stats_by_area_dir = join(sample_polygons_statistics_agbd_dir, 'restoration_stats_by_area')
restoration_stats_by_restoration_dir = join(sample_polygons_statistics_agbd_dir, 'restoration_stats_by_restoration')
intactness_stats_dir = join(sample_polygons_statistics_agbd_dir, 'intactness')
report_statistics_agbd_dir = join(sample_polygons_statistics_agbd_dir, 'report_statistics')



makedirs(land_and_forest_cover_by_area_dir, exist_ok=True)
makedirs(land_and_forest_cover_by_scenario_dir, exist_ok=True)
makedirs(scenario_stats_by_area_dir, exist_ok=True)
makedirs(scenario_stats_by_scenario_dir, exist_ok=True)
makedirs(disturbance_stats_by_area_dir, exist_ok=True)
makedirs(disturbance_stats_by_disturbance_dir, exist_ok=True)
makedirs(restoration_stats_by_area_dir, exist_ok=True)
makedirs(restoration_stats_by_restoration_dir, exist_ok=True)
makedirs(intactness_stats_dir, exist_ok=True)
makedirs(report_statistics_agbd_dir, exist_ok=True)

# Land area and forest cover

In [None]:
# List available forest masks, which will be matched with land masks
forest_masks = []
for mask_file in os.listdir(masks_dir):
    if mask_file.startswith('mask_forest_') and mask_file.endswith('.tif'):
        scenario_name = mask_file.replace('mask_forest_', '').replace('.tif', '')
        forest_masks.append(scenario_name)
forest_masks = sorted(forest_masks)

print('selected_forest_scenarios = [')
for scenario in forest_masks:
    print(f'  "{scenario}",')
print(']')

In [None]:
selected_forest_scenarios = [
  "1990",
  "1991",
  "1992",
  "1993",
  "1994",
  "1995",
  "1996",
  "1997",
  "1998",
  "1999",
  "2000",
  "2001",
  "2002",
  "2003",
  "2004",
  "2005",
  "2006",
  "2007",
  "2008",
  "2009",
  "2010",
  "2011",
  "2012",
  "2013",
  "2014",
  "2015",
  "2016",
  "2017",
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_no_disturbance_since_oldgrowth",
  "2021_oldgrowth_recovery",
  "2022",
  "2023",
  "2024",
  "2024_no_disturbance_since_oldgrowth",
  "2024_oldgrowth_recovery",
  "2024_road_mat_daling_deforestation",
]

In [None]:
# Forest cover statistics

# Match forest masks to land masks by year
forest_to_land_mask = {}
missing_land_masks = []

for scenario in selected_forest_scenarios:
    year = scenario[:4]
    land_mask_path = join(masks_dir, f"mask_land_{year}.tif")
    if exists(land_mask_path):
        forest_to_land_mask[scenario] = land_mask_path
    else:
        missing_land_masks.append((scenario, f"mask_land_{year}.tif"))

# Validate all land masks exist
if missing_land_masks:
    print("Missing land masks. Return to 3_features_lcluc.ipynb to create:")
    for scenario, mask_name in missing_land_masks:
        print(f"  {mask_name} (required for {scenario})")
    raise FileNotFoundError("Land masks missing")

# Pre-allocate arrays
polygon_names = [row["name"] for _, row in selected_sample_polygons_gpkg.iterrows()]
n_polygons = len(polygon_names)
n_scenarios = len(selected_forest_scenarios)

polygon_area_data = np.zeros(n_polygons)
land_area_data = np.zeros((n_scenarios, n_polygons))
forest_cover_data = np.zeros((n_scenarios, n_polygons))

# Open cell area raster
cell_area_dataset = rasterio.open(cell_area_path)

# Progress tracking
progress_total = n_polygons * n_scenarios
progress_index = 0
progress_label = widgets.Label(f"Mask / polygon pair progress: {progress_index}/{progress_total}")
display(progress_label)

try:
    for poly_idx, (index, row) in enumerate(selected_sample_polygons_gpkg.iterrows()):
        sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Mask cell area raster to polygon
        cell_area_masked, _ = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)
        cell_area_masked = cell_area_masked.astype('float64')

        # Calculate polygon area in km²
        pixel_area_sum_m2 = np.ma.sum(cell_area_masked, dtype='float64')
        polygon_area_data[poly_idx] = pixel_area_sum_m2 / 1e6

        # Convert cell areas to hectares
        cell_area_masked_ha = cell_area_masked / 10000

        for scenario_idx, scenario in enumerate(selected_forest_scenarios):
            # Land mask
            land_mask_path = forest_to_land_mask[scenario]
            with rasterio.open(land_mask_path) as land_src:
                land_array, _ = msk.mask(land_src, polygons, crop=True, filled=False)
                land_array = land_array.astype('float64')

            land_mask = (~np.ma.getmaskarray(land_array)) & (land_array == 1)
            land_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~land_mask)
            land_area_data[scenario_idx, poly_idx] = np.ma.sum(land_cell_areas_ha, dtype='float64')

            # Forest mask
            forest_mask_path = join(masks_dir, f"mask_forest_{scenario}.tif")
            with rasterio.open(forest_mask_path) as forest_src:
                forest_array, _ = msk.mask(forest_src, polygons, crop=True, filled=False)
                forest_array = forest_array.astype('float64')

            forest_mask = (~np.ma.getmaskarray(forest_array)) & (forest_array == 1)
            forest_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~forest_mask)
            forest_cover_data[scenario_idx, poly_idx] = np.ma.sum(forest_cell_areas_ha, dtype='float64')

            progress_index += 1
            progress_label.value = f"Mask / polygon pair progress: {progress_index}/{progress_total}"

finally: cell_area_dataset.close()

# Calculate percentages
pct_area_land = np.zeros((n_scenarios, n_polygons))
pct_area_forest = np.zeros((n_scenarios, n_polygons))
pct_land_forest = np.zeros((n_scenarios, n_polygons))
for poly_idx in range(n_polygons):
    area_ha = polygon_area_data[poly_idx] * 100
    for scenario_idx in range(n_scenarios):
        land_ha = land_area_data[scenario_idx, poly_idx]
        forest_ha = forest_cover_data[scenario_idx, poly_idx]
        pct_area_land[scenario_idx, poly_idx] = (land_ha / area_ha * 100) if area_ha > 0 else 0
        pct_area_forest[scenario_idx, poly_idx] = (forest_ha / area_ha * 100) if area_ha > 0 else 0
        pct_land_forest[scenario_idx, poly_idx] = (forest_ha / land_ha * 100) if land_ha > 0 else 0

# Generate detailed stats by area
for poly_idx, polygon_name in enumerate(polygon_names):
    df = pd.DataFrame(index=selected_forest_scenarios)
    df.index.name = 'scenario'
    df['Area (km^2)'] = polygon_area_data[poly_idx]
    df['Land area (ha)'] = land_area_data[:, poly_idx]
    df['Forest cover (ha)'] = forest_cover_data[:, poly_idx]
    df['% of area that is land'] = pct_area_land[:, poly_idx]
    df['% of area that is forest'] = pct_area_forest[:, poly_idx]
    df['% of land that is forest'] = pct_land_forest[:, poly_idx]
    df.to_csv(join(land_and_forest_cover_by_area_dir, f'{polygon_name}.csv'))

# Generate detailed stats by scenario
for scenario_idx, scenario in enumerate(selected_forest_scenarios):
    df = pd.DataFrame(index=polygon_names)
    df.index.name = 'Name'
    df['Area (km^2)'] = polygon_area_data
    df['Land area (ha)'] = land_area_data[scenario_idx, :]
    df['Forest cover (ha)'] = forest_cover_data[scenario_idx, :]
    df['% of area that is land'] = pct_area_land[scenario_idx, :]
    df['% of area that is forest'] = pct_area_forest[scenario_idx, :]
    df['% of land that is forest'] = pct_land_forest[scenario_idx, :]
    df.to_csv(join(land_and_forest_cover_by_scenario_dir, f'{scenario}.csv'))

# Generate summary stats
summary_land_and_forest_cover = pd.DataFrame(index=polygon_names)
summary_land_and_forest_cover.index.name = 'Name'
summary_land_and_forest_cover['Area (km^2)'] = polygon_area_data

for scenario_idx, scenario in enumerate(selected_forest_scenarios):
    summary_land_and_forest_cover[f'{scenario} land area (ha)'] = land_area_data[scenario_idx, :]
    summary_land_and_forest_cover[f'{scenario} forest cover (ha)'] = forest_cover_data[scenario_idx, :]

summary_land_and_forest_cover.to_csv(join(sample_polygons_statistics_agbd_dir, 'summary_land_and_forest_cover_stats.csv'))

print("Forest cover statistics completed.")

# Scenario AGB

In [None]:
# Create list of available AGB total rasters and extract scenarios
scenarios = set()
for agb_total_raster in os.listdir(agb_total_scenario_dir):
    if agb_total_raster.endswith('.tif') and 'agb_total_mg__' in agb_total_raster:
        scenario_name = agb_total_raster.split("agb_total_mg__")[1].split('.')[0]
        scenarios.add(scenario_name)

scenarios = sorted(list(scenarios))

# Select scenario predictions to calculate statistics
print('selected_scenarios = [')
for scenario in scenarios:
    print(f'  "{scenario}",')
print(']\n')

In [None]:
selected_scenarios = [
  "2021",
  "2021_no_disturbance_since_1993",
  "2021_no_disturbance_since_oldgrowth",
  "2021_oldgrowth_recovery",
  "2024",
  "2024_no_disturbance_since_1996",
  "2024_no_disturbance_since_1997",
  "2024_no_disturbance_since_1998",
  "2024_no_disturbance_since_1999",
  "2024_no_disturbance_since_2000",
  "2024_no_disturbance_since_2001",
  "2024_no_disturbance_since_2002",
  "2024_no_disturbance_since_2003",
  "2024_no_disturbance_since_2004",
  "2024_no_disturbance_since_2005",
  "2024_no_disturbance_since_2006",
  "2024_no_disturbance_since_2007",
  "2024_no_disturbance_since_2008",
  "2024_no_disturbance_since_2009",
  "2024_no_disturbance_since_2010",
  "2024_no_disturbance_since_2011",
  "2024_no_disturbance_since_2012",
  "2024_no_disturbance_since_2013",
  "2024_no_disturbance_since_2014",
  "2024_no_disturbance_since_2015",
  "2024_no_disturbance_since_2016",
  "2024_no_disturbance_since_2017",
  "2024_no_disturbance_since_2018",
  "2024_no_disturbance_since_2019",
  "2024_no_disturbance_since_2020",
  "2024_no_disturbance_since_2021",
  "2024_no_disturbance_since_2022",
  "2024_no_disturbance_since_2023",
  "2024_no_disturbance_since_2024",
  "2024_no_disturbance_since_oldgrowth",
  "2024_oldgrowth_recovery",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
]

In [None]:
# Build lists of AGB total rasters for selected scenarios
agb_total_rasters = []

for scenario in selected_scenarios:
    agb_total_path = join(agb_total_scenario_dir, f"agb_total_mg__{scenario}.tif")
    if exists(agb_total_path):
        agb_total_rasters.append(agb_total_path)

# Sort rasters chronologically
agb_total_rasters = sorted(agb_total_rasters)

# Toggle whether to generate uncertainty stats (only possible with uncertainty_dir)
generate_uncertainty_stats = (source_dir == uncertainty_dir)

# Pre-allocate arrays for statistics
polygon_names = [row["name"] for _, row in selected_sample_polygons_gpkg.iterrows()]
n_polygons = len(polygon_names)
n_scenarios = len(agb_total_rasters)

agbd_mean_data = np.zeros((n_scenarios, n_polygons), dtype='float64')
agbd_stdev_data = np.zeros((n_scenarios, n_polygons), dtype='float64')
agb_total_data = np.zeros((n_scenarios, n_polygons), dtype='float64')

if generate_uncertainty_stats:
    agbd_mean_ci95_data = np.zeros((n_scenarios, n_polygons), dtype='float64')
    agbd_mean_uncertainty_data = np.zeros((n_scenarios, n_polygons), dtype='float64')
    agb_total_ci95_data = np.zeros((n_scenarios, n_polygons), dtype='float64')

# Open AGB total raster datasets
agb_total_datasets = {path: rasterio.open(path) for path in agb_total_rasters}

# Open AGB total CI95 datasets only if uncertainty stats are generated
agb_total_ci95_datasets = {}
if generate_uncertainty_stats:
    for agb_total_raster in agb_total_rasters:
        scenario_name = os.path.basename(agb_total_raster).split('agb_total_mg__')[1].split('.')[0]
        agb_total_ci95_path = join(agb_total_scenario_dir, f"agb_total_ci_95_mg__{scenario_name}.tif")
        if exists(agb_total_ci95_path):
            agb_total_ci95_datasets[agb_total_raster] = rasterio.open(agb_total_ci95_path)

# Load cell area raster once for all calculations
cell_area_dataset = rasterio.open(cell_area_path)

# Progress tracking
progress_total = n_polygons * n_scenarios
progress_index = 0
progress_label = widgets.Label(f"Raster / polygon pair progress: {progress_index}/{progress_total}")
display(progress_label)

try:
    for poly_idx, (index, row) in enumerate(selected_sample_polygons_gpkg.iterrows()):
        sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Mask cell area raster to polygon
        cell_area_masked, _ = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)
        cell_area_masked = cell_area_masked.astype('float64')
        cell_area_masked_ha = cell_area_masked / 10000

        for raster_idx, agb_total_raster in enumerate(agb_total_rasters):
            # Mask AGB total raster to polygon
            agb_total = agb_total_datasets[agb_total_raster]
            agb_total_array_masked, _ = msk.mask(agb_total, polygons, crop=True, filled=False)
            agb_total_array_masked = agb_total_array_masked.astype('float64')

            # Valid pixels mask
            valid_mask = ~np.ma.getmaskarray(agb_total_array_masked)

            # Calculate area of valid pixels
            valid_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~valid_mask)
            valid_area_ha = np.ma.sum(valid_cell_areas_ha, dtype='float64')

            # Sum total AGB in Mg
            agb_total_mg = np.ma.sum(agb_total_array_masked, dtype='float64')

            # Calculate statistics
            if np.ma.is_masked(agb_total_mg) or valid_area_ha <= 0:
                agbd_mean_mg_ha = 0.0
                agbd_stdev_mg_ha = 0.0
                agb_total_tg = 0.0
            else:
                # Area-weighted mean AGBD
                agbd_mean_mg_ha = agb_total_mg / valid_area_ha

                # Back-calculate individual AGBD values for standard deviation
                agbd_values = agb_total_array_masked / cell_area_masked_ha
                valid_agbd = agbd_values[valid_mask]
                valid_areas = cell_area_masked_ha[valid_mask]

                # Area-weighted standard deviation
                variance_weighted = np.sum(valid_areas * (valid_agbd - agbd_mean_mg_ha)**2) / valid_area_ha
                agbd_stdev_mg_ha = np.sqrt(variance_weighted)

                # Convert total AGB from Mg to Tg
                agb_total_tg = agb_total_mg / 1000000

            # Store results
            agbd_mean_data[raster_idx, poly_idx] = agbd_mean_mg_ha
            agbd_stdev_data[raster_idx, poly_idx] = agbd_stdev_mg_ha
            agb_total_data[raster_idx, poly_idx] = agb_total_tg

            if generate_uncertainty_stats and agb_total_raster in agb_total_ci95_datasets:
                agb_total_ci95_raster = agb_total_ci95_datasets[agb_total_raster]
                agb_total_ci95_array_masked, _ = msk.mask(agb_total_ci95_raster, polygons, crop=True, filled=False)
                agb_total_ci95_array_masked = agb_total_ci95_array_masked.astype('float64')

                agb_total_ci95_mg = abs(np.ma.sum(agb_total_ci95_array_masked, dtype='float64'))

                if abs(agb_total_mg) > 0:
                    agbd_mean_ci95_mg_ha = agb_total_ci95_mg / valid_area_ha
                    agbd_uncertainty_pct = agb_total_ci95_mg / abs(agb_total_mg) * 100
                else:
                    agbd_mean_ci95_mg_ha = 0.0
                    agbd_uncertainty_pct = 0.0

                agb_total_ci95_tg = agb_total_ci95_mg / 1000000

                agbd_mean_ci95_data[raster_idx, poly_idx] = agbd_mean_ci95_mg_ha
                agbd_mean_uncertainty_data[raster_idx, poly_idx] = agbd_uncertainty_pct
                agb_total_ci95_data[raster_idx, poly_idx] = agb_total_ci95_tg

            progress_index += 1
            progress_label.value = f"Raster / polygon pair progress: {progress_index}/{progress_total}"

finally:
    cell_area_dataset.close()
    for dataset in agb_total_datasets.values():
        dataset.close()
    for dataset in agb_total_ci95_datasets.values():
        dataset.close()

# Create DataFrames from pre-allocated arrays
df_agbd_mean_mg_ha = pd.DataFrame(agbd_mean_data, index=selected_scenarios, columns=polygon_names)
df_agbd_mean_mg_ha.rename_axis('scenario', inplace=True)

df_agbd_stdev_mg_ha = pd.DataFrame(agbd_stdev_data, index=selected_scenarios, columns=polygon_names)
df_agbd_stdev_mg_ha.rename_axis('scenario', inplace=True)

df_agb_total_tg = pd.DataFrame(agb_total_data, index=selected_scenarios, columns=polygon_names)
df_agb_total_tg.rename_axis('scenario', inplace=True)

if generate_uncertainty_stats:
    df_agbd_mean_ci95_mg_ha = pd.DataFrame(agbd_mean_ci95_data, index=selected_scenarios, columns=polygon_names)
    df_agbd_mean_ci95_mg_ha.rename_axis('scenario', inplace=True)

    df_agbd_uncertainty_pct = pd.DataFrame(agbd_mean_uncertainty_data, index=selected_scenarios, columns=polygon_names)
    df_agbd_uncertainty_pct.rename_axis('scenario', inplace=True)

    df_agb_total_ci95_tg = pd.DataFrame(agb_total_ci95_data, index=selected_scenarios, columns=polygon_names)
    df_agb_total_ci95_tg.rename_axis('scenario', inplace=True)

# Create stats list
if generate_uncertainty_stats:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_mean_ci95_mg_ha, df_agbd_uncertainty_pct,
                     df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_ci95_tg]
else:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Generate summary stats
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" AGB (Tg)")

summary_components = [df_agb_total_tg_t]
if generate_uncertainty_stats:
    df_agb_total_ci95_tg_t = df_agb_total_ci95_tg.T.rename_axis("Name", axis=1).add_suffix(" AGB CI95 (Tg)")
    summary_components.append(df_agb_total_ci95_tg_t)

summary_scenario_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_scenario_stats.to_csv(join(sample_polygons_statistics_agbd_dir, 'summary_scenario_stats.csv'))

# Generate detailed stats by area
df_base = pd.DataFrame(index=selected_scenarios)
df_base.rename_axis('scenario', inplace=True)

for polygon_name in polygon_names:
    df_detailed = df_base.copy()
    for df_stats in df_stats_list:
        if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "AGBD mean (Mg / ha)"
        elif df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "AGBD stdev (Mg / ha)"
        elif df_stats.equals(df_agb_total_tg): stat_col = "AGB total (Tg)"
        elif generate_uncertainty_stats and df_stats.equals(df_agbd_mean_ci95_mg_ha): stat_col = "AGBD CI95 (Mg / ha)"
        elif generate_uncertainty_stats and df_stats.equals(df_agbd_uncertainty_pct): stat_col = "AGBD uncertainty (%)"
        elif generate_uncertainty_stats and df_stats.equals(df_agb_total_ci95_tg): stat_col = "AGB total CI95 (Tg)"
        df_detailed[stat_col] = df_stats[polygon_name]
    df_detailed.to_csv(join(scenario_stats_by_area_dir, f'{polygon_name}.csv'))

# Generate detailed stats by scenario
scenarios = {}
for stats_csv in os.listdir(scenario_stats_by_area_dir):
    polygon_name = stats_csv[:-4]
    stats_csv_df = pd.read_csv(join(scenario_stats_by_area_dir, stats_csv))
    for scenario in stats_csv_df['scenario'].unique():
        scenario_df = stats_csv_df[stats_csv_df['scenario'] == scenario].copy()
        scenario_df.drop('scenario', axis=1, inplace=True)
        scenario_df.insert(0, 'Name', polygon_name)
        if scenario in scenarios:
            scenarios[scenario] = pd.concat([scenarios[scenario], scenario_df], ignore_index=True)
        else:
            scenarios[scenario] = scenario_df

for scenario, scenario_df in scenarios.items():
    scenario_df.to_csv(join(scenario_stats_by_scenario_dir, f'{scenario}.csv'), index=False)

# Disturbance AGB

In [None]:
# Create list of available AGB total disturbance rasters and extract disturbances
disturbances = set()
for agb_total_disturbance_raster in os.listdir(agb_total_disturbance_dir):
    if agb_total_disturbance_raster.endswith('.tif') and 'agb_total_mg__' in agb_total_disturbance_raster:
        disturbance_name = agb_total_disturbance_raster.split("agb_total_mg__")[1].split('.')[0]
        disturbances.add(disturbance_name)

disturbances = sorted(list(disturbances))

# Select disturbance rasters to calculate statistics
print('selected_disturbances = [')
for disturbancein disturbances:
    print(f'  "{dist}",')
print(']')

In [None]:
selected_disturbances = [
  "2021_deforestation_since_oldgrowth",
  "2021_degradation_from_oldgrowth_to_1992",
  "2021_degradation_since_1993",
  "2021_degradation_since_oldgrowth",
  "2024_deforestation_of_road_mat_daling_2023_deforestation",
  "2024_deforestation_of_road_mat_daling_2023_degradation",
  "2024_deforestation_since_oldgrowth",
  "2024_degradation_from_oldgrowth_to_1995",
  "2024_degradation_since_1996",
  "2024_degradation_since_oldgrowth",
  "2024_effect_of_degradation_in_1996",
  "2024_effect_of_degradation_in_1997",
  "2024_effect_of_degradation_in_1998",
  "2024_effect_of_degradation_in_1999",
  "2024_effect_of_degradation_in_2000",
  "2024_effect_of_degradation_in_2001",
  "2024_effect_of_degradation_in_2002",
  "2024_effect_of_degradation_in_2003",
  "2024_effect_of_degradation_in_2004",
  "2024_effect_of_degradation_in_2005",
  "2024_effect_of_degradation_in_2006",
  "2024_effect_of_degradation_in_2007",
  "2024_effect_of_degradation_in_2008",
  "2024_effect_of_degradation_in_2009",
  "2024_effect_of_degradation_in_2010",
  "2024_effect_of_degradation_in_2011",
  "2024_effect_of_degradation_in_2012",
  "2024_effect_of_degradation_in_2013",
  "2024_effect_of_degradation_in_2014",
  "2024_effect_of_degradation_in_2015",
  "2024_effect_of_degradation_in_2016",
  "2024_effect_of_degradation_in_2017",
  "2024_effect_of_degradation_in_2018",
  "2024_effect_of_degradation_in_2019",
  "2024_effect_of_degradation_in_2020",
  "2024_effect_of_degradation_in_2021",
  "2024_effect_of_degradation_in_2022",
  "2024_effect_of_degradation_in_2023",
  "2024_effect_of_degradation_in_2024",
]

In [None]:
# Build lists of AGB total disturbance rasters for selected disturbances
agb_total_disturbance_rasters = []

for disturbancein selected_disturbances:
    agb_total_path = join(agb_total_disturbance_dir, f"agb_total_mg__{dist}.tif")
    if exists(agb_total_path):
        agb_total_disturbance_rasters.append(agb_total_path)

# Sort rasters chronologically
agb_total_disturbance_rasters = sorted(agb_total_disturbance_rasters)

# Toggle whether to generate uncertainty stats (only possible with uncertainty_dir)
generate_uncertainty_stats = (source_dir == uncertainty_dir)

# Pre-allocate arrays for statistics
polygon_names = [row["name"] for _, row in selected_sample_polygons_gpkg.iterrows()]
n_polygons = len(polygon_names)
n_disturbances = len(agb_total_disturbance_rasters)

agbd_mean_data = np.zeros((n_disturbances, n_polygons), dtype='float64')
agbd_stdev_data = np.zeros((n_disturbances, n_polygons), dtype='float64')
agb_total_data = np.zeros((n_disturbances, n_polygons), dtype='float64')

if generate_uncertainty_stats:
    agbd_mean_ci95_data = np.zeros((n_disturbances, n_polygons), dtype='float64')
    agbd_mean_uncertainty_data = np.zeros((n_disturbances, n_polygons), dtype='float64')
    agb_total_ci95_data = np.zeros((n_disturbances, n_polygons), dtype='float64')

# Open AGB total disturbance raster datasets
agb_total_disturbance_datasets = {path: rasterio.open(path) for path in agb_total_disturbance_rasters}

# Open AGB total CI95 disturbance datasets only if uncertainty stats are generated
agb_total_ci95_disturbance_datasets = {}
if generate_uncertainty_stats:
    for agb_total_disturbance_raster in agb_total_disturbance_rasters:
        disturbance_name = os.path.basename(agb_total_disturbance_raster).split('agb_total_mg__')[1].split('.')[0]
        agb_total_ci95_path = join(agb_total_disturbance_dir, f"agb_total_ci_95_mg__{disturbance_name}.tif")
        if exists(agb_total_ci95_path):
            agb_total_ci95_disturbance_datasets[agb_total_disturbance_raster] = rasterio.open(agb_total_ci95_path)

# Load cell area raster once for all calculations
cell_area_dataset = rasterio.open(cell_area_path)

# Progress tracking
progress_total = n_polygons * n_disturbances
progress_index = 0
progress_label = widgets.Label(f"Raster / polygon pair progress: {progress_index}/{progress_total}")
display(progress_label)

try:
    for poly_idx, (index, row) in enumerate(selected_sample_polygons_gpkg.iterrows()):
        sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Mask cell area raster to polygon
        cell_area_masked, _ = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)
        cell_area_masked = cell_area_masked.astype('float64')
        cell_area_masked_ha = cell_area_masked / 10000

        for raster_idx, agb_total_disturbance_raster in enumerate(agb_total_disturbance_rasters):
            # Mask AGB total disturbance raster to polygon
            agb_total_disturbance= agb_total_disturbance_datasets[agb_total_disturbance_raster]
            agb_total_array_masked, _ = msk.mask(agb_total_disturbance, polygons, crop=True, filled=False)
            agb_total_array_masked = agb_total_array_masked.astype('float64')

            # Valid pixels mask
            valid_mask = ~np.ma.getmaskarray(agb_total_array_masked)

            # Calculate area of valid pixels
            valid_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~valid_mask)
            valid_area_ha = np.ma.sum(valid_cell_areas_ha, dtype='float64')

            # Sum total AGB in Mg
            agb_total_mg = np.ma.sum(agb_total_array_masked, dtype='float64')

            # Calculate statistics
            if np.ma.is_masked(agb_total_mg) or valid_area_ha <= 0:
                agbd_mean_mg_ha = 0.0
                agbd_stdev_mg_ha = 0.0
                agb_total_tg = 0.0
            else:
                # Area-weighted mean AGBD
                agbd_mean_mg_ha = agb_total_mg / valid_area_ha

                # Back-calculate individual AGBD values for standard deviation
                agbd_values = agb_total_array_masked / cell_area_masked_ha
                valid_agbd = agbd_values[valid_mask]
                valid_areas = cell_area_masked_ha[valid_mask]

                # Area-weighted standard deviation
                variance_weighted = np.sum(valid_areas * (valid_agbd - agbd_mean_mg_ha)**2) / valid_area_ha
                agbd_stdev_mg_ha = np.sqrt(variance_weighted)

                # Convert total AGB from Mg to Tg
                agb_total_tg = agb_total_mg / 1000000

            # Store results
            agbd_mean_data[raster_idx, poly_idx] = agbd_mean_mg_ha
            agbd_stdev_data[raster_idx, poly_idx] = agbd_stdev_mg_ha
            agb_total_data[raster_idx, poly_idx] = agb_total_tg

            if generate_uncertainty_stats and agb_total_disturbance_raster in agb_total_ci95_disturbance_datasets:
                agb_total_ci95_raster = agb_total_ci95_disturbance_datasets[agb_total_disturbance_raster]
                agb_total_ci95_array_masked, _ = msk.mask(agb_total_ci95_raster, polygons, crop=True, filled=False)
                agb_total_ci95_array_masked = agb_total_ci95_array_masked.astype('float64')

                agb_total_ci95_mg = abs(np.ma.sum(agb_total_ci95_array_masked, dtype='float64'))

                if abs(agb_total_mg) > 0:
                    agbd_mean_ci95_mg_ha = agb_total_ci95_mg / valid_area_ha
                    agbd_uncertainty_pct = agb_total_ci95_mg / abs(agb_total_mg) * 100
                else:
                    agbd_mean_ci95_mg_ha = 0.0
                    agbd_uncertainty_pct = 0.0

                agb_total_ci95_tg = agb_total_ci95_mg / 1000000

                agbd_mean_ci95_data[raster_idx, poly_idx] = agbd_mean_ci95_mg_ha
                agbd_mean_uncertainty_data[raster_idx, poly_idx] = agbd_uncertainty_pct
                agb_total_ci95_data[raster_idx, poly_idx] = agb_total_ci95_tg

            progress_index += 1
            progress_label.value = f"Raster / polygon pair progress: {progress_index}/{progress_total}"

finally:
    cell_area_dataset.close()
    for dataset in agb_total_disturbance_datasets.values():
        dataset.close()
    for dataset in agb_total_ci95_disturbance_datasets.values():
        dataset.close()

# Create DataFrames from pre-allocated arrays
df_agbd_mean_mg_ha = pd.DataFrame(agbd_mean_data, index=selected_disturbances, columns=polygon_names)
df_agbd_mean_mg_ha.rename_axis('dist', inplace=True)

df_agbd_stdev_mg_ha = pd.DataFrame(agbd_stdev_data, index=selected_disturbances, columns=polygon_names)
df_agbd_stdev_mg_ha.rename_axis('dist', inplace=True)

df_agb_total_tg = pd.DataFrame(agb_total_data, index=selected_disturbances, columns=polygon_names)
df_agb_total_tg.rename_axis('dist', inplace=True)

if generate_uncertainty_stats:
    df_agbd_mean_ci95_mg_ha = pd.DataFrame(agbd_mean_ci95_data, index=selected_disturbances, columns=polygon_names)
    df_agbd_mean_ci95_mg_ha.rename_axis('dist', inplace=True)

    df_agbd_uncertainty_pct = pd.DataFrame(agbd_mean_uncertainty_data, index=selected_disturbances, columns=polygon_names)
    df_agbd_uncertainty_pct.rename_axis('dist', inplace=True)

    df_agb_total_ci95_tg = pd.DataFrame(agb_total_ci95_data, index=selected_disturbances, columns=polygon_names)
    df_agb_total_ci95_tg.rename_axis('dist', inplace=True)

# Create stats list
if generate_uncertainty_stats:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_mean_ci95_mg_ha, df_agbd_uncertainty_pct,
                     df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_ci95_tg]
else:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Generate summary stats
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" AGB (Tg)")

summary_components = [df_agb_total_tg_t]
if generate_uncertainty_stats:
    df_agb_total_ci95_tg_t = df_agb_total_ci95_tg.T.rename_axis("Name", axis=1).add_suffix(" AGB CI95 (Tg)")
    summary_components.append(df_agb_total_ci95_tg_t)

summary_disturbance_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_disturbance_stats.to_csv(join(sample_polygons_statistics_agbd_dir, 'summary_disturbance_stats.csv'))

# Generate detailed stats by area
df_base = pd.DataFrame(index=selected_disturbances)
df_base.rename_axis('dist', inplace=True)

for polygon_name in polygon_names:
    df_detailed = df_base.copy()
    for df_stats in df_stats_list:
        if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "AGBD mean (Mg / ha)"
        elif df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "AGBD stdev (Mg / ha)"
        elif df_stats.equals(df_agb_total_tg): stat_col = "AGB total (Tg)"
        elif generate_uncertainty_stats and df_stats.equals(df_agbd_mean_ci95_mg_ha): stat_col = "AGBD CI95 (Mg / ha)"
        elif generate_uncertainty_stats and df_stats.equals(df_agbd_uncertainty_pct): stat_col = "AGBD uncertainty (%)"
        elif generate_uncertainty_stats and df_stats.equals(df_agb_total_ci95_tg): stat_col = "AGB total CI95 (Tg)"
        df_detailed[stat_col] = df_stats[polygon_name]
    df_detailed.to_csv(join(disturbance_stats_by_area_dir, f'{polygon_name}.csv'))

# Generate detailed stats by disturbance
disturbances = {}
for stats_csv in os.listdir(disturbance_stats_by_area_dir):
    polygon_name = stats_csv[:-4]
    stats_csv_df = pd.read_csv(join(disturbance_stats_by_area_dir, stats_csv))
    for disturbancein stats_csv_df['dist'].unique():
        disturbance_df = stats_csv_df[stats_csv_df['dist'] == dist].copy()
        disturbance_df.drop('dist', axis=1, inplace=True)
        disturbance_df.insert(0, 'Name', polygon_name)
        if disturbancein disturbances:
            disturbances[disturbance]= pd.concat([disturbances[dist], disturbance_df], ignore_index=True)
        else:
            disturbances[disturbance]= disturbance_df

for disturbance, disturbance_df in disturbances.items():
    disturbance_df.to_csv(join(disturbance_stats_by_disturbance_dir, f'{dist}.csv'), index=False)

# Restoration AGB

In [None]:
# Create list of available AGB total restoration rasters and extract restorations
restorations = set()
for agb_total_restoration_raster in os.listdir(agb_total_restoration_dir):
    if agb_total_restoration_raster.endswith('.tif') and 'agb_total_mg__' in agb_total_restoration_raster:
        restoration_name = agb_total_restoration_raster.split("agb_total_mg__")[1].split('.')[0]
        restorations.add(restoration_name)

restorations = sorted(list(restorations))

# Select restoration rasters to calculate statistics
print('selected_restorations = [')
for rest in restorations:
    print(f'  "{rest}",')
print(']')

In [None]:
selected_restorations = [
  "2021_recovery_potential",
  "2021_reforestation_and_recovery_potential",
  "2024_recovery_potential",
  "2024_reforestation_and_recovery_potential",
]

In [None]:
# Build lists of AGB total restoration rasters for selected restorations
agb_total_restoration_rasters = []

for rest in selected_restorations:
    agb_total_path = join(agb_total_restoration_dir, f"agb_total_mg__{rest}.tif")
    if exists(agb_total_path):
        agb_total_restoration_rasters.append(agb_total_path)

# Sort rasters chronologically
agb_total_restoration_rasters = sorted(agb_total_restoration_rasters)

# Toggle whether to generate uncertainty stats (only possible with uncertainty_dir)
generate_uncertainty_stats = (source_dir == uncertainty_dir)

# Pre-allocate arrays for statistics
polygon_names = [row["name"] for _, row in selected_sample_polygons_gpkg.iterrows()]
n_polygons = len(polygon_names)
n_restorations = len(agb_total_restoration_rasters)

agbd_mean_data = np.zeros((n_restorations, n_polygons), dtype='float64')
agbd_stdev_data = np.zeros((n_restorations, n_polygons), dtype='float64')
agb_total_data = np.zeros((n_restorations, n_polygons), dtype='float64')

if generate_uncertainty_stats:
    agbd_mean_ci95_data = np.zeros((n_restorations, n_polygons), dtype='float64')
    agbd_mean_uncertainty_data = np.zeros((n_restorations, n_polygons), dtype='float64')
    agb_total_ci95_data = np.zeros((n_restorations, n_polygons), dtype='float64')

# Open AGB total restoration raster datasets
agb_total_restoration_datasets = {path: rasterio.open(path) for path in agb_total_restoration_rasters}

# Open AGB total CI95 restoration datasets only if uncertainty stats are generated
agb_total_ci95_restoration_datasets = {}
if generate_uncertainty_stats:
    for agb_total_restoration_raster in agb_total_restoration_rasters:
        restoration_name = os.path.basename(agb_total_restoration_raster).split('agb_total_mg__')[1].split('.')[0]
        agb_total_ci95_path = join(agb_total_restoration_dir, f"agb_total_ci_95_mg__{restoration_name}.tif")
        if exists(agb_total_ci95_path):
            agb_total_ci95_restoration_datasets[agb_total_restoration_raster] = rasterio.open(agb_total_ci95_path)

# Load cell area raster once for all calculations
cell_area_dataset = rasterio.open(cell_area_path)

# Progress tracking
progress_total = n_polygons * n_restorations
progress_index = 0
progress_label = widgets.Label(f"Raster / polygon pair progress: {progress_index}/{progress_total}")
display(progress_label)

try:
    for poly_idx, (index, row) in enumerate(selected_sample_polygons_gpkg.iterrows()):
        sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Mask cell area raster to polygon
        cell_area_masked, _ = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)
        cell_area_masked = cell_area_masked.astype('float64')
        cell_area_masked_ha = cell_area_masked / 10000

        for raster_idx, agb_total_restoration_raster in enumerate(agb_total_restoration_rasters):
            # Mask AGB total restoration raster to polygon
            agb_total_restoration = agb_total_restoration_datasets[agb_total_restoration_raster]
            agb_total_array_masked, _ = msk.mask(agb_total_restoration, polygons, crop=True, filled=False)
            agb_total_array_masked = agb_total_array_masked.astype('float64')

            # Valid pixels mask
            valid_mask = ~np.ma.getmaskarray(agb_total_array_masked)

            # Calculate area of valid pixels
            valid_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~valid_mask)
            valid_area_ha = np.ma.sum(valid_cell_areas_ha, dtype='float64')

            # Sum total AGB in Mg (positive values for restoration)
            agb_total_mg = np.ma.sum(agb_total_array_masked, dtype='float64')

            # Calculate statistics
            if np.ma.is_masked(agb_total_mg) or valid_area_ha <= 0:
                agbd_mean_mg_ha = 0.0
                agbd_stdev_mg_ha = 0.0
                agb_total_tg = 0.0
            else:
                # Area-weighted mean AGBD
                agbd_mean_mg_ha = agb_total_mg / valid_area_ha

                # Back-calculate individual AGBD values for standard deviation
                agbd_values = agb_total_array_masked / cell_area_masked_ha
                valid_agbd = agbd_values[valid_mask]
                valid_areas = cell_area_masked_ha[valid_mask]

                # Area-weighted standard deviation
                variance_weighted = np.sum(valid_areas * (valid_agbd - agbd_mean_mg_ha)**2) / valid_area_ha
                agbd_stdev_mg_ha = np.sqrt(variance_weighted)

                # Convert total AGB from Mg to Tg
                agb_total_tg = agb_total_mg / 1000000

            # Store results
            agbd_mean_data[raster_idx, poly_idx] = agbd_mean_mg_ha
            agbd_stdev_data[raster_idx, poly_idx] = agbd_stdev_mg_ha
            agb_total_data[raster_idx, poly_idx] = agb_total_tg

            if generate_uncertainty_stats and agb_total_restoration_raster in agb_total_ci95_restoration_datasets:
                agb_total_ci95_raster = agb_total_ci95_restoration_datasets[agb_total_restoration_raster]
                agb_total_ci95_array_masked, _ = msk.mask(agb_total_ci95_raster, polygons, crop=True, filled=False)
                agb_total_ci95_array_masked = agb_total_ci95_array_masked.astype('float64')

                agb_total_ci95_mg = abs(np.ma.sum(agb_total_ci95_array_masked, dtype='float64'))

                if abs(agb_total_mg) > 0:
                    agbd_mean_ci95_mg_ha = agb_total_ci95_mg / valid_area_ha
                    agbd_uncertainty_pct = agb_total_ci95_mg / abs(agb_total_mg) * 100
                else:
                    agbd_mean_ci95_mg_ha = 0.0
                    agbd_uncertainty_pct = 0.0

                agb_total_ci95_tg = agb_total_ci95_mg / 1000000

                agbd_mean_ci95_data[raster_idx, poly_idx] = agbd_mean_ci95_mg_ha
                agbd_mean_uncertainty_data[raster_idx, poly_idx] = agbd_uncertainty_pct
                agb_total_ci95_data[raster_idx, poly_idx] = agb_total_ci95_tg

            progress_index += 1
            progress_label.value = f"Raster / polygon pair progress: {progress_index}/{progress_total}"

finally:
    cell_area_dataset.close()
    for dataset in agb_total_restoration_datasets.values():
        dataset.close()
    for dataset in agb_total_ci95_restoration_datasets.values():
        dataset.close()

# Create DataFrames from pre-allocated arrays
df_agbd_mean_mg_ha = pd.DataFrame(agbd_mean_data, index=selected_restorations, columns=polygon_names)
df_agbd_mean_mg_ha.rename_axis('rest', inplace=True)

df_agbd_stdev_mg_ha = pd.DataFrame(agbd_stdev_data, index=selected_restorations, columns=polygon_names)
df_agbd_stdev_mg_ha.rename_axis('rest', inplace=True)

df_agb_total_tg = pd.DataFrame(agb_total_data, index=selected_restorations, columns=polygon_names)
df_agb_total_tg.rename_axis('rest', inplace=True)

if generate_uncertainty_stats:
    df_agbd_mean_ci95_mg_ha = pd.DataFrame(agbd_mean_ci95_data, index=selected_restorations, columns=polygon_names)
    df_agbd_mean_ci95_mg_ha.rename_axis('rest', inplace=True)

    df_agbd_uncertainty_pct = pd.DataFrame(agbd_mean_uncertainty_data, index=selected_restorations, columns=polygon_names)
    df_agbd_uncertainty_pct.rename_axis('rest', inplace=True)

    df_agb_total_ci95_tg = pd.DataFrame(agb_total_ci95_data, index=selected_restorations, columns=polygon_names)
    df_agb_total_ci95_tg.rename_axis('rest', inplace=True)

# Create stats list
if generate_uncertainty_stats:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_mean_ci95_mg_ha, df_agbd_uncertainty_pct,
                     df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_ci95_tg]
else:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Generate summary stats
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" AGB (Tg)")

summary_components = [df_agb_total_tg_t]
if generate_uncertainty_stats:
    df_agb_total_ci95_tg_t = df_agb_total_ci95_tg.T.rename_axis("Name", axis=1).add_suffix(" AGB CI95 (Tg)")
    summary_components.append(df_agb_total_ci95_tg_t)

summary_restoration_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_restoration_stats.to_csv(join(sample_polygons_statistics_agbd_dir, 'summary_restoration_stats.csv'))

# Generate detailed stats by area
df_base = pd.DataFrame(index=selected_restorations)
df_base.rename_axis('rest', inplace=True)

for polygon_name in polygon_names:
    df_detailed = df_base.copy()
    for df_stats in df_stats_list:
        if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "AGBD mean (Mg / ha)"
        elif df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "AGBD stdev (Mg / ha)"
        elif df_stats.equals(df_agb_total_tg): stat_col = "AGB total (Tg)"
        elif generate_uncertainty_stats and df_stats.equals(df_agbd_mean_ci95_mg_ha): stat_col = "AGBD CI95 (Mg / ha)"
        elif generate_uncertainty_stats and df_stats.equals(df_agbd_uncertainty_pct): stat_col = "AGBD uncertainty (%)"
        elif generate_uncertainty_stats and df_stats.equals(df_agb_total_ci95_tg): stat_col = "AGB total CI95 (Tg)"
        df_detailed[stat_col] = df_stats[polygon_name]
    df_detailed.to_csv(join(restoration_stats_by_area_dir, f'{polygon_name}.csv'))

# Generate detailed stats by restoration
restorations = {}
for stats_csv in os.listdir(restoration_stats_by_area_dir):
    polygon_name = stats_csv[:-4]
    stats_csv_df = pd.read_csv(join(restoration_stats_by_area_dir, stats_csv))
    for restoration in stats_csv_df['rest'].unique():
        restoration_df = stats_csv_df[stats_csv_df['rest'] == restoration].copy()
        restoration_df.drop('rest', axis=1, inplace=True)
        restoration_df.insert(0, 'Name', polygon_name)
        if restoration in restorations:
            restorations[restoration] = pd.concat([restorations[restoration], restoration_df], ignore_index=True)
        else:
            restorations[restoration] = restoration_df

for restoration, restoration_df in restorations.items():
    restoration_df.to_csv(join(restoration_stats_by_restoration_dir, f'{rest}.csv'), index=False)

# Intactness

In [None]:
# Create list of available intactness rasters
intactness_rasters = []
for root, dirs, files in os.walk(intactness_dir):
    for file in files:
        if "intactness__" in file and file.endswith('tif'):
            relative_path = os.path.relpath(join(root, file), intactness_dir)
            intactness_rasters.append(relative_path)

# Select intactness rasters to calculate statistics
print("# Select intactness raster to calculate statistics")
print("intactness_rasters = [")
for raster in intactness_rasters:
    print(f"'{raster}',")
print("]")

In [None]:
# Select intactness raster to calculate statistics
intactness_rasters = [
'2021_no_disturbance_since_1993__2021_degradation_since_1993/intactness__forest_reserves_10_quantiles__2021_no_disturbance_since_1993__2021_degradation_since_1993__agbd_251203_161707.tif',
'2021_no_disturbance_since_1993__2021_degradation_since_1993/intactness__gedi_area_10_quantiles__2021_no_disturbance_since_1993__2021_degradation_since_1993__agbd_251203_161707.tif',
'2021_no_disturbance_since_oldgrowth__2021_degradation_since_oldgrowth/intactness__forest_reserves_10_quantiles__2021_no_disturbance_since_oldgrowth__2021_degradation_since_oldgrowth__agbd_251203_161707.tif',
'2021_no_disturbance_since_oldgrowth__2021_degradation_since_oldgrowth/intactness__gedi_area_10_quantiles__2021_no_disturbance_since_oldgrowth__2021_degradation_since_oldgrowth__agbd_251203_161707.tif',
'2024_no_disturbance_since_1996__2024_degradation_since_1996/intactness__forest_reserves_10_quantiles__2024_no_disturbance_since_1996__2024_degradation_since_1996__agbd_251203_161707.tif',
'2024_no_disturbance_since_1996__2024_degradation_since_1996/intactness__gedi_area_10_quantiles__2024_no_disturbance_since_1996__2024_degradation_since_1996__agbd_251203_161707.tif',
'2024_no_disturbance_since_oldgrowth__2024_degradation_since_oldgrowth/intactness__forest_reserves_10_quantiles__2024_no_disturbance_since_oldgrowth__2024_degradation_since_oldgrowth__agbd_251203_161707.tif',
'2024_no_disturbance_since_oldgrowth__2024_degradation_since_oldgrowth/intactness__gedi_area_10_quantiles__2024_no_disturbance_since_oldgrowth__2024_degradation_since_oldgrowth__agbd_251203_161707.tif',
]

In [None]:
# Intactness statistics
# Calculates area-weighted intactness and percentage change statistics per polygon.
# Two metrics computed:
#   - Remaining forest: statistics for pixels with intactness > 0
#   - Landscape: includes deforested pixels (intactness == 0) as score 0 or -100% change

# Match intactness rasters to percentage change rasters
intactness_percentage_paths = {}
for intactness_raster in intactness_rasters:
    intactness_raster_path = join(intactness_dir, intactness_raster)

    # Percentage change raster located in same subdirectory as intactness raster
    subdir = intactness_raster.split('/')[0]
    percentage_change_filename = f"percentage_change__{subdir}__{selected_model}.tif"
    percentage_change_path = join(intactness_dir, subdir, percentage_change_filename)
    intactness_percentage_paths[intactness_raster_path] = percentage_change_path

# Area-weighted mean and standard deviation
def weighted_stats(values, weights):
    if len(values) == 0:
        return None, None
    sum_of_weights = np.sum(weights)
    if sum_of_weights <= 0:
        return 0.0, 0.0
    weighted_mean = np.sum(values * weights) / sum_of_weights
    variance = np.sum(weights * (values - weighted_mean) ** 2) / sum_of_weights
    weighted_std = np.sqrt(variance)
    return weighted_mean, weighted_std

# Area in hectares for each unique intactness score
def calculate_score_areas(intactness_masked, cell_area_masked_ha):
    score_areas = {}
    if np.ma.count(intactness_masked) == 0:
        return score_areas
    unique_scores = np.unique(intactness_masked.compressed())
    for score in unique_scores:
        score_mask = (intactness_masked == score) & (~intactness_masked.mask)
        if np.any(score_mask):
            score_areas[int(score)] = np.sum(cell_area_masked_ha[score_mask], dtype='float64')
        else:
            score_areas[int(score)] = 0.0
    return score_areas

# Pre-open cell area dataset
cell_area_dataset = rasterio.open(cell_area_path)

# Progress tracking
n_intactness_rasters = len(intactness_rasters)
n_polygons = len(selected_sample_polygons_gpkg)
progress_total = n_intactness_rasters * n_polygons
progress_index = 0
progress_label = widgets.Label(f"Raster / polygon pair progress: {progress_index}/{progress_total}")
display(progress_label)

try:
    # Outer loop: one CSV output per intactness raster
    for intactness_raster in intactness_rasters:
        intactness_raster_path = join(intactness_dir, intactness_raster)
        percentage_raster_path = intactness_percentage_paths[intactness_raster_path]

        # Extract metadata from filename
        # Format: intactness__{quantiles}__{baseline}__{disturbance}__{model}.tif
        filename = intactness_raster.split('/')[-1]
        parts = filename.split('__')
        polygon_quantiles = parts[1]
        baseline_name = parts[2]
        disturbance = parts[3]

        # Total score derived from quantiles (e.g. 'forest_reserves_10_quantiles' -> 10)
        total_score = int(polygon_quantiles.split('_')[-2])
        total_stdev = int(total_score / 2)

        # Output CSV path
        intactness_csv_name = f"{polygon_quantiles}__{baseline_name}__{disturbance}.csv"
        intactness_csv_path = join(intactness_stats_dir, intactness_csv_name)

        # Initialise output dataframe
        df_intactness_stats = pd.DataFrame(columns=[
            "Name",
            "Percentage change (remaining forest) mean",
            "Percentage change (remaining forest) stdev",
            "Percentage change (landscape) mean",
            "Percentage change (landscape) stdev",
            f"Intactness (remaining forest) mean / {total_score}",
            f"Intactness (remaining forest) stdev / {total_stdev}",
            f"Intactness (landscape) mean / {total_score}",
            f"Intactness (landscape) stdev / {total_stdev}"
        ])

        # Inner loop: one row per polygon
        for index, row in selected_sample_polygons_gpkg.iterrows():
            sample_polygon_geometry = row["geometry"]
            sample_polygon_name = row["name"]
            polygons = [polygon for polygon in sample_polygon_geometry.geoms]

            # Mask intactness raster to polygon
            with rasterio.open(intactness_raster_path) as src:
                intactness_masked, _ = msk.mask(src, polygons, crop=True, filled=False)
                intactness_masked = intactness_masked.astype('float64')

            # Skip polygon if no valid intactness data
            if np.ma.count(intactness_masked) == 0:
                new_row = pd.DataFrame([{
                    'Name': sample_polygon_name,
                    'Percentage change (remaining forest) mean': None,
                    'Percentage change (remaining forest) stdev': None,
                    'Percentage change (landscape) mean': None,
                    'Percentage change (landscape) stdev': None,
                    f'Intactness (remaining forest) mean / {total_score}': None,
                    f'Intactness (remaining forest) stdev / {total_stdev}': None,
                    f'Intactness (landscape) mean / {total_score}': None,
                    f'Intactness (landscape) stdev / {total_stdev}': None,
                }], dtype=object)
                df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)
                progress_index += 1
                progress_label.value = f"Raster / polygon pair progress: {progress_index}/{progress_total}"
                continue

            # Mask cell area raster to polygon, convert m2 to ha
            cell_area_masked, _ = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)
            cell_area_masked = cell_area_masked.astype('float64')
            cell_area_masked_ha = cell_area_masked / 10000

            # Forest mask: valid pixels with intactness > 0 (remaining forest)
            forest_mask = (~intactness_masked.mask) & (intactness_masked > 0)

            # Deforested mask: valid pixels with intactness == 0 (lost since baseline)
            deforested_mask = (~intactness_masked.mask) & (intactness_masked == 0)

            # Calculate areas in hectares
            forest_area_ha = np.sum(cell_area_masked_ha.data[forest_mask], dtype='float64')
            deforested_area_ha = np.sum(cell_area_masked_ha.data[deforested_mask], dtype='float64')
            landscape_area_ha = forest_area_ha + deforested_area_ha

            # Mask percentage change raster to polygon
            with rasterio.open(percentage_raster_path) as percent_src:
                percent_change_masked, _ = msk.mask(percent_src, polygons, crop=True, filled=False)
                percent_change_masked = percent_change_masked.astype('float64')

            # Calculate percentage change statistics
            if forest_area_ha > 0:
                # Remaining forest: only pixels with intactness > 0
                forest_percent_values = percent_change_masked.data[forest_mask]
                forest_percent_weights = cell_area_masked_ha.data[forest_mask]
                percent_forest_mean, percent_forest_std = weighted_stats(
                    forest_percent_values, forest_percent_weights
                )

                # Landscape: include deforested pixels as -100% change
                if deforested_area_ha > 0:
                    landscape_mean_num = np.sum(forest_percent_values * forest_percent_weights) + deforested_area_ha * (-100.0)
                    percent_landscape_mean = landscape_mean_num / landscape_area_ha

                    forest_var_contrib = np.sum(forest_percent_weights * np.square(forest_percent_values - percent_landscape_mean))
                    deforested_var_contrib = deforested_area_ha * np.square((-100.0) - percent_landscape_mean)
                    percent_landscape_std = np.sqrt((forest_var_contrib + deforested_var_contrib) / landscape_area_ha)
                else:
                    # No deforestation: landscape stats equal remaining forest stats
                    percent_landscape_mean = percent_forest_mean
                    percent_landscape_std = percent_forest_std
            else:
                # No remaining forest: all deforested
                percent_forest_mean = percent_forest_std = None
                percent_landscape_mean = -100.0
                percent_landscape_std = 0.0

            # Calculate intactness statistics
            if forest_area_ha > 0:
                # Remaining forest: only pixels with intactness > 0
                forest_intact_vals = intactness_masked.data[forest_mask]
                forest_intact_weights = cell_area_masked_ha.data[forest_mask]
                intactness_forest_mean, intactness_forest_std = weighted_stats(
                    forest_intact_vals, forest_intact_weights
                )

                # Landscape: include deforested pixels as score 0
                if deforested_area_ha > 0:
                    intactness_landscape_mean = np.sum(forest_intact_vals * forest_intact_weights) / landscape_area_ha

                    forest_var_contrib = np.sum(forest_intact_weights * np.square(forest_intact_vals - intactness_landscape_mean))
                    deforested_var_contrib = deforested_area_ha * np.square(0 - intactness_landscape_mean)
                    intactness_landscape_std = np.sqrt((forest_var_contrib + deforested_var_contrib) / landscape_area_ha)
                else:
                    # No deforestation: landscape stats equal remaining forest stats
                    intactness_landscape_mean = intactness_forest_mean
                    intactness_landscape_std = intactness_forest_std
            else:
                # No remaining forest: all deforested
                intactness_forest_mean = intactness_forest_std = None
                intactness_landscape_mean = 0.0
                intactness_landscape_std = 0.0

            # Calculate area per intactness score (includes score 0)
            score_areas = calculate_score_areas(intactness_masked, cell_area_masked_ha)

            # Build output row
            new_row_dict = {
                'Name': sample_polygon_name,
                'Percentage change (remaining forest) mean': percent_forest_mean,
                'Percentage change (remaining forest) stdev': percent_forest_std,
                'Percentage change (landscape) mean': percent_landscape_mean,
                'Percentage change (landscape) stdev': percent_landscape_std,
                f'Intactness (remaining forest) mean / {total_score}': intactness_forest_mean,
                f'Intactness (remaining forest) stdev / {total_stdev}': intactness_forest_std,
                f'Intactness (landscape) mean / {total_score}': intactness_landscape_mean,
                f'Intactness (landscape) stdev / {total_stdev}': intactness_landscape_std,
            }

            # Append score area columns dynamically
            for score, area in score_areas.items(): new_row_dict[f'Intactness score {score} area (ha)'] = area
            new_row = pd.DataFrame([new_row_dict], dtype=object)
            df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)

            progress_index += 1
            progress_label.value = f"Raster / polygon pair progress: {progress_index}/{progress_total}"

        # Save CSV for this intactness raster
        df_intactness_stats = df_intactness_stats.set_index('Name')
        df_intactness_stats.to_csv(intactness_csv_path)
        print(f"Saved statistics to {intactness_csv_path}")

finally: cell_area_dataset.close()

print("Intactness statistics completed.")

# Report statistics

In [None]:
# Reduces statistics outputs to a more specific and intuitive format.

# Load summary stats files
summary_scenario_stats_df = pd.read_csv(join(sample_polygons_statistics_agbd_dir, 'summary_scenario_stats.csv'))
summary_disturbance_stats_df = pd.read_csv(join(sample_polygons_statistics_agbd_dir, 'summary_disturbance_stats.csv'))
summary_forest_cover_stats_df = pd.read_csv(join(sample_polygons_statistics_agbd_dir, 'summary_forest_cover_stats.csv'))

# Extract forest cover scenarios (exclude percentage columns)
forest_cover_cols = [col for col in summary_forest_cover_stats_df.columns
                     if col.endswith('forest cover (ha)')]
forest_cover_scenarios = [col.replace(' forest cover (ha)', '') for col in forest_cover_cols]

print("# Forest cover scenarios")
print("forest_cover_list = [")
for scenario in forest_cover_scenarios:
    print(f"  '{scenario}',")
print("]\n")

# Extract scenario list from summary stats
print("# AGB scenarios (order matters)")
print("scenario_list = [")
for csv in os.listdir(scenario_stats_by_scenario_dir):
    print(f"  '{csv[:-4]}',")
print("]\n")

# Extract and sort disturbance list
disturbance_csv_files = [f[:-4] for f in os.listdir(disturbance_stats_by_disturbance_dir) if f.endswith('.csv')]

def get_disturbance_type(filename):
    if 'disturbance' in filename: return 3
    elif 'deforestation' in filename: return 2
    else: return 1  # degradation

def get_ref_year_sort_key(ref_year):
    # Numeric years sorted descending, non-numeric strings sorted alphabetically after
    try:
        return (0, -int(ref_year))
    except ValueError:
        return (1, ref_year)

# Group by year and disturbance type
files_by_category = {}
for file in disturbance_csv_files:
    year = file.split('_')[0]
    disturbance_type = get_disturbance_type(file)
    key = (year, disturbance_type)
    if key not in files_by_category:
        files_by_category[key] = []
    files_by_category[key].append(file)

print("# Disturbance scenarios")
print("disturbance_list = [")
for key in sorted(files_by_category.keys(), key=lambda k: (int(k[0]), k[1])):
    files = files_by_category[key]
    # Total files first
    total_files = [f for f in files if '_total' in f]
    for file in total_files:
        print(f"    '{file}',")
    # Group remaining by reference year
    ref_year_files = {}
    for file in files:
        if '_total' in file: continue
        ref_year = file.split('_')[-1]
        if ref_year not in ref_year_files:
            ref_year_files[ref_year] = []
        ref_year_files[ref_year].append(file)
    # Process each reference year
    for ref_year in sorted(ref_year_files.keys(), key=get_ref_year_sort_key):
        year_files = ref_year_files[ref_year]
        since_files = [f for f in year_files if '_since_' in f]
        before_files = [f for f in year_files if '_before_' in f]
        for file in since_files:
            print(f"    '{file}',")
        for file in before_files:
            print(f"    # '{file}',")
print("]")

In [None]:
# Forest cover scenarios
forest_cover_list = [
  '1993',
  '2021',
  '2024',
  '2024_oldgrowth_all_land',
]

# AGB scenarios (order matters)
scenario_list = [
  '2021',
  '2021_no_disturbance_since_1993',
  '2021_oldgrowth_all_land',
  '2024',
]

# Disturbance scenarios
disturbance_list = [
    '2021_degradation_since_1993',
    '2021_deforestation_since_1993',
]

In [None]:
# Read summary stats
summary_scenario_stats_df = pd.read_csv(join(sample_polygons_statistics_agbd_dir, 'summary_scenario_stats.csv'))
summary_disturbance_stats_df = pd.read_csv(join(sample_polygons_statistics_agbd_dir, 'summary_disturbance_stats.csv'))
summary_forest_cover_stats_df = pd.read_csv(join(sample_polygons_statistics_agbd_dir, 'summary_forest_cover_stats.csv'))

# Create forest cover CSV (including total area km2)
forest_cover = pd.DataFrame()
forest_cover['Name'] = summary_forest_cover_stats_df['Name']
forest_cover['Area (km^2)'] = summary_forest_cover_stats_df['Area (km^2)']
for scenario in forest_cover_list:
    col_name = f'{scenario} forest cover (ha)'
    if col_name in summary_forest_cover_stats_df.columns:
        forest_cover[col_name] = summary_forest_cover_stats_df[col_name]
forest_cover.to_csv(join(report_statistics_agbd_dir, 'forest_cover.csv'), index=False)

# Create scenarios total AGB CSV
scenarios_total_agb = pd.DataFrame()
scenarios_total_agb['Name'] = summary_scenario_stats_df['Unnamed: 0']
for scenario in scenario_list:
    scenarios_total_agb[f'{scenario} forest AGB (Tg)'] = summary_scenario_stats_df[f'{scenario} forest AGB (Tg)']
if source_dir == uncertainty_dir:
    for scenario in scenario_list:
        scenarios_total_agb[f'{scenario} forest AGB CI95 (Tg)'] = summary_scenario_stats_df[f'{scenario} forest AGB CI95 (Tg)']
scenarios_total_agb.to_csv(join(report_statistics_agbd_dir, 'scenarios_total_agb.csv'), index=False)

# Create scenarios AGBD CSV
scenarios_agbd = pd.DataFrame()
scenarios_agbd['Name'] = summary_scenario_stats_df['Unnamed: 0']
for scenario in scenario_list:
    scenario_stats_df = pd.read_csv(join(scenario_stats_by_scenario_dir, f'{scenario}.csv'))
    scenarios_agbd[f'{scenario} forest AGBD (Mg / ha)'] = scenario_stats_df['Forest AGBD mean (Mg / ha)']
if source_dir == uncertainty_dir:
    for scenario in scenario_list:
        scenario_stats_df = pd.read_csv(join(scenario_stats_by_scenario_dir, f'{scenario}.csv'))
        scenarios_agbd[f'{scenario} forest AGBD CI95 (Mg / ha)'] = scenario_stats_df['Forest AGBD CI95 (Mg / ha)']
scenarios_agbd.to_csv(join(report_statistics_agbd_dir, 'scenarios_agbd.csv'), index=False)

# Create disturbance total AGB CSV
disturbance_total_agb = pd.DataFrame()
disturbance_total_agb['Name'] = summary_disturbance_stats_df['Unnamed: 0']
for disturbance in disturbance_list:
    disturbance_total_agb[f'{disturbance} forest AGB (Tg)'] = summary_disturbance_stats_df[f'{disturbance} forest AGB (Tg)']
if source_dir == uncertainty_dir:
    for disturbance in disturbance_list:
        disturbance_total_agb[f'{disturbance} forest AGB CI95 (Tg)'] = summary_disturbance_stats_df[f'{disturbance} forest AGB CI95 (Tg)']
disturbance_total_agb.to_csv(join(report_statistics_agbd_dir, 'disturbance_total_agb.csv'), index=False)

# Create disturbance AGBD CSV
disturbance_agbd = pd.DataFrame()
disturbance_agbd['Name'] = summary_scenario_stats_df['Unnamed: 0']
for disturbance in disturbance_list:
    disturbance_stats_df = pd.read_csv(join(disturbance_stats_by_disturbance_dir, f'{disturbance}.csv'))
    disturbance_agbd[f'{disturbance} forest AGBD (Mg / ha)'] = disturbance_stats_df['Forest AGBD mean (Mg / ha)']
if source_dir == uncertainty_dir:
    for disturbance in disturbance_list:
        disturbance_stats_df = pd.read_csv(join(disturbance_stats_by_disturbance_dir, f'{disturbance}.csv'))
        disturbance_agbd[f'{disturbance} forest AGBD CI95 (Mg / ha)'] = disturbance_stats_df['Forest AGBD CI95 (Mg / ha)']
disturbance_agbd.to_csv(join(report_statistics_agbd_dir, 'disturbance_agbd.csv'), index=False)

print("Report statistics completed.")

# Sankey plots

In [None]:
# Define and create directories
sankey_labelled = join(sample_polygons_statistics_agbd_dir, 'sankey_labelled')
sankey_unlabelled = join(sample_polygons_statistics_agbd_dir, 'sankey_unlabelled')
sankey_labelled_svg = join(sample_polygons_statistics_agbd_dir, 'sankey_labelled_svg')
sankey_unlabelled_svg = join(sample_polygons_statistics_agbd_dir, 'sankey_unlabelled_svg')

for dir in [sankey_labelled, sankey_unlabelled, sankey_labelled_svg, sankey_unlabelled_svg]:
    makedirs(dir, exist_ok=True)

# Load the CSV files
summary_scenario_stats = pd.read_csv(join(sample_polygons_statistics_agbd_dir,'summary_scenario_stats.csv'))
summary_disturbance_stats = pd.read_csv(join(sample_polygons_statistics_agbd_dir,'summary_disturbance_stats.csv'))

# Check that all rows in both .csv files have the same strings (polygon areas) in column A
polygon_areas_stats = summary_scenario_stats.iloc[:, 0]
polygon_areas_disturbance_stats = summary_disturbance_stats.iloc[:, 0]

assert all(polygon_areas_stats == polygon_areas_disturbance_stats), "Polygon areas do not match between the two CSV files."

# Print columns relevant for sankey diagram configuration

# Filter for AGB columns only (exclude forest cover and CI95 for initial selection)
summary_agb_cols = [col for col in summary_scenario_stats.columns[1:] if 'forest AGB (Tg)' in col and 'CI95' not in col]
disturbance_agb_cols = [col for col in summary_disturbance_stats.columns[1:] if 'forest AGB (Tg)' in col and 'CI95' not in col]

print("=== summary_scenario_stats.csv AGB columns ===")
print("(for old_growth_agb_column and current_agb_column)\n")

# Group by category
current_year_cols = [col for col in summary_agb_cols if col.startswith('2024 ') or col.startswith('2023 ') or col.startswith('2022 ')]
oldgrowth_cols = [col for col in summary_agb_cols if 'oldgrowth' in col and not col.endswith('_1 forest AGB (Tg)') and not col.endswith('_2 forest AGB (Tg)')]
no_disturbance_cols = [col for col in summary_agb_cols if 'no_disturbance' in col]
no_degradation_cols = [col for col in summary_agb_cols if 'no_degradation' in col]

print("Current year scenarios:")
for i, col in enumerate(current_year_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nOld-growth scenarios:")
for i, col in enumerate(oldgrowth_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nNo disturbance scenarios:")
for i, col in enumerate(no_disturbance_cols, 1):
    print(f"  {i:2d}. {col}")

print("\n" + "="*50)
print("=== summary_disturbance_stats.csv AGB columns ===")
print("(for degradation/deforestation since/total columns)\n")

# Group disturbance columns
degradation_cols = [col for col in disturbance_agb_cols if 'degradation' in col]
deforestation_cols = [col for col in disturbance_agb_cols if 'deforestation' in col]
disturbance_cols = [col for col in disturbance_agb_cols if 'disturbance' in col and 'effect' not in col]

print("Degradation columns:")
for i, col in enumerate(degradation_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nDeforestation columns:")
for i, col in enumerate(deforestation_cols, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Plot degradation and deforestation separately
separate_disturbance = True
# Plot degradation before and since a date separately
separate_degradation = True

# DPI (default is 96, output image will scale accordingly)
dpi = 300
# Relative width modifier (ratio, e.g. 0.5 or 2)
width_modifier = 0.85

# Title (polygon area), density and label variables (weight of 800 ~ bold, 400 ~ normal)
show_title = True
show_density = True
show_labels = True
left_axis_label = True
svg_transparent_background = True
title_font_size = 20
title_font_weight = 600
density_font_size = 17
density_font_weight = 600
label_font_size = 17
label_font_weight = 600

# Base columns and year (summary_scenario_stats)
old_growth_agb_column = '2024_oldgrowth_all_land forest AGB (Tg)'
current_agb_column = '2024 forest AGB (Tg)'
current_year = current_agb_column.split(' ')[0]

# Disturbance columns (summary_disturbance_stats)
degradation_since_column = '2024_degradation_since_1996 forest AGB (Tg)'
degradation_total_column = '2024_degradation_since_oldgrowth forest AGB (Tg)'
deforestation_total_column = '2024_deforestation_since_oldgrowth forest AGB (Tg)'

# Node labels and colours
remaining_name = f'Remaining in {current_year}:'
remaining_colour = '#007fff'
degradation_before_name = 'Degradation < 1996'
degradation_before_colour = '#8dc00d'
degradation_since_name = 'Degradation > 1996'
degradation_since_colour = '#ffff00'
degradation_total_name = 'Degradation'
degradation_total_colour = '#ffff00'
deforestation_total_name = 'Deforestation'
deforestation_total_colour = '#ffffff'
disturbance_total_name = 'Disturbance'
disturbance_total_colour = '#ffffff'

# Validate separation settings
assert not separate_degradation or separate_disturbance, "separate_disturbance must be True if separate_degradation is True."

# Function to get values from statistics
def get_value(df, idx, column_name):
    try:
        value = df.loc[idx, column_name]
        return 0.0 if pd.isnull(value) else float(value)
    except KeyError:
        print(f"Column '{column_name}' not found in the dataframe.")
        return 0.0

# Loop through each row (polygon area)
for idx in summary_scenario_stats.index:
    polygon_name = summary_scenario_stats.iloc[idx, 0]

    # Get old-growth and current AGB values
    old_growth_agb = get_value(summary_scenario_stats, idx, old_growth_agb_column)
    current_agb = get_value(summary_scenario_stats, idx, current_agb_column)

    # Get disturbance values and calculate before values
    degradation_since = get_value(summary_disturbance_stats, idx, degradation_since_column)
    degradation_total = get_value(summary_disturbance_stats, idx, degradation_total_column)
    degradation_before = degradation_total - degradation_since
    deforestation_total = get_value(summary_disturbance_stats, idx, deforestation_total_column)
    disturbance_total = get_value(summary_disturbance_stats, idx, disturbance_total_column)

    # Statistical assertions
    if separate_degradation:
        discrepancy = abs(degradation_before + degradation_since - degradation_total)
        if discrepancy >= 1e-6:
            print(f"{polygon_name}: degradation_before + degradation_since != degradation_total (discrepancy: {discrepancy:.6e})")
    if separate_disturbance:
        discrepancy = abs(degradation_total + deforestation_total - disturbance_total)
        if discrepancy >= 1e-6:
            print(f"{polygon_name}: degradation_total + deforestation_total != disturbance_total (discrepancy: {discrepancy:.6e})")
    discrepancy = abs(current_agb - disturbance_total - old_growth_agb)
    if discrepancy >= 1e-6:
        print(f"{polygon_name}: current_agb - disturbance_total != old_growth_agb (discrepancy: {discrepancy:.6e})")
        print("Note: Constraining degradation floor to disturbance or capping disturbances to 0 can break equality when amalgamating across areas")

    # Load detailed stats for AGBD and CI95 values
    stats_df = pd.read_csv(join(scenario_stats_by_area_dir, f"{polygon_name}.csv"))
    old_growth_index = stats_df.index[stats_df['scenario'] == f"{old_growth_agb_column.split(' ')[0]}"].item()
    current_index = stats_df.index[stats_df['scenario'] == f"{current_agb_column.split(' ')[0]}"].item()

    old_growth_mean_agbd = get_value(stats_df, old_growth_index, "Forest AGBD mean (Mg / ha)")
    current_mean_agbd = get_value(stats_df, current_index, "Forest AGBD mean (Mg / ha)")

    uncertainty = 'Forest AGB total CI95 (Tg)' in stats_df.columns
    if uncertainty:
        old_growth_agb_ci95 = get_value(stats_df, old_growth_index, "Forest AGB total CI95 (Tg)")
        old_growth_mean_agbd_ci95 = get_value(stats_df, old_growth_index, "Forest AGBD CI95 (Mg / ha)")
        current_agb_ci95 = get_value(stats_df, current_index, "Forest AGB total CI95 (Tg)")
        current_mean_agbd_ci95 = get_value(stats_df, current_index, "Forest AGBD CI95 (Mg / ha)")

    # Build title and subtitle text
    title_name = f"{polygon_name}"

    if uncertainty:
        subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} ± {old_growth_mean_agbd_ci95:.1f} Mg / ha"
        subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} ± {current_mean_agbd_ci95:.1f} Mg / ha"
        left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} ± {old_growth_agb_ci95:.2f} Tg" if left_axis_label else ''
        remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} ± {current_agb_ci95:.2f} Tg"
    else:
        subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} Mg / ha"
        subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} Mg / ha"
        left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} Tg" if left_axis_label else ''
        remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} Tg"

    if separate_disturbance and separate_degradation:
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_total_name, remaining_name_agb]
        sources, targets = [0, 0, 0, 0], [1, 2, 3, 4]
        values = [-degradation_before, -degradation_since, -deforestation_total, current_agb]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_total_colour, remaining_colour]

    elif separate_disturbance and not separate_degradation:
        nodes = [left_axis, degradation_total_name, deforestation_total_name, remaining_name_agb]
        sources, targets = [0, 0, 0], [1, 2, 3]
        values = [-degradation_total, -deforestation_total, current_agb]
        colors = [degradation_total_colour, deforestation_total_colour, remaining_colour]

    else:
        nodes = [left_axis, disturbance_total_name, remaining_name_agb]
        sources, targets = [0, 0], [1, 2]
        values = [-(degradation_total + deforestation_total), current_agb]
        colors = [disturbance_total_colour, remaining_colour]

    node_colors = [remaining_colour] + colors

    # Add percentages to node labels
    percentages = [(abs(val) / old_growth_agb * 100) for val in values]
    for i in range(1, len(nodes)):
        if i - 1 < len(percentages):
            nodes[i] += f" ({percentages[i-1]:.0f}%)"

    # Configure title and density annotations
    title_and_density = [
        dict(x=0, y=1.28, xref='paper', yref='paper', text=title_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=title_font_size, color="black", weight=title_font_weight)),
        dict(x=0, y=1.19, xref='paper', yref='paper', text=subtitle_1_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=density_font_size, color="black", weight=density_font_weight)),
        dict(x=0, y=1.11, xref='paper', yref='paper', text=subtitle_2_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=density_font_size, color="black", weight=density_font_weight))
    ]

    if show_title and not show_density:
        title_and_density = title_and_density[0:1]
    elif not show_title and show_density:
        title_and_density = title_and_density[1:3]
    elif not show_title and not show_density:
        title_and_density = []

    # Remove labels if toggled off
    if not show_labels:
        nodes = [''] * len(nodes)

    # Create sankey diagram
    fig = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=nodes, color=node_colors, pad=15, thickness=20, line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors, line=dict(color="black", width=1))
    )])

    fig.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25),
        annotations=title_and_density
    )

    # Save labelled versions
    fig.write_image(join(sankey_labelled, f'sankey_diagram_{polygon_name}.png'), scale=dpi / 96)
    if svg_transparent_background:
        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    fig.write_image(join(sankey_labelled_svg, f'sankey_diagram_vector_{polygon_name}.svg'), scale=dpi / 96)

    # Create and save unlabelled versions
    fig_unlabelled = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=[''] * len(nodes), color=node_colors, pad=15, thickness=20, line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors, line=dict(color="black", width=1))
    )])

    fig_unlabelled.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25)
    )

    fig_unlabelled.write_image(join(sankey_unlabelled, f'sankey_diagram_{polygon_name}.png'), scale=dpi / 96)
    if svg_transparent_background:
        fig_unlabelled.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    fig_unlabelled.write_image(join(sankey_unlabelled_svg, f'sankey_diagram_vector_{polygon_name}.svg'), scale=dpi / 96)

    # Display figure with white background
    fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
    fig.show()

# Disconnected runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()