<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/9_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Subdirectories

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Imports and upgrades
!pip install geopandas
!pip install kaleido==0.2.1
!pip install rasterio

In [None]:
# Imports
from datetime import datetime
import geopandas as gpd
from google.colab import runtime
import json
import ipywidgets as widgets
import kaleido
import math
import numpy as np
from os.path import exists, join
from os import makedirs
from osgeo import gdal, ogr
gdal.UseExceptions()
import pandas as pd
import plotly.graph_objects as go
import rasterio
from rasterio import mask as msk
from sklearn.metrics import (
    root_mean_squared_error,
    r2_score,
    accuracy_score,
    log_loss,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    matthews_corrcoef,
    balanced_accuracy_score,
    average_precision_score,
    cohen_kappa_score,
    confusion_matrix
)
from shutil import copyfile

In [None]:
areas_dir = join(base_dir, "1_areas")
datasets_dir = join(base_dir, "4_datasets/final")
models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")
mask_dir = join(scenarios_dir, "scenario_masks")
uncertainty_dir = join(base_dir, "7_uncertainty")
differences_dir = join(base_dir, "8_differences")
statistics_dir = join(base_dir, "9_statistics")
sample_polygons_dir = join(statistics_dir, "sample_polygons")

# Create directories
makedirs(statistics_dir, exist_ok=True)
makedirs(sample_polygons_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ["COMPRESS=DEFLATE", "PREDICTOR=3", "ZLEVEL=9"]
    else: options = []
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Select model, area and sample polygons

In [None]:
# Select if to source predictions from scenarios_dir or uncertainty_dir
source_dir = uncertainty_dir
# source_dir = scenarios_dir
source_dir_name = f"{source_dir.split('_')[-1]}_dir"

# Select the model
for subdir in os.listdir(source_dir):
  if 'scenario_masks' not in subdir:
    print(f"selected_model = '{subdir}'")

In [None]:
selected_model = 'agbd_tekai_250625_003858'

# Define prediction, disturbance and intactness directories
selected_model_dir = join(models_dir, selected_model)
selected_model_prediction_dir = join(source_dir, selected_model)
if source_dir == scenarios_dir: prediction_raster_dir = join(selected_model_prediction_dir, 'scenario_predictions')
if source_dir == uncertainty_dir: prediction_raster_dir = join(selected_model_prediction_dir, 'uncertainty_predictions')
model_differences_dir = join(differences_dir, f"{selected_model}_{source_dir_name}")
disturbance_dir = join(model_differences_dir, 'disturbance')
intactness_dir = join(model_differences_dir, 'intactness')

# Check prediction directory
if not exists(prediction_raster_dir):
  print(f"Prediction directory doesn't exist yet: {prediction_raster_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(prediction_raster_dir))} rasters in {prediction_raster_dir}")
# Check disturbance directory
if not exists(disturbance_dir):
  print(f"Disturbance directory doesn't exist yet: {disturbance_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(disturbance_dir))} rasters in {disturbance_dir}")
# Check intactness directory
if not exists(intactness_dir):
  print(f"Intactness directory doesn't exist yet: {intactness_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(intactness_dir))} rasters in {intactness_dir}")

# Define model stats directory
model_statistics_dir = join(statistics_dir, f"{selected_model}_{source_dir_name}")
makedirs(model_statistics_dir, exist_ok=True)

# Select sample area polygons. This should be a single .gpkg with the field 'name' differentiating polygons.
sample_polygons = []
for geopackage in os.listdir(sample_polygons_dir):
  sample_polygons.append(geopackage)
if len(sample_polygons) == 0:
  print(f"No sample areas found. Upload .gpkg polygons to {sample_polygons_dir}")
else:
  for sample_polygon in sample_polygons: print(f"selected_sample_polygons = '{sample_polygon}'")

In [None]:
selected_sample_polygons = 'tekai_sample_polygons.gpkg'

# Load cell area raster for accurate pixel-by-pixel area calculations
cell_area_path = join(areas_dir, "cell_area.tif")

selected_sample_polygons_dir = join(sample_polygons_dir, selected_sample_polygons)
selected_sample_polygons_gpkg = gpd.read_file(selected_sample_polygons_dir)
sample_polygons_statistics_dir = join(model_statistics_dir, selected_sample_polygons[:-5])
makedirs(sample_polygons_statistics_dir, exist_ok=True)
agb_total_raster_dir = join(model_statistics_dir, 'agb_total_rasters')
makedirs(agb_total_raster_dir, exist_ok=True)
agb_total_scenario_dir = join(agb_total_raster_dir, 'scenarios')
makedirs(agb_total_scenario_dir, exist_ok=True)
agb_total_dist_dir = join(agb_total_raster_dir, 'disturbance')
makedirs(agb_total_dist_dir, exist_ok=True)
detailed_stats_by_area_dir = join(sample_polygons_statistics_dir, 'detailed_stats_by_area')
makedirs(detailed_stats_by_area_dir, exist_ok=True)
detailed_stats_by_scenario_dir = join(sample_polygons_statistics_dir, 'detailed_stats_by_scenario')
makedirs(detailed_stats_by_scenario_dir, exist_ok=True)
detailed_dist_stats_by_area_dir = join(sample_polygons_statistics_dir, 'detailed_dist_stats_by_area')
makedirs(detailed_dist_stats_by_area_dir, exist_ok=True)
detailed_dist_stats_by_scenario_dir = join(sample_polygons_statistics_dir, 'detailed_dist_stats_by_scenario')
makedirs(detailed_dist_stats_by_scenario_dir, exist_ok=True)
intactness_stats_dir = join(sample_polygons_statistics_dir, 'intactness')
makedirs(intactness_stats_dir, exist_ok=True)

# More intuitive data structure
report_statistics_dir = join(sample_polygons_statistics_dir, 'report_statistics')
makedirs(report_statistics_dir, exist_ok=True)

# Convert AGBD to AGB total rasters

In [None]:
## Takes account of variation in pixel size across latitude and longitude

# List all raster files in source directories
scenario_mean_rasters = []
scenario_uncertainty_rasters = []
dist_mean_rasters = []
dist_uncertainty_rasters = []

# Collect scenario rasters
if os.path.exists(prediction_raster_dir):
    for f in os.listdir(prediction_raster_dir):
        if f.endswith('.tif'):
          full_path = join(prediction_raster_dir, f)
          if source_dir == uncertainty_dir:
              if 'mean__' in f: scenario_mean_rasters.append(full_path)
              elif 'uncertainty__' in f: scenario_uncertainty_rasters.append(full_path)
          # scenarios_dir doesn't contain uncertainty rasters
          else: scenario_mean_rasters.append(full_path)

# Collect disturbance rasters
if os.path.exists(disturbance_dir):
    for f in os.listdir(disturbance_dir):
        if f.endswith('.tif'):
          full_path = join(disturbance_dir, f)
          if source_dir == uncertainty_dir:
              if 'mean__' in f: dist_mean_rasters.append(full_path)
              elif 'uncertainty__' in f: dist_uncertainty_rasters.append(full_path)
          # scenarios_dir doesn't contain uncertainty rasters
          else: dist_mean_rasters.append(full_path)

# Sort rasters chronologically
scenario_mean_rasters = sorted(scenario_mean_rasters)
scenario_uncertainty_rasters = sorted(scenario_uncertainty_rasters)
dist_mean_rasters = sorted(dist_mean_rasters)
dist_uncertainty_rasters = sorted(dist_uncertainty_rasters)

# Create lookup dictionaries for faster uncertainty matching
scenario_uncertainty_lookup = {}
for uncertainty_raster in scenario_uncertainty_rasters:
    base_name = os.path.basename(uncertainty_raster).replace('uncertainty__', 'mean__')
    scenario_uncertainty_lookup[base_name] = uncertainty_raster

dist_uncertainty_lookup = {}
for uncertainty_raster in dist_uncertainty_rasters:
    base_name = os.path.basename(uncertainty_raster).replace('uncertainty__', 'mean__')
    dist_uncertainty_lookup[base_name] = uncertainty_raster

# Load cell area raster once for efficiency
cell_area_ds = gdal.Open(cell_area_path)
cell_area_array = cell_area_ds.ReadAsArray().astype(np.float64)
# Convert cell area from m2 to ha
cell_area_ha = cell_area_array / 10000

# Function to process a batch of rasters
def process_rasters(raster_paths, uncertainty_lookup, output_dir, is_disturbance=False):

    progress_index = 0
    progress_total = len(raster_paths)
    raster_type = "Disturbance" if is_disturbance else "Scenario"
    progress_label = widgets.Label(f"{raster_type} rasters progress: {progress_index}/{progress_total}")
    display(progress_label)
    print(f"Processing {progress_total} {raster_type.lower()} rasters...")

    for raster_path in raster_paths:
        base_filename = os.path.basename(raster_path)
        # Extract scenario name based on source directory
        if source_dir == uncertainty_dir: raster_name = base_filename.split('__')[1].split('.')[0]
        else: raster_name = base_filename.split('__')[0].split('.')[0]
        output_agb_mg = join(output_dir, f"agb_total_mg__{raster_name}.tif")
        output_agb_ci95_mg = join(output_dir, f"agb_total_ci95_mg__{raster_name}.tif")

        # Check if output files already exist
        agb_exists = os.path.exists(output_agb_mg)
        # Variables to store the raster data and mask
        agbd_array = nodata = valid_mask = None

        # Create total AGB raster if it doesn't exist
        if not agb_exists:
            # Read the mean raster
            raster = gdal.Open(raster_path)
            agbd_array = raster.ReadAsArray()
            nodata = int(raster.GetRasterBand(1).GetNoDataValue())
            # Create mask for valid data
            valid_mask = (agbd_array != nodata)
            # Calculate total AGB (Mg) = AGBD (Mg/ha) × pixel area (ha)
            total_agb_mg = np.zeros_like(agbd_array, dtype='float64')
            total_agb_mg[valid_mask] = agbd_array[valid_mask] * cell_area_ha[valid_mask]
            total_agb_mg[~valid_mask] = nodata
            # Export total AGB raster
            export_array_as_tif(total_agb_mg, output_agb_mg, template=raster_path)

        # Process uncertainty if available
        if source_dir == uncertainty_dir:
            base_name = os.path.basename(raster_path)
            if base_name in uncertainty_lookup:
                # Check if CI95 raster already exists
                agb_ci95_exists = os.path.exists(output_agb_ci95_mg)
                if not agb_ci95_exists:
                    uncertainty_path = uncertainty_lookup[base_name]
                    # Read uncertainty raster
                    uncertainty_raster = gdal.Open(uncertainty_path)
                    uncertainty_array = uncertainty_raster.ReadAsArray().astype(np.float64)
                    # Read original raster if needed
                    if agbd_array is None or valid_mask is None:
                        raster = gdal.Open(raster_path)
                        agbd_array = raster.ReadAsArray()
                        nodata = int(raster.GetRasterBand(1).GetNoDataValue())
                        valid_mask = (agbd_array != nodata)
                    # Calculate uncertainty as proportion for CI95 calculation
                    # Uncertainty is stored as percentage (0-100), divide by 100 to get proportion (0-1)
                    uncertainty_proportion = uncertainty_array / 100
                    # Calculate total AGB CI95 (Mg) = AGBD (Mg/ha) × uncertainty proportion × area (ha)
                    total_agb_ci95_mg = np.zeros_like(agbd_array, dtype='float64')
                    total_agb_ci95_mg[valid_mask] = np.abs(agbd_array[valid_mask]) * uncertainty_proportion[valid_mask] * cell_area_ha[valid_mask]
                    total_agb_ci95_mg[~valid_mask] = nodata
                    # Export total AGB CI95 raster
                    export_array_as_tif(total_agb_ci95_mg, output_agb_ci95_mg, template=raster_path)

                # Create CI95 per hectare subdirectory and export
                agbd_ci95_mgha_dir = join(output_dir, 'agbd_ci95_mgha')
                os.makedirs(agbd_ci95_mgha_dir, exist_ok=True)
                output_agbd_ci95_mgha = join(agbd_ci95_mgha_dir, f"agbd_ci95_mgha__{raster_name}.tif")
                agbd_ci95_mgha_exists = os.path.exists(output_agbd_ci95_mgha)

                if not agbd_ci95_mgha_exists:
                    # Use existing uncertainty data if already loaded
                    if 'uncertainty_proportion' not in locals():
                        uncertainty_path = uncertainty_lookup[base_name]
                        uncertainty_raster = gdal.Open(uncertainty_path)
                        uncertainty_array = uncertainty_raster.ReadAsArray().astype(np.float64)
                        uncertainty_proportion = uncertainty_array / 100
                    # Read original raster if needed
                    if agbd_array is None or valid_mask is None:
                        raster = gdal.Open(raster_path)
                        agbd_array = raster.ReadAsArray()
                        nodata = int(raster.GetRasterBand(1).GetNoDataValue())
                        valid_mask = (agbd_array != nodata)
                    # Calculate CI95 per hectare without area conversion
                    agbd_ci95_mgha = np.zeros_like(agbd_array, dtype='float64')
                    agbd_ci95_mgha[valid_mask] = np.abs(agbd_array[valid_mask]) * uncertainty_proportion[valid_mask]
                    agbd_ci95_mgha[~valid_mask] = nodata
                    # Export CI95 per hectare raster
                    export_array_as_tif(agbd_ci95_mgha, output_agbd_ci95_mgha, template=raster_path)

        # Update progress
        progress_index += 1
        progress_label.value = f"{raster_type} rasters progress: {progress_index}/{progress_total}"
    return progress_total

# Process scenario and disturbance rasters
scenario_count = process_rasters(scenario_mean_rasters, scenario_uncertainty_lookup, agb_total_scenario_dir)
dist_count = process_rasters(dist_mean_rasters, dist_uncertainty_lookup, agb_total_dist_dir, is_disturbance=True)

print(f"Processed {scenario_count} scenario rasters and {dist_count} disturbance rasters")

# Scenario statistics

In [None]:
# Create list of available AGB total rasters and extract scenarios
scenarios = set()
for agb_total_raster in os.listdir(agb_total_scenario_dir):
    if agb_total_raster.endswith('.tif') and 'agb_total_mg__' in agb_total_raster:
        scenario_name = agb_total_raster.split("agb_total_mg__")[1].split('.')[0]
        scenarios.add(scenario_name)

scenarios = sorted(list(scenarios))

# Select scenario predictions to calculate statistics
print('selected_scenarios = [')
for scenario in scenarios:
    print(f'  "{scenario}",')
print(']\n')

In [None]:
selected_scenarios = [
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_no_degradation_since_1993",
  "2021_no_disturbance_since_1993",
  "2021_oldgrowth",
  "2021_oldgrowth_1",
  "2021_oldgrowth_2",
  "2021_oldgrowth_all_land",
  "2021_oldgrowth_all_land_1",
  "2021_oldgrowth_all_land_2",
  "2022",
  "2023",
  "2024",
  "2024_no_degradation_since_1996",
  "2024_no_disturbance_since_1996",
  "2024_no_disturbance_since_1997",
  "2024_no_disturbance_since_1998",
  "2024_no_disturbance_since_1999",
  "2024_no_disturbance_since_2000",
  "2024_no_disturbance_since_2001",
  "2024_no_disturbance_since_2002",
  "2024_no_disturbance_since_2003",
  "2024_no_disturbance_since_2004",
  "2024_no_disturbance_since_2005",
  "2024_no_disturbance_since_2006",
  "2024_no_disturbance_since_2007",
  "2024_no_disturbance_since_2008",
  "2024_no_disturbance_since_2009",
  "2024_no_disturbance_since_2010",
  "2024_no_disturbance_since_2011",
  "2024_no_disturbance_since_2012",
  "2024_no_disturbance_since_2013",
  "2024_no_disturbance_since_2014",
  "2024_no_disturbance_since_2015",
  "2024_no_disturbance_since_2016",
  "2024_no_disturbance_since_2017",
  "2024_no_disturbance_since_2018",
  "2024_no_disturbance_since_2019",
  "2024_no_disturbance_since_2020",
  "2024_no_disturbance_since_2021",
  "2024_no_disturbance_since_2022",
  "2024_no_disturbance_since_2023",
  "2024_no_disturbance_since_2024",
  "2024_oldgrowth",
  "2024_oldgrowth_1",
  "2024_oldgrowth_2",
  "2024_oldgrowth_all_land",
  "2024_oldgrowth_all_land_1",
  "2024_oldgrowth_all_land_2",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
]

# Build lists of AGB total rasters for selected scenarios
agb_total_rasters = []

for scenario in selected_scenarios:
    agb_total_path = join(agb_total_scenario_dir, f"agb_total_mg__{scenario}.tif")
    if os.path.exists(agb_total_path):
        agb_total_rasters.append(agb_total_path)

# Sort rasters chronologically
agb_total_rasters = sorted(agb_total_rasters)

# Toggle whether to generate uncertainty stats (only possible with uncertainty_dir)
generate_uncertainty_stats = (source_dir == uncertainty_dir)

# Pre-allocate arrays for statistics
polygon_names = [row["name"] for _, row in selected_sample_polygons_gpkg.iterrows()]
n_polygons = len(polygon_names)
n_scenarios = len(agb_total_rasters)

forest_cover_data = np.zeros((n_scenarios, n_polygons))
agbd_mean_data = np.zeros((n_scenarios, n_polygons))
agbd_stdev_data = np.zeros((n_scenarios, n_polygons))
agb_total_data = np.zeros((n_scenarios, n_polygons))

if generate_uncertainty_stats:
    agbd_mean_ci95_data = np.zeros((n_scenarios, n_polygons))
    agbd_mean_uncertainty_data = np.zeros((n_scenarios, n_polygons))
    agb_total_ci95_data = np.zeros((n_scenarios, n_polygons))

# Open AGB total raster datasets
agb_total_datasets = {path: rasterio.open(path) for path in agb_total_rasters}

# Open AGB total CI95 datasets only if uncertainty stats are generated
agb_total_ci95_datasets = {}
if generate_uncertainty_stats:
    for agb_total_raster in agb_total_rasters:
        scenario_name = os.path.basename(agb_total_raster).split('agb_total_mg__')[1].split('.')[0]
        agb_total_ci95_path = join(agb_total_scenario_dir, f"agb_total_ci95_mg__{scenario_name}.tif")
        if os.path.exists(agb_total_ci95_path):
            agb_total_ci95_datasets[agb_total_raster] = rasterio.open(agb_total_ci95_path)

# Load cell area raster once for all calculations
cell_area_dataset = rasterio.open(cell_area_path)

try:
    # Initialise polygon area dataframe
    df_polygon_area_km2 = pd.DataFrame(columns=["Name", "Area (km^2)"])

    # Loop through each polygon to generate statistics
    for poly_idx, (index, row) in enumerate(selected_sample_polygons_gpkg.iterrows()):

        # Define the polygon
        sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Mask the cell area raster to the polygon once
        cell_area_masked, transform_1 = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)

        # Calculate total area of all pixels within polygon in hectares
        pixel_area_sum_m2 = np.ma.sum(cell_area_masked, dtype='float64')
        pixel_area_sum_ha = pixel_area_sum_m2 / 10000

        # Convert cell areas from m2 to ha
        cell_area_masked_ha = cell_area_masked / 10000

        # Add polygon area to dataframe
        new_row = pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': pixel_area_sum_ha / 100}], dtype=object)
        df_polygon_area_km2 = pd.concat([df_polygon_area_km2, new_row], ignore_index=True, sort=False)

        # Loop through AGB total rasters
        for raster_idx, agb_total_raster in enumerate(agb_total_rasters):

            # Mask AGB total raster to polygon
            agb_total = agb_total_datasets[agb_total_raster]
            agb_total_array_masked, transform_2 = msk.mask(agb_total, polygons, crop=True, filled=False)

            # Extract forest pixels from valid AGB total pixels
            forest_pixels_mask = ~np.ma.getmaskarray(agb_total_array_masked)

            # Calculate forest area by summing cell areas of forest pixels
            forest_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~forest_pixels_mask)
            forest_cover_ha = np.ma.sum(forest_cell_areas_ha, dtype='float64')

            # Sum total AGB in Mg
            agb_total_mg = np.ma.sum(agb_total_array_masked, dtype='float64')

            # Calculate statistics with masked value handling
            if np.ma.is_masked(agb_total_mg) or forest_cover_ha <= 0:
                agbd_mean_mg_ha = 0.0
                agbd_mean_stdev_ha = 0.0
                agb_total_tg = 0.0
            else:
                # Calculate area-weighted mean AGBD
                agbd_mean_mg_ha = agb_total_mg / forest_cover_ha

                # Back-calculate individual AGBD values for standard deviation
                agbd_values = agb_total_array_masked / cell_area_masked_ha
                valid_agbd = agbd_values[forest_pixels_mask]
                valid_areas = cell_area_masked_ha[forest_pixels_mask]

                # Calculate area-weighted standard deviation
                variance_weighted = np.sum(valid_areas * (valid_agbd - agbd_mean_mg_ha)**2) / forest_cover_ha
                agbd_mean_stdev_ha = np.sqrt(variance_weighted)

                # Convert total AGB from Mg to Tg
                agb_total_tg = agb_total_mg / 1000000

            # Store results in pre-allocated arrays
            forest_cover_data[raster_idx, poly_idx] = forest_cover_ha
            agbd_mean_data[raster_idx, poly_idx] = agbd_mean_mg_ha
            agbd_stdev_data[raster_idx, poly_idx] = agbd_mean_stdev_ha
            agb_total_data[raster_idx, poly_idx] = agb_total_tg

            if generate_uncertainty_stats and agb_total_raster in agb_total_ci95_datasets:
                # Get total AGB CI95 from pre-calculated raster
                agb_total_ci95_raster = agb_total_ci95_datasets[agb_total_raster]
                agb_total_ci95_array_masked, _ = msk.mask(agb_total_ci95_raster, polygons, crop=True, filled=False)

                # Sum total AGB CI95 in Mg
                agb_total_ci95_mg = abs(np.ma.sum(agb_total_ci95_array_masked, dtype='float64'))

                # Calculate uncertainty statistics
                if abs(agb_total_mg) > 0:
                    agbd_mean_mg_ha_ci95 = agb_total_ci95_mg / forest_cover_ha
                    agbd_mean_mg_ha_uncertainty = agb_total_ci95_mg / abs(agb_total_mg)) * 100
                else:
                    agbd_mean_mg_ha_ci95 = 0
                    agbd_mean_mg_ha_uncertainty = 0

                # Convert total AGB CI95 from Mg to Tg
                agb_total_tg_ci95 = agb_total_ci95_mg / 1000000

                # Store uncertainty results
                agbd_mean_ci95_data[raster_idx, poly_idx] = agbd_mean_mg_ha_ci95
                agbd_mean_uncertainty_data[raster_idx, poly_idx] = agbd_mean_mg_ha_uncertainty
                agb_total_ci95_data[raster_idx, poly_idx] = agb_total_tg_ci95

finally:
    # Close all opened datasets
    cell_area_dataset.close()
    for dataset in agb_total_datasets.values():
        dataset.close()
    for dataset in agb_total_ci95_datasets.values():
        dataset.close()

# Create DataFrames from pre-allocated arrays
df_forest_cover_ha = pd.DataFrame(forest_cover_data, index=selected_scenarios, columns=polygon_names)
df_forest_cover_ha.rename_axis('scenario', inplace=True)

df_agbd_mean_mg_ha = pd.DataFrame(agbd_mean_data, index=selected_scenarios, columns=polygon_names)
df_agbd_mean_mg_ha.rename_axis('scenario', inplace=True)

df_agbd_stdev_mg_ha = pd.DataFrame(agbd_stdev_data, index=selected_scenarios, columns=polygon_names)
df_agbd_stdev_mg_ha.rename_axis('scenario', inplace=True)

df_agb_total_tg = pd.DataFrame(agb_total_data, index=selected_scenarios, columns=polygon_names)
df_agb_total_tg.rename_axis('scenario', inplace=True)

if generate_uncertainty_stats:
    df_agbd_mean_mg_ha_ci95 = pd.DataFrame(agbd_mean_ci95_data, index=selected_scenarios, columns=polygon_names)
    df_agbd_mean_mg_ha_ci95.rename_axis('scenario', inplace=True)

    df_agbd_mean_mg_ha_uncertainty = pd.DataFrame(agbd_mean_uncertainty_data, index=selected_scenarios, columns=polygon_names)
    df_agbd_mean_mg_ha_uncertainty.rename_axis('scenario', inplace=True)

    df_agb_total_tg_ci95 = pd.DataFrame(agb_total_ci95_data, index=selected_scenarios, columns=polygon_names)
    df_agb_total_tg_ci95.rename_axis('scenario', inplace=True)

# Create stats list
if generate_uncertainty_stats:
    df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95,
                     df_agbd_mean_mg_ha_uncertainty, df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_tg_ci95]
else:
    df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Generate summary stats
df_forest_cover_ha_t = df_forest_cover_ha.T.rename_axis("Name", axis=1).add_suffix(" forest cover (ha)")
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")

# Use list for efficient concatenation
summary_components = [df_polygon_area_km2, df_forest_cover_ha_t, df_agb_total_tg_t]
if generate_uncertainty_stats:
    df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
    summary_components.append(df_agb_total_tg_ci95_t)

summary_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_statistics_dir, 'summary_stats.csv'))

# Generate empty dataframes for statistics
df_base = pd.DataFrame(index=selected_scenarios)
df_base.rename_axis('scenario', inplace=True)

# Generate detailed stats by area
for polygon_area in polygon_names:
    polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
    df_detailed_stats = df_base.copy()
    df_detailed_stats["Area (km^2)"] = polygon_area_km2
    for df_stats in df_stats_list:
        if df_stats.equals(df_forest_cover_ha): stat_col = "Forest cover (ha)"
        if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
        if df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "Forest AGBD stdev (Mg / ha)"
        if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
        if generate_uncertainty_stats:
            if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
            if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
            if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
        for stats_polygon_area in df_stats:
            if stats_polygon_area == polygon_area:
                df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
                df_detailed_stats = pd.concat([df_detailed_stats, df_stats_renamed[stat_col]], axis=1)
    df_detailed_stats.to_csv(join(detailed_stats_by_area_dir, f'{polygon_area}.csv'))

# Generate detailed stats by scenario - build dictionary once then process
scenarios = {}
for stats_csv in os.listdir(detailed_stats_by_area_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_stats_by_area_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Process all scenarios for this polygon in one pass
    for scenario in stats_csv_df['scenario'].unique():
        scenario_df = stats_csv_df[stats_csv_df['scenario'] == scenario].copy()
        scenario_df.drop('scenario', axis=1, inplace=True)
        scenario_df.insert(0, 'Name', polygon_name)
        if scenario in scenarios:
            scenarios[scenario] = pd.concat([scenarios[scenario], scenario_df], ignore_index=True)
        else:
            scenarios[scenario] = scenario_df

# Write all scenario CSVs
for scenario, scenario_df in scenarios.items():
    output_file_path = join(detailed_stats_by_scenario_dir,f'{scenario}.csv')
    scenario_df.to_csv(output_file_path, index=False)

# Disturbance statistics

In [None]:
# Create list of available AGB total disturbance rasters and extract disturbances
dists = set()
for agb_total_dist_raster in os.listdir(agb_total_dist_dir):
    if agb_total_dist_raster.endswith('.tif') and 'agb_total_mg__' in agb_total_dist_raster:
        dist_name = agb_total_dist_raster.split("agb_total_mg__")[1].split('.')[0]
        dists.add(dist_name)

dists = sorted(list(dists))

# Select disturbance rasters to calculate statistics
print('selected_dists = [')
for dist in dists:
    print(f'  "{dist}",')
print(']')

In [None]:
selected_dists = [
  "2021_deforestation_since_1993",
  "2021_deforestation_since_oldgrowth",
  "2021_degradation_since_1993",
  "2021_degradation_since_oldgrowth",
  "2021_disturbance_since_1993",
  "2021_disturbance_since_oldgrowth",
  "2024_deforestation_of_road_mat_daling_2023",
  "2024_deforestation_since_1996",
  "2024_deforestation_since_oldgrowth",
  "2024_degradation_since_1996",
  "2024_degradation_since_oldgrowth",
  "2024_disturbance_since_1996",
  "2024_disturbance_since_1997",
  "2024_disturbance_since_1998",
  "2024_disturbance_since_1999",
  "2024_disturbance_since_2000",
  "2024_disturbance_since_2001",
  "2024_disturbance_since_2002",
  "2024_disturbance_since_2003",
  "2024_disturbance_since_2004",
  "2024_disturbance_since_2005",
  "2024_disturbance_since_2006",
  "2024_disturbance_since_2007",
  "2024_disturbance_since_2008",
  "2024_disturbance_since_2009",
  "2024_disturbance_since_2010",
  "2024_disturbance_since_2011",
  "2024_disturbance_since_2012",
  "2024_disturbance_since_2013",
  "2024_disturbance_since_2014",
  "2024_disturbance_since_2015",
  "2024_disturbance_since_2016",
  "2024_disturbance_since_2017",
  "2024_disturbance_since_2018",
  "2024_disturbance_since_2019",
  "2024_disturbance_since_2020",
  "2024_disturbance_since_2021",
  "2024_disturbance_since_2022",
  "2024_disturbance_since_2023",
  "2024_disturbance_since_2024",
  "2024_disturbance_since_oldgrowth",
  "2024_effect_of_disturbance_in_1996",
  "2024_effect_of_disturbance_in_1997",
  "2024_effect_of_disturbance_in_1998",
  "2024_effect_of_disturbance_in_1999",
  "2024_effect_of_disturbance_in_2000",
  "2024_effect_of_disturbance_in_2001",
  "2024_effect_of_disturbance_in_2002",
  "2024_effect_of_disturbance_in_2003",
  "2024_effect_of_disturbance_in_2004",
  "2024_effect_of_disturbance_in_2005",
  "2024_effect_of_disturbance_in_2006",
  "2024_effect_of_disturbance_in_2007",
  "2024_effect_of_disturbance_in_2008",
  "2024_effect_of_disturbance_in_2009",
  "2024_effect_of_disturbance_in_2010",
  "2024_effect_of_disturbance_in_2011",
  "2024_effect_of_disturbance_in_2012",
  "2024_effect_of_disturbance_in_2013",
  "2024_effect_of_disturbance_in_2014",
  "2024_effect_of_disturbance_in_2015",
  "2024_effect_of_disturbance_in_2016",
  "2024_effect_of_disturbance_in_2017",
  "2024_effect_of_disturbance_in_2018",
  "2024_effect_of_disturbance_in_2019",
  "2024_effect_of_disturbance_in_2020",
  "2024_effect_of_disturbance_in_2021",
  "2024_effect_of_disturbance_in_2022",
  "2024_effect_of_disturbance_in_2023",
  "2024_effect_of_disturbance_in_2024",
]

# Build lists of AGB total disturbance rasters for selected disturbances
agb_total_dist_rasters = []

for dist in selected_dists:
    agb_total_path = join(agb_total_dist_dir, f"agb_total_mg__{dist}.tif")
    if os.path.exists(agb_total_path):
        agb_total_dist_rasters.append(agb_total_path)

# Sort rasters chronologically
agb_total_dist_rasters = sorted(agb_total_dist_rasters)

# Toggle whether to generate uncertainty stats (only possible with uncertainty_dir)
generate_uncertainty_stats = (source_dir == uncertainty_dir)

# Pre-allocate arrays for statistics
polygon_names = [row["name"] for _, row in selected_sample_polygons_gpkg.iterrows()]
n_polygons = len(polygon_names)
n_dists = len(agb_total_dist_rasters)

agbd_mean_data = np.zeros((n_dists, n_polygons))
agbd_stdev_data = np.zeros((n_dists, n_polygons))
agb_total_data = np.zeros((n_dists, n_polygons))

if generate_uncertainty_stats:
    agbd_mean_ci95_data = np.zeros((n_dists, n_polygons))
    agbd_mean_uncertainty_data = np.zeros((n_dists, n_polygons))
    agb_total_ci95_data = np.zeros((n_dists, n_polygons))

# Open AGB total disturbance raster datasets
agb_total_dist_datasets = {path: rasterio.open(path) for path in agb_total_dist_rasters}

# Open AGB total CI95 disturbance datasets only if uncertainty stats are generated
agb_total_ci95_dist_datasets = {}
if generate_uncertainty_stats:
    for agb_total_dist_raster in agb_total_dist_rasters:
        dist_name = os.path.basename(agb_total_dist_raster).split('agb_total_mg__')[1].split('.')[0]
        agb_total_ci95_path = join(agb_total_dist_dir, f"agb_total_ci95_mg__{dist_name}.tif")
        if os.path.exists(agb_total_ci95_path):
            agb_total_ci95_dist_datasets[agb_total_dist_raster] = rasterio.open(agb_total_ci95_path)

# Load cell area raster once for all calculations
cell_area_dataset = rasterio.open(cell_area_path)

try:
    # Initialise polygon area dataframe
    df_polygon_area_km2 = pd.DataFrame(columns=["Name", "Area (km^2)"])

    # Loop through each polygon to generate statistics
    for poly_idx, (index, row) in enumerate(selected_sample_polygons_gpkg.iterrows()):

        # Define the polygon
        sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Mask the cell area raster to the polygon once
        cell_area_masked, transform_1 = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)

        # Calculate total area of all pixels within polygon in hectares
        pixel_area_sum_m2 = np.ma.sum(cell_area_masked, dtype='float64')
        pixel_area_sum_ha = pixel_area_sum_m2 / 10000

        # Convert cell areas from m2 to ha
        cell_area_masked_ha = cell_area_masked / 10000

        # Add polygon area to dataframe
        new_row = pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': pixel_area_sum_ha / 100}], dtype=object)
        df_polygon_area_km2 = pd.concat([df_polygon_area_km2, new_row], ignore_index=True, sort=False)

        # Loop through AGB total disturbance rasters
        for raster_idx, agb_total_dist_raster in enumerate(agb_total_dist_rasters):

            # Mask AGB total disturbance raster to polygon
            agb_total_dist = agb_total_dist_datasets[agb_total_dist_raster]
            agb_total_array_masked, transform_2 = msk.mask(agb_total_dist, polygons, crop=True, filled=False)

            # Extract forest pixels from valid AGB total pixels
            forest_pixels_mask = ~np.ma.getmaskarray(agb_total_array_masked)

            # Calculate forest area by summing cell areas of forest pixels
            forest_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~forest_pixels_mask)
            forest_cover_ha = np.ma.sum(forest_cell_areas_ha, dtype='float64')

            # Sum total AGB in Mg
            agb_total_mg = np.ma.sum(agb_total_array_masked, dtype='float64')

            # Calculate statistics with masked value handling
            if np.ma.is_masked(agb_total_mg) or forest_cover_ha <= 0:
                agbd_mean_mg_ha = 0.0
                agbd_mean_stdev_ha = 0.0
                agb_total_tg = 0.0
            else:
                # Calculate area-weighted mean AGBD
                agbd_mean_mg_ha = agb_total_mg / forest_cover_ha

                # Back-calculate individual AGBD values for standard deviation
                agbd_values = agb_total_array_masked / cell_area_masked_ha
                valid_agbd = agbd_values[forest_pixels_mask]
                valid_areas = cell_area_masked_ha[forest_pixels_mask]

                # Calculate area-weighted standard deviation
                variance_weighted = np.sum(valid_areas * (valid_agbd - agbd_mean_mg_ha)**2) / forest_cover_ha
                agbd_mean_stdev_ha = np.sqrt(variance_weighted)

                # Convert total AGB from Mg to Tg
                agb_total_tg = agb_total_mg / 1000000

            # Store results in pre-allocated arrays
            agbd_mean_data[raster_idx, poly_idx] = agbd_mean_mg_ha
            agbd_stdev_data[raster_idx, poly_idx] = agbd_mean_stdev_ha
            agb_total_data[raster_idx, poly_idx] = agb_total_tg

            if generate_uncertainty_stats and agb_total_dist_raster in agb_total_ci95_dist_datasets:
                # Get total AGB CI95 from pre-calculated raster
                agb_total_ci95_raster = agb_total_ci95_dist_datasets[agb_total_dist_raster]
                agb_total_ci95_array_masked, _ = msk.mask(agb_total_ci95_raster, polygons, crop=True, filled=False)

                # Sum total AGB CI95 in Mg
                agb_total_ci95_mg = abs(np.ma.sum(agb_total_ci95_array_masked, dtype='float64'))

                # Calculate uncertainty statistics
                if abs(agb_total_mg) > 0:
                    agbd_mean_mg_ha_ci95 = agb_total_ci95_mg / forest_cover_ha
                    agbd_mean_mg_ha_uncertainty = agb_total_ci95_mg / abs(agb_total_mg) * 100
                else:
                    agbd_mean_mg_ha_ci95 = 0
                    agbd_mean_mg_ha_uncertainty = 0

                # Convert total AGB CI95 from Mg to Tg
                agb_total_tg_ci95 = agb_total_ci95_mg / 1000000

                # Store uncertainty results
                agbd_mean_ci95_data[raster_idx, poly_idx] = agbd_mean_mg_ha_ci95
                agbd_mean_uncertainty_data[raster_idx, poly_idx] = agbd_mean_mg_ha_uncertainty
                agb_total_ci95_data[raster_idx, poly_idx] = agb_total_tg_ci95

finally:
    # Close all opened datasets
    cell_area_dataset.close()
    for dataset in agb_total_dist_datasets.values():
        dataset.close()
    for dataset in agb_total_ci95_dist_datasets.values():
        dataset.close()

# Create DataFrames from pre-allocated arrays
df_agbd_mean_mg_ha = pd.DataFrame(agbd_mean_data, index=selected_dists, columns=polygon_names)
df_agbd_mean_mg_ha.rename_axis('dist', inplace=True)

df_agbd_stdev_mg_ha = pd.DataFrame(agbd_stdev_data, index=selected_dists, columns=polygon_names)
df_agbd_stdev_mg_ha.rename_axis('dist', inplace=True)

df_agb_total_tg = pd.DataFrame(agb_total_data, index=selected_dists, columns=polygon_names)
df_agb_total_tg.rename_axis('dist', inplace=True)

if generate_uncertainty_stats:
    df_agbd_mean_mg_ha_ci95 = pd.DataFrame(agbd_mean_ci95_data, index=selected_dists, columns=polygon_names)
    df_agbd_mean_mg_ha_ci95.rename_axis('dist', inplace=True)

    df_agbd_mean_mg_ha_uncertainty = pd.DataFrame(agbd_mean_uncertainty_data, index=selected_dists, columns=polygon_names)
    df_agbd_mean_mg_ha_uncertainty.rename_axis('dist', inplace=True)

    df_agb_total_tg_ci95 = pd.DataFrame(agb_total_ci95_data, index=selected_dists, columns=polygon_names)
    df_agb_total_tg_ci95.rename_axis('dist', inplace=True)

# Create stats list
if generate_uncertainty_stats:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_tg_ci95]
else:
    df_stats_list = [df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Generate summary stats
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")

# Use list for efficient concatenation
summary_components = [df_polygon_area_km2, df_agb_total_tg_t]
if generate_uncertainty_stats:
    df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
    summary_components.append(df_agb_total_tg_ci95_t)

summary_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_statistics_dir, 'summary_dist_stats.csv'))

# Generate empty dataframes for statistics
df_base = pd.DataFrame(index=selected_dists)
df_base.rename_axis('dist', inplace=True)

# Generate detailed stats by polygon
for polygon_area in polygon_names:
    polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
    df_detailed_dist_stats = df_base.copy()
    df_detailed_dist_stats["Area (km^2)"] = polygon_area_km2
    for df_stats in df_stats_list:
        if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
        if df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "Forest AGBD stdev (Mg / ha)"
        if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
        if generate_uncertainty_stats:
            if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
            if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
            if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
        for stats_polygon_area in df_stats:
            if stats_polygon_area == polygon_area:
                df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
                df_detailed_dist_stats = pd.concat([df_detailed_dist_stats, df_stats_renamed[stat_col]], axis=1)
    df_detailed_dist_stats.to_csv(join(detailed_dist_stats_by_area_dir, f'{polygon_area}.csv'))

# Generate detailed stats by disturbance type - build dictionary once then process
dists = {}
for stats_csv in os.listdir(detailed_dist_stats_by_area_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_dist_stats_by_area_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Process all disturbance types for this polygon in one pass
    for dist in stats_csv_df['dist'].unique():
        dist_df = stats_csv_df[stats_csv_df['dist'] == dist].copy()
        dist_df.drop('dist', axis=1, inplace=True)
        dist_df.insert(0, 'Name', polygon_name)
        if dist in dists:
            dists[dist] = pd.concat([dists[dist], dist_df], ignore_index=True)
        else:
            dists[dist] = dist_df

# Write all disturbance CSVs
for dist, dist_df in dists.items():
    output_file_path = join(detailed_dist_stats_by_scenario_dir,f'{dist}.csv')
    dist_df.to_csv(output_file_path, index=False)

# Intactness statistics

In [None]:
# Create list of available percentage change and intactness rasters
percentage_change_rasters = []
intactness_rasters = []
for root, dirs, files in os.walk(intactness_dir):
    for file in files:
        if "intactness__" in file and file.endswith('tif'):
            relative_path = os.path.relpath(join(root, file), intactness_dir)
            intactness_rasters.append(relative_path)

# Select intactness rasters to calculate statistics
print("# Select intactness raster to calculate statistics")
print("intactness_rasters = [")
for raster in intactness_rasters:
    print(f"'{raster}',")
print("]")

In [None]:
# Select intactness raster to calculate statistics
intactness_rasters = [
'2024_oldgrowth_all_land__2024_disturbance_since_oldgrowth/intactness__forest_reserves_10_quantiles__2024_oldgrowth_all_land__2024_disturbance_since_oldgrowth__agbd_tekai_250625_003858.tif',
'2024_no_disturbance_since_1996__2024_disturbance_since_1996/intactness__forest_reserves_10_quantiles__2024_no_disturbance_since_1996__2024_disturbance_since_1996__agbd_tekai_250625_003858.tif',
'2021_oldgrowth_all_land__2021_disturbance_since_oldgrowth/intactness__forest_reserves_10_quantiles__2021_oldgrowth_all_land__2021_disturbance_since_oldgrowth__agbd_tekai_250625_003858.tif',
'2021_no_disturbance_since_1993__2021_disturbance_since_1993/intactness__forest_reserves_10_quantiles__2021_no_disturbance_since_1993__2021_disturbance_since_1993__agbd_tekai_250625_003858.tif',
]

In [None]:
# Load cell area raster for accurate pixel-by-pixel area calculations
cell_area_path = join(areas_dir, "cell_area.tif")

# Toggle for non-forest definition
# True: non-forest = land pixels with intactness == 0 (excludes water)
# False: non-forest = any pixels with intactness == 0 (includes water)
# Note: True requires appropriate land mask created in 'Oldgrowth scenarios' section of 6_scenarios.ipynb
restrict_non_forest_to_land_only = True

# Match percentage change rasters
intactness_percentage_raster_paths = {}
for intactness_raster in intactness_rasters:
    intactness_raster_path = join(intactness_dir, intactness_raster)
    intactness_baseline_dist_dir = intactness_raster.split('/')[0]
    percentage_change_filename = f"percentage_change__{intactness_baseline_dist_dir}__{selected_model}.tif"
    percentage_change_path = join(intactness_dir, intactness_baseline_dist_dir, percentage_change_filename)
    intactness_percentage_raster_paths[intactness_raster_path] = percentage_change_path

# Function to calculate area-weighted statistics
def weighted_stats(values, weights):
    # Handle empty arrays
    if len(values) == 0:
        return None, None
    # Calculate weighted mean
    weighted_sum = np.sum(values * weights)
    sum_of_weights = np.sum(weights)
    weighted_mean = weighted_sum / sum_of_weights if sum_of_weights > 0 else 0
    # Calculate weighted standard deviation
    if sum_of_weights > 0:
        variance = np.sum(weights * (values - weighted_mean) ** 2) / sum_of_weights
        weighted_std = np.sqrt(variance)
    else: weighted_std = 0
    return weighted_mean, weighted_std

# Function to calculate area for each intactness score
def calculate_score_areas(intactness_masked, cell_area_masked_ha):
    score_areas = {}
    if np.ma.count(intactness_masked) > 0:
        # Get unique values from valid (unmasked) data
        unique_scores = np.unique(intactness_masked.compressed())
        for score in unique_scores:
            # Find pixels with this score (not masked)
            score_mask = (intactness_masked == score) & (~intactness_masked.mask)
            if np.any(score_mask): score_area_ha = np.sum(cell_area_masked_ha[score_mask], dtype='float64')
            else: score_area_ha = 0.0
            score_areas[int(score)] = score_area_ha
    return score_areas

# Pre-open cell area dataset
cell_area_dataset = rasterio.open(cell_area_path)

try:
    # Loop through each polygon stored in GPKG to generate statistics
    for intactness_raster, percentage_raster in intactness_percentage_raster_paths.items():
        polygon_quantiles = intactness_raster.split('/')[-1].split('__')[1]
        baseline = intactness_raster.split('/')[-1].split('__')[2]
        disturbance = intactness_raster.split('/')[-1].split('__')[3]
        current_year = disturbance.split('_')[0]

        # Dynamic land mask path based on intactness raster year
        if restrict_non_forest_to_land_only:
            all_land_mask_path = join(mask_dir, f"mask_forest_{current_year}_oldgrowth_all_land.tif")
            if not exists(all_land_mask_path):
                raise FileNotFoundError(f"Land mask not found: {all_land_mask_path}. "
                                      f"Please create this mask in the 'Oldgrowth scenarios' section of 6_scenarios.ipynb")
            all_land_mask_dataset = rasterio.open(all_land_mask_path)

        intactness_csv_name = f"{polygon_quantiles}__{baseline}__{disturbance}.csv"
        intactness_csv_path = join(intactness_stats_dir, intactness_csv_name)
        total_score = int(intactness_raster.split('/')[-1].split('__')[1].split('_')[-2])
        total_stdev = int(total_score / 2)
        df_intactness_stats = pd.DataFrame(columns=[
            "Name",
            "Percentage change (remaining forest) mean",
            "Percentage change (remaining forest) stdev",
            "Percentage change (non-forest = -100) mean",
            "Percentage change (non-forest = -100) stdev",
            f"Intactness (remaining forest) mean / {total_score}",
            f"Intactness (remaining forest) stdev / {total_stdev}",
            f"Intactness (non-forest = 0) mean / {total_score}",
            f"Intactness (non-forest = 0) stdev / {total_stdev}"
        ])

        for index, row in selected_sample_polygons_gpkg.iterrows():

            # Define the polygon
            sample_polygon_geometry = row["geometry"]
            sample_polygon_name = row["name"]
            polygons = [polygon for polygon in sample_polygon_geometry.geoms]

            # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
            sample_polygons_crs = selected_sample_polygons_gpkg.crs
            temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
            temp_gdf_utm = temp_gdf.estimate_utm_crs()
            polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')

            # Read & mask intactness to polygon
            with rasterio.open(intactness_raster) as src:
                nodata_value = src.nodata
                intactness_masked, transform_2 = msk.mask(src, polygons, crop=True, filled=False)

            # Check if all values are masked (outside polygon)
            if np.ma.count(intactness_masked) == 0:
                # No valid intactness data - set all stats to None and continue to next polygon
                new_row = pd.DataFrame([{
                    'Name': sample_polygon_name,
                    'Percentage change (remaining forest) mean': None,
                    'Percentage change (remaining forest) stdev': None,
                    'Percentage change (non-forest = -100) mean': None,
                    'Percentage change (non-forest = -100) stdev': None,
                    f'Intactness (remaining forest) mean / {total_score}': None,
                    f'Intactness (remaining forest) stdev / {total_stdev}': None,
                    f'Intactness (non-forest = 0) mean / {total_score}': None,
                    f'Intactness (non-forest = 0) stdev / {total_stdev}': None,
                }], dtype=object)

                df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)
                continue

            # Mask the cell area raster to the polygon using pre-opened dataset
            cell_area_masked, transform_ca = msk.mask(cell_area_dataset, polygons, crop=True, filled=False)

            # Convert adjusted cell areas from m2 to ha for easier calculations
            cell_area_masked_ha = cell_area_masked / 10000

            # Create forest mask from intactness raster (forest = intactness > 0)
            forest_mask = (~intactness_masked.mask) & (intactness_masked > 0)

            # Create non-forest mask based on toggle setting
            potential_non_forest_mask = (~intactness_masked.mask) & (intactness_masked == 0)

            if restrict_non_forest_to_land_only:
                # Get land mask and restrict non-forest to land pixels only (excludes water)
                all_land_mask_data, transform_alm = msk.mask(all_land_mask_dataset, polygons, crop=True, filled=False)
                all_land_mask = ~np.ma.getmaskarray(all_land_mask_data) & (all_land_mask_data == 1)
                non_forest_mask = potential_non_forest_mask & all_land_mask
            else:
                # Include all pixels with intactness == 0 (including water)
                non_forest_mask = potential_non_forest_mask

            # Extract forest and non-forest land areas
            forest_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~forest_mask)
            forest_area_ha_sum = np.ma.sum(forest_cell_areas_ha)

            non_forest_cell_areas_ha = np.ma.array(cell_area_masked_ha.data, mask=~non_forest_mask)
            non_forest_area_ha_sum = np.ma.sum(non_forest_cell_areas_ha)

            # Mask percentage change raster to the polygon
            with rasterio.open(percentage_raster) as percent_change:
                percent_change_masked, transform_pc = msk.mask(percent_change, polygons, crop=True, filled=False)

            # Apply forest mask to percentage change values (for forest-only statistics)
            percent_change_forest_only = np.ma.array(percent_change_masked.data, mask=~forest_mask)

            if forest_area_ha_sum > 0:
                # Extract percentage change values for forest pixels
                forest_percent_values = np.ma.compressed(percent_change_forest_only)
                forest_percent_weights = np.ma.compressed(forest_cell_areas_ha)

                # Calculate area-weighted percentage change statistics for remaining forest
                percent_change_forest_mean, percent_change_forest_std = weighted_stats(
                    forest_percent_values, forest_percent_weights
                )

                # For all land including non-forest land (treated as -100% change)
                if non_forest_area_ha_sum > 0:
                    # Calculate the weighted mean directly
                    all_mean_numerator = np.sum(forest_percent_values * forest_percent_weights)
                    all_mean_denominator = forest_area_ha_sum + non_forest_area_ha_sum
                    all_mean_numerator += non_forest_area_ha_sum * (-100.0)
                    percent_change_all_mean = all_mean_numerator / all_mean_denominator

                    # Calculate the weighted variance directly
                    forest_variance_contribution = np.sum(
                        forest_percent_weights * np.square(forest_percent_values - percent_change_all_mean)
                    )
                    non_forest_variance_contribution = non_forest_area_ha_sum * np.square((-100.0) - percent_change_all_mean)
                    all_variance = (forest_variance_contribution + non_forest_variance_contribution) / all_mean_denominator
                    percent_change_all_std = np.sqrt(all_variance)
                else:
                    # If no non-forest land area, all-land stats are the same as forest stats
                    percent_change_all_mean = percent_change_forest_mean
                    percent_change_all_std = percent_change_forest_std
            else:
                # If no forest, set forest stats to None and all-land stats to -100% change
                percent_change_forest_mean = percent_change_forest_std = None
                percent_change_all_mean = -100.0
                percent_change_all_std = 0.0

            # Compute intactness stats
            if forest_area_ha_sum > 0:
                # Remaining-forest intactness (only forest pixels)
                forest_intact_vals = intactness_masked.data[forest_mask]
                forest_intact_weights = cell_area_masked_ha.data[forest_mask]
                intactness_remaining_mean, intactness_remaining_std = weighted_stats(
                    forest_intact_vals, forest_intact_weights
                )

                # All-land intactness (non-forest land = 0)
                total_land = forest_area_ha_sum + non_forest_area_ha_sum
                num = np.sum(forest_intact_vals * forest_intact_weights)
                den = total_land
                intactness_all_mean = num / den

                # Variance: forest + non-forest land contributions
                var_forest = np.sum(forest_intact_weights * np.square(forest_intact_vals - intactness_all_mean))
                var_nonforest = non_forest_area_ha_sum * np.square(0 - intactness_all_mean)
                intactness_all_std = np.sqrt((var_forest + var_nonforest) / den)
            else:
                # No forest present
                intactness_remaining_mean = intactness_remaining_std = None
                intactness_all_mean = 0.0
                intactness_all_std = 0.0

            # Calculate area for each intactness score
            score_areas = calculate_score_areas(intactness_masked, cell_area_masked_ha)

            # Create new row with all statistics including score areas
            new_row_dict = {
                'Name': sample_polygon_name,
                'Percentage change (remaining forest) mean': percent_change_forest_mean,
                'Percentage change (remaining forest) stdev': percent_change_forest_std,
                'Percentage change (non-forest = -100) mean': percent_change_all_mean,
                'Percentage change (non-forest = -100) stdev': percent_change_all_std,
                f'Intactness (remaining forest) mean / {total_score}': intactness_remaining_mean,
                f'Intactness (remaining forest) stdev / {total_stdev}': intactness_remaining_std,
                f'Intactness (non-forest = 0) mean / {total_score}': intactness_all_mean,
                f'Intactness (non-forest = 0) stdev / {total_stdev}': intactness_all_std,
            }

            # Add score area columns
            for score, area in score_areas.items():
                new_row_dict[f'Intactness score {score} area (ha)'] = area

            new_row = pd.DataFrame([new_row_dict], dtype=object)

            # Append to main dataframe
            df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)

        # Set index to Name and save to CSV
        df_intactness_stats = df_intactness_stats.set_index('Name')
        df_intactness_stats.to_csv(intactness_csv_path)
        print(f"Saved statistics to {intactness_csv_path}")

        # Close land mask dataset if opened
        if restrict_non_forest_to_land_only:
            all_land_mask_dataset.close()

finally: cell_area_dataset.close() # Close static datasets

# Report statistics

In [None]:
# Reduces statistics to a more specific and intuitive format.

# Define scenarios for report
print("# Remember that order matters\n")
print("scenario_list = [")
for csv in os.listdir(detailed_stats_by_scenario_dir):
  print(f"'{csv[:-4]}',")
print("]")
print("")

disturbance_csv_files = [f[:-4] for f in os.listdir(detailed_dist_stats_by_scenario_dir) if f.endswith('.csv')]

def get_disturbance_type(filename):
    if 'degradation_deforestation' in filename: return 3  # comes third
    elif 'deforestation' in filename: return 2  # comes second
    else: return 1  # comes first (degradation)

# Sort by year, then disturbance type
files_by_category = {}
for file in disturbance_csv_files:
    year = file.split('_')[0]
    dist_type = get_disturbance_type(file)
    key = (year, dist_type)

    if key not in files_by_category:
        files_by_category[key] = []
    files_by_category[key].append(file)
print("disturbance_list = [")
current_year = None

# Process each category in order
for key in sorted(files_by_category.keys(), key=lambda k: (int(k[0]), k[1])):
    files = files_by_category[key]
    # First add 'total' files
    total_files = [f for f in files if '_total' in f]
    for file in total_files:
        print(f"    '{file}',")
    # Group remaining files by reference year
    ref_year_files = {}
    for file in files:
        if '_total' in file:
            continue
        ref_year = file.split('_')[-1]
        if ref_year not in ref_year_files:
            ref_year_files[ref_year] = []
        ref_year_files[ref_year].append(file)
    # Process each reference year, placing 'since' before 'before'
    for ref_year in sorted(ref_year_files.keys(), key=int, reverse=True):
        year_files = ref_year_files[ref_year]
        since_files = [f for f in year_files if '_since_' in f]
        before_files = [f for f in year_files if '_before_' in f]
        # Add 'since' files normally
        for file in since_files:
            print(f"    '{file}',")
        # Add 'before' files commented out
        for file in before_files:
            print(f"    # '{file}',")

print("]")

In [None]:
# Remember that order matters

scenario_list = [
'2018',
'2024',
# '2024_oldgrowth',
# '2024_oldgrowth_all_land',
# '2024_alternate_degradation_2018',
# '2024_no_degradation_since_1996',
# '2024_no_degradation_since_2019',
]

disturbance_list = [
    # '2024_degradation_total',
    '2024_degradation_since_2019',
    # '2024_degradation_before_2019',
    # '2024_degradation_since_1996',
    # '2024_degradation_before_1996',
    # '2024_deforestation_total',
    '2024_deforestation_since_2019',
    # '2024_deforestation_before_2019',
    # '2024_degradation_deforestation_total',
    '2024_degradation_deforestation_since_2019',
    # '2024_degradation_deforestation_before_2019',
]

report_year = '2024'

all_land_scenario = None
for scenario in scenario_list:
  if 'all_land' in scenario:
    all_land_scenario = scenario
if all_land_scenario == None: print("No all land scenario exists in the detailed stats.")

# Read summary stats
summary_stats_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_stats.csv'))
summary_dist_stats_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_dist_stats.csv'))

# Create attributes CSV
attributes = pd.DataFrame()
attributes['Name'] = summary_stats_df['Unnamed: 0']
attributes['Area (km^2)'] = summary_stats_df['Area (km^2)']
attributes[f'{report_year} forest cover (ha)'] = summary_stats_df[f'{report_year} forest cover (ha)']
if all_land_scenario:
  attributes[f'{all_land_scenario} forest cover (ha)'] = summary_stats_df[f'{all_land_scenario} forest cover (ha)']
attributes.to_csv(join(report_statistics_dir, f'{report_year}_attributes.csv'), index=False)

# Create scenarios total AGB CSV
scenarios_total_agb = pd.DataFrame()
scenarios_total_agb['Name'] = summary_stats_df['Unnamed: 0']
for scenario in scenario_list:
  scenarios_total_agb[f'{scenario} forest AGB (Tg)'] = summary_stats_df[f'{scenario} forest AGB (Tg)']
if source_dir == uncertainty_dir:
  for scenario in scenario_list:
    scenarios_total_agb[f'{scenario} forest AGB CI95 (Tg)'] = summary_stats_df[f'{scenario} forest AGB CI95 (Tg)']
scenarios_total_agb.to_csv(join(report_statistics_dir, f'{report_year}_scenarios_total_agb.csv'), index=False)

# Create scenarios AGBD CSV
scenarios_agbd = pd.DataFrame()
scenarios_agbd['Name'] = summary_stats_df['Unnamed: 0']
for scenario in scenario_list:
  scenario_detailed_stats_df = pd.read_csv(join(detailed_stats_by_scenario_dir, f'{scenario}.csv'))
  scenarios_agbd[f'{scenario} forest AGBD (Mg / ha)'] = scenario_detailed_stats_df['Forest AGBD mean (Mg / ha)']
if source_dir == uncertainty_dir:
  for scenario in scenario_list:
    scenario_detailed_stats_df = pd.read_csv(join(detailed_stats_by_scenario_dir, f'{scenario}.csv'))
    scenarios_agbd[f'{scenario} forest AGBD CI95 (Mg / ha)'] = scenario_detailed_stats_df['Forest AGBD CI95 (Mg / ha)']
scenarios_agbd.to_csv(join(report_statistics_dir, f'{report_year}_scenarios_agbd.csv'), index=False)

# Create disturbance total AGB CSV
disturbance_total_agb = pd.DataFrame()
disturbance_total_agb['Name'] = summary_dist_stats_df['Unnamed: 0']
for disturbance in disturbance_list:
  disturbance_total_agb[f'{disturbance} forest AGB (Tg)'] = summary_dist_stats_df[f'{disturbance} forest AGB (Tg)']
if source_dir == uncertainty_dir:
  for disturbance in disturbance_list:
    disturbance_total_agb[f'{disturbance} forest AGB CI95 (Tg)'] = summary_dist_stats_df[f'{disturbance} forest AGB CI95 (Tg)']
disturbance_total_agb.to_csv(join(report_statistics_dir, f'{report_year}_disturbance_total_agb.csv'), index=False)

# Create disturbance AGBD CSV
disturbance_agbd = pd.DataFrame()
disturbance_agbd['Name'] = summary_stats_df['Unnamed: 0']
for disturbance in disturbance_list:
  disturbance_detailed_stats_df = pd.read_csv(join(detailed_dist_stats_by_scenario_dir, f'{disturbance}.csv'))
  disturbance_agbd[f'{disturbance} forest AGBD (Mg / ha)'] = disturbance_detailed_stats_df['Forest AGBD mean (Mg / ha)']
if source_dir == uncertainty_dir:
  for disturbance in disturbance_list:
    disturbance_detailed_stats_df = pd.read_csv(join(detailed_dist_stats_by_scenario_dir, f'{disturbance}.csv'))
    disturbance_agbd[f'{disturbance} forest AGBD CI95 (Mg / ha)'] = disturbance_detailed_stats_df['Forest AGBD CI95 (Mg / ha)']
disturbance_agbd.to_csv(join(report_statistics_dir, f'{report_year}_disturbance_agbd.csv'), index=False)

print("Report statistics completed.")

# Sankey plots

In [None]:
# Define and create directories
sankey_labelled = join(sample_polygons_statistics_dir, 'sankey_labelled')
sankey_unlabelled = join(sample_polygons_statistics_dir, 'sankey_unlabelled')
sankey_labelled_svg = join(sample_polygons_statistics_dir, 'sankey_labelled_svg')
sankey_unlabelled_svg = join(sample_polygons_statistics_dir, 'sankey_unlabelled_svg')

for dir in [sankey_labelled, sankey_unlabelled, sankey_labelled_svg, sankey_unlabelled_svg]:
    makedirs(dir, exist_ok=True)

# Load the CSV files
summary_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_stats.csv'))
summary_dist_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_dist_stats.csv'))

# Check that all rows in both .csv files have the same strings (polygon areas) in column A
polygon_areas_stats = summary_stats.iloc[:, 0]
polygon_areas_dist_stats = summary_dist_stats.iloc[:, 0]

assert all(polygon_areas_stats == polygon_areas_dist_stats), "Polygon areas do not match between the two CSV files."

# Print columns relevant for sankey diagram configuration

# Filter for AGB columns only (exclude forest cover and CI95 for initial selection)
summary_agb_cols = [col for col in summary_stats.columns[1:] if 'forest AGB (Tg)' in col and 'CI95' not in col]
dist_agb_cols = [col for col in summary_dist_stats.columns[1:] if 'forest AGB (Tg)' in col and 'CI95' not in col]

print("=== summary_stats.csv AGB columns ===")
print("(for old_growth_agb_column and current_agb_column)\n")

# Group by category
current_year_cols = [col for col in summary_agb_cols if col.startswith('2024 ') or col.startswith('2023 ') or col.startswith('2022 ')]
oldgrowth_cols = [col for col in summary_agb_cols if 'oldgrowth' in col and not col.endswith('_1 forest AGB (Tg)') and not col.endswith('_2 forest AGB (Tg)')]
no_disturbance_cols = [col for col in summary_agb_cols if 'no_disturbance' in col]
no_degradation_cols = [col for col in summary_agb_cols if 'no_degradation' in col]

print("Current year scenarios:")
for i, col in enumerate(current_year_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nOld-growth scenarios:")
for i, col in enumerate(oldgrowth_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nNo disturbance scenarios:")
for i, col in enumerate(no_disturbance_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nNo degradation scenarios:")
for i, col in enumerate(no_degradation_cols, 1):
    print(f"  {i:2d}. {col}")

print("\n" + "="*50)
print("=== summary_dist_stats.csv AGB columns ===")
print("(for degradation/deforestation since/total columns)\n")

# Group disturbance columns
degradation_cols = [col for col in dist_agb_cols if 'degradation' in col]
deforestation_cols = [col for col in dist_agb_cols if 'deforestation' in col]
disturbance_cols = [col for col in dist_agb_cols if 'disturbance' in col and 'effect' not in col]

print("Degradation columns:")
for i, col in enumerate(degradation_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nDeforestation columns:")
for i, col in enumerate(deforestation_cols, 1):
    print(f"  {i:2d}. {col}")

print("\nDisturbance (combined) columns:")
for i, col in enumerate(disturbance_cols, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Plot degradation and deforestation separately
separate_disturbance = True
# Plot degradation before and since a date separately
separate_degradation = True
# Plot deforestation before and since a date separately
separate_deforestation = True
# Plot total disturbance before and since a date separately (when separate_disturbance is False)
separate_disturbance_temporal = False

# DPI (default is 96, output image will scale accordingly)
dpi = 300
# Relative width modifier (ratio, e.g. 0.5 or 2)
width_modifier = 0.85

# Title (polygon area), density and label variables (weight of 800 ~ bold, 400 ~ normal)
show_title = True
show_density = True
show_labels = True
left_axis_label = True
svg_transparent_background = True
title_font_size = 20
title_font_weight = 600
density_font_size = 17
density_font_weight = 600
label_font_size = 17
label_font_weight = 600

# Base columns and year (summary_stats)
old_growth_agb_column = '2021_oldgrowth_all_land forest AGB (Tg)'
current_agb_column = '2021 forest AGB (Tg)'
current_year = current_agb_column.split(' ')[0]

# Disturbance columns (summary_dist_stats)
degradation_since_column = '2021_degradation_since_1993 forest AGB (Tg)'
degradation_total_column = '2021_degradation_since_oldgrowth forest AGB (Tg)'
deforestation_since_column = '2021_deforestation_since_1993 forest AGB (Tg)'
deforestation_total_column = '2021_deforestation_since_oldgrowth forest AGB (Tg)'
disturbance_since_column = '2021_disturbance_since_1993 forest AGB (Tg)'
disturbance_total_column = '2021_disturbance_since_oldgrowth forest AGB (Tg)'

# Node labels and colours
remaining_name = f'Remaining in {current_year}:'
remaining_colour = '#007fff'
degradation_before_name = 'Degradation loss before 1993'
degradation_before_colour = '#1a801a'
degradation_since_name = 'Degradation loss since 1993'
degradation_since_colour = '#8dc00d'
degradation_total_name = 'Degradation loss'
degradation_total_colour = '#8dc00d'
deforestation_before_name = 'Deforestation loss before 1993'
deforestation_before_colour = '#ffffff'
deforestation_since_name = 'Deforestation loss since 1993'
deforestation_since_colour = '#ffff00'
deforestation_total_name = 'Deforestation loss'
deforestation_total_colour = '#ffffff'
disturbance_before_name = 'Disturbance loss before 1993'
disturbance_before_colour = '#cccccc'
disturbance_since_name = 'Disturbance loss since 1993'
disturbance_since_colour = '#999999'
disturbance_total_name = 'Disturbance loss'
disturbance_total_colour = '#ffffff'

# Validate separation settings
assert not separate_degradation or separate_disturbance, "separate_disturbance must be True if separate_degradation is True."
assert not separate_deforestation or separate_disturbance, "separate_disturbance must be True if separate_deforestation is True."
assert not separate_disturbance_temporal or not separate_disturbance, "separate_disturbance must be False if separate_disturbance_temporal is True."

# Function to get values from statistics
def get_value(df, idx, column_name):
    try:
        value = df.loc[idx, column_name]
        return 0.0 if pd.isnull(value) else float(value)
    except KeyError:
        print(f"Column '{column_name}' not found in the dataframe.")
        return 0.0

# Loop through each row (polygon area)
for idx in summary_stats.index:
    polygon_name = summary_stats.iloc[idx, 0]

    # Get old-growth and current AGB values
    old_growth_agb = get_value(summary_stats, idx, old_growth_agb_column)
    current_agb = get_value(summary_stats, idx, current_agb_column)

    # Get disturbance values and calculate before values
    degradation_since = get_value(summary_dist_stats, idx, degradation_since_column)
    degradation_total = get_value(summary_dist_stats, idx, degradation_total_column)
    degradation_before = degradation_total - degradation_since

    deforestation_since = get_value(summary_dist_stats, idx, deforestation_since_column)
    deforestation_total = get_value(summary_dist_stats, idx, deforestation_total_column)
    deforestation_before = deforestation_total - deforestation_since

    disturbance_since = get_value(summary_dist_stats, idx, disturbance_since_column)
    disturbance_total = get_value(summary_dist_stats, idx, disturbance_total_column)
    disturbance_before = disturbance_total - disturbance_since

    # Statistical assertions
    if separate_degradation:
        discrepancy = abs(degradation_before + degradation_since - degradation_total)
        if discrepancy >= 1e-6:
            print(f"{polygon_name}: degradation_before + degradation_since != degradation_total (discrepancy: {discrepancy:.6e})")
    if separate_deforestation:
        discrepancy = abs(deforestation_before + deforestation_since - deforestation_total)
        if discrepancy >= 1e-6:
            print(f"{polygon_name}: deforestation_before + deforestation_since != deforestation_total (discrepancy: {discrepancy:.6e})")
    if separate_disturbance:
        discrepancy = abs(degradation_total + deforestation_total - disturbance_total)
        if discrepancy >= 1e-6:
            print(f"{polygon_name}: degradation_total + deforestation_total != disturbance_total (discrepancy: {discrepancy:.6e})")
    discrepancy = abs(current_agb - disturbance_total - old_growth_agb)
    if discrepancy >= 1e-6:
        print(f"{polygon_name}: current_agb - disturbance_total != old_growth_agb (discrepancy: {discrepancy:.6e})")
        print("Note: Constraining degradation floor to disturbance or capping disturbances to 0 can break equality when amalgamating across areas")

    # Load detailed stats for AGBD and CI95 values
    detailed_stats_df = pd.read_csv(join(detailed_stats_by_area_dir, f"{polygon_name}.csv"))
    old_growth_index = detailed_stats_df.index[detailed_stats_df['scenario'] == f"{old_growth_agb_column.split(' ')[0]}"].item()
    current_index = detailed_stats_df.index[detailed_stats_df['scenario'] == f"{current_agb_column.split(' ')[0]}"].item()

    old_growth_mean_agbd = get_value(detailed_stats_df, old_growth_index, "Forest AGBD mean (Mg / ha)")
    current_mean_agbd = get_value(detailed_stats_df, current_index, "Forest AGBD mean (Mg / ha)")

    uncertainty = 'Forest AGB total CI95 (Tg)' in detailed_stats_df.columns
    if uncertainty:
        old_growth_agb_ci95 = get_value(detailed_stats_df, old_growth_index, "Forest AGB total CI95 (Tg)")
        old_growth_mean_agbd_ci95 = get_value(detailed_stats_df, old_growth_index, "Forest AGBD CI95 (Mg / ha)")
        current_agb_ci95 = get_value(detailed_stats_df, current_index, "Forest AGB total CI95 (Tg)")
        current_mean_agbd_ci95 = get_value(detailed_stats_df, current_index, "Forest AGBD CI95 (Mg / ha)")

    # Build title and subtitle text
    title_name = f"{polygon_name}"

    if uncertainty:
        subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} ± {old_growth_mean_agbd_ci95:.1f} Mg / ha"
        subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} ± {current_mean_agbd_ci95:.1f} Mg / ha"
        left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} ± {old_growth_agb_ci95:.2f} Tg" if left_axis_label else ''
        remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} ± {current_agb_ci95:.2f} Tg"
    else:
        subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} Mg / ha"
        subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} Mg / ha"
        left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} Tg" if left_axis_label else ''
        remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} Tg"

    # Configure nodes and links based on separation settings
    if separate_disturbance and separate_degradation and separate_deforestation:
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_before_name, deforestation_since_name, remaining_name_agb]
        sources, targets = [0, 0, 0, 0, 0], [1, 2, 3, 4, 5]
        values = [-degradation_before, -degradation_since, -deforestation_before, -deforestation_since, current_agb]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_before_colour, deforestation_since_colour, remaining_colour]

    elif separate_disturbance and separate_degradation and not separate_deforestation:
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_total_name, remaining_name_agb]
        sources, targets = [0, 0, 0, 0], [1, 2, 3, 4]
        values = [-degradation_before, -degradation_since, -deforestation_total, current_agb]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_total_colour, remaining_colour]

    elif separate_disturbance and not separate_degradation and separate_deforestation:
        nodes = [left_axis, degradation_total_name, deforestation_before_name, deforestation_since_name, remaining_name_agb]
        sources, targets = [0, 0, 0, 0], [1, 2, 3, 4]
        values = [-degradation_total, -deforestation_before, -deforestation_since, current_agb]
        colors = [degradation_total_colour, deforestation_before_colour, deforestation_since_colour, remaining_colour]

    elif separate_disturbance and not separate_degradation and not separate_deforestation:
        nodes = [left_axis, degradation_total_name, deforestation_total_name, remaining_name_agb]
        sources, targets = [0, 0, 0], [1, 2, 3]
        values = [-degradation_total, -deforestation_total, current_agb]
        colors = [degradation_total_colour, deforestation_total_colour, remaining_colour]

    elif not separate_disturbance and separate_disturbance_temporal:
        nodes = [left_axis, disturbance_before_name, disturbance_since_name, remaining_name_agb]
        sources, targets = [0, 0, 0], [1, 2, 3]
        values = [-disturbance_before, -disturbance_since, current_agb]
        colors = [disturbance_before_colour, disturbance_since_colour, remaining_colour]

    else:
        nodes = [left_axis, disturbance_total_name, remaining_name_agb]
        sources, targets = [0, 0], [1, 2]
        values = [-disturbance_total, current_agb]
        colors = [disturbance_total_colour, remaining_colour]

    node_colors = [remaining_colour] + colors

    # Add percentages to node labels
    percentages = [(abs(val) / old_growth_agb * 100) for val in values]
    for i in range(1, len(nodes)):
        if i - 1 < len(percentages):
            nodes[i] += f" ({percentages[i-1]:.0f}%)"

    # Configure title and density annotations
    title_and_density = [
        dict(x=0, y=1.28, xref='paper', yref='paper', text=title_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=title_font_size, color="black", weight=title_font_weight)),
        dict(x=0, y=1.19, xref='paper', yref='paper', text=subtitle_1_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=density_font_size, color="black", weight=density_font_weight)),
        dict(x=0, y=1.11, xref='paper', yref='paper', text=subtitle_2_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=density_font_size, color="black", weight=density_font_weight))
    ]

    if show_title and not show_density:
        title_and_density = title_and_density[0:1]
    elif not show_title and show_density:
        title_and_density = title_and_density[1:3]
    elif not show_title and not show_density:
        title_and_density = []

    # Remove labels if toggled off
    if not show_labels:
        nodes = [''] * len(nodes)

    # Create sankey diagram
    fig = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=nodes, color=node_colors, pad=15, thickness=20, line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors, line=dict(color="black", width=1))
    )])

    fig.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25),
        annotations=title_and_density
    )

    # Save labelled versions
    fig.write_image(join(sankey_labelled, f'sankey_diagram_{polygon_name}.png'), scale=dpi / 96)
    if svg_transparent_background:
        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    fig.write_image(join(sankey_labelled_svg, f'sankey_diagram_vector_{polygon_name}.svg'), scale=dpi / 96)

    # Create and save unlabelled versions
    fig_unlabelled = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=[''] * len(nodes), color=node_colors, pad=15, thickness=20, line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors, line=dict(color="black", width=1))
    )])

    fig_unlabelled.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25)
    )

    fig_unlabelled.write_image(join(sankey_unlabelled, f'sankey_diagram_{polygon_name}.png'), scale=dpi / 96)
    if svg_transparent_background:
        fig_unlabelled.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    fig_unlabelled.write_image(join(sankey_unlabelled_svg, f'sankey_diagram_vector_{polygon_name}.svg'), scale=dpi / 96)

    print(f"Statistical assertions and sankey diagram complete for {polygon_name}.")

    # Display figure with white background
    fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
    fig.show()

# Disconnected runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()