<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/dev/8_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Subdirectories

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi_asartr"
# base_dir = '/content/drive/MyDrive/masfi_asartr'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Imports and upgrades
!pip install geopandas
!pip install kaleido
!pip install rasterio

In [None]:
# Reload imports, replacing those in the cache
# %load_ext autoreload
# %autoreload 2
# Imports
import geopandas as gpd
from google.colab import runtime
import math
import numpy as np
from os.path import exists, join
from os import makedirs
from osgeo import gdal
import pandas as pd
import plotly.graph_objects as go
import rasterio
from rasterio import mask as msk

In [None]:
areas_dir = join(base_dir, "1_areas")
scenarios_dir = join(base_dir, "6_scenarios")
mask_dir = join(scenarios_dir, "scenario_masks")
predictions_dir = join(base_dir, "7_predictions")
statistics_dir = join(base_dir, "8_statistics")
sample_polygons_dir = join(statistics_dir, "sample_polygons")

# Create directories
makedirs(statistics_dir, exist_ok=True)
makedirs(sample_polygons_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Select model, area and sample polygons

In [None]:
# Select if to source predictions from scenarios_dir or predictions_dir
source_dir = predictions_dir
# source_dir = scenarios_dir

source_dir_name = f"{source_dir.split('_')[-1]}_dir"

# Select the model
for subdir in os.listdir(source_dir):
  if 'scenario_masks' not in subdir:
    print(f"selected_model = '{subdir}'")

In [None]:
selected_model = 'agbd_historic_250429_223033'

selected_model_dir = join(source_dir, selected_model)
# Select the prediction area
for subdir in os.listdir(selected_model_dir):
  if source_dir == scenarios_dir and not subdir.endswith('.csv') and not subdir.endswith('.json'):
    print(f"prediction_area = '{subdir}'")
  if source_dir == predictions_dir and subdir != 'model_iterations':
    print(f"prediction_area = '{subdir[10:]}'")

In [None]:
prediction_area = 'asartr'

# Model-area stats directory
model_area_statistics_dir = join(statistics_dir, f"{selected_model}_{source_dir_name}_{prediction_area}")
makedirs(model_area_statistics_dir, exist_ok=True)

# Select sample area polygons. This should be a single .gpkg with the field 'name' differentiating polygons.
sample_polygons = []
for geopackage in os.listdir(sample_polygons_dir):
  sample_polygons.append(geopackage)

if len(sample_polygons) == 0:
  print(f"No sample areas found. Upload .gpkg polygons to {sample_polygons_dir}")
else:
  for sample_polygon in sample_polygons: print(f"selected_sample_polygons = '{sample_polygon}'")

In [None]:
selected_sample_polygons = 'asartr_phase_1.gpkg'

# Set whether to adjust area calculations to match rasters (True) or to match polygon areas (False).
# The raster approach (True) uses the precise pixel area calculated in 1_areas.ipynb.
adjust_polygon_to_raster = True

# Load cell area raster for accurate pixel-by-pixel area calculations
cell_area_path = join(areas_dir, "cell_area.tif")

selected_sample_polygons_dir = join(sample_polygons_dir, selected_sample_polygons)
selected_sample_polygons_gpkg = gpd.read_file(selected_sample_polygons_dir)
sample_polygons_statistics_dir = join(model_area_statistics_dir, selected_sample_polygons[:-5])
makedirs(sample_polygons_statistics_dir, exist_ok=True)
detailed_stats_by_area_dir = join(sample_polygons_statistics_dir, 'detailed_stats_by_area')
makedirs(detailed_stats_by_area_dir, exist_ok=True)
detailed_stats_by_scenario_dir = join(sample_polygons_statistics_dir, 'detailed_stats_by_scenario')
makedirs(detailed_stats_by_scenario_dir, exist_ok=True)
forecast_input_dir = join(sample_polygons_statistics_dir, 'forecast_input')
makedirs(forecast_input_dir, exist_ok=True)
detailed_dist_stats_by_area_dir = join(sample_polygons_statistics_dir, 'detailed_dist_stats_by_area')
makedirs(detailed_dist_stats_by_area_dir, exist_ok=True)
detailed_dist_stats_by_scenario_dir = join(sample_polygons_statistics_dir, 'detailed_dist_stats_by_scenario')
makedirs(detailed_dist_stats_by_scenario_dir, exist_ok=True)
intactness_stats_dir = join(sample_polygons_statistics_dir, 'intactness')
makedirs(intactness_stats_dir, exist_ok=True)
report_statistics_dir = join(sample_polygons_statistics_dir, 'report_statistics')
makedirs(report_statistics_dir, exist_ok=True)

# Scenario statistics

In [None]:
# Create list of available prediction rasters and scenarios. Rasters must already be masked (e.g. to forest).
if source_dir == scenarios_dir: prediction_raster_dir = join(selected_model_dir, prediction_area, 'scenario_predictions')
if source_dir == predictions_dir: prediction_raster_dir = join(selected_model_dir, f'scenarios_{prediction_area}', 'statistics_masked')
prediction_raster_dirs = []
scenarios = set()
for prediction_raster in os.listdir(prediction_raster_dir):
  prediction_raster_dirs.append(join(prediction_raster_dir, prediction_raster))
  if source_dir == predictions_dir: scenarios.add(prediction_raster.split("__")[1])
  if source_dir == scenarios_dir: scenarios.add(prediction_raster.split("__")[0])
scenarios = sorted(list(scenarios))

# Select scenario predictions to calculate statistics
print('selected_scenarios = [')
for scenario in scenarios:
  print(f'  "{scenario}",')
print(']\n')

In [None]:
selected_scenarios = [
  # "1990_oldgrowth",
  # "2014",
  # "2014_no_degradation_since_1991",
  # "2014_oldgrowth",
  # "2015",
  # "2016",
  # "2017",
  # "2018",
  # "2019",
  # "2020",
  # "2021",
  # "2021_oldgrowth",
  # "2022",
  # "2022_alternate_degradation_2021",
  # "2022_no_degradation_since_2022",
  # "2022_oldgrowth",
  # "2023",
  # "2023_alternate_degradation_2022",
  # "2023_no_degradation_since_2023",
  # "2023_oldgrowth",
  "2024",
  # "2024_alternate_degradation_2014",
  # "2024_alternate_degradation_2021",
  # "2024_alternate_degradation_2022",
  # "2024_alternate_degradation_2023",
  # "2024_no_degradation_since_2000",
  # "2024_no_degradation_since_2015",
  # "2024_no_degradation_since_2022",
  # "2024_no_degradation_since_2023",
  # "2024_no_degradation_since_2024",
  # "2024_oldgrowth",
  # "2024_oldgrowth_1",
  "2024_road",
  # "all_oldgrowth",
]

# Filter to selected scenarios, and separate prediction and uncertainty rasters
prediction_rasters = []
uncertainty_rasters = []
for prediction_raster in prediction_raster_dirs:
  for scenario in selected_scenarios:
    if source_dir == predictions_dir:
      if scenario == prediction_raster.split('/')[-1].split('__')[1] and 'mean__' in prediction_raster:
        prediction_rasters.append(prediction_raster)
      if scenario == prediction_raster.split('/')[-1].split('__')[1] and 'uncertainty__' in prediction_raster:
        uncertainty_rasters.append(prediction_raster)
    else: # If the source directory is scenarios_dir (without uncertainty values)
      if scenario == prediction_raster.split('/')[-1].split('__')[0]: prediction_rasters.append(prediction_raster)

# Sort rasters chronologically
prediction_rasters = sorted(prediction_rasters)
uncertainty_rasters = sorted(uncertainty_rasters)

# Create lookup dictionary for faster uncertainty matching
uncertainty_lookup = {}
for uncertainty_raster in uncertainty_rasters:
  base_name = os.path.basename(uncertainty_raster).replace('uncertainty__', 'mean__')
  uncertainty_lookup[base_name] = uncertainty_raster

# Toggle whether to predict uncertainty stats
generate_uncertainty_stats = bool(uncertainty_rasters)

# Generate empty dataframes for statistics
df_base = pd.DataFrame(index = selected_scenarios)
df_base.rename_axis('scenario', inplace=True)
df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg = df_base.copy(), df_base.copy(), df_base.copy(), df_base.copy()
# If uncertainty rasters are present, generate empty dataframes
if generate_uncertainty_stats:
  df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg_ci95 = df_base.copy(), df_base.copy(), df_base.copy()

# Initialise polygon area dataframe
df_polygon_area_km2 = pd.DataFrame(columns = ["Name", "Area (km^2)"])

# Loop through each polygon stored in GPKG to generate statistics
for index, row in selected_sample_polygons_gpkg.iterrows():

  # Define the polygon
  sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
  polygons = [polygon for polygon in sample_polygon_geometry.geoms]

  # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
  sample_polygons_crs = selected_sample_polygons_gpkg.crs
  temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
  temp_gdf_utm = temp_gdf.estimate_utm_crs()
  polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')

  # Mask the cell area raster to the polygon once for efficiency
  with rasterio.open(cell_area_path) as cell_area:
    cell_area_masked, transform_1 = msk.mask(cell_area, polygons, crop=True, filled=False)

  # Calculate total area of all pixels within polygon in hectares
  pixel_area_sum_m2 = np.ma.sum(cell_area_masked, dtype='float64')
  pixel_area_sum_ha = np.divide(pixel_area_sum_m2, 10000, dtype='float64')

  # Calculate ratio between polygon and raster areas
  area_ratio = np.divide(polygon_area_ha, pixel_area_sum_ha, dtype='float64')

  # Apply area adjustment based on setting
  if adjust_polygon_to_raster:
    # Adjust polygon area to match raster area
    adjusted_polygon_area_ha = pixel_area_sum_ha
    # No adjustment needed for pixel values - keep original cell areas
    adjusted_cell_area_masked = cell_area_masked
  else:
    # Keep original polygon area and adjust individual pixel areas
    adjusted_polygon_area_ha = polygon_area_ha
    # Adjust each pixel area by the area ratio to maintain total equal to polygon area
    adjusted_cell_area_masked = np.multiply(cell_area_masked, area_ratio, dtype='float64')

  # Convert adjusted cell areas from m² to ha
  adjusted_cell_area_masked_ha = np.divide(adjusted_cell_area_masked, 10000, dtype='float64')

  # Add polygon area to dataframe
  new_row = pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': adjusted_polygon_area_ha / 100}], dtype=object)
  df_polygon_area_km2 = pd.concat([df_polygon_area_km2, new_row], ignore_index=True, sort=False)

  # Create empty lists for each prediction raster statistic
  values_forest_cover_ha, values_agbd_mean_mg_ha, values_agbd_stdev_mg_ha, values_agb_total_tg = [], [], [], []

  # If uncertainty rasters are present, create empty lists for uncertainty statistics
  if generate_uncertainty_stats:
    values_agbd_mean_mg_ha_ci95, values_agbd_mean_mg_ha_uncertainty, values_agb_total_tg_ci95 = [], [], []

  # Loop through prediction rasters
  for prediction_raster in prediction_rasters:

    # Mask feature to sample_polygon_geometry
    with rasterio.open(prediction_raster) as prediction:
      nodatavalue = int(prediction.nodatavals[0])
      prediction_array_masked, transform_2 = msk.mask(prediction, polygons, crop=True, filled=False)

    # Extract forest pixels (non-masked pixels in prediction array)
    forest_pixels_mask = ~np.ma.getmaskarray(prediction_array_masked)

    # Calculate forest area by summing adjusted cell areas of forest pixels
    forest_cell_areas_ha = np.ma.array(adjusted_cell_area_masked_ha.data, mask=~forest_pixels_mask)
    forest_cover_ha = np.ma.sum(forest_cell_areas_ha, dtype='float64')

    # Calculate total biomass by multiplying AGBD by pixel area for each pixel, then summing
    pixel_biomass_mg = np.multiply(prediction_array_masked, forest_cell_areas_ha, dtype='float64')

    # Sum to get total biomass in Mg
    agb_total_mg = np.ma.sum(pixel_biomass_mg, dtype='float64')

    # Calculate mean AGBD (Mg/ha) by dividing total biomass by forest area
    agbd_mean_mg_ha = np.divide(agb_total_mg, forest_cover_ha, dtype='float64')

    # Calculate standard deviation - using unweighted for now
    agbd_mean_stdev_ha = np.ma.std(prediction_array_masked, dtype='float64')

    # Convert total AGB from Mg to Tg
    agb_total_tg = np.divide(agb_total_mg, 1000000, dtype='float64')

    # Append results to statistics lists
    values_forest_cover_ha.append(forest_cover_ha)
    values_agbd_mean_mg_ha.append(agbd_mean_mg_ha)
    values_agbd_stdev_mg_ha.append(agbd_mean_stdev_ha)
    values_agb_total_tg.append(agb_total_tg)

    if generate_uncertainty_stats:
      # Use lookup dictionary to quickly find matching uncertainty raster
      prediction_basename = os.path.basename(prediction_raster)
      uncertainty_raster_present = prediction_basename in uncertainty_lookup

      if not uncertainty_raster_present:
        print(f"There is no uncertainty raster for {prediction_basename}")
      else:
        matching_uncertainty_raster = uncertainty_lookup[prediction_basename]

        # Open and mask uncertainty raster to polygon
        with rasterio.open(matching_uncertainty_raster) as uncertainty:
          nodatavalue = int(uncertainty.nodatavals[0])
          uncertainty_array_masked, transform_2 = msk.mask(uncertainty, polygons, crop=True, filled=False)

        # Calculate uncertainty with area-weighted approach
        uncertainty_ratios = np.divide(uncertainty_array_masked, 100, dtype='float64')
        # Multiply uncertainty % by AGBD by pixel area
        pixel_ci95_mg = np.multiply(np.multiply(prediction_array_masked, uncertainty_ratios, dtype='float64'),
                                  forest_cell_areas_ha, dtype='float64')

        # Sum to get total CI95 for the area (in Mg)
        agb_total_ci95_mg = np.ma.sum(pixel_ci95_mg, dtype='float64')

        # Calculate the mean CI95 as percentage of mean AGBD
        if abs(agb_total_mg) > 0:  # Use absolute value for division check
            # Uncertainty percentage is relative to absolute value
            agb_total_ci95_percent = np.divide(abs(agb_total_ci95_mg), abs(agb_total_mg), dtype='float64') * 100
            # Preserve sign of original measurement
            sign = np.sign(agb_total_mg)
            agbd_mean_mg_ha_ci95 = sign * np.multiply(abs(agbd_mean_mg_ha), np.divide(agb_total_ci95_percent, 100, dtype='float64'), dtype='float64')
        else:
            agb_total_ci95_percent = 0
            agbd_mean_mg_ha_ci95 = 0

        # Store uncertainty as percentage
        agbd_mean_mg_ha_uncertainty = agb_total_ci95_percent

        # Calculate total AGB CI95 in Tg - preserve sign
        agb_total_tg_ci95 = np.divide(agb_total_ci95_mg, 1000000, dtype='float64')

        # Append results to statistics list
        values_agbd_mean_mg_ha_ci95.append(agbd_mean_mg_ha_ci95)
        values_agbd_mean_mg_ha_uncertainty.append(agbd_mean_mg_ha_uncertainty)
        values_agb_total_tg_ci95.append(agb_total_tg_ci95)

  # Concatenate new columns to the main DataFrames for each statistic
  df_forest_cover_ha = pd.concat([df_forest_cover_ha, pd.DataFrame({sample_polygon_name: values_forest_cover_ha}, index=df_forest_cover_ha.index)], axis=1)
  df_agbd_mean_mg_ha = pd.concat([df_agbd_mean_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha}, index=df_agbd_mean_mg_ha.index)], axis=1)
  df_agbd_stdev_mg_ha = pd.concat([df_agbd_stdev_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_stdev_mg_ha}, index=df_agbd_stdev_mg_ha.index)], axis=1)
  df_agb_total_tg = pd.concat([df_agb_total_tg, pd.DataFrame({sample_polygon_name: values_agb_total_tg}, index=df_agb_total_tg.index)], axis=1)

  if generate_uncertainty_stats:
      df_agbd_mean_mg_ha_ci95 = pd.concat([df_agbd_mean_mg_ha_ci95, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_ci95}, index=df_agbd_mean_mg_ha_ci95.index)], axis=1)
      df_agbd_mean_mg_ha_uncertainty = pd.concat([df_agbd_mean_mg_ha_uncertainty, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_uncertainty}, index=df_agbd_mean_mg_ha_uncertainty.index)], axis=1)
      df_agb_total_tg_ci95 = pd.concat([df_agb_total_tg_ci95, pd.DataFrame({sample_polygon_name: values_agb_total_tg_ci95}, index=df_agb_total_tg_ci95.index)], axis=1)

# Create stats list
if generate_uncertainty_stats:
  df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95,
                 df_agbd_mean_mg_ha_uncertainty, df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_tg_ci95]
else: df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Export statistics for forecast input
df_forecast_list = [df_forest_cover_ha, df_agb_total_tg]
for df_forecast in df_forecast_list:
  df_noalts = df_forecast[~df_forecast.index.str.contains("_")]  # More efficient filtering
  if df_forecast.equals(df_forest_cover_ha): df_filename = "forest_cover_ha"
  if df_forecast.equals(df_agb_total_tg): df_filename = "agb_total_tg"
  df_noalts.to_csv(join(forecast_input_dir, f'{df_filename}.csv'))

# Generate summary stats
df_forest_cover_ha_t = df_forest_cover_ha.T.rename_axis("Name", axis=1).add_suffix(" forest cover (ha)")
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")

# Use list for more efficient concatenation
summary_components = [df_polygon_area_km2, df_forest_cover_ha_t, df_agb_total_tg_t]
if generate_uncertainty_stats:
  df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
  summary_components.append(df_agb_total_tg_ci95_t)

summary_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_statistics_dir, 'summary_stats.csv'))

# Generate detailed stats by area
for polygon_area in df_stats_list[0]:
  polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
  df_detailed_stats = df_base.copy()
  df_detailed_stats["Area (km^2)"] = polygon_area_km2
  for df_stats in df_stats_list:
    if df_stats.equals(df_forest_cover_ha): stat_col = "Forest cover (ha)"
    if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
    if df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "Forest AGBD stdev (Mg / ha)"
    if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
    if generate_uncertainty_stats:
      if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
      if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
      if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
    for stats_polygon_area in df_stats:
      if stats_polygon_area == polygon_area:
        df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
        df_detailed_stats = pd.concat([df_detailed_stats, df_stats_renamed[stat_col]], axis=1)
  df_detailed_stats.to_csv(join(detailed_stats_by_area_dir, f'{polygon_area}.csv'))

# Generate detailed stats by scenario - build dictionary once then process
scenarios = {}
for stats_csv in os.listdir(detailed_stats_by_area_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_stats_by_area_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Process all scenarios for this polygon in one pass
    for scenario in stats_csv_df['scenario'].unique():
        scenario_df = stats_csv_df[stats_csv_df['scenario'] == scenario].copy()
        scenario_df.drop('scenario', axis=1, inplace=True)
        scenario_df.insert(0, 'Name', polygon_name)
        if scenario in scenarios:
            scenarios[scenario] = pd.concat([scenarios[scenario], scenario_df], ignore_index=True)
        else:
            scenarios[scenario] = scenario_df

# Write all scenario CSVs at once
for scenario, scenario_df in scenarios.items():
    output_file_path = join(detailed_stats_by_scenario_dir,f'{scenario}.csv')
    scenario_df.to_csv(output_file_path, index=False)

# Disturbance statistics

In [None]:
# Create list of available disturbance rasters and scenarios
if source_dir == scenarios_dir: dist_raster_dir = join(selected_model_dir, prediction_area, 'scenario_disturbance')
if source_dir == predictions_dir: dist_raster_dir = join(selected_model_dir, f'scenarios_{prediction_area}', 'scenario_disturbance')

dist_raster_dirs = []
dists = set()
for dist_raster in os.listdir(dist_raster_dir):
  dist_raster_dirs.append(join(dist_raster_dir, dist_raster))
  if source_dir == predictions_dir: dists.add(dist_raster.split("__")[1])
  if source_dir == scenarios_dir: dists.add(dist_raster.split("__")[0])
dists = sorted(list(dists))

# Select disturbance rasters to calculate statistics
print('selected_dists = [')
for dist in dists:
  print(f'  "{dist}",')
print(']')

In [None]:
selected_dists = [
  # "2014_deforestation_total",
  # "2014_degradation_before_1991",
  # "2014_degradation_deforestation_total",
  # "2014_degradation_since_1991",
  # "2014_degradation_total",
  # "2015_change_2014",
  # "2016_change_2015",
  # "2017_change_2016",
  # "2018_change_2017",
  # "2019_change_2018",
  # "2020_change_2019",
  # "2021_change_2020",
  # "2021_deforestation_total",
  # "2021_degradation_deforestation_total",
  # "2021_degradation_total",
  # "2022_change_2021",
  # "2022_deforestation_before_2022",
  # "2022_deforestation_since_2022",
  # "2022_deforestation_total",
  # "2022_degradation_before_2022",
  # "2022_degradation_deforestation_before_2022",
  # "2022_degradation_deforestation_since_2022",
  # "2022_degradation_deforestation_total",
  # "2022_degradation_since_2022",
  # "2022_degradation_total",
  # "2023_change_2022",
  # "2023_deforestation_before_2023",
  # "2023_deforestation_since_2023",
  # "2023_deforestation_total",
  # "2023_degradation_before_2023",
  # "2023_degradation_deforestation_before_2023",
  # "2023_degradation_deforestation_since_2023",
  # "2023_degradation_deforestation_total",
  # "2023_degradation_since_2023",
  # "2023_degradation_total",
  # "2024_change_2014",
  # "2024_change_2023",
  # "2024_deforestation_before_2015",
  # "2024_deforestation_before_2022",
  # "2024_deforestation_before_2023",
  # "2024_deforestation_before_2024",
  # "2024_deforestation_since_2015",
  # "2024_deforestation_since_2022",
  # "2024_deforestation_since_2023",
  # "2024_deforestation_since_2024",
  # "2024_deforestation_total",
  # "2024_degradation_before_2000",
  # "2024_degradation_before_2015",
  # "2024_degradation_before_2022",
  # "2024_degradation_before_2023",
  # "2024_degradation_before_2024",
  # "2024_degradation_deforestation_before_2015",
  # "2024_degradation_deforestation_before_2022",
  # "2024_degradation_deforestation_before_2023",
  # "2024_degradation_deforestation_before_2024",
  # "2024_degradation_deforestation_since_2015",
  # "2024_degradation_deforestation_since_2022",
  # "2024_degradation_deforestation_since_2023",
  # "2024_degradation_deforestation_since_2024",
  # "2024_degradation_deforestation_total",
  # "2024_degradation_since_2000",
  # "2024_degradation_since_2015",
  # "2024_degradation_since_2022",
  # "2024_degradation_since_2023",
  # "2024_degradation_since_2024",
  # "2024_degradation_total",
  "2024_disturbance_road",
]

# Filter to selected disturbances, and separate prediction and uncertainty rasters
dist_rasters = []
dist_uncertainty_rasters = []
for dist_raster in dist_raster_dirs:
  for dist in selected_dists:
    if source_dir == predictions_dir:
      if dist == dist_raster.split('/')[-1].split('__')[1] and 'mean__' in dist_raster:
        dist_rasters.append(dist_raster)
      if dist == dist_raster.split('/')[-1].split('__')[1] and 'uncertainty__' in dist_raster:
        dist_uncertainty_rasters.append(dist_raster)
    else: # If the source directory is scenarios_dir (without uncertainty values)
      if dist == dist_raster.split('/')[-1].split('__')[0]: dist_rasters.append(dist_raster)

# Sort rasters chronologically
dist_rasters = sorted(dist_rasters)
dist_uncertainty_rasters = sorted(dist_uncertainty_rasters)

# Create lookup dictionary for faster uncertainty matching
uncertainty_lookup = {}
for uncertainty_raster in dist_uncertainty_rasters:
  base_name = os.path.basename(uncertainty_raster).replace('uncertainty__', 'mean__')
  uncertainty_lookup[base_name] = uncertainty_raster

# Toggle whether to predict uncertainty stats
generate_uncertainty_stats = bool(dist_uncertainty_rasters)

# Generate empty dataframes for statistics
df_base = pd.DataFrame(index = selected_dists)
df_base.rename_axis('dist', inplace=True)
df_agbd_mean_mg_ha, df_agb_total_tg = df_base.copy(), df_base.copy()
# If uncertainty rasters are present, generate empty dataframes
if generate_uncertainty_stats:
  df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg_ci95 = df_base.copy(), df_base.copy(), df_base.copy()

# Initialise polygon area dataframe
df_polygon_area_km2 = pd.DataFrame(columns = ["Name", "Area (km^2)"])

# Load cell area raster for accurate pixel-by-pixel area calculations
cell_area_path = join(areas_dir, "cell_area.tif")

# Loop through each polygon stored in GPKG to generate statistics
for index, row in selected_sample_polygons_gpkg.iterrows():

  # Define the polygon
  sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"]
  polygons = [polygon for polygon in sample_polygon_geometry.geoms]

  # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
  sample_polygons_crs = selected_sample_polygons_gpkg.crs
  temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
  temp_gdf_utm = temp_gdf.estimate_utm_crs()
  polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')

  # Mask the cell area raster to the polygon once for efficiency
  with rasterio.open(cell_area_path) as cell_area:
    cell_area_masked, transform_1 = msk.mask(cell_area, polygons, crop=True, filled=False)

  # Calculate total area of all pixels within polygon in hectares
  pixel_area_sum_m2 = np.ma.sum(cell_area_masked, dtype='float64')
  pixel_area_sum_ha = np.divide(pixel_area_sum_m2, 10000, dtype='float64')

  # Calculate ratio between polygon and raster areas
  area_ratio = np.divide(polygon_area_ha, pixel_area_sum_ha, dtype='float64')

  # Apply area adjustment based on setting
  if adjust_polygon_to_raster:
    # Adjust polygon area to match raster area
    adjusted_polygon_area_ha = pixel_area_sum_ha
    # No adjustment needed for pixel values - keep original cell areas
    adjusted_cell_area_masked = cell_area_masked
  else:
    # Keep original polygon area and adjust individual pixel areas
    adjusted_polygon_area_ha = polygon_area_ha
    # Adjust each pixel area by the area ratio to maintain total equal to polygon area
    adjusted_cell_area_masked = np.multiply(cell_area_masked, area_ratio, dtype='float64')

  # Convert adjusted cell areas from m² to ha
  adjusted_cell_area_masked_ha = np.divide(adjusted_cell_area_masked, 10000, dtype='float64')

  # Add polygon area to dataframe
  new_row = pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': adjusted_polygon_area_ha / 100}], dtype=object)
  df_polygon_area_km2 = pd.concat([df_polygon_area_km2, new_row], ignore_index=True, sort=False)

  # Create empty lists for each disturbance raster statistic
  values_agbd_mean_mg_ha, values_agb_total_tg = [], []

  # If uncertainty rasters are present, create empty lists for uncertainty statistics
  if generate_uncertainty_stats:
    values_agbd_mean_mg_ha_ci95, values_agbd_mean_mg_ha_uncertainty, values_agb_total_tg_ci95 = [], [], []

  # Loop through disturbance rasters
  for dist_raster in dist_rasters:

    # Mask feature to sample_polygon_geometry
    with rasterio.open(dist_raster) as dist:
      nodatavalue = int(dist.nodatavals[0])
      dist_array_masked, transform_2 = msk.mask(dist, polygons, crop=True, filled=False)

    # Extract forest pixels (non-masked pixels in disturbance array)
    forest_pixels_mask = ~np.ma.getmaskarray(dist_array_masked)

    # Calculate forest area by summing adjusted cell areas of forest pixels
    forest_cell_areas_ha = np.ma.array(adjusted_cell_area_masked_ha.data, mask=~forest_pixels_mask)
    forest_cover_ha = np.ma.sum(forest_cell_areas_ha, dtype='float64')

    # Calculate total biomass by multiplying AGBD by pixel area for each pixel, then summing
    pixel_biomass_mg = np.multiply(dist_array_masked, forest_cell_areas_ha, dtype='float64')

    # Sum to get total biomass in Mg
    agb_total_mg = np.ma.sum(pixel_biomass_mg, dtype='float64')

    # Calculate mean AGBD (Mg/ha) by dividing total biomass by forest area
    agbd_mean_mg_ha = np.divide(agb_total_mg, forest_cover_ha, dtype='float64') if forest_cover_ha > 0 else 0

    # Convert total AGB from Mg to Tg
    agb_total_tg = np.divide(agb_total_mg, 1000000, dtype='float64')

    # Append results to statistics lists
    values_agbd_mean_mg_ha.append(agbd_mean_mg_ha)
    values_agb_total_tg.append(agb_total_tg)

    if generate_uncertainty_stats:
      # Use lookup dictionary to quickly find matching uncertainty raster
      dist_basename = os.path.basename(dist_raster)
      uncertainty_raster_present = dist_basename in uncertainty_lookup

      if not uncertainty_raster_present:
        print(f"There is no uncertainty raster for {dist_basename}")
      else:
        matching_uncertainty_raster = uncertainty_lookup[dist_basename]

        # Open and mask uncertainty raster to polygon
        with rasterio.open(matching_uncertainty_raster) as uncertainty:
          nodatavalue = int(uncertainty.nodatavals[0])
          uncertainty_array_masked, transform_2 = msk.mask(uncertainty, polygons, crop=True, filled=False)

        # Calculate uncertainty with area-weighted approach
        uncertainty_ratios = np.divide(uncertainty_array_masked, 100, dtype='float64')
        # Multiply uncertainty % by AGBD by pixel area
        pixel_ci95_mg = np.multiply(np.multiply(dist_array_masked, uncertainty_ratios, dtype='float64'),
                                  forest_cell_areas_ha, dtype='float64')

        # Sum to get total CI95 for the area (in Mg)
        agb_total_ci95_mg = np.ma.sum(pixel_ci95_mg, dtype='float64')

        # Calculate the mean CI95 as percentage of mean AGBD
        if abs(agb_total_mg) > 0:  # Use absolute value for division check
            # Uncertainty percentage is relative to absolute value
            agb_total_ci95_percent = np.divide(abs(agb_total_ci95_mg), abs(agb_total_mg), dtype='float64') * 100
            # Preserve sign of original measurement
            sign = np.sign(agb_total_mg)
            agbd_mean_mg_ha_ci95 = sign * np.multiply(abs(agbd_mean_mg_ha), np.divide(agb_total_ci95_percent, 100, dtype='float64'), dtype='float64')
        else:
            agb_total_ci95_percent = 0
            agbd_mean_mg_ha_ci95 = 0

        # Store uncertainty as percentage
        agbd_mean_mg_ha_uncertainty = agb_total_ci95_percent

        # Calculate total AGB CI95 in Tg - preserve sign
        agb_total_tg_ci95 = np.divide(agb_total_ci95_mg, 1000000, dtype='float64')

        # Append results to statistics list
        values_agbd_mean_mg_ha_ci95.append(agbd_mean_mg_ha_ci95)
        values_agbd_mean_mg_ha_uncertainty.append(agbd_mean_mg_ha_uncertainty)
        values_agb_total_tg_ci95.append(agb_total_tg_ci95)

  # Concatenate new columns to the main DataFrames for each statistic
  df_agbd_mean_mg_ha = pd.concat([df_agbd_mean_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha}, index=df_agbd_mean_mg_ha.index)], axis=1)
  df_agb_total_tg = pd.concat([df_agb_total_tg, pd.DataFrame({sample_polygon_name: values_agb_total_tg}, index=df_agb_total_tg.index)], axis=1)

  if generate_uncertainty_stats:
      df_agbd_mean_mg_ha_ci95 = pd.concat([df_agbd_mean_mg_ha_ci95, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_ci95}, index=df_agbd_mean_mg_ha_ci95.index)], axis=1)
      df_agbd_mean_mg_ha_uncertainty = pd.concat([df_agbd_mean_mg_ha_uncertainty, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_uncertainty}, index=df_agbd_mean_mg_ha_uncertainty.index)], axis=1)
      df_agb_total_tg_ci95 = pd.concat([df_agb_total_tg_ci95, pd.DataFrame({sample_polygon_name: values_agb_total_tg_ci95}, index=df_agb_total_tg_ci95.index)], axis=1)

# Create stats list
if generate_uncertainty_stats:
  df_stats_list = [df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg, df_agb_total_tg_ci95]
else: df_stats_list = [df_agbd_mean_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Generate summary stats
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")

# Use list for more efficient concatenation
summary_components = [df_polygon_area_km2, df_agb_total_tg_t]
if generate_uncertainty_stats:
  df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
  summary_components.append(df_agb_total_tg_ci95_t)

summary_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_statistics_dir, 'summary_dist_stats.csv'))

# Generate detailed stats by polygon
for polygon_area in df_stats_list[0]:
  polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
  df_detailed_dist_stats = df_base.copy()
  df_detailed_dist_stats["Area (km^2)"] = polygon_area_km2
  for df_stats in df_stats_list:
    if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
    if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
    if generate_uncertainty_stats:
      if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
      if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
      if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
    for stats_polygon_area in df_stats:
      if stats_polygon_area == polygon_area:
        df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
        df_detailed_dist_stats = pd.concat([df_detailed_dist_stats, df_stats_renamed[stat_col]], axis=1)
  df_detailed_dist_stats.to_csv(join(detailed_dist_stats_by_area_dir, f'{polygon_area}.csv'))

# Generate detailed stats by disturbance type - build dictionary once then process
dists = {}
for stats_csv in os.listdir(detailed_dist_stats_by_area_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_dist_stats_by_area_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Process all disturbance types for this polygon in one pass
    for dist in stats_csv_df['dist'].unique():
        dist_df = stats_csv_df[stats_csv_df['dist'] == dist].copy()
        dist_df.drop('dist', axis=1, inplace=True)
        dist_df.insert(0, 'Name', polygon_name)
        if dist in dists:
            dists[dist] = pd.concat([dists[dist], dist_df], ignore_index=True)
        else:
            dists[dist] = dist_df

# Write all disturbance CSVs at once
for dist, dist_df in dists.items():
    output_file_path = join(detailed_dist_stats_by_scenario_dir,f'{dist}.csv')
    dist_df.to_csv(output_file_path, index=False)

# Intactness statistics

In [None]:
# Create list of available percentage change and intactness rasters
if source_dir == scenarios_dir:
    intactness_dir = join(selected_model_dir, prediction_area, 'intactness')
if source_dir == predictions_dir:
    intactness_dir = join(selected_model_dir, f'scenarios_{prediction_area}', 'intactness')

percentage_change_rasters = []
intactness_rasters = []
for root, dirs, files in os.walk(intactness_dir):
    for file in files:
        if "intactness__" in file and file.endswith('tif'):
            relative_path = os.path.relpath(join(root, file), intactness_dir)
            intactness_rasters.append(relative_path)

# Select intactness rasters to calculate statistics
print("# Select intactness raster to calculate statistics")
print("intactness_rasters = [")
for raster in intactness_rasters:
    print(f"'{raster}',")
print("]")

In [None]:
# Select intactness raster to calculate statistics
intactness_rasters = [
'all_oldgrowth__2024_degradation_deforestation_total/intactness__asartr_phase_2_10_quantiles__all_oldgrowth__2024_degradation_deforestation_total__agbd_historic_250429_223033.tif',
'all_oldgrowth__2024_degradation_deforestation_total/intactness__project_area_10_quantiles__all_oldgrowth__2024_degradation_deforestation_total__agbd_historic_250429_223033.tif',
'all_oldgrowth__2024_degradation_deforestation_total/intactness__intactness_wo_tn_10_quantiles__all_oldgrowth__2024_degradation_deforestation_total__agbd_historic_250429_223033.tif',
'all_oldgrowth__2024_degradation_deforestation_total/intactness__10_quantiles__all_oldgrowth__2024_degradation_deforestation_total__agbd_historic_250429_223033.tif',
'2024_no_degradation_since_2000__2024_degradation_since_2000/intactness__asartr_phase_2_10_quantiles__2024_no_degradation_since_2000__2024_degradation_since_2000__agbd_historic_250429_223033.tif',
'2024_no_degradation_since_2000__2024_degradation_since_2000/intactness__project_area_10_quantiles__2024_no_degradation_since_2000__2024_degradation_since_2000__agbd_historic_250429_223033.tif',
'2024_no_degradation_since_2000__2024_degradation_since_2000/intactness__intactness_wo_tn_10_quantiles__2024_no_degradation_since_2000__2024_degradation_since_2000__agbd_historic_250429_223033.tif',
'2024_no_degradation_since_2000__2024_degradation_since_2000/intactness__10_quantiles__2024_no_degradation_since_2000__2024_degradation_since_2000__agbd_historic_250429_223033.tif',
]

In [None]:
# Load cell area raster for accurate pixel-by-pixel area calculations
# The 'mask_forest_all_oldgrowth.tif' must be present in the scenario mask dir
cell_area_path = join(areas_dir, "cell_area.tif")
all_land_mask_path = join(mask_dir, "mask_forest_all_oldgrowth.tif")

# Match percentage change rasters
intactness_percentage_raster_paths = {}
for intactness_raster in intactness_rasters:
    intactness_raster_path = join(intactness_dir, intactness_raster)
    intactness_baseline_dist_dir = intactness_raster.split('/')[0]
    percentage_change_filename = f"percentage_change__{intactness_baseline_dist_dir}__{selected_model}.tif"
    percentage_change_path = join(intactness_dir, intactness_baseline_dist_dir, percentage_change_filename)
    intactness_percentage_raster_paths[intactness_raster_path] = percentage_change_path

# Function to calculate area-weighted statistics
def weighted_stats(values, weights):
    # Handle empty arrays
    if len(values) == 0:
        return None, None

    # Calculate weighted mean
    weighted_sum = np.sum(values * weights, dtype='float64')
    sum_of_weights = np.sum(weights, dtype='float64')
    weighted_mean = weighted_sum / sum_of_weights if sum_of_weights > 0 else 0

    # Calculate weighted standard deviation
    if sum_of_weights > 0:
        variance = np.sum(weights * np.square(values - weighted_mean, dtype='float64'), dtype='float64') / sum_of_weights
        weighted_std = np.sqrt(variance, dtype='float64')
    else:
        weighted_std = 0

    return weighted_mean, weighted_std

# Loop through each polygon stored in GPKG to generate statistics
for intactness_raster, percentage_raster in intactness_percentage_raster_paths.items():

    polygon_quantiles = intactness_raster.split('/')[-1].split('__')[1]
    baseline = intactness_raster.split('/')[-1].split('__')[2]
    disturbance = intactness_raster.split('/')[-1].split('__')[3]
    current_year = disturbance.split('_')[0]
    mask_path = join(mask_dir, f"mask_forest_{current_year}.tif")
    intactness_csv_name = f"{polygon_quantiles}__{baseline}__{disturbance}.csv"
    intactness_csv_path = join(intactness_stats_dir, intactness_csv_name)

    total_score = int(intactness_raster.split('/')[-1].split('__')[1].split('_')[-2])  # Extracts the quantiles used for total score
    total_stdev = int(total_score / 2)

    df_intactness_stats = pd.DataFrame(columns=[
        "Name",
        "Percentage change (remaining forest) mean",
        "Percentage change (remaining forest) stdev",
        "Percentage change (non-forest = -100) mean",
        "Percentage change (non-forest = -100) stdev",
        f"Intactness (remaining forest) mean / {total_score}",
        f"Intactness (remaining forest) stdev / {total_stdev}",
        f"Intactness (non-forest = 0) mean / {total_score}",
        f"Intactness (non-forest = 0) stdev / {total_stdev}"
    ])

    for index, row in selected_sample_polygons_gpkg.iterrows():

        # Define the polygon
        sample_polygon_geometry = row["geometry"]
        sample_polygon_name = row["name"]
        polygons = [polygon for polygon in sample_polygon_geometry.geoms]

        # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
        sample_polygons_crs = selected_sample_polygons_gpkg.crs
        temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
        temp_gdf_utm = temp_gdf.estimate_utm_crs()
        polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')

        # First check if the intactness raster has any valid data in this polygon
        with rasterio.open(intactness_raster) as intactness:
            intactness_masked, transform_2 = msk.mask(intactness, polygons, crop=True, filled=False)

        # Check if all values are masked (nodatavalue)
        if np.ma.count(intactness_masked) == 0:
            # No valid intactness data - set all stats to None and continue to next polygon
            new_row = pd.DataFrame([{
                'Name': sample_polygon_name,
                'Percentage change (remaining forest) mean': None,
                'Percentage change (remaining forest) stdev': None,
                'Percentage change (non-forest = -100) mean': None,
                'Percentage change (non-forest = -100) stdev': None,
                f'Intactness (remaining forest) mean / {total_score}': None,
                f'Intactness (remaining forest) stdev / {total_stdev}': None,
                f'Intactness (non-forest = 0) mean / {total_score}': None,
                f'Intactness (non-forest = 0) stdev / {total_stdev}': None,
            }], dtype=object)

            df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)
            continue

        # Mask the cell area raster to the polygon
        with rasterio.open(cell_area_path) as cell_area:
            cell_area_masked, transform_ca = msk.mask(cell_area, polygons, crop=True, filled=False)

        # Calculate total area of all pixels within polygon in hectares
        pixel_area_sum_m2 = np.ma.sum(cell_area_masked, dtype='float64')
        pixel_area_sum_ha = np.divide(pixel_area_sum_m2, 10000, dtype='float64')

        # Calculate ratio between polygon and raster areas
        area_ratio = np.divide(polygon_area_ha, pixel_area_sum_ha, dtype='float64')

        # Apply area adjustment based on setting
        if adjust_polygon_to_raster:
            # No adjustment needed for pixel values - keep original cell areas
            adjusted_cell_area_masked = cell_area_masked
        else:
            # Adjust each pixel area by the area ratio to maintain total equal to polygon area
            adjusted_cell_area_masked = np.multiply(cell_area_masked, area_ratio, dtype='float64')

        # Convert adjusted cell areas from m² to ha for easier calculations
        adjusted_cell_area_masked_ha = np.divide(adjusted_cell_area_masked, 10000, dtype='float64')

        # Get land mask from mask directory (1 = land, NoData = non-land)
        with rasterio.open(all_land_mask_path) as all_land_mask_raster:
            all_land_mask_data, transform_alm = msk.mask(all_land_mask_raster, polygons, crop=True, filled=False)

        # Create land mask where True = land
        all_land_mask = ~np.ma.getmaskarray(all_land_mask_data) & (all_land_mask_data == 1)

        # Get the area of all land pixels
        all_land_area_ha = np.ma.array(adjusted_cell_area_masked_ha.data, mask=~all_land_mask)
        all_land_area_ha_sum = np.ma.sum(all_land_area_ha, dtype='float64')

        # Check if we have any valid land area to process
        if all_land_area_ha_sum <= 0:
            # No land in this polygon - set all stats to None and continue to next polygon
            new_row = pd.DataFrame([{
                'Name': sample_polygon_name,
                'Percentage change (remaining forest) mean': None,
                'Percentage change (remaining forest) stdev': None,
                'Percentage change (non-forest = -100) mean': None,
                'Percentage change (non-forest = -100) stdev': None,
                f'Intactness (remaining forest) mean / {total_score}': None,
                f'Intactness (remaining forest) stdev / {total_stdev}': None,
                f'Intactness (non-forest = 0) mean / {total_score}': None,
                f'Intactness (non-forest = 0) stdev / {total_stdev}': None,
            }], dtype=object)

            df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)
            continue

        # Get forest mask for current year from mask raster
        with rasterio.open(mask_path) as forest_mask_raster:
            forest_mask_data, transform_fm = msk.mask(forest_mask_raster, polygons, crop=True, filled=False)

        # Create forest mask where True = forest
        forest_mask = ~np.ma.getmaskarray(forest_mask_data) & (forest_mask_data == 1)

        # Extract forest areas using the forest mask
        forest_cell_areas_ha = np.ma.array(adjusted_cell_area_masked_ha.data, mask=~forest_mask)
        forest_area_ha_sum = np.ma.sum(forest_cell_areas_ha, dtype='float64')

        # Non-forest area (land that is not forest)
        non_forest_area_ha = all_land_area_ha_sum - forest_area_ha_sum

        # Mask percentage change raster to the polygon
        with rasterio.open(percentage_raster) as percent_change:
            percent_change_masked, transform_pc = msk.mask(percent_change, polygons, crop=True, filled=False)

        # Apply forest mask to percentage change values (for forest-only statistics)
        percent_change_forest_only = np.ma.array(percent_change_masked.data, mask=~forest_mask)

        if forest_area_ha_sum > 0:
            # Extract percentage change values for forest pixels
            forest_percent_values = np.ma.compressed(percent_change_forest_only)
            forest_percent_weights = np.ma.compressed(forest_cell_areas_ha)

            # Calculate area-weighted percentage change statistics for remaining forest
            percent_change_forest_mean, percent_change_forest_std = weighted_stats(
                forest_percent_values, forest_percent_weights
            )

            # For all land including non-forest (treated as -100% change)
            if non_forest_area_ha > 0:
                # Calculate the weighted mean directly
                all_mean_numerator = np.sum(forest_percent_values * forest_percent_weights)
                all_mean_denominator = forest_area_ha_sum + non_forest_area_ha
                all_mean_numerator += non_forest_area_ha * (-100.0)
                percent_change_all_mean = all_mean_numerator / all_mean_denominator

                # Calculate the weighted variance directly
                forest_variance_contribution = np.sum(
                    forest_percent_weights * np.square(forest_percent_values - percent_change_all_mean)
                )
                non_forest_variance_contribution = non_forest_area_ha * np.square((-100.0) - percent_change_all_mean)
                all_variance = (forest_variance_contribution + non_forest_variance_contribution) / all_mean_denominator
                percent_change_all_std = np.sqrt(all_variance)
            else:
                # If no non-forest area, all-land stats are the same as forest stats
                percent_change_all_mean = percent_change_forest_mean
                percent_change_all_std = percent_change_forest_std
        else:
            # If no forest, set forest stats to None and all-land stats to -100% change
            percent_change_forest_mean = percent_change_forest_std = None
            percent_change_all_mean = -100.0
            percent_change_all_std = 0.0

        # Read & mask intactness to polygon (keep inside-polygon nodata in .data)
        with rasterio.open(intactness_raster) as src:
            nodata_value = src.nodata
            raw_int_masked, _ = msk.mask(src, polygons, crop=True, filled=False)

        # Mask any pixel == nodata_value on top of "outside-polygon"
        if nodata_value is not None:
            nodata_mask = (raw_int_masked.data == nodata_value)
            combined_mask = raw_int_masked.mask | nodata_mask
            intactness_masked = np.ma.array(raw_int_masked.data, mask=combined_mask)
        else:
            intactness_masked = raw_int_masked

        # Drop non-forest: combine "not forest" with existing mask
        # Now mask = True for any pixel that is outside polygon, nodata, or non-forest
        masked_intactness = np.ma.array(
            intactness_masked.data,
            mask=(~forest_mask) | intactness_masked.mask
        )

        # Compute intactness stats
        if forest_area_ha_sum > 0:
            # Build a single boolean array for pixels that are valid forest & not nodata
            valid_forest_mask = (~intactness_masked.mask) & forest_mask

            # Remaining-forest intactness
            forest_intact_vals = intactness_masked.data[valid_forest_mask]
            forest_intact_weights = adjusted_cell_area_masked_ha.data[valid_forest_mask]
            intactness_remaining_mean, intactness_remaining_std = weighted_stats(
                forest_intact_vals, forest_intact_weights
            )

            # All-land intactness (non-forest = 0)
            total_land = forest_area_ha_sum + non_forest_area_ha
            num = np.sum(forest_intact_vals * forest_intact_weights)
            den = total_land
            intactness_all_mean = num / den

            # Variance: forest + non-forest contributions
            var_forest = np.sum(forest_intact_weights * (forest_intact_vals - intactness_all_mean) ** 2)
            var_nonforest = non_forest_area_ha * (0 - intactness_all_mean) ** 2
            intactness_all_std = np.sqrt((var_forest + var_nonforest) / den)
        else:
            # No forest present
            intactness_remaining_mean = intactness_remaining_std = None
            intactness_all_mean = 0.0
            intactness_all_std = 0.0

        # Create new row with statistics
        new_row = pd.DataFrame([{
            'Name': sample_polygon_name,
            'Percentage change (remaining forest) mean': percent_change_forest_mean,
            'Percentage change (remaining forest) stdev': percent_change_forest_std,
            'Percentage change (non-forest = -100) mean': percent_change_all_mean,
            'Percentage change (non-forest = -100) stdev': percent_change_all_std,
            f'Intactness (remaining forest) mean / {total_score}': intactness_remaining_mean,
            f'Intactness (remaining forest) stdev / {total_stdev}': intactness_remaining_std,
            f'Intactness (non-forest = 0) mean / {total_score}': intactness_all_mean,
            f'Intactness (non-forest = 0) stdev / {total_stdev}': intactness_all_std,
        }], dtype=object)

        # Append to main dataframe
        df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)

    # Set index to Name and save to CSV
    df_intactness_stats = df_intactness_stats.set_index('Name')
    df_intactness_stats.to_csv(intactness_csv_path)
    print(f"Saved statistics to {intactness_csv_path}")

# Report statistics

In [None]:
# Define disturbances for report
csv_files = [f[:-4] for f in os.listdir(detailed_dist_stats_by_scenario_dir) if 'change' not in f and f.endswith('.csv')]

def get_disturbance_type(filename):
    if 'degradation_deforestation' in filename:
        return 3  # comes third
    elif 'deforestation' in filename:
        return 2  # comes second
    else:
        return 1  # comes first (degradation)

# Sort by year, then disturbance type
files_by_category = {}
for file in csv_files:
    year = file.split('_')[0]
    dist_type = get_disturbance_type(file)
    key = (year, dist_type)

    if key not in files_by_category:
        files_by_category[key] = []
    files_by_category[key].append(file)

# Process each category in order
sorted_files = []
for key in sorted(files_by_category.keys(), key=lambda k: (int(k[0]), k[1])):
    files = files_by_category[key]

    # First add 'total' files
    total_files = [f for f in files if '_total' in f]
    sorted_files.extend(total_files)

    # Group remaining files by reference year
    ref_year_files = {}
    for file in files:
        if '_total' in file:
            continue

        ref_year = file.split('_')[-1]
        if ref_year not in ref_year_files:
            ref_year_files[ref_year] = []
        ref_year_files[ref_year].append(file)

    # Process each reference year, placing 'since' before 'before'
    for ref_year in sorted(ref_year_files.keys(), key=int, reverse=True):
        year_files = ref_year_files[ref_year]
        since_files = [f for f in year_files if '_since_' in f]
        before_files = [f for f in year_files if '_before_' in f]
        sorted_files.extend(since_files + before_files)

print("disturbance_list = [")
for file in sorted_files:
    print(f"    '{file}',")
print("]")

In [None]:
# Define disturbances for report
csv_files = [f[:-4] for f in os.listdir(detailed_dist_stats_by_scenario_dir) if 'change' not in f and f.endswith('.csv')]

def get_disturbance_type(filename):
    if 'degradation_deforestation' in filename: return 3  # comes third
    elif 'deforestation' in filename: return 2  # comes second
    else: return 1  # comes first (degradation)

# Sort by year, then disturbance type
files_by_category = {}
for file in csv_files:
    year = file.split('_')[0]
    dist_type = get_disturbance_type(file)
    key = (year, dist_type)

    if key not in files_by_category:
        files_by_category[key] = []
    files_by_category[key].append(file)
print("disturbance_list = [")
current_year = None

# Process each category in order
for key in sorted(files_by_category.keys(), key=lambda k: (int(k[0]), k[1])):
    files = files_by_category[key]
    # First add 'total' files
    total_files = [f for f in files if '_total' in f]
    for file in total_files:
        print(f"    '{file}',")
    # Group remaining files by reference year
    ref_year_files = {}
    for file in files:
        if '_total' in file:
            continue
        ref_year = file.split('_')[-1]
        if ref_year not in ref_year_files:
            ref_year_files[ref_year] = []
        ref_year_files[ref_year].append(file)
    # Process each reference year, placing 'since' before 'before'
    for ref_year in sorted(ref_year_files.keys(), key=int, reverse=True):
        year_files = ref_year_files[ref_year]
        since_files = [f for f in year_files if '_since_' in f]
        before_files = [f for f in year_files if '_before_' in f]
        # Add 'since' files normally
        for file in since_files:
            print(f"    '{file}',")
        # Add 'before' files commented out
        for file in before_files:
            print(f"    # '{file}',")

print("]")

In [None]:
scenario_list = [
# '1990_oldgrowth',
# '2014',
# '2014_no_degradation_since_1991',
# '2014_oldgrowth',
# '2015',
# '2016',
# '2017',
# '2018',
# '2019',
# '2020',
# '2021',
# '2021_oldgrowth',
'2022',
# '2022_alternate_degradation_2021',
# '2022_no_degradation_since_2022',
# '2022_oldgrowth',
# '2023',
# '2023_alternate_degradation_2022',
# '2023_no_degradation_since_2023',
# '2023_oldgrowth',
# '2024',
# '2024_alternate_degradation_2014',
# '2024_alternate_degradation_2021',
# '2024_alternate_degradation_2022',
# '2024_alternate_degradation_2023',
# '2024_no_degradation_since_2015',
# '2024_no_degradation_since_2022',
# '2024_no_degradation_since_2023',
# '2024_no_degradation_since_2024',
# '2024_oldgrowth',
'all_oldgrowth',
]

disturbance_list = [
    # '2014_degradation_total',
    # '2014_degradation_since_1991',
    # # '2014_degradation_before_1991',
    # '2014_deforestation_total',
    # '2014_degradation_deforestation_total',
    # '2021_degradation_total',
    # '2021_deforestation_total',
    # '2021_degradation_deforestation_total',
    '2022_degradation_total',
    '2022_degradation_since_2022',
    # '2022_degradation_before_2022',
    '2022_deforestation_total',
    '2022_deforestation_since_2022',
    # '2022_deforestation_before_2022',
    '2022_degradation_deforestation_total',
    '2022_degradation_deforestation_since_2022',
    '2022_degradation_deforestation_before_2022',
    # '2023_degradation_total',
    # '2023_degradation_since_2023',
    # # '2023_degradation_before_2023',
    # '2023_deforestation_total',
    # '2023_deforestation_since_2023',
    # # '2023_deforestation_before_2023',
    # '2023_degradation_deforestation_total',
    # '2023_degradation_deforestation_since_2023',
    # '2023_degradation_deforestation_before_2023',
    # '2024_degradation_total',
    # '2024_degradation_since_2024',
    # # '2024_degradation_before_2024',
    # '2024_degradation_since_2023',
    # # '2024_degradation_before_2023',
    # '2024_degradation_since_2022',
    # # '2024_degradation_before_2022',
    # '2024_degradation_since_2015',
    # # '2024_degradation_before_2015',
    # '2024_deforestation_total',
    # '2024_deforestation_since_2024',
    # # '2024_deforestation_before_2024',
    # '2024_deforestation_since_2023',
    # # '2024_deforestation_before_2023',
    # '2024_deforestation_since_2022',
    # # '2024_deforestation_before_2022',
    # '2024_deforestation_since_2015',
    # # '2024_deforestation_before_2015',
    # '2024_degradation_deforestation_total',
    # '2024_degradation_deforestation_since_2024',
    # # '2024_degradation_deforestation_before_2024',
    # '2024_degradation_deforestation_since_2023',
    # # '2024_degradation_deforestation_before_2023',
    # '2024_degradation_deforestation_since_2022',
    # # '2024_degradation_deforestation_before_2022',
    # '2024_degradation_deforestation_since_2015',
    # # '2024_degradation_deforestation_before_2015',
]

report_year = '2022'

all_land_scenario = None
for scenario in scenario_list:
  if 'all' in scenario:
    all_land_scenario = scenario
if all_land_scenario == None: print("No all land scenario exists in the detailed stats.")

# Read summary stats
summary_stats_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_stats.csv'))
summary_dist_stats_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_dist_stats.csv'))

# Create attributes CSV
attributes = pd.DataFrame()
attributes['Name'] = summary_stats_df['Unnamed: 0']
attributes['Area (km^2)'] = summary_stats_df['Area (km^2)']
attributes[f'{report_year} forest cover (ha)'] = summary_stats_df[f'{report_year} forest cover (ha)']
attributes[f'{all_land_scenario} forest cover (ha)'] = summary_stats_df[f'{all_land_scenario} forest cover (ha)']
attributes.to_csv(join(report_statistics_dir, f'{report_year}_attributes.csv'), index=False)

# Create scenarios total AGB CSV
scenarios_total_agb = pd.DataFrame()
scenarios_total_agb['Name'] = summary_stats_df['Unnamed: 0']
for scenario in scenario_list:
  scenarios_total_agb[f'{scenario} forest AGB (Tg)'] = summary_stats_df[f'{scenario} forest AGB (Tg)']
if source_dir == predictions_dir:
  for scenario in scenario_list:
    scenarios_total_agb[f'{scenario} forest AGB CI95 (Tg)'] = summary_stats_df[f'{scenario} forest AGB CI95 (Tg)']
scenarios_total_agb.to_csv(join(report_statistics_dir, f'{report_year}_scenarios_total_agb.csv'), index=False)

# Create scenarios AGBD CSV
scenarios_agbd = pd.DataFrame()
scenarios_agbd['Name'] = summary_stats_df['Unnamed: 0']
for scenario in scenario_list:
  scenario_detailed_stats_df = pd.read_csv(join(detailed_stats_by_scenario_dir, f'{scenario}.csv'))
  scenarios_agbd[f'{scenario} forest AGBD (Mg / ha)'] = scenario_detailed_stats_df['Forest AGBD mean (Mg / ha)']
if source_dir == predictions_dir:
  for scenario in scenario_list:
    scenario_detailed_stats_df = pd.read_csv(join(detailed_stats_by_scenario_dir, f'{scenario}.csv'))
    scenarios_agbd[f'{scenario} forest AGBD CI95 (Mg / ha)'] = scenario_detailed_stats_df['Forest AGBD CI95 (Mg / ha)']
scenarios_agbd.to_csv(join(report_statistics_dir, f'{report_year}_scenarios_agbd.csv'), index=False)

# Create disturbance total AGB CSV
disturbance_total_agb = pd.DataFrame()
disturbance_total_agb['Name'] = summary_dist_stats_df['Unnamed: 0']
for disturbance in disturbance_list:
  disturbance_total_agb[f'{disturbance} forest AGB (Tg)'] = summary_dist_stats_df[f'{disturbance} forest AGB (Tg)']
if source_dir == predictions_dir:
  for disturbance in disturbance_list:
    disturbance_total_agb[f'{disturbance} forest AGB CI95 (Tg)'] = summary_dist_stats_df[f'{disturbance} forest AGB CI95 (Tg)']
disturbance_total_agb.to_csv(join(report_statistics_dir, f'{report_year}_disturbance_total_agb.csv'), index=False)

# Create disturbance AGBD CSV
disturbance_agbd = pd.DataFrame()
disturbance_agbd['Name'] = summary_stats_df['Unnamed: 0']
for disturbance in disturbance_list:
  disturbance_detailed_stats_df = pd.read_csv(join(detailed_dist_stats_by_scenario_dir, f'{disturbance}.csv'))
  disturbance_agbd[f'{disturbance} forest AGBD (Mg / ha)'] = disturbance_detailed_stats_df['Forest AGBD mean (Mg / ha)']
if source_dir == predictions_dir:
  for disturbance in disturbance_list:
    disturbance_detailed_stats_df = pd.read_csv(join(detailed_dist_stats_by_scenario_dir, f'{disturbance}.csv'))
    disturbance_agbd[f'{disturbance} forest AGBD CI95 (Mg / ha)'] = disturbance_detailed_stats_df['Forest AGBD CI95 (Mg / ha)']
disturbance_agbd.to_csv(join(report_statistics_dir, f'{report_year}_disturbance_agbd.csv'), index=False)

# Sankey plots

In [None]:
# Define and create directories
sankey_labelled = join(sample_polygons_statistics_dir, 'sankey_labelled')
sankey_unlabelled = join(sample_polygons_statistics_dir, 'sankey_unlabelled')
sankey_labelled_svg = join(sample_polygons_statistics_dir, 'sankey_labelled_svg')
sankey_unlabelled_svg = join(sample_polygons_statistics_dir, 'sankey_unlabelled_svg')

for dir in [sankey_labelled, sankey_unlabelled, sankey_labelled_svg, sankey_unlabelled_svg]:
    makedirs(dir, exist_ok=True)

# Load the CSV files
summary_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_stats.csv'))
summary_dist_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_dist_stats.csv'))

# Check that all rows in both .csv files have the same strings (polygon areas) in column A
polygon_areas_stats = summary_stats.iloc[:, 0]
polygon_areas_dist_stats = summary_dist_stats.iloc[:, 0]

assert all(polygon_areas_stats == polygon_areas_dist_stats), "Polygon areas do not match between the two CSV files."

# Print columns of summary_stats
print("Columns in summary_stats:")
for i, col in enumerate(summary_stats.columns[1:]):
    print(col, end=' ')
    if (i+1) % 5 == 0:
        print()

print("\n")

# Print columns of summary_dist_stats
print("Columns in summary_dist_stats:")
for i, col in enumerate(summary_dist_stats.columns[1:]):
    print(col, end=' ')
    if (i+1) % 5 == 0:
        print()

In [None]:
# Plot degradation and deforestation separately
separate_disturbance = True
# Plot degradation before and since a date separately
separate_degradation = True
# Plot deforestation before and since a date separately
separate_deforestation = True

# DPI (default is 96, output image will scale accordingly)
dpi = 300
# Relative width modifier (ratio, e.g. 0.5 or 2)
width_modifier = 0.85

# Title (polygon area), density and label variables (weight of 800 ~ bold, 400 ~ normal)
show_title = True
show_density = True
show_labels = True
left_axis_label = True
svg_transparent_background = True
title_font_size = 20
title_font_weight = 600
density_font_size = 17
density_font_weight = 600
label_font_size = 17
label_font_weight = 600

# Base columns and year (summary_stats)
old_growth_agb_column = 'all_oldgrowth forest AGB (Tg)'
current_agb_column = '2024 forest AGB (Tg)'
current_year = current_agb_column.split(' ')[0] # Usually first word of current_agb_column

# Disturbance columns (summary_dist_stats)
degradation_before_column = '2024_degradation_before_2015 forest AGB (Tg)'
degradation_since_column = '2024_degradation_since_2015 forest AGB (Tg)'
degradation_total_column = '2024_degradation_total forest AGB (Tg)'
deforestation_before_column = '2024_deforestation_before_2015 forest AGB (Tg)'
deforestation_since_column = '2024_deforestation_since_2015 forest AGB (Tg)'
deforestation_total_column = '2024_deforestation_total forest AGB (Tg)'
disturbance_total_column = '2024_degradation_deforestation_total forest AGB (Tg)'

# Node labels and colours
remaining_name = f'Remaining in {current_year}:'
remaining_colour = '#007fff' # Blue
degradation_before_name = 'Degradation loss before 2015'
degradation_before_colour = '#1a801a'  # Dark green
degradation_since_name = 'Degradation loss since 2015'
degradation_since_colour = '#8dc00d'  # Light green
degradation_total_name = 'Degradation loss'
degradation_total_colour = '#8dc00d'  # Light green
deforestation_before_name = 'Deforestation loss before 2015'
deforestation_before_colour = '#ffffff'  # White
deforestation_since_name = 'Deforestation loss since 2015'
deforestation_since_colour = '#ffff00'  # Yellow
deforestation_total_name = 'Deforestation loss'
deforestation_total_colour = '#ffffff'  # White
disturbance_total_name = 'Disturbance loss'
disturbance_total_colour = '#ffffff'  # White

# Assert checking separate_disturbance is True if separate_degradation or separate_deforestation is True
assert not separate_degradation or separate_disturbance, "separate_disturbance must be True if separate_degradation is True."
assert not separate_deforestation or separate_disturbance, "separate_disturbance must be True if separate_deforestation is True."

# Function to get values from statistics
def get_value(df, idx, column_name):
    try:
        value = df.loc[idx, column_name]
        if pd.isnull(value): return 0.0
        else: return float(value)
    except KeyError:
        print(f"Column '{column_name}' not found in the dataframe.")
        return 0.0

# Loop through each row (polygon area)
for idx in summary_stats.index:

    # Get the polygon name
    polygon_name = summary_stats.iloc[idx, 0]

    # Get old-growth and current AGB values
    old_growth_agb = get_value(summary_stats, idx, old_growth_agb_column)
    current_agb = get_value(summary_stats, idx, current_agb_column)

    # Get values from summary_dist_stats
    degradation_before = get_value(summary_dist_stats, idx, degradation_before_column)
    degradation_since = get_value(summary_dist_stats, idx, degradation_since_column)
    degradation_total = get_value(summary_dist_stats, idx, degradation_total_column)
    deforestation_before = get_value(summary_dist_stats, idx, deforestation_before_column)
    deforestation_since = get_value(summary_dist_stats, idx, deforestation_since_column)
    deforestation_total = get_value(summary_dist_stats, idx, deforestation_total_column)
    disturbance_total = get_value(summary_dist_stats, idx, disturbance_total_column)

    # Load detailed stats to get mean AGBD and CI95 values
    detailed_stats_df = pd.read_csv(join(detailed_stats_by_area_dir, f"{polygon_name}.csv"))
    old_growth_index = detailed_stats_df.index[detailed_stats_df['scenario'] == f"{old_growth_agb_column.split(' ')[0]}"].item()
    old_growth_mean_agbd = get_value(detailed_stats_df, old_growth_index, "Forest AGBD mean (Mg / ha)")
    current_index = detailed_stats_df.index[detailed_stats_df['scenario'] == f"{current_agb_column.split(' ')[0]}"].item()
    current_mean_agbd = get_value(detailed_stats_df, current_index, "Forest AGBD mean (Mg / ha)")
    uncertainty = False # Uncertainty may not have been calculated
    if 'Forest AGB total CI95 (Tg)' in detailed_stats_df.columns:
      uncertainty = True # CI95 will be divided by 2 for margin of error
      old_growth_agb_ci95 = get_value(detailed_stats_df, old_growth_index, "Forest AGB total CI95 (Tg)")
      old_growth_mean_agbd_ci95 = get_value(detailed_stats_df, old_growth_index, "Forest AGBD CI95 (Mg / ha)")
      current_agb_ci95 = get_value(detailed_stats_df, current_index, "Forest AGB total CI95 (Tg)")
      current_mean_agbd_ci95 = get_value(detailed_stats_df, current_index, "Forest AGBD CI95 (Mg / ha)")

    # Title line 1 name
    title_name = f"{polygon_name}"

    # Subtitle line 1 name
    if uncertainty: subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} ± {old_growth_mean_agbd_ci95:.1f} Mg / ha"
    else: subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} Mg / ha"

    # Subtitle line 2 name
    if uncertainty: subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} ± {current_mean_agbd_ci95:.1f} Mg / ha"
    else: subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} Mg / ha"

    # Left axis name
    if left_axis_label:
      if uncertainty: left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} ± {old_growth_agb_ci95:.2f} Tg"
      else: left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} Tg"
    else: left_axis = ''

    # Update remaining_name with AGB
    if uncertainty: remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} ± {current_agb_ci95:.2f} Tg"
    else: remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} Tg"

    # Depending on the settings, perform assertions and plot
    if separate_disturbance and separate_degradation and separate_deforestation:
        assert abs(degradation_before + degradation_since - degradation_total) < 1e-9, f"{polygon_name}: degradation_before_column + degradation_since_column != degradation_total_column"
        assert abs(deforestation_before + deforestation_since - deforestation_total) < 1e-9, f"{polygon_name}: deforestation_before_column + deforestation_since_column != deforestation_total_column"
        assert abs(degradation_total + deforestation_total - disturbance_total) < 1e-9, f"{polygon_name}: degradation_total_column + deforestation_total_column != disturbance_total_column"
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_before_name, deforestation_since_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0, 0, 0, 0]
        targets = [1, 2, 3, 4, 5]
        values = [-degradation_before, -degradation_since, -deforestation_before, -deforestation_since, remaining_value]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_before_colour, deforestation_since_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    elif separate_disturbance and separate_degradation and not separate_deforestation:
        assert abs(degradation_before + degradation_since - degradation_total) < 1e-9, f"{polygon_name}: degradation_before_column + degradation_since_column != degradation_total_column"
        assert abs(degradation_total + deforestation_total - disturbance_total) < 1e-9, f"{polygon_name}: degradation_total_column + deforestation_total_column != disturbance_total_column"
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_total_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0, 0, 0]
        targets = [1, 2, 3, 4]
        values = [-degradation_before, -degradation_since, -deforestation_total, remaining_value]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_total_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    elif separate_disturbance and not separate_degradation and separate_deforestation:
        assert abs(deforestation_before + deforestation_since - deforestation_total) < 1e-9, f"{polygon_name}: deforestation_before_column + deforestation_since_column != deforestation_total_column"
        assert abs(degradation_total + deforestation_total - disturbance_total) < 1e-9, f"{polygon_name}: degradation_total_column + deforestation_total_column != disturbance_total_column"
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, degradation_total_name, deforestation_before_name, deforestation_since_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0, 0, 0]
        targets = [1, 2, 3, 4]
        values = [-degradation_total, -deforestation_before, -deforestation_since, remaining_value]
        colors = [degradation_total_colour, deforestation_before_colour, deforestation_since_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    elif separate_disturbance and not separate_degradation and not separate_deforestation:
        assert abs(degradation_total + deforestation_total - disturbance_total) < 1e-9, f"{polygon_name}: degradation_total_column + deforestation_total_column != disturbance_total_column"
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, degradation_total_name, deforestation_total_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0, 0]
        targets = [1, 2, 3]
        values = [-degradation_total, -deforestation_total, remaining_value]
        colors = [degradation_total_colour, deforestation_total_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    else:
        # Both separate_disturbance and separate_degradation are False
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, disturbance_total_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0]
        targets = [1, 2]
        values = [-disturbance_total, remaining_value]
        colors = [disturbance_total_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    # Calculate percentages and update right node labels
    percentages = [(abs(val) / old_growth_agb * 100) for val in values]
    for i in range(1, len(nodes)):
        if i - 1 < len(percentages):
            nodes[i] += f" ({percentages[i-1]:.0f}%)"

    title_and_density = [
            dict(
                x=0,
                y=1.28,
                xref='paper',
                yref='paper',
                text=title_name,
                showarrow=False,
                xanchor='left',
                align='left',
                font=dict(
                    family="arial, sans serif",
                    size=title_font_size,
                    color="black",
                    weight=title_font_weight
                )
            ),
            dict(
                x=0,
                y=1.19,
                xref='paper',
                yref='paper',
                text=subtitle_1_name,
                showarrow=False,
                xanchor='left',
                align='left',
                font=dict(
                    family="arial, sans serif",
                    size=density_font_size,
                    color="black",
                    weight=density_font_weight
                )
            ),
            dict(
                x=0,
                y=1.11,
                xref='paper',
                yref='paper',
                text=subtitle_2_name,
                showarrow=False,
                xanchor='left',
                align='left',
                font=dict(
                    family="arial, sans serif",
                    size=density_font_size,
                    color="black",
                    weight=density_font_weight
                )
            )
        ]

    if show_title and not show_density: title_and_density = title_and_density[0:1]
    if not show_title and show_density: title_and_density = title_and_density[1:3]
    if not show_title and not show_density: title_and_density = []

    # If labels are toggled off, replace node labels with empty strings
    if not show_labels: nodes = [''] * len(nodes)

    # Create the Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(
            label=nodes,
            color=node_colors,  # Set node colors
            pad=15,
            thickness=20,
            line=dict(color="black", width=1)
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color=colors,
            line=dict(color="black", width=1),  # Add border to ribbons
        )
    )])

    fig.update_layout(
        width=700 * width_modifier,
        height=500,
        font=dict(
            family="arial, sans serif",
            size=label_font_size,
            color="black",
            weight=label_font_weight
        ),
        # Adjust the margins
        margin=dict(
            l=25,
            r=25,
            t=115,  # Increased top margin to accommodate title
            b=25
        ),
        annotations=title_and_density
    )

    # Save labelled version (with user settings)
    png_path = os.path.join(sankey_labelled, f'sankey_diagram_{polygon_name}.png')
    fig.write_image(png_path, scale=dpi / 96)

    if svg_transparent_background:
        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    svg_path = os.path.join(sankey_labelled_svg, f'sankey_diagram_vector_{polygon_name}.svg')
    fig.write_image(svg_path, scale=dpi / 96)

    # Create and save unlabelled version
    fig_unlabelled = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=[''] * len(nodes), color=node_colors, pad=15, thickness=20,
                 line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors,
                 line=dict(color="black", width=1))
    )])

    fig_unlabelled.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25)
    )

    png_path = os.path.join(sankey_unlabelled, f'sankey_diagram_{polygon_name}.png')
    fig_unlabelled.write_image(png_path, scale=dpi / 96)

    if svg_transparent_background:
        fig_unlabelled.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    svg_path = os.path.join(sankey_unlabelled_svg, f'sankey_diagram_vector_{polygon_name}.svg')
    fig_unlabelled.write_image(svg_path, scale=dpi / 96)

    print(f"Statistical assertions and sankey diagram complete for {polygon_name}.")

    # Show the figure (with white background)
    fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
    fig.show()

# Disconnected runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()