<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/8_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Subdirectories

In [None]:
# Define base directory
base_dir = "/gdrive/Shareddrives/masfi"

# Mount Google Drive and set base directory
from google.colab import drive
import os
import sys
drive.mount('/gdrive', force_remount=True)
_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Imports and upgrades
!pip install geopandas
!pip install kaleido
!pip install rasterio

In [None]:
# Reload imports, replacing those in the cache
# %load_ext autoreload
# %autoreload 2
# Imports
import geopandas as gpd
from google.colab import runtime
import math
import numpy as np
from os.path import exists, join
from os import makedirs
from osgeo import gdal
import pandas as pd
import plotly.graph_objects as go
import rasterio
from rasterio import mask as msk

In [None]:
areas_dir = join(base_dir, "1_areas")
scenarios_dir = join(base_dir, "6_scenarios")
uncertainty_dir = join(base_dir, "7_uncertainty")
statistics_dir = join(base_dir, "8_statistics")
sample_polygons_dir = join(statistics_dir, "sample_polygons")

# Create directories
makedirs(statistics_dir, exist_ok=True)
makedirs(sample_polygons_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Select model, area and sample polygons

In [None]:
# Select if to source predictions from scenarios_dir or uncertainty_dir
source_dir = uncertainty_dir

# Select the model
for subdir in os.listdir(source_dir):
  print(f"selected_model = '{subdir}'")

In [None]:
selected_model = 'agbd_240926_030225'

selected_model_dir = join(source_dir, selected_model)
# Select the prediction area
for subdir in os.listdir(selected_model_dir):
  if source_dir == scenarios_dir and not subdir.endswith('.csv') and not subdir.endswith('.json'):
    print(f"prediction_area = '{subdir}'")
  if source_dir == uncertainty_dir and subdir != 'model_iterations':
    print(f"prediction_area = '{subdir[10:]}'")

In [None]:
prediction_area = 'tekai'

# Model-area stats directory
model_area_statistics_dir = join(statistics_dir, f"{selected_model}_{prediction_area}")
makedirs(model_area_statistics_dir, exist_ok=True)

# Calculate precise pixel size for the centre of the area
# See https://gis.stackexchange.com/questions/142326/calculating-longitude-length-in-miles
# See https://thoughtco.com/degree-of-latitude-and-longitude-distance-4070616

# Use a predictor of the prediction area as a template
predictors_dir = join(scenarios_dir, selected_model, prediction_area, "predictors")
prediction_area_template_path = join(predictors_dir, os.listdir(predictors_dir)[0])
prediction_area_template = gdal.Open(prediction_area_template_path)
pixel_height_deg = 0 - prediction_area_template.GetGeoTransform()[5]
pixel_width_deg = prediction_area_template.GetGeoTransform()[1]

# Create a raster without 'nodata' values.
template_no_nodata_dir = join(model_area_statistics_dir, 'template_no_nodata.tif')
if not exists(template_no_nodata_dir):
  prediction_area_template_array = prediction_area_template.ReadAsArray()
  template_nodatavalue = prediction_area_template.GetRasterBand(1).GetNoDataValue()
  absent_nodatavalue = template_nodatavalue + 1 # Modify nodata value to one that's absent.
  assert absent_nodatavalue not in prediction_area_template_array, "New nodata value is present in the template. Change to a value that is absent."
  print("A template without nodata values will be created.")
  export_array_as_tif(prediction_area_template_array, template_no_nodata_dir, template = prediction_area_template_path, nodatavalue=absent_nodatavalue, compress=False)
else: print("Template without nodata values already exists.")
print("This is used for counting all pixels inside a polygon mask (which uses the 'nodata' values).\n")

# Proxy distance of degrees latitude in m (actual is non-linear)
lat_dist_equator_km = 110.567
lat_dist_poles_km = 111.699
lat_dist_diff = lat_dist_poles_km - lat_dist_equator_km
lat_dist_change_deg_km = lat_dist_diff / 90
long_dist_equator_km = 111.321 # Equation calculates at different latitudes

# Approximate pixel size
approx_resolution = (np.average([pixel_height_deg, pixel_width_deg]) * np.average([lat_dist_equator_km, lat_dist_poles_km]) * 1000)
approx_pixel_size_ha = approx_resolution**2 / 10000

print(f"Without precise correction, the approximate resolution is {approx_resolution} m, while the approximate pixel area is {approx_pixel_size_ha} ha.\n")
print(f"The pixel size with be further corrected based on the position of each sample polygon.")

In [None]:
# Select sample area polygons. This should be a single .gpkg with the field 'name' differentiating polygons.
sample_polygons = []
for geopackage in os.listdir(sample_polygons_dir):
  sample_polygons.append(geopackage)

if len(sample_polygons) == 0:
  print(f"No sample areas found. Upload .gpkg polygons to {sample_polygons_dir}")
else:
  for sample_polygon in sample_polygons: print(f"selected_sample_polygons = '{sample_polygon}'")

In [None]:
selected_sample_polygons = 'tekai_sample_polygons.gpkg'

selected_sample_polygons_dir = join(sample_polygons_dir, selected_sample_polygons)
selected_sample_polygons_gpkg = gpd.read_file(selected_sample_polygons_dir)
sample_polygons_statistics_dir = join(model_area_statistics_dir, selected_sample_polygons[:-5])
makedirs(sample_polygons_statistics_dir, exist_ok=True)
detailed_stats_dir = join(sample_polygons_statistics_dir, 'detailed_stats')
makedirs(detailed_stats_dir, exist_ok=True)
detailed_stats_scenario_dir = join(sample_polygons_statistics_dir, 'detailed_stats_scenario')
makedirs(detailed_stats_scenario_dir, exist_ok=True)
forecast_input_dir = join(sample_polygons_statistics_dir, 'forecast_input')
makedirs(forecast_input_dir, exist_ok=True)
detailed_diff_stats_dir = join(sample_polygons_statistics_dir, 'detailed_diff_stats')
makedirs(detailed_diff_stats_dir, exist_ok=True)
detailed_diff_stats_diff_dir = join(sample_polygons_statistics_dir, 'detailed_diff_stats_diff')
makedirs(detailed_diff_stats_diff_dir, exist_ok=True)

# Scenario statistics

In [None]:
# Create list of available prediction rasters and scenarios. Rasters must already be masked (e.g. to forest).
if source_dir == scenarios_dir: prediction_raster_dir = join(selected_model_dir, prediction_area, 'scenario_predictions')
if source_dir == uncertainty_dir: prediction_raster_dir = join(selected_model_dir, f'scenarios_{prediction_area}', 'statistics_masked')
prediction_raster_dirs = []
scenarios = set()
for prediction_raster in os.listdir(prediction_raster_dir):
  prediction_raster_dirs.append(join(prediction_raster_dir, prediction_raster))
  if source_dir == uncertainty_dir: scenarios.add(prediction_raster.split("__")[1])
  if source_dir == scenarios_dir: scenarios.add(prediction_raster.split("__")[0])
scenarios = sorted(list(scenarios))

# Select scenario predictions to calculate statistics
print('selected_scenarios = [')
for scenario in scenarios:
  print(f'  "{scenario}",')
print(']\n')

In [None]:
selected_scenarios = [
  "2022",
  "2022_no_degradation_since_1990",
  "2022_oldgrowth",
  "2023",
  "2023_no_degradation_since_1990",
  "2023_oldgrowth",
  "all_oldgrowth",
]

# Filter to selected scenarios, and separate prediction and uncertainty rasters (if latter present)
prediction_rasters = []
uncertainty_rasters = []
for prediction_raster in prediction_raster_dirs:
  for scenario in selected_scenarios:
    if source_dir == uncertainty_dir:
      if scenario == prediction_raster.split('/')[-1].split('__')[1] and 'mean__' in prediction_raster:
        prediction_rasters.append(prediction_raster)
      if scenario == prediction_raster.split('/')[-1].split('__')[1] and 'uncertainty__' in prediction_raster:
        uncertainty_rasters.append(prediction_raster)
    else: # If the source directory is scenarios_dir (without uncertainty values)
      if scenario == prediction_raster.split('/')[-1].split('__')[0]: prediction_rasters.append(prediction_raster)
# Toggle whether to predict uncertainty stats
if len(uncertainty_rasters) > 0: generate_uncertainty_stats = True
else: generate_uncertainty_stats = False

# Sort rasters chronologically (assuming year is first in the filename)
prediction_rasters = sorted(prediction_rasters)
uncertainty_rasters = sorted(uncertainty_rasters)

# Generate empty dataframes for statistics
df_base = pd.DataFrame(index = selected_scenarios)
df_base.rename_axis('scenario', inplace=True)
df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg = df_base.copy(), df_base.copy(), df_base.copy(), df_base.copy()
# If uncertainty rasters are present, generate empty dataframes
if generate_uncertainty_stats:
  df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg_ci95 = df_base.copy(), df_base.copy(), df_base.copy()

# Initialise polygon area dataframe
df_polygon_area_km2 = pd.DataFrame(columns = ["Name", "Area (km^2)"])

# Loop through each polygon stored in GPKG to generate statistics
for index, row in selected_sample_polygons_gpkg.iterrows():

  # Define the polygon
  sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"] # Set name to start at 3rd character with [2:] (skipping number used for ordering)
  polygons = [polygon for polygon in sample_polygon_geometry.geoms]

  # Latitude of the centroid
  polygon_centroid_lat = sample_polygon_geometry.centroid.y

  # Calculate latitude distance per degree at the polygon's latitude
  latitude_m_per_degree = 1000 * (lat_dist_equator_km + (lat_dist_change_deg_km * polygon_centroid_lat))

  # Rest of your calculations that depend on the latitude_m_per_degree follows here
  # For example, if you're calculating pixel size in meters for each polygon based on its latitude:
  precise_pixel_height_m = latitude_m_per_degree * pixel_height_deg
  precise_pixel_width_m = (math.cos((math.pi / 180) * polygon_centroid_lat) * latitude_m_per_degree * pixel_width_deg)
  precise_pixel_size_ha = precise_pixel_height_m * precise_pixel_width_m / 10000

  # Mask the 'no nodata' raster to the polygon with an absent value to count all pixels within the polygon
  with rasterio.open(template_no_nodata_dir) as template_no_nodata:
    no_nodata_template_array_masked, transform_1 = msk.mask(template_no_nodata, polygons, crop=True, filled=False)

  # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
  sample_polygons_crs = selected_sample_polygons_gpkg.crs
  temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
  temp_gdf_utm = temp_gdf.estimate_utm_crs()
  polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')
  # Ensure the new_row has the correct data types
  new_row = pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': polygon_area_ha / 100}], dtype=object)
  df_polygon_area_km2 = pd.concat([df_polygon_area_km2, new_row], ignore_index=True, sort=False)

  # Create an empty list for each prediction raster statistic
  values_forest_cover_ha, values_agbd_mean_mg_ha, values_agbd_stdev_mg_ha, values_agb_total_tg = [], [], [], []

  # If uncertainty rasters are present, create an empty list for each uncertainty raster statistic
  if generate_uncertainty_stats:
    values_agbd_mean_mg_ha_ci95, values_agbd_mean_mg_ha_uncertainty, values_agb_total_tg_ci95  = [], [], []

  # Loop through prediction rasters
  for prediction_raster in prediction_rasters:

    # Mask predictor to sample_polygon_geometry
    with rasterio.open(prediction_raster) as prediction:
      nodatavalue = int(prediction.nodatavals[0])
      prediction_array_masked, transform_2 = msk.mask(prediction, polygons, crop=True, filled=False)

    # Count all (incl. nodata) pixels within polygon, and estimate their total area
    pixel_count_polygon = np.ma.count(no_nodata_template_array_masked)
    pixels_area_polygon_ha = np.multiply(pixel_count_polygon, precise_pixel_size_ha, dtype='float64')

    # Correct pixel size to UTM ellipsoidal measure of the polygons position (which will decrease further North)
    polygon_to_pixel_area_ratio = np.divide(pixels_area_polygon_ha, polygon_area_ha, dtype='float64')
    pixel_size_ha_corrected = np.multiply(precise_pixel_size_ha, polygon_to_pixel_area_ratio, dtype='float64')

    # Count pixels within polygon, excluding those previously masked i.e. nodata
    pixel_count_polygon_masked = np.ma.count(prediction_array_masked)
    pixels_area_polygon_masked_ha = np.multiply(pixel_count_polygon_masked, pixel_size_ha_corrected, dtype='float64')

    # Calculate forest area
    forest_cover_ha = pixels_area_polygon_masked_ha # Already masked to forest in current workflow

    # Calculate total, mean and stdev of aboveground biomass
    agbd_mean_mg_ha = np.ma.mean(prediction_array_masked, dtype='float64') # Float64 minimises error for large number of values
    agbd_mean_stdev_ha = np.ma.std(prediction_array_masked, dtype='float64')
    agb_total_mg = np.multiply(agbd_mean_mg_ha, forest_cover_ha, dtype='float64')
    agb_total_tg = np.divide(agb_total_mg, 1000000, dtype='float64') # Convert Mg (megagram = ton) to Tg (teragram = megaton)

    # Append results to statistics list
    values_forest_cover_ha.append(forest_cover_ha)
    values_agbd_mean_mg_ha.append(agbd_mean_mg_ha)
    values_agbd_stdev_mg_ha.append(agbd_mean_stdev_ha)
    values_agb_total_tg.append(agb_total_tg)

    if generate_uncertainty_stats:
      uncertainty_raster_present = False
      for uncertainty_raster in uncertainty_rasters:
        if prediction_raster.split('mean__')[1] in uncertainty_raster:
          uncertainty_raster_present = True
          matching_uncertainty_raster = uncertainty_raster

      if not uncertainty_raster_present: print(f"There is no uncertainty raster for {prediction_raster.split('/')[-1]}")

      if uncertainty_raster_present:
        # Open and mask uncertainty raster to polygon
          with rasterio.open(matching_uncertainty_raster) as uncertainty:
            nodatavalue = int(uncertainty.nodatavals[0])
            uncertainty_array_masked, transform_2 = msk.mask(uncertainty, polygons, crop=True, filled=False)

          # See https://stats.stackexchange.com/questions/223924/how-to-add-up-partial-confidence-intervals-to-create-a-total-confidence-interval#comment426260_223924

          # Compress masked array data to 1D
          prediction_1d = np.ma.compressed(prediction_array_masked)
          uncertainty_1d = np.ma.compressed(uncertainty_array_masked)
          # Convert uncertainty percentages to ratios
          uncertainty_ratios = np.divide(uncertainty_1d, 100, dtype='float64')
          # Multiply the prediction values (mean AGBD Mg/ha) by uncertainty ratios for CI95 values
          prediction_ci95s = np.multiply(prediction_1d, uncertainty_ratios, dtype='float64')

          # Method 1 - Simple calculation of mean CI95 (higher estimate). Assumption: pixel values are completely correlated (all measure the same thing).
          agbd_mean_mg_ha_ci95_1 = np.mean(prediction_ci95s, dtype = 'float64')
          # Method 2 - Square prediction CI95s. Sum and then square root for total CI.
          # Then divide by observations to calculate mean CI95. Assumption: pixel values are completely independent.
          sum_squares = np.sum(np.square(prediction_ci95s, dtype='float64'), dtype='float64')
          total_ci95 = np.sqrt(sum_squares, dtype='float64')
          agbd_mean_mg_ha_ci95_2 = np.divide(total_ci95, np.ma.count(prediction_ci95s), dtype='float64')
          # Method 3 - Used in Liang et al 2023 to calculate change uncertainty. Identical results to method 2.
          predictions_x_uncertainties = np.multiply(prediction_1d, uncertainty_1d, dtype='float64')
          sum_squares_pxu = np.sum(np.square(predictions_x_uncertainties, dtype='float64'), dtype='float64')
          sqrt_divided_sum = np.sqrt(sum_squares_pxu, dtype='float64') / np.sum(prediction_1d, dtype='float64')
          agbd_mean_mg_ha_ci95_3 = np.multiply(np.divide(sqrt_divided_sum, 100, dtype='float64'), agbd_mean_mg_ha, dtype='float64')

          agbd_mean_mg_ha_ci95 = agbd_mean_mg_ha_ci95_1

          # Calculate total AGB CI95
          agb_total_mg_ci95 = np.multiply(agbd_mean_mg_ha_ci95, forest_cover_ha, dtype='float64')
          agb_total_tg_ci95 = np.divide(agb_total_mg_ci95, 1000000, dtype='float64') # Convert total CI to Tg
          # Calculate percentage uncertainty
          agbd_mean_mg_ha_uncertainty = np.multiply(np.divide(agbd_mean_mg_ha_ci95, agbd_mean_mg_ha, dtype='float64'), 100, dtype='float64')
          # Append results to statistics list
          values_agbd_mean_mg_ha_ci95.append(agbd_mean_mg_ha_ci95)
          values_agbd_mean_mg_ha_uncertainty.append(agbd_mean_mg_ha_uncertainty)
          values_agb_total_tg_ci95.append(agb_total_tg_ci95)

  # Concatenate new columns to the main DataFrames for each statistic
  df_forest_cover_ha = pd.concat([df_forest_cover_ha, pd.DataFrame({sample_polygon_name: values_forest_cover_ha}, index=df_forest_cover_ha.index)], axis=1)
  df_agbd_mean_mg_ha = pd.concat([df_agbd_mean_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha}, index=df_agbd_mean_mg_ha.index)], axis=1)
  df_agbd_stdev_mg_ha = pd.concat([df_agbd_stdev_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_stdev_mg_ha}, index=df_agbd_stdev_mg_ha.index)], axis=1)
  df_agb_total_tg = pd.concat([df_agb_total_tg, pd.DataFrame({sample_polygon_name: values_agb_total_tg}, index=df_agb_total_tg.index)], axis=1)

  if generate_uncertainty_stats:
      df_agbd_mean_mg_ha_ci95 = pd.concat([df_agbd_mean_mg_ha_ci95, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_ci95}, index=df_agbd_mean_mg_ha_ci95.index)], axis=1)
      df_agbd_mean_mg_ha_uncertainty = pd.concat([df_agbd_mean_mg_ha_uncertainty, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_uncertainty}, index=df_agbd_mean_mg_ha_uncertainty.index)], axis=1)
      df_agb_total_tg_ci95 = pd.concat([df_agb_total_tg_ci95, pd.DataFrame({sample_polygon_name: values_agb_total_tg_ci95}, index=df_agb_total_tg_ci95.index)], axis=1)

# Create stats list
if generate_uncertainty_stats:
  df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95,
                 df_agbd_mean_mg_ha_uncertainty, df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_tg_ci95]
else: df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Export statistics for forecast input
df_forecast_list = [df_forest_cover_ha, df_agb_total_tg]
for df_forecast in df_forecast_list:
  df_noalts = df_forecast[df_forecast.index.str.contains("_") == False]
  if df_forecast.equals(df_forest_cover_ha): df_filename = "forest_cover_ha"
  # if df_stats.equals(df_agbd_mean_mg_ha): df_filename = "agbd_mean_mg_ha"
  # if df_stats.equals(df_agbd_stdev_mg_ha): df_filename = "agbd_stdev_mg_ha"
  if df_forecast.equals(df_agb_total_tg): df_filename = "agb_total_tg"
  df_noalts.to_csv(join(forecast_input_dir, f'{df_filename}.csv'))

# Generate summary stats
df_forest_cover_ha_t = df_forest_cover_ha.T.rename_axis("Name", axis=1).add_suffix(" forest cover (ha)")
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")
if generate_uncertainty_stats:
  df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
  summary_stats = pd.concat([df_polygon_area_km2, df_forest_cover_ha_t, df_agb_total_tg_t, df_agb_total_tg_ci95_t], axis=1).rename_axis("Name", axis=1)
else: summary_stats = pd.concat([df_polygon_area_km2, df_forest_cover_ha_t, df_agb_total_tg_t], axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_statistics_dir, 'summary_stats.csv'))

# Generate detailed stats by polygon (requires uncertainty stats)
for polygon_area in df_stats_list[0]:
  polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
  df_detailed_stats = df_base
  df_detailed_stats["Area (km^2)"] = polygon_area_km2
  for df_stats in df_stats_list:
    if df_stats.equals(df_forest_cover_ha): stat_col = "Forest cover (ha)"
    if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
    if df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "Forest AGBD stdev (Mg / ha)"
    if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
    if generate_uncertainty_stats:
      if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
      if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
      if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
    for stats_polygon_area in df_stats:
      if stats_polygon_area == polygon_area:
        df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
        df_detailed_stats = pd.concat([df_detailed_stats, df_stats_renamed[stat_col]], axis=1)
    df_detailed_stats.to_csv(join(detailed_stats_dir, f'{polygon_area}.csv'))

# Generate detailed stats by scenario (requires uncertainty stats)
scenarios = {}
# Loop through all CSV files in the 'detailed_stats_dir'
for stats_csv in os.listdir(detailed_stats_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_stats_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Loop through each unique scenario in the file
    for scenario in stats_csv_df['scenario'].unique():
        # Filter the dataframe for the current scenario
        scenario_df = stats_csv_df[stats_csv_df['scenario'] == scenario].copy()
        # Drop the 'scenario' column and add the 'Name' column
        scenario_df.drop('scenario', axis=1, inplace=True)
        scenario_df.insert(0, 'Name', polygon_name)
        # If this scenario's dataframe already exists, append to it; otherwise, create it
        if scenario in scenarios: scenarios[scenario] = pd.concat([scenarios[scenario], scenario_df], ignore_index=True)
        else: scenarios[scenario] = scenario_df
for scenario, scenario_df in scenarios.items():
    output_file_path = join(detailed_stats_scenario_dir,f'{scenario}.csv')
    scenario_df.to_csv(output_file_path, index=False)

# Difference statistics

In [None]:
# Create list of available difference rasters and scenarios
if source_dir == scenarios_dir: diff_raster_dir = join(selected_model_dir, prediction_area, 'scenario_difference')
if source_dir == uncertainty_dir: diff_raster_dir = join(selected_model_dir, f'scenarios_{prediction_area}', 'scenario_difference')

diff_raster_dirs = []
diffs = set()
for diff_raster in os.listdir(diff_raster_dir):
  diff_raster_dirs.append(join(diff_raster_dir, diff_raster))
  if source_dir == uncertainty_dir: diffs.add(diff_raster.split("__")[1])
  if source_dir == scenarios_dir: diffs.add(diff_raster.split("__")[0])
diffs = sorted(list(diffs))

# Select difference rasters to calculate statistics
print('selected_diffs = [')
for diff in diffs:
  print(f'  "{diff}",')
print(']')

In [None]:
selected_diffs = [
  "2022_deforestation_total",
  "2022_degradation_before_1990",
  "2022_degradation_deforestation_total",
  "2022_degradation_since_1990",
  "2022_degradation_total",
  "2023_deforestation_total",
  "2023_degradation_before_1990",
  "2023_degradation_deforestation_total",
  "2023_degradation_since_1990",
  "2023_degradation_total",
]

# Filter to selected scenarios, and separate prediction and uncertainty rasters (if latter present)
diff_rasters = []
diff_uncertainty_rasters = []
for diff_raster in diff_raster_dirs:
  for diff in selected_diffs:
    if source_dir == uncertainty_dir:
      if diff == diff_raster.split('/')[-1].split('__')[1] and 'mean__' in diff_raster:
        diff_rasters.append(diff_raster)
      if diff == diff_raster.split('/')[-1].split('__')[1] and 'uncertainty__' in diff_raster:
        diff_uncertainty_rasters.append(diff_raster)
    else: # If the source directory is scenarios_dir (without uncertainty values)
      if diff == diff_raster.split('/')[-1].split('__')[0]: diff_rasters.append(diff_raster)
# Toggle whether to predict uncertainty stats
if len(diff_uncertainty_rasters) > 0: generate_uncertainty_stats = True
else: generate_uncertainty_stats = False

# Sort rasters chronologically (assuming year is first in the filename)
diff_rasters = sorted(diff_rasters)
diff_uncertainty_rasters = sorted(diff_uncertainty_rasters)

# Generate empty dataframes for statistics
df_base = pd.DataFrame(index = selected_diffs)
df_base.rename_axis('diff', inplace=True)
df_agbd_mean_mg_ha, df_agb_total_tg = df_base.copy(), df_base.copy()
# If uncertainty rasters are present, generate empty dataframes
if generate_uncertainty_stats:
  df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg_ci95 = df_base.copy(), df_base.copy(), df_base.copy()

# Initialise polygon area dataframe
df_polygon_area_km2 = pd.DataFrame(columns = ["Name", "Area (km^2)"])

# Loop through each polygon stored in GPKG to generate statistics
for index, row in selected_sample_polygons_gpkg.iterrows():

  # Define the polygon
  sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"] # Set name to start at 3rd character with [2:] (skipping number used for ordering)
  polygons = [polygon for polygon in sample_polygon_geometry.geoms]

  # Latitude of the centroid
  polygon_centroid_lat = sample_polygon_geometry.centroid.y

  # Calculate latitude distance per degree at the polygon's latitude
  latitude_m_per_degree = 1000 * (lat_dist_equator_km + (lat_dist_change_deg_km * polygon_centroid_lat))

  # Rest of your calculations that depend on the latitude_m_per_degree follows here
  # For example, if you're calculating pixel size in meters for each polygon based on its latitude:
  precise_pixel_height_m = latitude_m_per_degree * pixel_height_deg
  precise_pixel_width_m = (math.cos((math.pi / 180) * polygon_centroid_lat) * latitude_m_per_degree * pixel_width_deg)
  precise_pixel_size_ha = precise_pixel_height_m * precise_pixel_width_m / 10000

  # Mask the 'no nodata' raster to the polygon with an absent value to count all pixels within the polygon
  with rasterio.open(template_no_nodata_dir) as template_no_nodata:
    no_nodata_template_array_masked, transform_1 = msk.mask(template_no_nodata, polygons, crop=True, filled=False)

  # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
  sample_polygons_crs = selected_sample_polygons_gpkg.crs
  temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
  temp_gdf_utm = temp_gdf.estimate_utm_crs()
  polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')
  # Ensure the new_row has the correct data types
  new_row = pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': polygon_area_ha / 100}], dtype=object)
  df_polygon_area_km2 = pd.concat([df_polygon_area_km2, new_row], ignore_index=True, sort=False)

  # Create an empty list for each prediction raster statistic
  values_agbd_mean_mg_ha, values_agb_total_tg = [], []

  # If uncertainty rasters are present, create an empty list for each uncertainty raster statistic
  if generate_uncertainty_stats:
    values_agbd_mean_mg_ha_ci95, values_agbd_mean_mg_ha_uncertainty, values_agb_total_tg_ci95  = [], [], []

  # Loop through diff rasters
  for diff_raster in diff_rasters:

    # Mask predictor to sample_polygon_geometry
    with rasterio.open(diff_raster) as diff:
      nodatavalue = int(diff.nodatavals[0])
      diff_array_masked, transform_2 = msk.mask(diff, polygons, crop=True, filled=False)

    # Count all (incl. nodata) pixels within polygon, and estimate their total area
    pixel_count_polygon = np.ma.count(no_nodata_template_array_masked)
    pixels_area_polygon_ha = np.multiply(pixel_count_polygon, precise_pixel_size_ha, dtype='float64')

    # Correct pixel size to UTM ellipsoidal measure of the polygons position (which will decrease further North)
    polygon_to_pixel_area_ratio = np.divide(pixels_area_polygon_ha, polygon_area_ha, dtype='float64')
    pixel_size_ha_corrected = np.multiply(precise_pixel_size_ha, polygon_to_pixel_area_ratio, dtype='float64')

    # Count pixels within polygon, excluding those previously masked i.e. nodata
    pixel_count_polygon_masked = np.ma.count(diff_array_masked)
    pixels_area_polygon_masked_ha = np.multiply(pixel_count_polygon_masked, pixel_size_ha_corrected, dtype='float64')

    # Calculate forest area
    forest_cover_ha = pixels_area_polygon_masked_ha # Already masked to forest in current workflow

    # Calculate total, mean and stdev of aboveground biomass
    agbd_mean_mg_ha = np.ma.mean(diff_array_masked, dtype='float64') # Float64 minimises error for large number of values
    agb_total_mg = np.multiply(agbd_mean_mg_ha, forest_cover_ha, dtype='float64')
    agb_total_tg = np.divide(agb_total_mg, 1000000, dtype='float64') # Convert Mg (megagram = ton) to Tg (teragram = megaton)

    # Append results to statistics list
    values_agbd_mean_mg_ha.append(agbd_mean_mg_ha)
    values_agb_total_tg.append(agb_total_tg)

    if generate_uncertainty_stats:
      diff_uncertainty_raster_present = False
      for diff_uncertainty_raster in diff_uncertainty_rasters:
        if diff_raster.split('mean__')[1] in diff_uncertainty_raster:
          diff_uncertainty_raster_present = True
          matching_diff_uncertainty_raster = diff_uncertainty_raster

      if not diff_uncertainty_raster_present: print(f"There is no uncertainty raster for {diff_raster.split('/')[-1]}")

      if diff_uncertainty_raster_present:
        # Open and mask uncertainty raster to polygon
          with rasterio.open(matching_diff_uncertainty_raster) as uncertainty:
            nodatavalue = int(uncertainty.nodatavals[0])
            uncertainty_array_masked, transform_2 = msk.mask(uncertainty, polygons, crop=True, filled=False)

          # See https://stats.stackexchange.com/questions/223924/how-to-add-up-partial-confidence-intervals-to-create-a-total-confidence-interval#comment426260_223924

          # Compress masked array data to 1D
          diff_1d = np.ma.compressed(diff_array_masked)
          uncertainty_1d = np.ma.compressed(uncertainty_array_masked)
          # Convert uncertainty percentages to ratios
          uncertainty_ratios = np.divide(uncertainty_1d, 100, dtype='float64')
          # Multiply the diff values (mean AGBD Mg/ha) by uncertainty ratios for CI95 values
          diff_ci95s = np.multiply(diff_1d, uncertainty_ratios, dtype='float64')

          # Method 1 - Simple calculation of mean CI95 (higher estimate). Assumption: pixel values are completely correlated (all measure the same thing).
          agbd_mean_mg_ha_ci95_1 = np.mean(diff_ci95s, dtype = 'float64')
          # Method 2 - Square diff CI95s. Sum and then square root for total CI.
          # Then divide by observations to calculate mean CI95. Assumption: pixel values are completely independent.
          sum_squares = np.sum(np.square(diff_ci95s, dtype='float64'), dtype='float64')
          total_ci95 = np.sqrt(sum_squares, dtype='float64')
          agbd_mean_mg_ha_ci95_2 = np.divide(total_ci95, np.ma.count(diff_ci95s), dtype='float64')
          # Method 3 - Used in Liang et al 2023 to calculate change uncertainty. Identical results to method 2.
          diffs_x_uncertainties = np.multiply(diff_1d, uncertainty_1d, dtype='float64')
          sum_squares_pxu = np.sum(np.square(diffs_x_uncertainties, dtype='float64'), dtype='float64')
          sqrt_divided_sum = np.sqrt(sum_squares_pxu, dtype='float64') / np.sum(diff_1d, dtype='float64')
          agbd_mean_mg_ha_ci95_3 = np.multiply(np.divide(sqrt_divided_sum, 100, dtype='float64'), agbd_mean_mg_ha, dtype='float64')

          agbd_mean_mg_ha_ci95 = agbd_mean_mg_ha_ci95_1

          # Calculate total AGB CI95
          agb_total_mg_ci95 = np.multiply(agbd_mean_mg_ha_ci95, forest_cover_ha, dtype='float64')
          agb_total_tg_ci95 = np.divide(agb_total_mg_ci95, 1000000, dtype='float64') # Convert total CI to Tg
          # Calculate percentage uncertainty
          agbd_mean_mg_ha_uncertainty = np.multiply(np.divide(agbd_mean_mg_ha_ci95, agbd_mean_mg_ha, dtype='float64'), 100, dtype='float64')
          # Append results to statistics list
          values_agbd_mean_mg_ha_ci95.append(agbd_mean_mg_ha_ci95)
          values_agbd_mean_mg_ha_uncertainty.append(agbd_mean_mg_ha_uncertainty)
          values_agb_total_tg_ci95.append(agb_total_tg_ci95)

  # Concatenate new columns to the main DataFrames for each statistic
  df_agbd_mean_mg_ha = pd.concat([df_agbd_mean_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha}, index=df_agbd_mean_mg_ha.index)], axis=1)
  df_agb_total_tg = pd.concat([df_agb_total_tg, pd.DataFrame({sample_polygon_name: values_agb_total_tg}, index=df_agb_total_tg.index)], axis=1)

  if generate_uncertainty_stats:
      df_agbd_mean_mg_ha_ci95 = pd.concat([df_agbd_mean_mg_ha_ci95, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_ci95}, index=df_agbd_mean_mg_ha_ci95.index)], axis=1)
      df_agbd_mean_mg_ha_uncertainty = pd.concat([df_agbd_mean_mg_ha_uncertainty, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_uncertainty}, index=df_agbd_mean_mg_ha_uncertainty.index)], axis=1)
      df_agb_total_tg_ci95 = pd.concat([df_agb_total_tg_ci95, pd.DataFrame({sample_polygon_name: values_agb_total_tg_ci95}, index=df_agb_total_tg_ci95.index)], axis=1)


# Create stats list
if generate_uncertainty_stats:
  df_stats_list = [df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg, df_agb_total_tg_ci95]
else: df_stats_list = [df_agbd_mean_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Generate summary stats
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")
if generate_uncertainty_stats:
  df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
  summary_stats = pd.concat([df_polygon_area_km2, df_agb_total_tg_t, df_agb_total_tg_ci95_t], axis=1).rename_axis("Name", axis=1)
else: summary_stats = pd.concat([df_polygon_area_km2, df_agb_total_tg_t], axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_statistics_dir, 'summary_diff_stats.csv'))

# Generate detailed stats by polygon (requires uncertainty stats)
for polygon_area in df_stats_list[0]:
  polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
  df_detailed_diff_stats = df_base
  df_detailed_diff_stats["Area (km^2)"] = polygon_area_km2
  for df_stats in df_stats_list:
    if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
    if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
    if generate_uncertainty_stats:
      if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
      if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
      if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
    for stats_polygon_area in df_stats:
      if stats_polygon_area == polygon_area:
        df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
        df_detailed_diff_stats = pd.concat([df_detailed_diff_stats, df_stats_renamed[stat_col]], axis=1)
    df_detailed_diff_stats.to_csv(join(detailed_diff_stats_dir, f'{polygon_area}.csv'))

# Generate detailed stats by diff (requires uncertainty stats)
diffs = {}
# Loop through all CSV files in the 'detailed_diff_stats_dir'
for stats_csv in os.listdir(detailed_diff_stats_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_diff_stats_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Loop through each unique diff in the file
    for diff in stats_csv_df['diff'].unique():
        # Filter the dataframe for the current diff
        diff_df = stats_csv_df[stats_csv_df['diff'] == diff].copy()
        # Drop the 'diff' column and add the 'Name' column
        diff_df.drop('diff', axis=1, inplace=True)
        diff_df.insert(0, 'Name', polygon_name)
        # If this diff's dataframe already exists, append to it; otherwise, create it
        if diff in diffs: diffs[diff] = pd.concat([diffs[diff], diff_df], ignore_index=True)
        else: diffs[diff] = diff_df
for diff, diff_df in diffs.items():
    output_file_path = join(detailed_diff_stats_diff_dir,f'{diff}.csv')
    diff_df.to_csv(output_file_path, index=False)

# Sankey plots

In [None]:
sankey_dir = join(sample_polygons_statistics_dir, 'sankey_diagrams')
makedirs(sankey_dir, exist_ok=True)

# Load the CSV files
summary_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_stats.csv'))
summary_diff_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_diff_stats.csv'))

# Check that all rows in both .csv files have the same strings (polygon areas) in column A
polygon_areas_stats = summary_stats.iloc[:, 0]
polygon_areas_diff_stats = summary_diff_stats.iloc[:, 0]

assert all(polygon_areas_stats == polygon_areas_diff_stats), "Polygon areas do not match between the two CSV files."

# Print columns of summary_stats
print("Columns in summary_stats:")
for i, col in enumerate(summary_stats.columns[1:]):
    print(col, end=' ')
    if (i+1) % 5 == 0:
        print()

print("\n")

# Print columns of summary_diff_stats
print("Columns in summary_diff_stats:")
for i, col in enumerate(summary_diff_stats.columns[1:]):
    print(col, end=' ')
    if (i+1) % 5 == 0:
        print()

In [None]:
# Plot degradation and deforestation separately
separate_disturbance = True

# Plot degradation before and since a date separately
separate_degradation = True

# Scaling modifier for sizing non-text elements (ratio, e.g. 0.5 or 2)
scaling_modifier = 1

# Relative width modifier (ratio, e.g. 0.5 or 2)
width_modifier = 0.7

# Title (polygon area), density and label variables (weight of 800 ~ bold, 400 ~ normal)
show_title = True
show_labels = True
title_font_size = 20
title_font_weight = 600
density_font_size = 17
density_font_weight = 600
label_font_size = 17
label_font_weight = 600

# Base columns (summary_stats)
old_growth_agb_column = 'all_oldgrowth forest AGB (Tg)'
current_agb_column = '2022 forest AGB (Tg)'

current_year = current_agb_column.split(' ')[0] # Usually first word of current_agb_column

# Difference columns (summary_diff_stats)
degradation_before_column = '2022_degradation_before_1990 forest AGB (Tg)'
degradation_since_column = '2022_degradation_since_1990 forest AGB (Tg)'
degradation_total_column = '2022_degradation_total forest AGB (Tg)'
deforestation_total_column = '2022_deforestation_total forest AGB (Tg)'
disturbance_total_column = '2022_degradation_deforestation_total forest AGB (Tg)'

# Node labels and colours
remaining_name = f'Remaining in {current_year}:'
remaining_colour = '#1a801a'  # Dark green
degradation_before_name = 'Degradation before 1990'
degradation_before_colour = '#8dc00d'  # Light green
degradation_since_name = 'Degradation since 1990'
degradation_since_colour = '#ffff00'  # Yellow
degradation_total_name = 'Degradation'
degradation_total_colour = '#ffff00'  # Yellow
deforestation_total_name = 'Deforestation'
deforestation_total_colour = '#ffffff'  # White
disturbance_total_name = 'Disturbance'
disturbance_total_colour = '#ffff00'  # Yellow

left_axis_label = True

# Assert checking separate_disturbance is True if separate_degradation is True
assert not separate_degradation or separate_disturbance, "separate_disturbance must be True if separate_degradation is True."

# Function to get values from statistics
def get_value(df, idx, column_name):
    try:
        value = df.loc[idx, column_name]
        if pd.isnull(value): return 0.0
        else: return float(value)
    except KeyError:
        print(f"Column '{column_name}' not found in the dataframe.")
        return 0.0

# Loop through each row (polygon area)
for idx in summary_stats.index:

    # Get the polygon name
    polygon_name = summary_stats.iloc[idx, 0]

    # Get old-growth and current AGB values
    old_growth_agb = get_value(summary_stats, idx, old_growth_agb_column)
    current_agb = get_value(summary_stats, idx, current_agb_column)

    # Get values from summary_diff_stats
    degradation_before = get_value(summary_diff_stats, idx, degradation_before_column)
    degradation_since = get_value(summary_diff_stats, idx, degradation_since_column)
    degradation_total = get_value(summary_diff_stats, idx, degradation_total_column)
    deforestation_total = get_value(summary_diff_stats, idx, deforestation_total_column)
    disturbance_total = get_value(summary_diff_stats, idx, disturbance_total_column)

    # Load detailed stats to get mean AGBD and CI95 values
    detailed_stats_df = pd.read_csv(join(detailed_stats_dir, f"{polygon_name}.csv"))
    old_growth_index = detailed_stats_df.index[detailed_stats_df['scenario'] == f"{old_growth_agb_column.split(' ')[0]}"].item()
    old_growth_mean_agbd = get_value(detailed_stats_df, old_growth_index, "Forest AGBD mean (Mg / ha)")
    current_index = detailed_stats_df.index[detailed_stats_df['scenario'] == f"{current_agb_column.split(' ')[0]}"].item()
    current_mean_agbd = get_value(detailed_stats_df, current_index, "Forest AGBD mean (Mg / ha)")
    uncertainty = False # Uncertainty may not have been calculated
    if 'Forest AGB total CI95 (Tg)' in detailed_stats_df.columns:
      uncertainty = True
      old_growth_agb_ci95 = get_value(detailed_stats_df, old_growth_index, "Forest AGB total CI95 (Tg)")
      old_growth_mean_agbd_ci95 = get_value(detailed_stats_df, old_growth_index, "Forest AGBD CI95 (Mg / ha)")
      current_agb_ci95 = get_value(detailed_stats_df, current_index, "Forest AGB total CI95 (Tg)")
      current_mean_agbd_ci95 = get_value(detailed_stats_df, current_index, "Forest AGBD CI95 (Mg / ha)")

    # Title line 1 name
    title_name = f"{polygon_name}"

    # Subtitle line 1 name
    if uncertainty: subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} ± {old_growth_mean_agbd_ci95:.1f} Mg / ha"
    else: subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} Mg / ha"

    # Subtitle line 2 name
    if uncertainty: subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} ± {current_mean_agbd_ci95:.1f} Mg / ha"
    else: subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} Mg / ha"

    # Left axis name
    if left_axis_label:
      if uncertainty: left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} ± {old_growth_agb_ci95:.2f} Tg"
      else: left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} Tg"
    else: left_axis = ''

    # Update remaining_name with AGB
    if uncertainty: remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} ± {current_agb_ci95:.2f} Tg"
    else: remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} Tg"

    # Depending on the settings, perform assertions and plot
    if separate_disturbance and separate_degradation:
        assert abs(degradation_before + degradation_since - degradation_total) < 1e-9, f"{polygon_name}: degradation_before_column + degradation_since_column != degradation_total_column"
        assert abs(degradation_total + deforestation_total - disturbance_total) < 1e-9, f"{polygon_name}: degradation_total_column + deforestation_total_column != disturbance_total_column"
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_total_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0, 0, 0]
        targets = [1, 2, 3, 4]
        values = [-degradation_before, -degradation_since, -deforestation_total, remaining_value]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_total_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    elif separate_disturbance and not separate_degradation:
        assert abs(degradation_total + deforestation_total - disturbance_total) < 1e-9, f"{polygon_name}: degradation_total_column + deforestation_total_column != disturbance_total_column"
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, degradation_total_name, deforestation_total_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0, 0]
        targets = [1, 2, 3]
        values = [-degradation_total, -deforestation_total, remaining_value]
        colors = [degradation_total_colour, deforestation_total_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    else:
        # Both separate_disturbance and separate_degradation are False
        assert abs(current_agb - disturbance_total - old_growth_agb) < 1e-9, f"{polygon_name}: current_agb_column - disturbance_total_column != old_growth_agb_column"
        # Define nodes
        nodes = [left_axis, disturbance_total_name, remaining_name_agb]
        # Calculate 'Remaining' value (should be equal to current_agb)
        remaining_value = current_agb
        # Define links
        sources = [0, 0]
        targets = [1, 2]
        values = [-disturbance_total, remaining_value]
        colors = [disturbance_total_colour, remaining_colour]
        # Define node colors
        node_colors = [remaining_colour] + colors

    # Calculate percentages and update right node labels
    percentages = [(abs(val) / old_growth_agb * 100) for val in values]
    for i in range(1, len(nodes)):
        if i - 1 < len(percentages):
            nodes[i] += f" ({percentages[i-1]:.0f}%)"

    # If labels are toggled off, replace node labels with empty strings
    if not show_labels: nodes = [''] * len(nodes)

    # Create the Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(
            label=nodes,
            color=node_colors,  # Set node colors
            pad=15 * scaling_modifier,
            thickness=20 * scaling_modifier,
            line=dict(color="black", width=1 * scaling_modifier)
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color=colors,
            line=dict(color="black", width=1 * scaling_modifier),  # Add border to ribbons
        )
    )])

    fig.update_layout(
        width=700 * scaling_modifier * width_modifier,
        height=500 * scaling_modifier,
        font=dict(
            family="helvetica, serif",
            size=label_font_size,
            color="black",
            weight=label_font_weight
        ),
        # Adjust the margins
        margin=dict(
            l=25 * scaling_modifier,
            r=25 * scaling_modifier,
            t=115 * scaling_modifier,  # Increased top margin to accommodate title
            b=25 * scaling_modifier
        ),
        # Annotations allow more customisable title with three lines
        annotations=[
            dict(
                x=0,
                y=1.28,
                xref='paper',
                yref='paper',
                text=title_name,
                showarrow=False,
                xanchor='left',
                align='left',
                font=dict(
                    family="helvetica, serif",
                    size=title_font_size,
                    color="black",
                    weight=title_font_weight
                )
            ),
            dict(
                x=0,
                y=1.19,
                xref='paper',
                yref='paper',
                text=subtitle_1_name,
                showarrow=False,
                xanchor='left',
                align='left',
                font=dict(
                    family="helvetica, serif",
                    size=density_font_size,
                    color="black",
                    weight=density_font_weight
                )
            ),
            dict(
                x=0,
                y=1.11,
                xref='paper',
                yref='paper',
                text=subtitle_2_name,
                showarrow=False,
                xanchor='left',
                align='left',
                font=dict(
                    family="helvetica, serif",
                    size=density_font_size,
                    color="black",
                    weight=density_font_weight
                )
            )
        ]
    )

    # Save the figure
    output_path = os.path.join(sankey_dir, f'sankey_diagram_{polygon_name}.png')
    fig.write_image(output_path, scale=2)
    print(f"Statistical assertions and sankey diagram complete for {polygon_name}.")

    # Show the figure
    fig.show()

# Disconnected runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()