<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/9_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Subdirectories

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Imports and upgrades
!pip install geopandas
!pip install kaleido==0.2.1

In [None]:
# Imports
from datetime import datetime
import geopandas as gpd
from google.colab import runtime
import json
import ipywidgets as widgets
import kaleido
import numpy as np
from os.path import exists, join
from os import makedirs
from osgeo import gdal, ogr
gdal.UseExceptions()
import pandas as pd
import plotly.graph_objects as go
import re
from shutil import copyfile

In [None]:
# Define existing directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
masks_dir = join(areas_dir, "masks")
models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")
predictions_dir = join(base_dir, "7_predictions")
differences_dir = join(base_dir, "8_differences")

# Define and create statistics directories
statistics_dir = join(base_dir, "9_statistics")
sample_polygons_dir = join(statistics_dir, "sample_polygons")
makedirs(statistics_dir, exist_ok=True)
makedirs(sample_polygons_dir, exist_ok=True)

# Cell area raster for accurate pixel-by-pixel area calculations
cell_area_path = join(areas_dir, "cell_area.tif")

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -11111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'] # Good speed / size ratio
    else: options = []
    if input_array.dtype == 'int16': dtype = gdal.GDT_Int16
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Select model

In [None]:
# Select if to source predictions from scenarios_dir or predictions_dir
source_dir = predictions_dir
# source_dir = scenarios_dir
source_dir_name = f"{source_dir.split('_')[-1]}_dir"

# Select the model
for subdir in os.listdir(source_dir):
  if 'scenario_masks' not in subdir:
    print(f"selected_model = '{subdir}'")

In [None]:
selected_model = 'agbd_v2_1_260206_133525'

# Define prediction, disturbance and intactness directories
selected_model_prediction_dir = join(source_dir, selected_model)
prediction_raster_dir = join(selected_model_prediction_dir, 'scenario_predictions')
model_differences_dir = join(differences_dir, f"{selected_model}_{source_dir_name}")
disturbance_dir = join(model_differences_dir, 'disturbance')
intactness_dir = join(model_differences_dir, 'intactness')
restoration_dir = join(model_differences_dir, 'restoration')

# Check prediction directory
if not exists(prediction_raster_dir):
  print(f"Prediction directory doesn't exist yet: {prediction_raster_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(prediction_raster_dir))} rasters in {prediction_raster_dir}")
# Check disturbance directory
if not exists(disturbance_dir):
  print(f"Disturbance directory doesn't exist yet: {disturbance_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(disturbance_dir))} rasters in {disturbance_dir}")
# Check intactness directory
if not exists(intactness_dir):
  print(f"Intactness directory doesn't exist yet: {intactness_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(intactness_dir))} rasters in {intactness_dir}")
# Check restoration directory
if not exists(restoration_dir):
  print(f"Intactness directory doesn't exist yet: {restoration_dir}")
  print("Try changing source directory, or re-run previous notebooks")
else: print(f"There are {len(os.listdir(restoration_dir))} rasters in {restoration_dir}")

# Define model stats directory
model_statistics_dir = join(statistics_dir, f"{selected_model}_{source_dir_name}")
makedirs(model_statistics_dir, exist_ok=True)

# Select sample polygons

In [None]:
# Select sample area polygons. This should be a single .gpkg with the field 'name' differentiating polygons.
sample_polygons = []
for geopackage in os.listdir(sample_polygons_dir):
  sample_polygons.append(geopackage)
if len(sample_polygons) == 0:
  print(f"No sample areas found. Upload .gpkg polygons to {sample_polygons_dir}")
else:
  for sample_polygon in sample_polygons: print(f"selected_sample_polygons = '{sample_polygon}'")

In [None]:
selected_sample_polygons = 'tekai_sample_polygons.gpkg'

# Define and create sample polygons directory
selected_sample_polygons_dir = join(sample_polygons_dir, selected_sample_polygons)
selected_sample_polygons_gpkg = gpd.read_file(selected_sample_polygons_dir)
sample_polygons_statistics_dir = join(model_statistics_dir, selected_sample_polygons[:-5])
makedirs(sample_polygons_statistics_dir, exist_ok=True)

# Define and create statistic .csv directories
polygon_indices_dir = join(sample_polygons_statistics_dir, 'polygon_indices')
land_and_forest_cover_by_area_dir = join(sample_polygons_statistics_dir, 'land_and_forest_cover_by_area')
land_and_forest_cover_by_scenario_dir = join(sample_polygons_statistics_dir, 'land_and_forest_cover_by_scenario')
scenario_stats_by_area_dir = join(sample_polygons_statistics_dir, 'scenario_stats_by_area')
scenario_stats_by_scenario_dir = join(sample_polygons_statistics_dir, 'scenario_stats_by_scenario')
disturbance_stats_by_area_dir = join(sample_polygons_statistics_dir, 'disturbance_stats_by_area')
disturbance_stats_by_disturbance_dir = join(sample_polygons_statistics_dir, 'disturbance_stats_by_disturbance')
restoration_stats_by_area_dir = join(sample_polygons_statistics_dir, 'restoration_stats_by_area')
restoration_stats_by_restoration_dir = join(sample_polygons_statistics_dir, 'restoration_stats_by_restoration')
intactness_stats_dir = join(sample_polygons_statistics_dir, 'intactness')
report_statistics_dir = join(sample_polygons_statistics_dir, 'report_statistics')
disturbance_trends_dir = join(sample_polygons_statistics_dir, 'plots_disturbance_trends')
sankey_dir = join(sample_polygons_statistics_dir, 'plots_sankey')

makedirs(polygon_indices_dir, exist_ok=True)
makedirs(land_and_forest_cover_by_area_dir, exist_ok=True)
makedirs(land_and_forest_cover_by_scenario_dir, exist_ok=True)
makedirs(scenario_stats_by_area_dir, exist_ok=True)
makedirs(scenario_stats_by_scenario_dir, exist_ok=True)
makedirs(disturbance_stats_by_area_dir, exist_ok=True)
makedirs(disturbance_stats_by_disturbance_dir, exist_ok=True)
makedirs(restoration_stats_by_area_dir, exist_ok=True)
makedirs(restoration_stats_by_restoration_dir, exist_ok=True)
makedirs(intactness_stats_dir, exist_ok=True)
makedirs(report_statistics_dir, exist_ok=True)
makedirs(disturbance_trends_dir, exist_ok=True)
makedirs(sankey_dir, exist_ok=True)

In [None]:
# Rasterise each sample polygon, save row/col indices and cell areas.
# Pixel centre intersection (all_touched=False) avoids double-counting.

# Structure of .npz files:
#   rows, cols: 1D int arrays of pixel indices within polygon
#   cell_area_ha: 1D float array of pixel areas in hectares
# Later statistics sections load these arrays, extract raster values at [rows, cols],
# filter nodata, then compute area-weighted statistics using cell_area_ha.

# Sort polygons alphabetically instead of geopackage order
sort_polygons_alphabetically = False

polygon_indices_dir = join(sample_polygons_statistics_dir, 'polygon_indices')
makedirs(polygon_indices_dir, exist_ok=True)

polygon_names = [row['name'] for _, row in selected_sample_polygons_gpkg.iterrows()]
if sort_polygons_alphabetically:
    polygon_names = sorted(polygon_names)

# Zero-padded prefix width based on polygon count
n_polygons = len(polygon_names)
prefix_width = len(str(n_polygons))

# Load cell area array
cell_area_ds = gdal.Open(cell_area_path)
cell_area_array = cell_area_ds.ReadAsArray().astype('float64') / 10000
zeros_array = np.zeros(cell_area_array.shape, dtype='int16')
cell_area_ds = None

temp_raster_path = join(polygon_indices_dir, 'temp_burn.tif')
export_array_as_tif(zeros_array, temp_raster_path)

temp_ds = gdal.Open(temp_raster_path, gdal.GA_Update)
band = temp_ds.GetRasterBand(1)

progress_total = n_polygons
progress_index = 0
progress_label = widgets.Label(f"Polygon progress: {progress_index}/{progress_total}")
display(progress_label)

for i, polygon_name in enumerate(polygon_names):
    prefix = str(i).zfill(prefix_width)
    npz_path = join(polygon_indices_dir, f'{prefix}_{polygon_name}.npz')
    if exists(npz_path):
        progress_index += 1
        progress_label.value = f"Polygon progress: {progress_index}/{progress_total}"
        continue

    # Burn polygon with unique value
    vector = ogr.Open(selected_sample_polygons_dir)
    layer = vector.GetLayer()
    layer.SetAttributeFilter(f"name = '{polygon_name}'")
    gdal.RasterizeLayer(temp_ds, [1], layer, burn_values=[i + 1], options=[])
    temp_ds.FlushCache()
    vector = None

    # Extract indices and cell areas
    rows, cols = np.where(band.ReadAsArray() == i + 1)
    cell_area_ha = cell_area_array[rows, cols]
    np.savez(npz_path, rows=rows, cols=cols, cell_area_ha=cell_area_ha)

    progress_index += 1
    progress_label.value = f"Polygon progress: {progress_index}/{progress_total}"

temp_ds = None
os.remove(temp_raster_path)

print(f"Polygon index arrays saved to {polygon_indices_dir}")

# Land area and forest cover

In [None]:
# List available forest masks, which will be matched with land masks
forest_masks = []
for mask_file in os.listdir(masks_dir):
    if mask_file.startswith('mask_forest_') and mask_file.endswith('.tif'):
        scenario_name = mask_file.replace('mask_forest_', '').replace('.tif', '')
        forest_masks.append(scenario_name)
forest_masks = sorted(forest_masks)

print('selected_forest_scenarios = [')
for scenario in forest_masks:
    print(f'  "{scenario}",')
print(']')

In [None]:
selected_forest_scenarios = [
  "1990",
  "1991",
  "1992",
  "1993",
  "1994",
  "1995",
  "1996",
  "1997",
  "1998",
  "1999",
  "2000",
  "2001",
  "2002",
  "2003",
  "2004",
  "2005",
  "2006",
  "2007",
  "2008",
  "2009",
  "2010",
  "2011",
  "2012",
  "2013",
  "2014",
  "2015",
  "2016",
  "2017",
  "2018",
  "2019",
  "2020",
  "2021",
  "2022",
  "2023",
  "2024",
  "2024_oldgrowth_recovery",
  "2024_road_mat_daling_deforestation",
  "2024_undisturbed_since_oldgrowth",
]

In [None]:
# Forest cover statistics

# Match forest masks to land masks by year
forest_to_land_mask = {}
missing_land_masks = []

for scenario in selected_forest_scenarios:
    year = scenario[:4]
    land_mask_path = join(masks_dir, f"mask_land_{year}.tif")
    if exists(land_mask_path):
        forest_to_land_mask[scenario] = land_mask_path
    else:
        missing_land_masks.append((scenario, f"mask_land_{year}.tif"))

# Validate all land masks exist
if missing_land_masks:
    print("Missing land masks. Return to 3a_features_scenario.ipynb to create:")
    for scenario, mask_name in missing_land_masks:
        print(f"  {mask_name} (required for {scenario})")
    raise FileNotFoundError("Land masks missing")

# Load polygon data from .npz files
polygon_data = {}
for npz_file in sorted(os.listdir(polygon_indices_dir)):
    if npz_file.endswith('.npz'):
        name = npz_file.split('_', 1)[1][:-4]
        data = np.load(join(polygon_indices_dir, npz_file))
        polygon_data[name] = {
            'rows': data['rows'],
            'cols': data['cols'],
            'cell_area_ha': data['cell_area_ha']
        }

polygon_names = list(polygon_data.keys())
n_polygons = len(polygon_names)
n_scenarios = len(selected_forest_scenarios)

# Pre-allocate arrays
polygon_area_data = np.zeros(n_polygons)
land_area_data = np.zeros((n_scenarios, n_polygons))
forest_cover_data = np.zeros((n_scenarios, n_polygons))

# Calculate polygon area in km²
for poly_idx, polygon_name in enumerate(polygon_names):
    cell_area_ha = polygon_data[polygon_name]['cell_area_ha']
    polygon_area_data[poly_idx] = cell_area_ha.sum(dtype='float64') / 100

# Progress tracking
progress_total = n_scenarios
progress_index = 0
progress_label = widgets.Label(f"Scenario progress: {progress_index}/{progress_total}")
display(progress_label)

for scenario_idx, scenario in enumerate(selected_forest_scenarios):
    # Land mask
    land_mask_path = forest_to_land_mask[scenario]
    land_ds = gdal.Open(land_mask_path)
    land_array = land_ds.ReadAsArray()
    land_ds = None

    # Forest mask
    forest_mask_path = join(masks_dir, f"mask_forest_{scenario}.tif")
    forest_ds = gdal.Open(forest_mask_path)
    forest_array = forest_ds.ReadAsArray()
    forest_ds = None

    for poly_idx, polygon_name in enumerate(polygon_names):
        rows = polygon_data[polygon_name]['rows']
        cols = polygon_data[polygon_name]['cols']
        cell_area_ha = polygon_data[polygon_name]['cell_area_ha']

        # Land area
        land_values = land_array[rows, cols]
        land_mask = (land_values == 1)
        land_area_data[scenario_idx, poly_idx] = cell_area_ha[land_mask].sum(dtype='float64')

        # Forest cover
        forest_values = forest_array[rows, cols]
        forest_mask = (forest_values == 1)
        forest_cover_data[scenario_idx, poly_idx] = cell_area_ha[forest_mask].sum(dtype='float64')

    progress_index += 1
    progress_label.value = f"Scenario progress: {progress_index}/{progress_total}"

# Calculate percentages
pct_area_land = np.zeros((n_scenarios, n_polygons))
pct_area_forest = np.zeros((n_scenarios, n_polygons))
pct_land_forest = np.zeros((n_scenarios, n_polygons))
for poly_idx in range(n_polygons):
    area_ha = polygon_area_data[poly_idx] * 100
    for scenario_idx in range(n_scenarios):
        land_ha = land_area_data[scenario_idx, poly_idx]
        forest_ha = forest_cover_data[scenario_idx, poly_idx]
        pct_area_land[scenario_idx, poly_idx] = (land_ha / area_ha * 100) if area_ha > 0 else 0
        pct_area_forest[scenario_idx, poly_idx] = (forest_ha / area_ha * 100) if area_ha > 0 else 0
        pct_land_forest[scenario_idx, poly_idx] = (forest_ha / land_ha * 100) if land_ha > 0 else 0

# Generate detailed stats by area
for poly_idx, polygon_name in enumerate(polygon_names):
    df = pd.DataFrame(index=selected_forest_scenarios)
    df.index.name = 'scenario'
    df['Area (km^2)'] = polygon_area_data[poly_idx]
    df['Land area (ha)'] = land_area_data[:, poly_idx]
    df['Forest cover (ha)'] = forest_cover_data[:, poly_idx]
    df['% of area that is land'] = pct_area_land[:, poly_idx]
    df['% of area that is forest'] = pct_area_forest[:, poly_idx]
    df['% of land that is forest'] = pct_land_forest[:, poly_idx]
    df.to_csv(join(land_and_forest_cover_by_area_dir, f'{polygon_name}.csv'))

# Generate detailed stats by scenario
for scenario_idx, scenario in enumerate(selected_forest_scenarios):
    df = pd.DataFrame(index=polygon_names)
    df.index.name = 'Name'
    df['Area (km^2)'] = polygon_area_data
    df['Land area (ha)'] = land_area_data[scenario_idx, :]
    df['Forest cover (ha)'] = forest_cover_data[scenario_idx, :]
    df['% of area that is land'] = pct_area_land[scenario_idx, :]
    df['% of area that is forest'] = pct_area_forest[scenario_idx, :]
    df['% of land that is forest'] = pct_land_forest[scenario_idx, :]
    df.to_csv(join(land_and_forest_cover_by_scenario_dir, f'{scenario}.csv'))

# Generate summary stats
summary_land_and_forest_cover = pd.DataFrame(index=polygon_names)
summary_land_and_forest_cover.index.name = 'Name'
summary_land_and_forest_cover['Area (km^2)'] = polygon_area_data

for scenario_idx, scenario in enumerate(selected_forest_scenarios):
    summary_land_and_forest_cover[f'{scenario} land area (ha)'] = land_area_data[scenario_idx, :]
    summary_land_and_forest_cover[f'{scenario} forest cover (ha)'] = forest_cover_data[scenario_idx, :]

summary_land_and_forest_cover.to_csv(join(sample_polygons_statistics_dir, 'summary_land_and_forest_cover_stats.csv'))

print("Forest cover statistics completed.")

# AGB statistics

In [None]:
# List available scenario rasters
scenarios = set()
if exists(prediction_raster_dir):
    for f in os.listdir(prediction_raster_dir):
        if f.endswith('.tif'):
            if source_dir == predictions_dir:
                if 'mean__' in f:
                    scenario_name = f.split('__')[1].split('.')[0]
                    scenarios.add(scenario_name)
            else:
                scenario_name = f.split('__')[0].split('.')[0]
                scenarios.add(scenario_name)
scenarios = sorted(list(scenarios))
print('# Scenarios')
print('selected_scenarios = [')
for scenario in scenarios:
    print(f'  "{scenario}",')
print(']\n')


# List available disturbance rasters
disturbances = set()
if exists(disturbance_dir):
    for f in os.listdir(disturbance_dir):
        if f.endswith('.tif'):
            if source_dir == predictions_dir:
                if 'mean__' in f:
                    disturbance_name = f.split('__')[1].split('.')[0]
                    disturbances.add(disturbance_name)
            else:
                disturbance_name = f.split('__')[0].split('.')[0]
                disturbances.add(disturbance_name)
disturbances = sorted(list(disturbances))
print('# Disturbance')
print('selected_disturbance = [')
for disturbance in disturbances:
    print(f'  "{disturbance}",')
print(']\n')


# List available restoration rasters
restorations = set()
if exists(restoration_dir):
    for f in os.listdir(restoration_dir):
        if f.endswith('.tif'):
            if source_dir == predictions_dir:
                if 'mean__' in f:
                    restoration_name = f.split('__')[1].split('.')[0]
                    restorations.add(restoration_name)
            else:
                restoration_name = f.split('__')[0].split('.')[0]
                restorations.add(restoration_name)
restorations = sorted(list(restorations))
print('# Restoration')
print('selected_restoration = [')
for restoration in restorations:
    print(f'  "{restoration}",')
print(']')

In [None]:
# Scenarios
selected_scenarios = [
  # "2023",
  "2024",
  "2024_oldgrowth_recovery",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
  "2024_undisturbed_since_1996",
  "2024_undisturbed_since_1997",
  "2024_undisturbed_since_1998",
  "2024_undisturbed_since_1999",
  "2024_undisturbed_since_2000",
  "2024_undisturbed_since_2001",
  "2024_undisturbed_since_2002",
  "2024_undisturbed_since_2003",
  "2024_undisturbed_since_2004",
  "2024_undisturbed_since_2005",
  "2024_undisturbed_since_2006",
  "2024_undisturbed_since_2007",
  "2024_undisturbed_since_2008",
  "2024_undisturbed_since_2009",
  "2024_undisturbed_since_2010",
  "2024_undisturbed_since_2011",
  "2024_undisturbed_since_2012",
  "2024_undisturbed_since_2013",
  "2024_undisturbed_since_2014",
  "2024_undisturbed_since_2015",
  "2024_undisturbed_since_2016",
  "2024_undisturbed_since_2017",
  "2024_undisturbed_since_2018",
  "2024_undisturbed_since_2019",
  "2024_undisturbed_since_2020",
  "2024_undisturbed_since_2021",
  "2024_undisturbed_since_2022",
  "2024_undisturbed_since_2023",
  "2024_undisturbed_since_2024",
  "2024_undisturbed_since_oldgrowth",
]

# Disturbance
selected_disturbance = [
  "2024_deforestation_of_road_mat_daling_2023_deforestation",
  "2024_deforestation_of_road_mat_daling_2023_degradation",
  "2024_deforestation_since_oldgrowth",
  "2024_degradation_since_1996",
  "2024_degradation_since_oldgrowth",
  "2024_effect_of_degradation_in_1996",
  "2024_effect_of_degradation_in_1997",
  "2024_effect_of_degradation_in_1998",
  "2024_effect_of_degradation_in_1999",
  "2024_effect_of_degradation_in_2000",
  "2024_effect_of_degradation_in_2001",
  "2024_effect_of_degradation_in_2002",
  "2024_effect_of_degradation_in_2003",
  "2024_effect_of_degradation_in_2004",
  "2024_effect_of_degradation_in_2005",
  "2024_effect_of_degradation_in_2006",
  "2024_effect_of_degradation_in_2007",
  "2024_effect_of_degradation_in_2008",
  "2024_effect_of_degradation_in_2009",
  "2024_effect_of_degradation_in_2010",
  "2024_effect_of_degradation_in_2011",
  "2024_effect_of_degradation_in_2012",
  "2024_effect_of_degradation_in_2013",
  "2024_effect_of_degradation_in_2014",
  "2024_effect_of_degradation_in_2015",
  "2024_effect_of_degradation_in_2016",
  "2024_effect_of_degradation_in_2017",
  "2024_effect_of_degradation_in_2018",
  "2024_effect_of_degradation_in_2019",
  "2024_effect_of_degradation_in_2020",
  "2024_effect_of_degradation_in_2021",
  "2024_effect_of_degradation_in_2022",
  "2024_effect_of_degradation_in_2023",
  "2024_effect_of_degradation_in_2024",
]

# Restoration
selected_restoration = [
  "2024_recovery_potential_with_edge_effects",
  "2024_reforestation_potential",
  "2024_restoration_potential",
]

In [None]:
# AGB statistics

# CI95 aggregation assumes perfect positive spatial correlation between prediction
# errors: CI_polygon = sum(CI_pixel × area_pixel). Yields maximum possible aggregate uncertainty,
# almost certainly an overestimate. Alternative assumption of complete independence
# uses root-sum-of-squares: CI_polygon = sqrt(sum(CI_pixel^2)), yielding narrower
# intervals. True CI lies between these bounds. Spatial clustering visible in percentage
# uncertainty rasters supports correlated errors, making sum-of-CI a defensible
# conservative upper bound.

# Uncertainty stats only available from predictions_dir
generate_uncertainty_stats = (source_dir == predictions_dir)

# Load polygon data from .npz files
polygon_data = {}
for npz_file in sorted(os.listdir(polygon_indices_dir)):
    if npz_file.endswith('.npz'):
        name = npz_file.split('_', 1)[1][:-4]
        data = np.load(join(polygon_indices_dir, npz_file))
        polygon_data[name] = {
            'rows': data['rows'],
            'cols': data['cols'],
            'cell_area_ha': data['cell_area_ha']
        }

polygon_names = list(polygon_data.keys())
n_polygons = len(polygon_names)

# Category configurations
categories = [
    {
        'name': 'scenario',
        'selected': selected_scenarios,
        'raster_dir': prediction_raster_dir,
        'by_area_dir': scenario_stats_by_area_dir,
        'by_item_dir': scenario_stats_by_scenario_dir,
        'summary_filename': 'summary_scenario_stats.csv',
        'index_name': 'scenario'
    },
    {
        'name': 'disturbance',
        'selected': selected_disturbance,
        'raster_dir': disturbance_dir,
        'by_area_dir': disturbance_stats_by_area_dir,
        'by_item_dir': disturbance_stats_by_disturbance_dir,
        'summary_filename': 'summary_disturbance_stats.csv',
        'index_name': 'disturbance'
    },
    {
        'name': 'restoration',
        'selected': selected_restoration,
        'raster_dir': restoration_dir,
        'by_area_dir': restoration_stats_by_area_dir,
        'by_item_dir': restoration_stats_by_restoration_dir,
        'summary_filename': 'summary_restoration_stats.csv',
        'index_name': 'restoration'
    }
]

for category in categories:
    selected_items = category['selected']
    if not selected_items: continue

    n_items = len(selected_items)
    category_name = category['name']
    raster_dir = category['raster_dir']
    by_area_dir = category['by_area_dir']
    by_item_dir = category['by_item_dir']
    summary_filename = category['summary_filename']
    index_name = category['index_name']

    # Build raster paths
    agbd_paths = {}
    ci95_paths = {}

    if source_dir == predictions_dir:
        for item_name in selected_items:
            agbd_paths[item_name] = join(raster_dir, f'mean__{item_name}__{selected_model}.tif')
            ci95_path = join(raster_dir, f'ci95__{item_name}__{selected_model}.tif')
            if exists(ci95_path): ci95_paths[item_name] = ci95_path
    else:
        for item_name in selected_items:
            agbd_paths[item_name] = join(raster_dir, f'{item_name}__{selected_model}.tif')

    # Pre-allocate arrays
    agbd_mean_data = np.zeros((n_items, n_polygons), dtype='float64')
    agbd_stdev_data = np.zeros((n_items, n_polygons), dtype='float64')
    agb_total_data = np.zeros((n_items, n_polygons), dtype='float64')

    if generate_uncertainty_stats:
        agbd_mean_ci95_data = np.zeros((n_items, n_polygons), dtype='float64')
        agbd_uncertainty_data = np.zeros((n_items, n_polygons), dtype='float64')
        agb_total_ci95_data = np.zeros((n_items, n_polygons), dtype='float64')

    # Progress tracking
    progress_total = n_items
    progress_index = 0
    progress_label = widgets.Label(f"{category_name.capitalize()} raster progress: {progress_index}/{progress_total}")
    display(progress_label)

    for item_idx, item_name in enumerate(selected_items):
        # Load AGBD raster
        agbd_ds = gdal.Open(agbd_paths[item_name])
        agbd_array = agbd_ds.ReadAsArray().astype('float64')
        nodata = agbd_ds.GetRasterBand(1).GetNoDataValue()
        agbd_ds = None

        # Load CI95 raster if available
        ci95_array = None
        if generate_uncertainty_stats and item_name in ci95_paths:
            ci95_ds = gdal.Open(ci95_paths[item_name])
            ci95_array = ci95_ds.ReadAsArray().astype('float64')
            ci95_ds = None

        for poly_idx, polygon_name in enumerate(polygon_names):
            rows = polygon_data[polygon_name]['rows']
            cols = polygon_data[polygon_name]['cols']
            cell_area_ha = polygon_data[polygon_name]['cell_area_ha']

            # Extract values and filter nodata
            agbd_values = agbd_array[rows, cols]
            valid_mask = (agbd_values != nodata)
            agbd_valid = agbd_values[valid_mask]
            areas_valid = cell_area_ha[valid_mask]

            # Total area and AGB
            total_area_ha = areas_valid.sum(dtype='float64')
            agb_total_mg = (agbd_valid * areas_valid).sum(dtype='float64')

            if total_area_ha > 0:
                # Area-weighted mean: sum(AGBD * area) / sum(area)
                mean_agbd = agb_total_mg / total_area_ha

                # Area-weighted stdev: sqrt(sum(area * (AGBD - mean)^2) / sum(area))
                variance = (areas_valid * (agbd_valid - mean_agbd)**2).sum(dtype='float64') / total_area_ha
                stdev_agbd = np.sqrt(variance)

                agb_total_tg = agb_total_mg / 1e6
            else:
                mean_agbd = 0.0
                stdev_agbd = 0.0
                agb_total_tg = 0.0

            agbd_mean_data[item_idx, poly_idx] = mean_agbd
            agbd_stdev_data[item_idx, poly_idx] = stdev_agbd
            agb_total_data[item_idx, poly_idx] = agb_total_tg

            if ci95_array is not None:
                ci95_values = ci95_array[rows, cols]
                ci95_valid = ci95_values[valid_mask]
                ci95_total_mg = (ci95_valid * areas_valid).sum(dtype='float64')

                if total_area_ha > 0 and abs(agb_total_mg) > 0:
                    ci95_mean_agbd = ci95_total_mg / total_area_ha
                    uncertainty_pct = ci95_total_mg / abs(agb_total_mg) * 100
                    ci95_total_tg = ci95_total_mg / 1e6
                else:
                    ci95_mean_agbd = 0.0
                    uncertainty_pct = 0.0
                    ci95_total_tg = 0.0

                agbd_mean_ci95_data[item_idx, poly_idx] = ci95_mean_agbd
                agbd_uncertainty_data[item_idx, poly_idx] = uncertainty_pct
                agb_total_ci95_data[item_idx, poly_idx] = ci95_total_tg

        progress_index += 1
        progress_label.value = f"{category_name.capitalize()} raster progress: {progress_index}/{progress_total}"

    # Create DataFrames
    df_agbd_mean = pd.DataFrame(agbd_mean_data, index=selected_items, columns=polygon_names)
    df_agbd_mean.rename_axis(index_name, inplace=True)

    df_agbd_stdev = pd.DataFrame(agbd_stdev_data, index=selected_items, columns=polygon_names)
    df_agbd_stdev.rename_axis(index_name, inplace=True)

    df_agb_total = pd.DataFrame(agb_total_data, index=selected_items, columns=polygon_names)
    df_agb_total.rename_axis(index_name, inplace=True)

    if generate_uncertainty_stats:
        df_agbd_ci95 = pd.DataFrame(agbd_mean_ci95_data, index=selected_items, columns=polygon_names)
        df_agbd_ci95.rename_axis(index_name, inplace=True)

        df_agbd_uncertainty = pd.DataFrame(agbd_uncertainty_data, index=selected_items, columns=polygon_names)
        df_agbd_uncertainty.rename_axis(index_name, inplace=True)

        df_agb_ci95 = pd.DataFrame(agb_total_ci95_data, index=selected_items, columns=polygon_names)
        df_agb_ci95.rename_axis(index_name, inplace=True)

    # Stats list for CSV output
    if generate_uncertainty_stats:
        df_stats_list = [
            (df_agbd_mean, "AGBD mean (Mg / ha)"),
            (df_agbd_ci95, "AGBD CI95 (Mg / ha)"),
            (df_agbd_uncertainty, "AGBD uncertainty (%)"),
            (df_agbd_stdev, "AGBD stdev (Mg / ha)"),
            (df_agb_total, "AGB total (Tg)"),
            (df_agb_ci95, "AGB total CI95 (Tg)")
        ]
    else:
        df_stats_list = [
            (df_agbd_mean, "AGBD mean (Mg / ha)"),
            (df_agbd_stdev, "AGBD stdev (Mg / ha)"),
            (df_agb_total, "AGB total (Tg)")
        ]

    # Summary stats
    df_agb_total_t = df_agb_total.T.rename_axis("Name", axis=1).add_suffix(" AGB (Tg)")
    summary_components = [df_agb_total_t]

    if generate_uncertainty_stats:
        df_agb_ci95_t = df_agb_ci95.T.rename_axis("Name", axis=1).add_suffix(" AGB CI95 (Tg)")
        summary_components.append(df_agb_ci95_t)

    summary_stats = pd.concat(summary_components, axis=1).rename_axis("Name", axis=1)
    summary_stats.to_csv(join(sample_polygons_statistics_dir, summary_filename))

    # Detailed stats by area
    for polygon_name in polygon_names:
        df_by_area = pd.DataFrame(index=selected_items)
        df_by_area.rename_axis(index_name, inplace=True)
        for df_stats, col_name in df_stats_list:
            df_by_area[col_name] = df_stats[polygon_name]
        df_by_area.to_csv(join(by_area_dir, f'{polygon_name}.csv'))

    # Detailed stats by item
    for item_name in selected_items:
        df_by_item = pd.DataFrame(index=polygon_names)
        df_by_item.index.name = 'Name'
        for df_stats, col_name in df_stats_list:
            df_by_item[col_name] = df_stats.loc[item_name]
        df_by_item.to_csv(join(by_item_dir, f'{item_name}.csv'))

    print(f"{category_name.capitalize()} AGB statistics completed.")

# Intactness statistics

In [None]:
# Create list of available intactness rasters
intactness_rasters = []
for root, dirs, files in os.walk(intactness_dir):
    for file in files:
        if "intactness__" in file and file.endswith('tif'):
            relative_path = os.path.relpath(join(root, file), intactness_dir)
            intactness_rasters.append(relative_path)

# Select intactness rasters to calculate statistics
print("# Select intactness raster to calculate statistics")
print("intactness_rasters = [")
for raster in intactness_rasters:
    print(f"'{raster}',")
print("]")

In [None]:
# Select intactness raster to calculate statistics
intactness_rasters = [
'2024_undisturbed_since_1996/prediction_area/intactness__prediction_area_10_quantiles__2024_undisturbed_since_1996__agbd_v2_1_260206_133525.tif',
'2024_undisturbed_since_1996/forest_reserves/intactness__forest_reserves_10_quantiles__2024_undisturbed_since_1996__agbd_v2_1_260206_133525.tif',
'2024_undisturbed_since_oldgrowth/prediction_area/intactness__prediction_area_10_quantiles__2024_undisturbed_since_oldgrowth__agbd_v2_1_260206_133525.tif',
'2024_undisturbed_since_oldgrowth/forest_reserves/intactness__forest_reserves_10_quantiles__2024_undisturbed_since_oldgrowth__agbd_v2_1_260206_133525.tif',
]

In [None]:
# Intactness statistics
# Calculates area-weighted forest intactness and percentage change statistics per polygon.
# Forest intactness scores are aggregated using area-weighted means and standard deviations,
# but not CI95, as they represent ordinal categories derived from percentage loss quantiles.
# Two versions are calculated per polygon:
#   - 'extant': only forest pixel scores (1-10) contribute
#   - 'original': deforested pixels also contribute, assigned score 0 for intactness
#     or -100% for percentage change
# This differentiates degraded forest without deforestation (lower extant, higher original)
# from undegraded forest alongside deforestation (higher extant, lower original).
# Standard deviation of forest intactness indicates heterogeneity in degradation for
# extant scores and in disturbance for original scores.

# Regarding percentage loss CI95, used for polygon-level uncertainty aggregation:

# As mentioned in the AGB total raster section, the optimal approach would be to
# aggregate all Monte Carlo iterations per polygon.For each iteration, calculate the
# mean area-weighted mean percentage loss within the polygon, then derive CI95 from
# the distribution of polygon means. This preserves the spatial correlation structure
# of prediction errors. However, it requires loading all iterations for every intactness
# baseline for every sample polygon, which is extremely computationally expensive for
# large extents and many polygons. In other words, it scales poorly.

# Instead, the approach taken calculates the area-weighted mean of pixel-level CI95
# values. So ci95_polygon = sum(weight * ci95_pixel) / sum(weight). This assumes perfect
# spatial correlation between pixel errors, i.e. if one pixel overestimates, all
# pixels overestimate by the same relative magnitude. It yields the maximum possible
# aggregate CI, providing a conservative upper bound on uncertainty. In reality, true
# CI is between this value and the root-sum-of-squares approach:
# ci95_polygon = sqrt(sum(weight^2 * ci95_pixel^2)) / sum(weight), which assumes complete
# independence. Examination of the percentage uncertainty rasters does show spatial
# clustering, suggesting the conservative approach is better justified.

# Deforested pixels (percentage loss = -100%) have CI = 0 because complete forest
# removal is an observed state from Landsat-derived classification, not a model
# prediction. These pixels contribute to the mean percentage loss but not to aggregate
# uncertainty. Original CI is therefore smaller than extant CI when
# deforestation is present, as the same uncertainty is spread over a larger area.

# Match intactness rasters to percentage change and CI rasters
intactness_percentage_paths = {}
intactness_percentage_ci95_paths = {}
for intactness_raster in intactness_rasters:
    intactness_raster_path = join(intactness_dir, intactness_raster)

    # Percentage change raster located in same subdirectory as intactness raster
    subdir = intactness_raster.split('/')[0]
    percentage_change_filename = f"percentage_loss__{subdir}__{selected_model}.tif"
    percentage_change_path = join(intactness_dir, subdir, percentage_change_filename)
    intactness_percentage_paths[intactness_raster_path] = percentage_change_path

    # CI halfwidth raster located in same subdirectory
    ci95_halfwidth_filename = f"ci95_halfwidth__percentage_loss__{subdir}__{selected_model}.tif"
    ci95_halfwidth_path = join(intactness_dir, subdir, ci95_halfwidth_filename)
    intactness_percentage_ci95_paths[intactness_raster_path] = ci95_halfwidth_path

# Load polygon data from .npz files
polygon_data = {}
for npz_file in sorted(os.listdir(polygon_indices_dir)):
    if npz_file.endswith('.npz'):
        name = npz_file.split('_', 1)[1][:-4]
        data = np.load(join(polygon_indices_dir, npz_file))
        polygon_data[name] = {
            'rows': data['rows'],
            'cols': data['cols'],
            'cell_area_ha': data['cell_area_ha']
        }

polygon_names = list(polygon_data.keys())
n_polygons = len(polygon_names)

# Progress tracking
n_intactness_rasters = len(intactness_rasters)
progress_total = n_intactness_rasters
progress_index = 0
progress_label = widgets.Label(f"Intactness raster progress: {progress_index}/{progress_total}")
display(progress_label)

# Outer loop: one CSV output per intactness raster
for intactness_raster in intactness_rasters:
    intactness_raster_path = join(intactness_dir, intactness_raster)
    percentage_raster_path = intactness_percentage_paths[intactness_raster_path]
    ci95_raster_path = intactness_percentage_ci95_paths[intactness_raster_path]
    ci95_available = exists(ci95_raster_path)

    # Extract metadata from filename
    # Format: intactness__{quantiles}__{baseline}__{disturbance}__{model}.tif
    filename = intactness_raster.split('/')[-1]
    parts = filename.split('__')
    polygon_quantiles = parts[1]
    baseline_name = parts[2]
    disturbance = parts[3]

    # Total score derived from quantiles (e.g. 'forest_reserves_10_quantiles' -> 10)
    total_score = int(polygon_quantiles.split('_')[-2])
    total_stdev = int(total_score / 2)

    # Output CSV path
    intactness_csv_name = f"{polygon_quantiles}__{baseline_name}__{disturbance}.csv"
    intactness_csv_path = join(intactness_stats_dir, intactness_csv_name)

    # Load rasters
    intactness_ds = gdal.Open(intactness_raster_path)
    intactness_array = intactness_ds.ReadAsArray().astype('float64')
    intactness_nodata = intactness_ds.GetRasterBand(1).GetNoDataValue()
    intactness_ds = None

    percent_ds = gdal.Open(percentage_raster_path)
    percent_array = percent_ds.ReadAsArray().astype('float64')
    percent_ds = None

    # Percentage loss CI95 has additional NaN values where intactness score = 0
    # (deforested pixels), as percentage loss CI95 was not calculated for these pixels.
    ci95_array = None
    if ci95_available:
        ci95_ds = gdal.Open(ci95_raster_path)
        ci95_array = ci95_ds.ReadAsArray().astype('float64')
        ci95_ds = None

    # Initialise output dataframe
    columns = [
        "Name",
        "Percentage change (extant) mean",
        "Percentage change (extant) stdev",
        "Percentage change (extant) CI95",
        "Percentage change (original) mean",
        "Percentage change (original) stdev",
        "Percentage change (original) CI95",
        f"Intactness (extant) mean / {total_score}",
        f"Intactness (extant) stdev / {total_stdev}",
        f"Intactness (original) mean / {total_score}",
        f"Intactness (original) stdev / {total_stdev}"
    ]
    df_intactness_stats = pd.DataFrame(columns=columns)

    # Inner loop: one row per polygon
    for polygon_name in polygon_names:
        rows = polygon_data[polygon_name]['rows']
        cols = polygon_data[polygon_name]['cols']
        cell_area_ha = polygon_data[polygon_name]['cell_area_ha']

        # Extract intactness values
        intactness_values = intactness_array[rows, cols]
        valid_mask = (intactness_values != intactness_nodata)

        # Skip polygon if no valid intactness data
        if not np.any(valid_mask):
            new_row = pd.DataFrame([{
                'Name': polygon_name,
                'Percentage change (extant) mean': None,
                'Percentage change (extant) stdev': None,
                'Percentage change (extant) CI95': None,
                'Percentage change (original) mean': None,
                'Percentage change (original) stdev': None,
                'Percentage change (original) CI95': None,
                f'Intactness (extant) mean / {total_score}': None,
                f'Intactness (extant) stdev / {total_stdev}': None,
                f'Intactness (original) mean / {total_score}': None,
                f'Intactness (original) stdev / {total_stdev}': None,
            }], dtype=object)
            df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)
            continue

        # Extant mask: valid pixels with intactness > 0 (extant forest)
        extant_mask = valid_mask & (intactness_values > 0)

        # Deforested mask: valid pixels with intactness == 0 (lost since baseline)
        deforested_mask = valid_mask & (intactness_values == 0)

        # Calculate areas in hectares
        extant_area_ha = cell_area_ha[extant_mask].sum(dtype='float64')
        deforested_area_ha = cell_area_ha[deforested_mask].sum(dtype='float64')
        original_area_ha = extant_area_ha + deforested_area_ha

        # Extract percentage change values
        percent_values = percent_array[rows, cols]

        # Extract CI95 values if available, filter NaN
        ci95_values = None
        ci95_valid_mask = None
        if ci95_available:
            ci95_values = ci95_array[rows, cols]
            ci95_valid_mask = extant_mask & ~np.isnan(ci95_values)

        # Calculate percentage change statistics
        if extant_area_ha > 0:
            # Extant: only pixels with intactness > 0
            extant_percent_values = percent_values[extant_mask]
            extant_percent_weights = cell_area_ha[extant_mask]

            # Area-weighted mean
            percent_extant_mean = np.sum(extant_percent_values * extant_percent_weights, dtype='float64') / extant_area_ha

            # Area-weighted stdev
            variance = np.sum(extant_percent_weights * (extant_percent_values - percent_extant_mean)**2, dtype='float64') / extant_area_ha
            percent_extant_std = np.sqrt(variance)

            # Extant CI (using only valid CI95 pixels)
            if ci95_available and np.any(ci95_valid_mask):
                ci95_extant_values = ci95_values[ci95_valid_mask]
                ci95_extant_weights = cell_area_ha[ci95_valid_mask]
                ci95_extant_area = ci95_extant_weights.sum(dtype='float64')
                percent_extant_ci = np.sum(ci95_extant_weights * ci95_extant_values, dtype='float64') / ci95_extant_area
            else: percent_extant_ci = None

            # Original: include deforested pixels as -100% change
            if deforested_area_ha > 0:
                original_mean_num = np.sum(extant_percent_values * extant_percent_weights, dtype='float64') + deforested_area_ha * (-100.0)
                percent_original_mean = original_mean_num / original_area_ha

                extant_var_contrib = np.sum(extant_percent_weights * np.square(extant_percent_values - percent_original_mean), dtype='float64')
                deforested_var_contrib = deforested_area_ha * np.square((-100.0) - percent_original_mean)
                percent_original_std = np.sqrt((extant_var_contrib + deforested_var_contrib) / original_area_ha)

                # Original CI: deforested pixels have CI = 0 (exact -100%, percentage uncertainty).
                # Only extant pixels contribute to aggregated CI.
                if ci95_available and np.any(ci95_valid_mask):
                    percent_original_ci = np.sum(ci95_extant_weights * ci95_extant_values, dtype='float64') / original_area_ha
                else: percent_original_ci = None
            else:
                # No deforestation: original stats equal extant stats
                percent_original_mean = percent_extant_mean
                percent_original_std = percent_extant_std
                percent_original_ci = percent_extant_ci
        else:
            # No extant forest: all deforested
            percent_extant_mean = percent_extant_std = percent_extant_ci = None
            percent_original_mean = -100.0
            percent_original_std = 0.0
            percent_original_ci = 0.0

        # Calculate intactness statistics
        if extant_area_ha > 0:
            # Extant: only pixels with intactness > 0
            extant_intact_vals = intactness_values[extant_mask]
            extant_intact_weights = cell_area_ha[extant_mask]

            # Area-weighted mean
            intactness_extant_mean = np.sum(extant_intact_vals * extant_intact_weights, dtype='float64') / extant_area_ha

            # Area-weighted stdev
            variance = np.sum(extant_intact_weights * (extant_intact_vals - intactness_extant_mean)**2, dtype='float64') / extant_area_ha
            intactness_extant_std = np.sqrt(variance)

            # Original: include deforested pixels as score 0
            if deforested_area_ha > 0:
                intactness_original_mean = np.sum(extant_intact_vals * extant_intact_weights, dtype='float64') / original_area_ha

                extant_var_contrib = np.sum(extant_intact_weights * np.square(extant_intact_vals - intactness_original_mean), dtype='float64')
                deforested_var_contrib = deforested_area_ha * np.square(0 - intactness_original_mean)
                intactness_original_std = np.sqrt((extant_var_contrib + deforested_var_contrib) / original_area_ha)
            else:
                # No deforestation: original stats equal extant stats
                intactness_original_mean = intactness_extant_mean
                intactness_original_std = intactness_extant_std
        else:
            # No extant forest: all deforested
            intactness_extant_mean = intactness_extant_std = None
            intactness_original_mean = 0.0
            intactness_original_std = 0.0

        # Calculate area per intactness score (includes score 0)
        score_areas = {}
        valid_values = intactness_values[valid_mask]
        valid_areas = cell_area_ha[valid_mask]
        if len(valid_values) > 0:
            unique_scores = np.unique(valid_values)
            for score in unique_scores:
                score_mask = (valid_values == score)
                score_areas[int(score)] = valid_areas[score_mask].sum(dtype='float64')

        # Build output row
        new_row_dict = {
            'Name': polygon_name,
            'Percentage change (extant) mean': percent_extant_mean,
            'Percentage change (extant) stdev': percent_extant_std,
            'Percentage change (extant) CI95': percent_extant_ci,
            'Percentage change (original) mean': percent_original_mean,
            'Percentage change (original) stdev': percent_original_std,
            'Percentage change (original) CI95': percent_original_ci,
            f'Intactness (extant) mean / {total_score}': intactness_extant_mean,
            f'Intactness (extant) stdev / {total_stdev}': intactness_extant_std,
            f'Intactness (original) mean / {total_score}': intactness_original_mean,
            f'Intactness (original) stdev / {total_stdev}': intactness_original_std,
        }

        # Append score area columns dynamically
        for score, area in score_areas.items(): new_row_dict[f'Intactness score {score} area (ha)'] = area
        new_row = pd.DataFrame([new_row_dict], dtype=object)
        df_intactness_stats = pd.concat([df_intactness_stats, new_row], ignore_index=True)

    # Save CSV for this intactness raster
    df_intactness_stats = df_intactness_stats.set_index('Name')
    df_intactness_stats.to_csv(intactness_csv_path)

    progress_index += 1
    progress_label.value = f"Intactness raster progress: {progress_index}/{progress_total}"

print("Intactness statistics completed.")

# Report statistics

In [None]:
# Provides customisable summary of all prior statistics for reporting

# Load summary stats
summary_scenario_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_scenario_stats.csv'), index_col=0)
summary_disturbance_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_disturbance_stats.csv'), index_col=0)
summary_restoration_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_restoration_stats.csv'), index_col=0)
summary_land_forest_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_land_and_forest_cover_stats.csv'), index_col=0)

# Print available forest cover scenarios
forest_cover_cols = [col for col in summary_land_forest_df.columns if col.endswith('forest cover (ha)')]
forest_cover_scenarios = [col.replace(' forest cover (ha)', '') for col in forest_cover_cols]
print("forest_cover_list = [")
for scenario in forest_cover_scenarios:
    print(f"  '{scenario}',")
print("]\n")

# Print available AGB scenarios
print("scenario_list = [")
for csv in sorted(os.listdir(scenario_stats_by_scenario_dir)):
    if csv.endswith('.csv'): print(f"  '{csv[:-4]}',")
print("]\n")

# Print available disturbances
print("disturbance_list = [")
for csv in sorted(os.listdir(disturbance_stats_by_disturbance_dir)):
    if csv.endswith('.csv'): print(f"  '{csv[:-4]}',")
print("]\n")

# Print available restorations
print("restoration_list = [")
for csv in sorted(os.listdir(restoration_stats_by_restoration_dir)):
    if csv.endswith('.csv'): print(f"  '{csv[:-4]}',")
print("]")

# Print available intactness stats
print("intactness_list = [")
for csv in sorted(os.listdir(intactness_stats_dir)):
    if csv.endswith('.csv'): print(f"  '{csv[:-4]}',")
print("]")

In [None]:
forest_cover_list = [
  '2024_undisturbed_since_1996',
  '2024_undisturbed_since_oldgrowth',
  '2024',
]

scenario_list = [
  '2024_undisturbed_since_1996',
  '2024_undisturbed_since_oldgrowth',
  '2024',
]

disturbance_list = [
  '2024_deforestation_since_oldgrowth',
  '2024_degradation_since_oldgrowth',
  '2024_degradation_since_1996',
]

restoration_list = [
  '2024_recovery_potential_with_edge_effects',
  '2024_reforestation_potential',
  '2024_restoration_potential',
]

intactness_list = [
  'forest_reserves_10_quantiles__2024_undisturbed_since_1996__agbd_v2_1_260206_133525.tif',
  'forest_reserves_10_quantiles__2024_undisturbed_since_oldgrowth__agbd_v2_1_260206_133525.tif',
]

In [None]:
# Generate report CSVs from selected statistics

polygon_names = summary_scenario_df.index.tolist()

# Forest cover
forest_cover = pd.DataFrame(index=polygon_names)
forest_cover.index.name = 'Name'
forest_cover['Area (km^2)'] = summary_land_forest_df['Area (km^2)']
for scenario in forest_cover_list:
    col = f'{scenario} forest cover (ha)'
    if col in summary_land_forest_df.columns:
        forest_cover[col] = summary_land_forest_df[col]
forest_cover.to_csv(join(report_statistics_dir, 'forest_cover.csv'))

# Scenarios - total AGB
scenarios_total_agb = pd.DataFrame(index=polygon_names)
scenarios_total_agb.index.name = 'Name'
for scenario in scenario_list:
    col = f'{scenario} AGB (Tg)'
    if col in summary_scenario_df.columns:
        scenarios_total_agb[col] = summary_scenario_df[col]
if source_dir == predictions_dir:
    for scenario in scenario_list:
        col = f'{scenario} AGB CI95 (Tg)'
        if col in summary_scenario_df.columns:
            scenarios_total_agb[col] = summary_scenario_df[col]
scenarios_total_agb.to_csv(join(report_statistics_dir, 'scenarios_total_agb.csv'))

# Scenarios - AGBD
scenarios_agbd = pd.DataFrame(index=polygon_names)
scenarios_agbd.index.name = 'Name'
for scenario in scenario_list:
    scenario_df = pd.read_csv(join(scenario_stats_by_scenario_dir, f'{scenario}.csv'), index_col=0)
    scenarios_agbd[f'{scenario} AGBD (Mg/ha)'] = scenario_df['AGBD mean (Mg / ha)']
if source_dir == predictions_dir:
    for scenario in scenario_list:
        scenario_df = pd.read_csv(join(scenario_stats_by_scenario_dir, f'{scenario}.csv'), index_col=0)
        scenarios_agbd[f'{scenario} AGBD CI95 (Mg/ha)'] = scenario_df['AGBD CI95 (Mg / ha)']
scenarios_agbd.to_csv(join(report_statistics_dir, 'scenarios_agbd.csv'))

# Disturbance - total AGB
disturbance_total_agb = pd.DataFrame(index=polygon_names)
disturbance_total_agb.index.name = 'Name'
for disturbance in disturbance_list:
    col = f'{disturbance} AGB (Tg)'
    if col in summary_disturbance_df.columns:
        disturbance_total_agb[col] = summary_disturbance_df[col]
if source_dir == predictions_dir:
    for disturbance in disturbance_list:
        col = f'{disturbance} AGB CI95 (Tg)'
        if col in summary_disturbance_df.columns:
            disturbance_total_agb[col] = summary_disturbance_df[col]
disturbance_total_agb.to_csv(join(report_statistics_dir, 'disturbance_total_agb.csv'))

# Disturbance - AGBD
disturbance_agbd = pd.DataFrame(index=polygon_names)
disturbance_agbd.index.name = 'Name'
for disturbance in disturbance_list:
    disturbance_df = pd.read_csv(join(disturbance_stats_by_disturbance_dir, f'{disturbance}.csv'), index_col=0)
    disturbance_agbd[f'{disturbance} AGBD (Mg/ha)'] = disturbance_df['AGBD mean (Mg / ha)']
if source_dir == predictions_dir:
    for disturbance in disturbance_list:
        disturbance_df = pd.read_csv(join(disturbance_stats_by_disturbance_dir, f'{disturbance}.csv'), index_col=0)
        disturbance_agbd[f'{disturbance} AGBD CI95 (Mg/ha)'] = disturbance_df['AGBD CI95 (Mg / ha)']
disturbance_agbd.to_csv(join(report_statistics_dir, 'disturbance_agbd.csv'))

# Restoration - total AGB
restoration_total_agb = pd.DataFrame(index=polygon_names)
restoration_total_agb.index.name = 'Name'
for restoration in restoration_list:
    col = f'{restoration} AGB (Tg)'
    if col in summary_restoration_df.columns:
        restoration_total_agb[col] = summary_restoration_df[col]
if source_dir == predictions_dir:
    for restoration in restoration_list:
        col = f'{restoration} AGB CI95 (Tg)'
        if col in summary_restoration_df.columns:
            restoration_total_agb[col] = summary_restoration_df[col]
restoration_total_agb.to_csv(join(report_statistics_dir, 'restoration_total_agb.csv'))

# Restoration - AGBD
restoration_agbd = pd.DataFrame(index=polygon_names)
restoration_agbd.index.name = 'Name'
for restoration in restoration_list:
    restoration_df = pd.read_csv(join(restoration_stats_by_restoration_dir, f'{restoration}.csv'), index_col=0)
    restoration_agbd[f'{restoration} AGBD (Mg/ha)'] = restoration_df['AGBD mean (Mg / ha)']
if source_dir == predictions_dir:
    for restoration in restoration_list:
        restoration_df = pd.read_csv(join(restoration_stats_by_restoration_dir, f'{restoration}.csv'), index_col=0)
        restoration_agbd[f'{restoration} AGBD CI95 (Mg/ha)'] = restoration_df['AGBD CI95 (Mg / ha)']
restoration_agbd.to_csv(join(report_statistics_dir, 'restoration_agbd.csv'))

# Intactness - percentage change
intactness_percentage = pd.DataFrame(index=polygon_names)
intactness_percentage.index.name = 'Name'
for intactness in intactness_list:
    intactness_df = pd.read_csv(join(intactness_stats_dir, f'{intactness}.csv'), index_col=0)
    intactness_percentage[f'{intactness} % change (extant)'] = intactness_df['Percentage change (extant) mean']
    intactness_percentage[f'{intactness} % change (original)'] = intactness_df['Percentage change (original) mean']
if source_dir == predictions_dir:
    for intactness in intactness_list:
        intactness_df = pd.read_csv(join(intactness_stats_dir, f'{intactness}.csv'), index_col=0)
        intactness_percentage[f'{intactness} % change (extant) CI95'] = intactness_df['Percentage change (extant) CI95']
        intactness_percentage[f'{intactness} % change (original) CI95'] = intactness_df['Percentage change (original) CI95']
intactness_percentage.to_csv(join(report_statistics_dir, 'intactness_percentage.csv'))

print("Report statistics completed.")

# Sankey plots

In [None]:
# Define and create directories
sankey_labelled = join(sankey_dir, 'sankey_labelled')
sankey_unlabelled = join(sankey_dir, 'sankey_unlabelled')
sankey_labelled_svg = join(sankey_dir, 'sankey_labelled_svg')
sankey_unlabelled_svg = join(sankey_dir, 'sankey_unlabelled_svg')

for dir in [sankey_labelled, sankey_unlabelled, sankey_labelled_svg, sankey_unlabelled_svg]:
    makedirs(dir, exist_ok=True)

# Load the CSV files
summary_scenario_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_scenario_stats.csv'))
summary_disturbance_stats = pd.read_csv(join(sample_polygons_statistics_dir,'summary_disturbance_stats.csv'))

# Check that all rows in both .csv files have the same strings (polygon areas) in column A
polygon_areas_stats = summary_scenario_stats.iloc[:, 0]
polygon_areas_disturbance_stats = summary_disturbance_stats.iloc[:, 0]

assert all(polygon_areas_stats == polygon_areas_disturbance_stats), "Polygon areas do not match between the two CSV files."

# Print columns relevant for sankey diagram configuration

# Filter for AGB columns only (exclude forest cover and CI95 for initial selection)
summary_agb_cols = [col for col in summary_scenario_stats.columns[1:] if 'AGB (Tg)' in col and 'CI95' not in col]
disturbance_agb_cols = [col for col in summary_disturbance_stats.columns[1:] if 'AGB (Tg)' in col and 'CI95' not in col]

print("summary_scenario_stats.csv AGB columns")
print("(for old_growth_agb_column and current_agb_column)\n")

# Group by category
current_year_cols = [col for col in summary_agb_cols if col.startswith('20') and col[4:].startswith(' ')]
oldgrowth_cols = [col for col in summary_agb_cols if 'oldgrowth' in col and not col.endswith('_1 AGB (Tg)') and not col.endswith('_2 AGB (Tg)')]
undisturbed_cols = [col for col in summary_agb_cols if 'undisturbed' in col]
no_degradation_cols = [col for col in summary_agb_cols if 'no_degradation' in col]

print("Current year scenarios:")
for i, col in enumerate(current_year_cols, 1):
    print(f"  {col}")

print("\nOld-growth scenarios:")
for i, col in enumerate(oldgrowth_cols, 1):
    print(f"  {col}")

print("\nUndisturbed scenarios:")
for i, col in enumerate(undisturbed_cols, 1):
    print(f"  {col}")

print("\n")
print("summary_disturbance_stats.csv AGB columns")
print("(for degradation/deforestation since/total columns)\n")

# Group disturbance columns
degradation_cols = [col for col in disturbance_agb_cols if 'degradation_since' in col and 'effect' not in col]
deforestation_cols = [col for col in disturbance_agb_cols if 'deforestation_since' in col]

print("Degradation columns:")
for i, col in enumerate(degradation_cols, 1):
    print(f"   {col}")

print("\nDeforestation columns:")
for i, col in enumerate(deforestation_cols, 1):
    print(f"   {col}")

In [None]:
# Plot degradation and deforestation separately
separate_disturbance = True
# Plot degradation before and since a date separately
separate_degradation = True

# DPI (default is 96, output image will scale accordingly)
dpi = 300
# Relative width modifier (ratio, e.g. 0.5 or 2)
width_modifier = 0.85

# Title (polygon area), density and label variables (weight of 800 ~ bold, 400 ~ normal)
show_title = True
show_density = True
show_labels = True
left_axis_label = True
svg_transparent_background = True
title_font_size = 20
title_font_weight = 600
density_font_size = 17
density_font_weight = 600
label_font_size = 17
label_font_weight = 600

# Base columns and year (summary_scenario_stats)
old_growth_agb_column = '2024_undisturbed_since_oldgrowth AGB (Tg)'
current_agb_column = '2024 AGB (Tg)'
current_year = current_agb_column.split(' ')[0]

# Disturbance columns (summary_disturbance_stats)
degradation_since_column = '2024_degradation_since_1996 AGB (Tg)'
degradation_total_column = '2024_degradation_since_oldgrowth AGB (Tg)'
deforestation_total_column = '2024_deforestation_since_oldgrowth AGB (Tg)'

# Node labels and colours
remaining_name = f'Remaining in {current_year}:'
remaining_colour = '#007fff'
degradation_before_name = 'Degradation loss before 1996'
degradation_before_colour = '#8dc00d'
degradation_since_name = 'Degradation loss since 1996'
degradation_since_colour = '#ffff00'
degradation_total_name = 'Degradation'
degradation_total_colour = '#ffff00'
deforestation_total_name = 'Deforestation loss'
deforestation_total_colour = '#ffffff'
disturbance_total_name = 'Disturbance'
disturbance_total_colour = '#ffffff'

# Validate separation settings
assert not separate_degradation or separate_disturbance, "separate_disturbance must be True if separate_degradation is True."

# Function to get values from statistics
def get_value(df, idx, column_name):
    try:
        value = df.loc[idx, column_name]
        return 0.0 if pd.isnull(value) else float(value)
    except KeyError:
        print(f"Column '{column_name}' not found in the dataframe.")
        return 0.0

# Loop through each row (polygon area)
for idx in summary_scenario_stats.index:
    polygon_name = summary_scenario_stats.iloc[idx, 0]

    # Get old-growth and current AGB values
    old_growth_agb = get_value(summary_scenario_stats, idx, old_growth_agb_column)
    current_agb = get_value(summary_scenario_stats, idx, current_agb_column)

    # Get disturbance values and calculate before values
    degradation_total = get_value(summary_disturbance_stats, idx, degradation_total_column)
    if separate_degradation:
      degradation_since = get_value(summary_disturbance_stats, idx, degradation_since_column)
      degradation_before = degradation_total - degradation_since
    deforestation_total = get_value(summary_disturbance_stats, idx, deforestation_total_column)
    disturbance_total = degradation_total + deforestation_total

    # Check for statistical / precision discrepencies of greater than 10 tonnes (~ a single tree)
    discrepancy = abs(current_agb - disturbance_total - old_growth_agb)
    if discrepancy >= 1e-5:
        print(f"{polygon_name}: current_agb - disturbance_total != old_growth_agb (discrepancy: {discrepancy:.5e})")

    # Load detailed stats for AGBD and CI95 values
    stats_df = pd.read_csv(join(scenario_stats_by_area_dir, f"{polygon_name}.csv"))
    old_growth_index = stats_df.index[stats_df['scenario'] == f"{old_growth_agb_column.split(' ')[0]}"].item()
    current_index = stats_df.index[stats_df['scenario'] == f"{current_agb_column.split(' ')[0]}"].item()

    old_growth_mean_agbd = get_value(stats_df, old_growth_index, "AGBD mean (Mg / ha)")
    current_mean_agbd = get_value(stats_df, current_index, "AGBD mean (Mg / ha)")

    uncertainty = 'AGB total CI95 (Tg)' in stats_df.columns
    if uncertainty:
        old_growth_agb_ci95 = get_value(stats_df, old_growth_index, "AGB total CI95 (Tg)")
        old_growth_mean_agbd_ci95 = get_value(stats_df, old_growth_index, "AGBD CI95 (Mg / ha)")
        current_agb_ci95 = get_value(stats_df, current_index, "AGB total CI95 (Tg)")
        current_mean_agbd_ci95 = get_value(stats_df, current_index, "AGBD CI95 (Mg / ha)")

    # Build title and subtitle text
    title_name = f"{polygon_name}"

    if uncertainty:
        subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} ± {old_growth_mean_agbd_ci95:.1f} Mg / ha"
        subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} ± {current_mean_agbd_ci95:.1f} Mg / ha"
        left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} ± {old_growth_agb_ci95:.2f} Tg" if left_axis_label else ''
        remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} ± {current_agb_ci95:.2f} Tg"
    else:
        subtitle_1_name = f"Predicted old-growth AGBD: {old_growth_mean_agbd:.0f} Mg / ha"
        subtitle_2_name = f"{current_year} AGBD: {current_mean_agbd:.0f} Mg / ha"
        left_axis = f"Predicted<br>old-growth AGB:<br>{old_growth_agb:.1f} Tg" if left_axis_label else ''
        remaining_name_agb = f"{remaining_name}<br>{current_agb:.1f} Tg"

    if separate_disturbance and separate_degradation:
        nodes = [left_axis, degradation_before_name, degradation_since_name, deforestation_total_name, remaining_name_agb]
        sources, targets = [0, 0, 0, 0], [1, 2, 3, 4]
        values = [-degradation_before, -degradation_since, -deforestation_total, current_agb]
        colors = [degradation_before_colour, degradation_since_colour, deforestation_total_colour, remaining_colour]

    elif separate_disturbance and not separate_degradation:
        nodes = [left_axis, degradation_total_name, deforestation_total_name, remaining_name_agb]
        sources, targets = [0, 0, 0], [1, 2, 3]
        values = [-degradation_total, -deforestation_total, current_agb]
        colors = [degradation_total_colour, deforestation_total_colour, remaining_colour]

    else:
        nodes = [left_axis, disturbance_total_name, remaining_name_agb]
        sources, targets = [0, 0], [1, 2]
        values = [-(degradation_total + deforestation_total), current_agb]
        colors = [disturbance_total_colour, remaining_colour]

    node_colors = [remaining_colour] + colors

    # Add percentages to node labels
    percentages = [(abs(val) / old_growth_agb * 100) for val in values]
    for i in range(1, len(nodes)):
        if i - 1 < len(percentages):
            nodes[i] += f" ({percentages[i-1]:.0f}%)"

    # Configure title and density annotations
    title_and_density = [
        dict(x=0, y=1.28, xref='paper', yref='paper', text=title_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=title_font_size, color="black", weight=title_font_weight)),
        dict(x=0, y=1.19, xref='paper', yref='paper', text=subtitle_1_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=density_font_size, color="black", weight=density_font_weight)),
        dict(x=0, y=1.11, xref='paper', yref='paper', text=subtitle_2_name, showarrow=False, xanchor='left', align='left',
             font=dict(family="arial, sans serif", size=density_font_size, color="black", weight=density_font_weight))
    ]

    if show_title and not show_density:
        title_and_density = title_and_density[0:1]
    elif not show_title and show_density:
        title_and_density = title_and_density[1:3]
    elif not show_title and not show_density:
        title_and_density = []

    # Remove labels if toggled off
    if not show_labels:
        nodes = [''] * len(nodes)

    # Create sankey diagram
    fig = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=nodes, color=node_colors, pad=15, thickness=20, line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors, line=dict(color="black", width=1))
    )])

    fig.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25),
        annotations=title_and_density
    )

    # Save labelled versions
    fig.write_image(join(sankey_labelled, f'sankey_diagram_{polygon_name}.png'), scale=dpi / 96)
    if svg_transparent_background:
        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    fig.write_image(join(sankey_labelled_svg, f'sankey_diagram_vector_{polygon_name}.svg'), scale=dpi / 96)

    # Create and save unlabelled versions
    fig_unlabelled = go.Figure(data=[go.Sankey(
        arrangement="freeform",
        node=dict(label=[''] * len(nodes), color=node_colors, pad=15, thickness=20, line=dict(color="black", width=1)),
        link=dict(source=sources, target=targets, value=values, color=colors, line=dict(color="black", width=1))
    )])

    fig_unlabelled.update_layout(
        width=700 * width_modifier, height=500,
        font=dict(family="arial, sans serif", size=label_font_size, color="black", weight=label_font_weight),
        margin=dict(l=25, r=25, t=115, b=25)
    )

    fig_unlabelled.write_image(join(sankey_unlabelled, f'sankey_diagram_{polygon_name}.png'), scale=dpi / 96)
    if svg_transparent_background:
        fig_unlabelled.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
    fig_unlabelled.write_image(join(sankey_unlabelled_svg, f'sankey_diagram_vector_{polygon_name}.svg'), scale=dpi / 96)

    # Display figure with white background
    fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
    fig.show()

# Disturbance trend plots

In [None]:
# Analyses yearly forest cover loss and degradation effect trends
# relative to a target year and baseline

target_year = '2024'
start_year = '1996'
use_oldgrowth_baseline = True  # if False, uses start_year as baseline

# Print available polygons
print("selected_trend_polygons = [")
for _, row in selected_sample_polygons_gpkg.iterrows(): print(f"  '{row['name']}',")
print("]")

In [None]:
selected_trend_polygons = [
  # 'All protected areas',
  # 'Taman Negara National Park',
  # 'Tengku Hassanal Wildlife Reserve',
  # 'All forest reserves',
  # 'Ais forest reserves',
  'Berkelah Jerantut forest reserve',
  # 'Berkelah Kuantan forest reserves',
  # 'Berkelah Temerloh forest reserve',
  # 'Remen Chereh forest reserves',
  # 'Tekai Tembeling forest reserves',
  # 'Tekam forest reserve',
  # 'Yong forest reserves',
  # 'Yong Lipis forest reserves',
  # 'ASARTR phase 1',
  'ASARTR phase 2',
  'Tekai Tembeling forest reserves new',
  # 'All forest reserves new',
]

In [None]:
# Validate year range
try: start_int, target_int = int(start_year), int(target_year)
except ValueError: raise ValueError("start_year and target_year must be numeric")
year_range = [str(y) for y in range(start_int, target_int + 1)]

# Load summary data
summary_land_forest_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_land_and_forest_cover_stats.csv'), index_col=0)
summary_disturbance_df = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_disturbance_stats.csv'), index_col=0)
summary_scenario_stats = pd.read_csv(join(sample_polygons_statistics_dir, 'summary_scenario_stats.csv'), index_col=0)

# Validate target year data exists
target_forest_col = f'{target_year} forest cover (ha)'
if target_forest_col not in summary_land_forest_df.columns:
    raise FileNotFoundError(f"Target year forest cover not found: {target_forest_col}")

# Determine baseline names
if use_oldgrowth_baseline:
    forest_baseline_scenario = f'{target_year}_undisturbed_since_oldgrowth'
    degradation_baseline_name = f'{target_year}_degradation_since_oldgrowth'
    baseline_label = 'old-growth'
else:
    forest_baseline_scenario = f'{target_year}_undisturbed_since_{start_year}'
    degradation_baseline_name = f'{target_year}_degradation_since_{start_year}'
    baseline_label = start_year

# Check baseline availability
forest_baseline_col = f'{forest_baseline_scenario} forest cover (ha)'
forest_baseline_available = forest_baseline_col in summary_land_forest_df.columns
if not forest_baseline_available:
    raise FileNotFoundError(f"Forest baseline required: {forest_baseline_col}")

degradation_baseline_path = join(disturbance_stats_by_disturbance_dir, f'{degradation_baseline_name}.csv')
if not exists(degradation_baseline_path):
    raise FileNotFoundError(f"Degradation baseline required: {degradation_baseline_path}")

# Validate forest cover years (need year before start for first loss calculation)
prev_year_col = f'{start_int - 1} forest cover (ha)'
if prev_year_col not in summary_land_forest_df.columns:
    raise FileNotFoundError(f"Year before start required: {prev_year_col}")
missing_forest = [y for y in year_range if f'{y} forest cover (ha)' not in summary_land_forest_df.columns]
if missing_forest: raise FileNotFoundError(f"Missing forest cover years: {missing_forest}")

# Validate degradation effect years
missing_degradation, earliest_available = [], None
for year in year_range:
    effect_path = join(disturbance_stats_by_disturbance_dir, f'{target_year}_effect_of_degradation_in_{year}.csv')
    if not exists(effect_path): missing_degradation.append(year)
    elif earliest_available is None: earliest_available = year
if missing_degradation:
    print(f"Error: Missing degradation years: {missing_degradation}")
    if earliest_available: print(f"Earliest available: {earliest_available}")
    raise FileNotFoundError("Degradation data incomplete for year range")

# Validate polygons
available_polygons = summary_land_forest_df.index.tolist()
invalid_polygons = [p for p in selected_trend_polygons if p not in available_polygons]
if invalid_polygons: raise ValueError(f"Polygons not found: {invalid_polygons}")

# Load reference data
degradation_baseline_df = pd.read_csv(degradation_baseline_path, index_col=0)

# Load all degradation effect data
degradation_effect_data = {}
for year in year_range:
    effect_path = join(disturbance_stats_by_disturbance_dir, f'{target_year}_effect_of_degradation_in_{year}.csv')
    degradation_effect_data[year] = pd.read_csv(effect_path, index_col=0)

# Check uncertainty availability
generate_uncertainty_stats = (source_dir == predictions_dir)
if generate_uncertainty_stats:
    if 'AGB total CI95 (Tg)' not in degradation_effect_data[year_range[0]].columns:
        print("Warning: CI95 not found, disabling uncertainty")
        generate_uncertainty_stats = False

print(f"Validated: {target_year} target, {start_year}-{target_year} range, {baseline_label} baseline, {len(selected_trend_polygons)} polygons")

In [None]:
# Process each polygon
progress_total = len(selected_trend_polygons)
progress_index = 0
progress_label = widgets.Label(f"Polygon progress: {progress_index}/{progress_total}")
display(progress_label)


# Plot settings
dpi = 300
width = 1000
height = 600
svg_transparent_background = True
show_title = True
forest_cover_colour = '#8dc00d'
degradation_colour = '#ffff00'
degradation_ci95_colour = 'rgba(255, 255, 0, 0.3)'
title_size = 21
subtitle_size = 18
axis_title_size = 17
axis_value_size = 15
legend_size = 17

# Line labels
forest_line_label = 'Forest cover loss'
degrad_line_label = 'Degradation effect with CI95' if generate_uncertainty_stats else 'Degradation effect'

for polygon_name in selected_trend_polygons:
    polygon_trends_dir = join(disturbance_trends_dir, polygon_name)
    makedirs(polygon_trends_dir, exist_ok=True)

    # Subdirectories for labelled and unlabelled outputs
    trends_labelled = join(polygon_trends_dir, 'trends_labelled')
    trends_labelled_svg = join(polygon_trends_dir, 'trends_labelled_svg')
    trends_unlabelled = join(polygon_trends_dir, 'trends_unlabelled')
    trends_unlabelled_svg = join(polygon_trends_dir, 'trends_unlabelled_svg')
    for subdir in [trends_labelled, trends_labelled_svg, trends_unlabelled, trends_unlabelled_svg]:
        makedirs(subdir, exist_ok=True)

    # Reference values for this polygon
    baseline_forest = summary_land_forest_df.loc[polygon_name, forest_baseline_col]
    target_forest = summary_land_forest_df.loc[polygon_name, target_forest_col]
    total_forest_loss = target_forest - baseline_forest
    pct_forest_of_baseline = (target_forest / baseline_forest * 100) if baseline_forest != 0 else 0

    # Get baseline total AGB from summary disturbance stats
    baseline_agb_col = f'{degradation_baseline_name} AGB (Tg)'
    baseline_degradation_agb = abs(summary_disturbance_df.loc[polygon_name, baseline_agb_col])

    # Forest cover trends (negative = loss)
    forest_trends = []
    for year in year_range:
        prev_col, curr_col = f'{int(year) - 1} forest cover (ha)', f'{year} forest cover (ha)'
        abs_loss = summary_land_forest_df.loc[polygon_name, curr_col] - summary_land_forest_df.loc[polygon_name, prev_col]
        pct_baseline = (abs_loss / abs(total_forest_loss) * 100) if total_forest_loss != 0 else 0
        forest_trends.append({'year': int(year), 'absolute_loss_ha': abs_loss, 'pct_of_total_loss': pct_baseline})
    forest_trends_df = pd.DataFrame(forest_trends)
    forest_trends_df.to_csv(join(polygon_trends_dir, 'forest_cover_trends.csv'), index=False)

    # Degradation trends using total AGB
    degradation_trends = []
    for year in year_range:
        effect_df = degradation_effect_data[year]
        abs_effect = effect_df.loc[polygon_name, 'AGB total (Tg)']
        pct_baseline = (abs_effect / baseline_degradation_agb * 100) if baseline_degradation_agb != 0 else 0
        row = {'year': int(year), 'absolute_effect_tg': abs_effect, 'pct_of_total_degradation': pct_baseline}
        if generate_uncertainty_stats:
            ci95 = effect_df.loc[polygon_name, 'AGB total CI95 (Tg)']
            row['ci95_tg'] = ci95
            row['pct_ci95'] = (ci95 / baseline_degradation_agb * 100) if baseline_degradation_agb != 0 else 0
        degradation_trends.append(row)
    degradation_trends_df = pd.DataFrame(degradation_trends)
    degradation_trends_df.to_csv(join(polygon_trends_dir, 'degradation_trends.csv'), index=False)

    # Get target and baseline scenario AGB for subtitle
    target_agb_col = f'{target_year} AGB (Tg)'
    baseline_scenario_agb_col = f'{forest_baseline_scenario} AGB (Tg)'
    target_agb = summary_scenario_stats.loc[polygon_name, target_agb_col] if target_agb_col in summary_scenario_stats.columns else None
    baseline_scenario_agb = summary_scenario_stats.loc[polygon_name, baseline_scenario_agb_col] if baseline_scenario_agb_col in summary_scenario_stats.columns else None
    pct_agb_of_baseline = (target_agb / baseline_scenario_agb * 100) if (target_agb is not None and baseline_scenario_agb is not None and baseline_scenario_agb != 0) else None

    # Build title and subtitles
    title_text = f"{polygon_name}, {baseline_label} baseline"

    if baseline_scenario_agb is not None and pct_agb_of_baseline is not None:
        subtitle1 = f"Baseline AGB: {baseline_scenario_agb:.2f} Tg | {target_year} is {pct_agb_of_baseline:.1f}% of baseline"
    else: subtitle1 = ""

    subtitle2 = f"Baseline forest cover: {baseline_forest:,.0f} ha | {target_year} is {pct_forest_of_baseline:.1f}% of baseline"

    # Plot configurations
    years = forest_trends_df['year'].values

    plot_configs = [
        ('trends_absolute', 'absolute_loss_ha', 'Forest cover loss (ha)',
         'absolute_effect_tg', f'Degradation effect on {target_year} AGB (Tg)', 'ci95_tg'),
        ('trends_pct_of_total', 'pct_of_total_loss', 'Forest loss (% of total)',
         'pct_of_total_degradation', f'Degradation effect (% of total)', 'pct_ci95')]

    for filename, forest_col, forest_axis_label, degrad_col, degrad_axis_label, ci95_col in plot_configs:
        fig = go.Figure()
        # Get data
        forest_vals = forest_trends_df[forest_col].values.astype(float)
        degrad_vals = degradation_trends_df[degrad_col].values.astype(float)
        ci95_vals = None
        if generate_uncertainty_stats and ci95_col in degradation_trends_df.columns:
            ci95_vals = degradation_trends_df[ci95_col].values.astype(float)
            if np.any(pd.isna(ci95_vals)): ci95_vals = None
        # Calculate y-axis ranges (min to 0)
        forest_min = np.min(forest_vals) * 1.1 if not np.any(pd.isna(forest_vals)) else -1
        degrad_min = np.min(degrad_vals) * 1.1 if not np.any(pd.isna(degrad_vals)) else -1
        if ci95_vals is not None: degrad_min = min(degrad_min, np.min(degrad_vals - ci95_vals) * 1.1)
        # 1. CI95 shading on y1
        if ci95_vals is not None and not np.any(pd.isna(degrad_vals)):
            upper, lower = degrad_vals + ci95_vals, degrad_vals - ci95_vals
            fig.add_trace(go.Scatter(
                x=np.concatenate([years, years[::-1]]), y=np.concatenate([upper, lower[::-1]]),
                fill='toself', fillcolor=degradation_ci95_colour, line=dict(color=degradation_colour, width=1),
                showlegend=False, yaxis='y1', hoverinfo='skip'))
        # 2. Degradation line on y1 - buffer then black line
        if not np.any(pd.isna(degrad_vals)):
            fig.add_trace(go.Scatter(
                x=years, y=degrad_vals, mode='lines',
                line=dict(color=degradation_colour, width=6),
                showlegend=False, yaxis='y1', hoverinfo='skip'))
            fig.add_trace(go.Scatter(
                x=years, y=degrad_vals, name=degrad_line_label, mode='lines+markers',
                line=dict(color='black', width=2),
                marker=dict(size=8, color=degradation_colour, line=dict(color='black', width=1)),
                yaxis='y1'))
        # 3. Forest cover on y2 - buffer then black line
        if not np.any(pd.isna(forest_vals)):
            fig.add_trace(go.Scatter(
                x=years, y=forest_vals, mode='lines',
                line=dict(color=forest_cover_colour, width=6),
                showlegend=False, yaxis='y2', hoverinfo='skip'))
            fig.add_trace(go.Scatter(
                x=years, y=forest_vals, name=forest_line_label, mode='lines+markers',
                line=dict(color='black', width=2),
                marker=dict(size=8, color=forest_cover_colour, line=dict(color='black', width=1)),
                yaxis='y2'))
        # Layout
        fig.update_layout(
            xaxis=dict(
                dtick=1, tickfont=dict(family="helvetica", size=axis_value_size, color="black"),
                showgrid=True, gridcolor='lightgrey', gridwidth=0.5),
            yaxis=dict(
                title=degrad_axis_label,
                titlefont=dict(family="helvetica", size=axis_title_size, color="black"),
                tickfont=dict(family="helvetica", size=axis_value_size, color="black"),
                side='left', range=[degrad_min, 0], showgrid=True, gridcolor='lightgrey', gridwidth=0.5),
            yaxis2=dict(
                title=forest_axis_label,
                titlefont=dict(family="helvetica", size=axis_title_size, color="black"),
                tickfont=dict(family="helvetica", size=axis_value_size, color="black"),
                side='right', overlaying='y', range=[forest_min, 0], showgrid=False),
            legend=dict(x=0.01, y=0.01, bgcolor='rgba(255,255,255,0.8)',
                        font=dict(family="helvetica", size=legend_size, color="black")),
            width=width, height=height, hovermode='x unified',
            plot_bgcolor='white', paper_bgcolor='white',
            margin=dict(t=115),
            annotations=[
                dict(x=0, y=1.28, xref='paper', yref='paper', text=title_text, showarrow=False, xanchor='left', align='left',
                     font=dict(family="helvetica", size=title_size, color="black", weight=600)),
                dict(x=0, y=1.19, xref='paper', yref='paper', text=subtitle1, showarrow=False, xanchor='left', align='left',
                     font=dict(family="helvetica", size=subtitle_size, color="black", weight=600)),
                dict(x=0, y=1.11, xref='paper', yref='paper', text=subtitle2, showarrow=False, xanchor='left', align='left',
                     font=dict(family="helvetica", size=subtitle_size, color="black", weight=600)),
                ])

        # Save labelled PNG
        fig.write_image(join(trends_labelled, f'{filename}.png'), scale=dpi / 96)

        # Save labelled SVG
        if svg_transparent_background:
            fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
        fig.write_image(join(trends_labelled_svg, f'{filename}.svg'), scale=dpi / 96)

        # Unlabelled version: remove title, subtitles, axes titles; make legend text invisible but preserve box width
        fig.layout.annotations = []
        fig.layout.yaxis.title.text = ''
        fig.layout.yaxis2.title.text = ''
        fig.update_layout(
            legend=dict(font=dict(color='rgba(0,0,0,0)')),
            plot_bgcolor='white', paper_bgcolor='white'
        )

        # Save unlabelled PNG
        fig.write_image(join(trends_unlabelled, f'{filename}.png'), scale=dpi / 96)

        # Save unlabelled SVG
        if svg_transparent_background:
            fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
        fig.write_image(join(trends_unlabelled_svg, f'{filename}.svg'), scale=dpi / 96)

    progress_index += 1
    progress_label.value = f"Polygon progress: {progress_index}/{progress_total}"

print("Disturbance trends completed.")

# Disconnected runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()