<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/dev/7_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and directories

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi_asartr"
# base_dir = '/content/drive/MyDrive/masfi_asartr'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs and upgrades
!pip install geopandas
!pip install rasterio
!pip install tensorflow
!pip install xgboost

In [None]:
# Reload imports, replacing those in the cache
%reload_ext autoreload
%autoreload 2
# Imports
import json
import geopandas as gpd
from google.colab import runtime
import ipywidgets as widgets
import itertools
import joblib
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
from numpy.random import normal
from os import makedirs
from os.path import exists, join
from osgeo import gdal
import pickle
import pandas as pd
import rasterio
from rasterio.features import rasterize
import scipy.stats as st
import shutil
from shutil import copyfile
import xgboost as xgb

# # Define GPU
# import tensorflow as tf
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   print('GPU device not found.')
# else:
#   print(f"Found GPU at: {device_name}")

In [None]:
# Define directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
feature_dir = join(base_dir, "3_features")
feature_final_dir = join(feature_dir, "final")
models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")
masks_dir = join(scenarios_dir, "scenario_masks")
predictions_dir = join(base_dir, "7_predictions")

# Create directories
makedirs(predictions_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Burn a polygon to raster
def burn_polygon_to_raster(raster, polygon, fixed=True, fixed_value=1, column_name=None, all_touched=True):
  with rasterio.open(raster, 'r+') as src:
      array = src.read(1)
      transform = src.transform
      gdf = gpd.read_file(polygon)
      for geom in gdf.geometry:
          if not fixed and column_name == None:
              column_name = gdf.columns[0]
          if not fixed: burn_value = gdf.loc[gdf.geometry == geom, column_name].values[0]
          else: burn_value = fixed_value
          rasterize([(geom, burn_value)], out=array, transform=transform,
              all_touched=all_touched, dtype=src.meta['dtype'], out_shape=src.shape)
      src.write(array, 1)

# Select model

In [None]:
# Select a baseline model, tested and trained in advance.
# The target must have an uncertainty metric - otherwise
# skip to the next notebook '8_statistics' and use the outputs
# of the '6_scenarios' notebook.
model_exists = False
for subdir, dirs, files in os.walk(models_dir):
  for file in files:
    if file == 'model.json':
      print(f'selected_model = "{subdir.split(f"{models_dir}/",1)[1]}"')
      model_exists = True
if not model_exists:
  print("No model exists.")

In [None]:
selected_model = "agbd_historic_250429_223033"
categorise_target = False # If the target was categorised in 5_models

# Define model directories
selected_model_dir = join(models_dir,selected_model)
selected_model_json = join(selected_model_dir, "model.json")
selected_model_descr_dir = join(selected_model_dir, "model_description.json")
selected_model_dataset_path = join(selected_model_dir, f"{selected_model}.pkl")
selected_model_dataset = pd.read_pickle(selected_model_dataset_path)

# Read description for model dataset attributes
with open(join(selected_model_dir,"model_dataset_description.json")) as model_dataset_description_json:
  model_dataset_description = json.load(model_dataset_description_json)
model_dataset_name = model_dataset_description["model_dataset_name"]
number_of_columns = model_dataset_description["number_of_columns"]
number_of_rows = model_dataset_description["number_of_rows"]
id_column = model_dataset_description["id_column"]
selected_target = model_dataset_description["selected_target"]
uncertainty = model_dataset_description["uncertainty"]
covariates_renamed = model_dataset_description["covariates_renamed"]
covariates_categorised = model_dataset_description["covariates_categorised"]
selected_features = model_dataset_description["selected_features"] + model_dataset_description["covariates_renamed"]
categorical_columns = model_dataset_description["categorical_columns"]
descriptive_parameters = model_dataset_description["descriptive_parameters"]
filter_parameter = model_dataset_description["filter_parameter"]
filter_values_to_include = model_dataset_description["filter_values_to_include"]
sample_imported_dataset = model_dataset_description["sample_imported_dataset"]
sample_imported_dataset_by_percent = model_dataset_description["sample_imported_dataset_by_percent"]
sample_imported_dataset_value = model_dataset_description["sample_imported_dataset_value"]

# Reload hyperparameters
with open(selected_model_descr_dir) as model_description_json:
  model_description = json.load(model_description_json)
final_hyperparameters = eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])

# Define directories
scenarios_model_dir = join(scenarios_dir,selected_model)
uncertainty_selected_model_dir = join(predictions_dir, selected_model)
model_iterations_dir = join(uncertainty_selected_model_dir, "model_iterations")

# Create directories
makedirs(uncertainty_selected_model_dir, exist_ok=True)
makedirs(model_iterations_dir, exist_ok=True)

# Model iterations

In [None]:
# Verify that the target is equal to the mean
print(f'mean = "{selected_target}"')

# Calculate se from columns flagged 'uncertainty'
if len(uncertainty)==0:
  print("There are no flagged uncertainty columns to calculate SE from.")
  print("Manually create the metric from the available columns.")
  for col in selected_model_dataset.columns:
    print(f"{col}")
else:
  for col in selected_model_dataset.columns:
    if col in uncertainty and col not in selected_target:
      print(f'se = "{col}"')

In [None]:
mean = "tar_agbd"
se = "tar_agbd_se"
# Liang et al. (2023) use SE as a proxy for STDEV

# Set model iterations
model_iterations = 100

# Define model (y axis changes for each iteration based on mean and se arrays)
model_dataset_x = selected_model_dataset[selected_features]
mean_array = selected_model_dataset[mean].values
se_array = selected_model_dataset[se].values
if categorise_target: XGBPredictor = xgb.XGBClassifier(**final_hyperparameters)
else: XGBPredictor = xgb.XGBRegressor(**final_hyperparameters)
model_params = XGBPredictor.get_params()
model_params['eval_metric'] = model_description['metric_used_for_training']
# Default fix for new XGBoost version
[model_params.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]

# Progress label
model_progress_index = 0
model_progress_label = widgets.Label(f"Model iteration: {model_progress_index}/{model_iterations}")
display(model_progress_label)

for model_iteration in range(1,model_iterations+1):
  # Set model iteration filename and check if already exists
  model_iteration_filename = f"model_iteration_{model_iteration}.json"
  model_iteration_path = join(model_iterations_dir, model_iteration_filename)
  # If model iteration does not exist...
  if not exists(model_iteration_path):
    # Set the random seed based on iteration for replicability
    np.random.seed(model_iteration)
    # Set a normal distribution sample as the y for this iteration
    model_dataset_y = normal(mean_array, se_array)
    # Create DMatrix objects
    model_dtrain = xgb.DMatrix(model_dataset_x, model_dataset_y, enable_categorical=True)
    # Train the model iteration using the tested hyperparameters
    model = xgb.train(model_params,
                        model_dtrain,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        verbose_eval=True)
    # Save the model iteration
    model.save_model(model_iteration_path)
  # Update progress
  model_progress_index += 1
  model_progress_label.value = f"Model iteration: {model_progress_index}/{model_iterations}"
print("All model iterations have been trained and saved.")

# Scenario iterations

In [None]:
# Scenarios must be designed and tested using 06_scenarios first.
# Features in the 'constant' and scenario subdirs should not be moved.

# Select a scenario area
scenario_area_exists = False
for subdir in os.listdir(scenarios_model_dir):
  if not subdir.endswith('.json') and not subdir.endswith('.csv'):
    print(f'selected_scenario_area = "{subdir}"')
    scenario_area_exists = True
if not scenario_area_exists:
  print(f"Create a scenario area directory in {scenarios_model_dir}")

In [None]:
selected_scenario_area = "asartr"

# Locate scenario area directories
selected_scenario_area_dir = join(scenarios_model_dir, selected_scenario_area)
features_dir = join(selected_scenario_area_dir, "features")
tile_templates_dir = join(selected_scenario_area_dir, 'tile_templates')
tile_feature_stacks_dir = join(selected_scenario_area_dir, 'tile_feature_stacks')

# Define uncertainty scenario area directory
uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_area}")
tile_prediction_cache_dir = join(uncertainty_scenario_area_dir, "tile_prediction_cache")
makedirs(uncertainty_scenario_area_dir, exist_ok=True)
makedirs(tile_prediction_cache_dir, exist_ok=True)

# Collect available scenarios from the feature stack tiles directory
scenario_stacks_list = []
for scenario in os.listdir(tile_feature_stacks_dir):
    scenario_stacks_list.append(scenario)

# Select scenarios to predict
print("scenarios_to_predict = [")
for scenario in sorted(scenario_stacks_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_predict = [
  "1990_oldgrowth_1",
  "1990_oldgrowth_2",
  "2014",
  "2014_no_degradation_since_1991",
  "2014_oldgrowth_1",
  "2014_oldgrowth_2",
  "2015",
  "2016",
  "2017",
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_oldgrowth_1",
  "2021_oldgrowth_2",
  "2022",
  "2022_alternate_degradation_2021",
  "2022_no_degradation_since_2022",
  "2022_oldgrowth_1",
  "2022_oldgrowth_2",
  "2023",
  "2023_alternate_degradation_2022",
  "2023_no_degradation_since_2023",
  "2023_oldgrowth_1",
  "2023_oldgrowth_2",
  "2024",
  "2024_alternate_degradation_2014",
  "2024_alternate_degradation_2021",
  "2024_alternate_degradation_2022",
  "2024_alternate_degradation_2023",
  "2024_no_degradation_since_2000",
  "2024_no_degradation_since_2015",
  "2024_no_degradation_since_2022",
  "2024_no_degradation_since_2023",
  "2024_no_degradation_since_2024",
  "2024_oldgrowth_1",
  "2024_oldgrowth_2",
  "all_oldgrowth_1",
  "all_oldgrowth_2",
]

# Check the number of model iterations available
model_iterations_available = len(os.listdir(model_iterations_dir))
print(f"\nThere are {len(os.listdir(model_iterations_dir))} model iterations available.")

In [None]:
# Set the number of scenario iterations. It must be <= the number of model iterations available.
scenario_iterations = 100

assert scenario_iterations <= model_iterations_available, f"Reduce the number of scenario iterations to <= {model_iterations_available}."

# Change this and the code within the block accordingly.
add_covariates = True # Adds a selected covariate value as the feature
sensitivity_value = 0.99
beam_value = 5

# # Check for GPU
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0': print('GPU device not found')
# else: print(f"Found GPU at: {device_name}")

# Check existing tile parameters
template_tile_list = []
for file in os.listdir(tile_templates_dir):
  if file.endswith('.tif') and file[:13] == 'template_tile':
    template_tile_list.append(file)
n_tiles = len(template_tile_list)

if n_tiles < 1: print("There are currently no template tiles.")
if n_tiles >= 1:
  template_tile_x = gdal.Open(join(tile_templates_dir,'template_tile_1.tif')).GetRasterBand(1).XSize
  print(f"There are {n_tiles} template tiles.")

# Tile progress
if n_tiles > 1:
  tile_progress_index, tile_progress_label = 0, widgets.Label(value=f"Tile progress: 0 / {n_tiles}")
  display(tile_progress_label)

# Loop through each scenario
for scenario in scenarios_to_predict:
  scenario_feature_stack_dir = join(tile_feature_stacks_dir, scenario)
  # Create scenario iterations directory
  scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")
  makedirs(scenario_iterations_dir, exist_ok=True)
  # Iteration progress
  iteration_progress_index = 0
  iteration_progress_label = widgets.Label(f"{scenario} iteration progress: {iteration_progress_index} / {scenario_iterations}")
  display(iteration_progress_label)

  # Check if all scenario iterations already exist, if not then load feature stack
  scenario_iteration_list = []
  for model_iteration in range(1,scenario_iterations+1):
    prediction_iteration_filename = f"{scenario}__{selected_scenario_area}_{selected_model}_unmasked_iteration_{model_iteration}.tif"
    prediction_iteration_path = join(scenario_iterations_dir, prediction_iteration_filename)
    scenario_iteration_list.append(prediction_iteration_path)
  all_scenario_iterations_exist = True
  for scenario_iteration in scenario_iteration_list:
    if not exists(scenario_iteration): all_scenario_iterations_exist = False
  if not all_scenario_iterations_exist:
    if n_tiles == 1:
    # Load template parameters
      template_tile_dir = join(tile_templates_dir, f"template_tile_1.tif")
      template_tile_y = gdal.Open(template_tile_dir).GetRasterBand(1).YSize
      template_tile_x = gdal.Open(template_tile_dir).GetRasterBand(1).XSize
      # Load feature stack
      stack_filename = f"feature_stack_{scenario}_1.npy"
      feature_stack = np.load(join(scenario_feature_stack_dir, stack_filename))
      if add_covariates: feature_stack = np.hstack((feature_stack,
                    np.full((feature_stack.shape[0], 1), beam_value, dtype=int),
                    np.full((feature_stack.shape[0], 1), sensitivity_value, dtype=float)
                    ))
    # Predict scenario for each model iteration
    for model_iteration in range(1,scenario_iterations+1):
      # Define the model
      model_dir = join(model_iterations_dir,f"model_iteration_{model_iteration}.json")
      prediction_iteration_filename = f"{scenario}__{selected_scenario_area}_{selected_model}_unmasked_iteration_{model_iteration}.tif"
      prediction_iteration_path = join(scenario_iterations_dir, prediction_iteration_filename)
      # If scenario iteration does not exist:
      if not exists(prediction_iteration_path):
        # Load model
        booster = xgb.Booster()
        booster.load_model(model_dir)
        if categorise_target: XGBPredictor = xgb.XGBClassifier()
        else: XGBPredictor = xgb.XGBRegressor()
        XGBPredictor._Booster = booster
        # Avoids issues using dataframe from CPU
        xgb.set_config(verbosity=0, use_rmm=True)
        # Get number of stacks
        n_stacks = len(os.listdir(scenario_feature_stack_dir))
        if n_stacks == 1:
          # Define prediction array and reshape
          prediction = XGBPredictor.predict(feature_stack)
          prediction_array = prediction.reshape((template_tile_y, template_tile_x))
          prediction = None # Flush prediction

        # Tiling for if feature stacks and separated into chunks
        if n_stacks > 1:
          # Create a tile cache directory for the prediction
          tile_cache_iteration_dir = join(tile_prediction_cache_dir, prediction_iteration_filename[:-4])
          makedirs(tile_cache_iteration_dir, exist_ok=True)
          # Create a tile count to match the feature stack chunk
          for stack in range(1, n_stacks+1):
            iteration_tile_filename = f"scenario_tile_{stack}.tif"
            # Check if tile already exists
            scenario_tile_exists = False
            for scenario_tile in os.listdir(tile_cache_iteration_dir):
              if scenario_tile == iteration_tile_filename: scenario_tile_exists=True
            # If scenario prediction tile does not exist:
            if scenario_tile_exists == False:
              # Load template tile parameters
              template_tile_dir = join(tile_templates_dir, f"template_tile_{stack}.tif")
              template_tile_y = gdal.Open(template_tile_dir).GetRasterBand(1).YSize
              template_tile_x = gdal.Open(template_tile_dir).GetRasterBand(1).XSize
              # Load feature tile stack
              stack_filename = f"feature_stack_{scenario}_{stack}.npy"
              feature_stack = np.load(join(scenario_feature_stack_dir, stack_filename))
              # Add covariates (sensitivity and BEAM)
              if add_covariates: feature_stack = np.hstack((feature_stack,
                                np.full((feature_stack.shape[0], 1), beam_value, dtype=int),
                                np.full((feature_stack.shape[0], 1), sensitivity_value, dtype=float)
                                ))
              # Define prediction array and reshape
              prediction = XGBPredictor.predict(feature_stack)
              feature_stack = None # Flush feature stack
              prediction_tile = prediction.reshape((template_tile_y, template_tile_x))
              prediction = None # Flush prediction
              # Export prediction array as .tif
              export_array_as_tif(prediction_tile, join(tile_cache_iteration_dir, iteration_tile_filename), template = template_tile_dir, compress = False)
              prediction_tile = None # Flush prediction tile
              # Update progress
            tile_progress_index += 1
            tile_progress_label.value = f"Tile progress: {tile_progress_index} / {n_stacks}"
          # Prepare empty array for merging tiles
          prediction_array = np.empty((0,template_tile_x))
          # Read each tile .tif as an array, stack, then export as a .tif
          for subdir in os.listdir(tile_cache_iteration_dir):
            if subdir.endswith('.tif'):
              tile_dir = join(tile_cache_iteration_dir, subdir)
              prediction_array = np.vstack((prediction_array, gdal.Open(tile_dir).ReadAsArray()))
          # Delete scenario tile cache directory and reset index
          shutil.rmtree(tile_cache_iteration_dir)
          tile_progress_index = 0
          tile_progress_label.value = f"Tile progress: 0 / {n_tiles}"

        # Define scenario template
        scenario_template = join(features_dir, os.listdir(features_dir)[0])
        export_array_as_tif(prediction_array, prediction_iteration_path, template = scenario_template, compress = True)

      iteration_progress_index += 1
      iteration_progress_label.value = f"{scenario} iteration progress: {iteration_progress_index} / {scenario_iterations}"
  else:
    iteration_progress_label.value = f"{scenario} iteration progress: 100 / {scenario_iterations}"
print("\nScenario iterations complete.")

# Scenario statistics


In [None]:
# Select a scenario iterations area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "asartr"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_unmasked_dir = join(uncertainty_scenario_area_dir,"statistics_unmasked")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")
makedirs(statistics_unmasked_dir, exist_ok=True)
makedirs(statistics_masked_dir, exist_ok=True)

# Collect scenarios with iterations
scenarios_iterations_list = []
for subdir in os.listdir(uncertainty_scenario_area_dir):
  if subdir.endswith("_iterations"):
    scenarios_iterations_list.append(subdir[:-11])
# Select scenarios to calculate mean and standard deviation
print("scenarios_to_calculate = [")
for scenario in sorted(scenarios_iterations_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_calculate = [
  "1990_oldgrowth_1",
  "1990_oldgrowth_2",
  "2014",
  "2014_no_degradation_since_1991",
  "2014_oldgrowth_1",
  "2014_oldgrowth_2",
  "2015",
  "2016",
  "2017",
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_oldgrowth_1",
  "2021_oldgrowth_2",
  "2022",
  "2022_alternate_degradation_2021",
  "2022_no_degradation_since_2022",
  "2022_oldgrowth_1",
  "2022_oldgrowth_2",
  "2023",
  "2023_alternate_degradation_2022",
  "2023_no_degradation_since_2023",
  "2023_oldgrowth_1",
  "2023_oldgrowth_2",
  "2024",
  "2024_no_degradation_since_2000",
  "2024_alternate_degradation_2014",
  "2024_alternate_degradation_2021",
  "2024_alternate_degradation_2022",
  "2024_alternate_degradation_2023",
  "2024_no_degradation_since_2015",
  "2024_no_degradation_since_2022",
  "2024_no_degradation_since_2023",
  "2024_no_degradation_since_2024",
  "2024_oldgrowth_1",
  "2024_oldgrowth_2",
  "all_oldgrowth_1",
  "all_oldgrowth_2",
]

In [None]:
# Check iteration number

# Exact '0' pixels without decimals can be an indicator that an iteration was incorrectly generated.
# These must be deleted and predicted again to avoid incorrect statistics.
check_problems = True

# Exact '0' pixels might have been genuinely predicted (though unlikely in a regression),
# In which case set fix_problems to True and add these iterations to problem_arrays list.
# The code below will add 0.001 so they won't trigger the problem checker again.
fix_problems = False
problem_rasters = [

]

if len(problem_rasters) > 0:
  for problem_raster in problem_rasters:
    problem_raster_path = join(uncertainty_scenario_area_dir, f"{scenario}_iterations", problem_raster)
    problem_raster_array = gdal.Open(problem_raster_path).ReadAsArray()
    problem_raster_array[problem_raster_array == 0] = 0.001
    export_array_as_tif(problem_raster_array, problem_raster_path, template = problem_raster_path, compress=True)

# Check the number of prediction iterations
for scenario in scenarios_to_calculate:
  scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")
  iterations = 0
  for subdir in os.listdir(scenario_iterations_dir):
    if subdir.endswith(".tif"):
      # Check whether the prediction iteration is valid
      if check_problems:
        iteration = join(scenario_iterations_dir,subdir)
        iteration_array = gdal.Open(iteration).ReadAsArray()
        assert np.count_nonzero(iteration_array==0) == 0, f"{subdir} contains 0 values, so the iteration may not have predicted correctly.\n Check the file, delete and repredict if necessary.\n If they are valid 0 values, run the cell below on:\n {iteration}."
      iterations += 1
  print(f"There are {iterations} prediction iterations for scenario {scenario} statistics.")

In [None]:
confidence_interval = 0.95

# Statistics progress
stats_progress_index = 0
stats_progress_label = widgets.Label(f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}")
display(stats_progress_label)

for scenario in scenarios_to_calculate:
    stat_base_filename = f"{scenario}__{selected_scenario_iterations_area}_{selected_model}"
    scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")

    # Define statistics raster directories
    stat_mean_filename = f"mean__{stat_base_filename}_unmasked.tif"
    stat_mean_dir = join(statistics_unmasked_dir,stat_mean_filename)
    stat_uncertainty_filename = f"uncertainty__{stat_base_filename}_unmasked.tif"
    stat_predictions_dir = join(statistics_unmasked_dir,stat_uncertainty_filename)

    # Check whether statistics rasters already exist
    stat_mean_tif_exists, stat_uncertainty_tif_exists = False, False
    for stat_tif in os.listdir(statistics_unmasked_dir):
        if stat_tif == stat_mean_filename: stat_mean_tif_exists = True
        if stat_tif == stat_uncertainty_filename: stat_uncertainty_tif_exists = True

    # If either mean or uncertainty do not exist
    if stat_mean_tif_exists == False or stat_uncertainty_tif_exists == False:
        stat_sum = None
        stat_sum_sq = None
        iteration_n = 0
        for subdir in os.listdir(scenario_iterations_dir):
            if subdir.endswith(".tif"):
                iteration = os.path.join(scenario_iterations_dir, subdir)
                iteration_array = gdal.Open(iteration).ReadAsArray()
                if stat_sum is None:
                    stat_sum = np.zeros_like(iteration_array, dtype='float64')
                    stat_sum_sq = np.zeros_like(iteration_array, dtype='float64')
                # Sum and sum of squares
                stat_sum += iteration_array  # Running sum for mean
                stat_sum_sq += np.square(iteration_array, dtype=np.float64)  # Running sum of squares for variance
                iteration_n += 1

        # Calculate mean: sum / count
        stat_mean = np.divide(stat_sum, iteration_n, dtype='float64')
        if stat_mean_tif_exists == False:
            export_array_as_tif(stat_mean, stat_mean_dir, template = iteration)
            print(f"{stat_mean_filename} has been exported.")
        else: print(f"{stat_mean_filename} already exists.")

        if stat_uncertainty_tif_exists == False:
            # Calculate variance: E[X^2] - (E[X])^2
            stat_variance = (stat_sum_sq - (stat_sum ** 2) / iteration_n) / (iteration_n - 1)
            # Standard error: σ / sqrt(n)
            stat_se = np.sqrt(stat_variance) / np.sqrt(iteration_n)
            # Calculate confidence intervals using t-distribution
            stat_ci_lower, stat_ci_upper = st.t.interval(confidence_interval, iteration_n - 1, loc=stat_mean, scale=stat_se)
            # CI width: (upper - lower) / 2
            stat_ci = np.divide(np.subtract(stat_ci_upper, stat_ci_lower, dtype='float64'), 2, dtype='float64')
            # Uncertainty: (CI / mean) * 100%
            stat_uncertainty = np.multiply(np.divide(stat_ci, stat_mean, dtype='float64'), 100, dtype='float64')
            # Export statistics arrays as rasters
            export_array_as_tif(stat_se, join(statistics_unmasked_dir,f"se__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"se__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci_lower, join(statistics_unmasked_dir,f"ci_lower__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"ci_lower__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci_upper, join(statistics_unmasked_dir,f"ci_upper__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"ci_upper__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci, join(statistics_unmasked_dir,f"ci__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"ci__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_uncertainty, stat_predictions_dir, template = iteration)
            print(f"{stat_uncertainty_filename} has been exported.")
        else: print(f"{stat_uncertainty_filename} already exists.")

    else: print(f"{stat_mean_filename} and {stat_uncertainty_filename} already exist.")
    stats_progress_index += 1
    stats_progress_label.value = (f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}")

print("Statistics calculations and .tif exports complete.")

# Mask scenario statistics

In [None]:
# Select a scenario iterations area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "asartr"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_unmasked_dir = join(uncertainty_scenario_area_dir,"statistics_unmasked")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")

# Use polygons for masking, only areas inside the polygons will be included
polygons_to_exclude = ['template.gpkg', 'project_area_buffered_bbox.gpkg']
print("mask_polygons = [")
for polygon in os.listdir(polygons_dir):
  if polygon not in polygons_to_exclude:
    if 'inverse' not in polygon:
      print(f"  '{polygon[:-5]}',")
print("]")

In [None]:
mask_polygons = [
  # 'project_area',
  'gedi_area',
  # 'peninsular_malaysia',
  # 'pa_taman_krau',
  # 'pa_ais',
]


# Create an inverse project area path for masking
template_polygon_path = join(polygons_dir, "template.gpkg")
for polygon in mask_polygons:
  inverse_polygon_path = join(polygons_dir, f"{polygon}_inverse.gpkg")
  if not exists(inverse_polygon_path):
    polygon_path = join(polygons_dir, f"{polygon}.gpkg")
    template_polygon = gpd.read_file(template_polygon_path)
    polygon_read = gpd.read_file(polygon_path)
    polygon_crs = polygon_read.crs.to_epsg()
    inverse_polygon = template_polygon['geometry'].difference(polygon_read['geometry']).iloc[0]
    inverse_polygon_gdf = gpd.GeoDataFrame({'geometry': [inverse_polygon]}, crs=f"EPSG:{polygon_crs}")
    inverse_polygon_gdf.to_file(inverse_polygon_path, driver="GPKG")
    print(f"An inverse masking polygon for {polygon} has been created in {polygons_dir}.")
  else: print(f"An inverse masking polygon for {polygon} already exists.")


# If only [scenario]_oldgrowth_1 exists, simply all disturbance from all disturbance features
# This will be masked to [scenario]_oldgrowth.

# If both oldgrowth_1 and oldgrowth_2 exist,
# oldgrowth_1 uses an area-based proxy for pre-Landsat undisturbed forest
# oldgrowth_2 simply removes all disturbance from all disturbance features
# The final masked [scenario]_oldgrowth chooses the maximum pixel values from comparing each.

# Identify all oldgrowth 1 files (both mean and uncertainty)
oldgrowth_oldgrowth_1_files = [f for f in os.listdir(statistics_unmasked_dir)
                     if ('oldgrowth_1__' in f) and
                        (f.startswith('mean__') or f.startswith('uncertainty__')) and
                        f.endswith('_unmasked.tif')]

# First find all mean oldgrowth_1 files
mean_oldgrowth_1_files = [f for f in os.listdir(statistics_unmasked_dir)
                if f.startswith('mean__') and 'oldgrowth_1__' in f and f.endswith('_unmasked.tif')]

for mean_oldgrowth_1_file in mean_oldgrowth_1_files:
    # Get corresponding uncertainty oldgrowth_1 file
    uncertainty_oldgrowth_1_file = mean_oldgrowth_1_file.replace('mean__', 'uncertainty__')
    # Get oldgrowth_2 files
    mean_oldgrowth_2_file = mean_oldgrowth_1_file.replace('oldgrowth_1__', 'oldgrowth_2__')
    uncertainty_oldgrowth_2_file = uncertainty_oldgrowth_1_file.replace('oldgrowth_1__', 'oldgrowth_2__')
    # Create merged filenames
    mean_merged_file = mean_oldgrowth_1_file.replace('oldgrowth_1__', 'oldgrowth__')
    uncertainty_merged_file = uncertainty_oldgrowth_1_file.replace('oldgrowth_1__', 'oldgrowth__')
    # Paths
    mean_merged_path = join(statistics_unmasked_dir, mean_merged_file)
    uncertainty_merged_path = join(statistics_unmasked_dir, uncertainty_merged_file)
    # Skip if merged files already exist
    if exists(mean_merged_path) and exists(uncertainty_merged_path):
        print(f"Merged files already exist for {mean_oldgrowth_1_file}")
        continue
    print(f"Processing {mean_oldgrowth_1_file}")

    # Check if oldgrowth_2 exists
    if exists(join(statistics_unmasked_dir, mean_oldgrowth_2_file)):
        # Process with oldgrowth_2
        mean_oldgrowth_1_array = gdal.Open(join(statistics_unmasked_dir, mean_oldgrowth_1_file)).ReadAsArray()
        mean_oldgrowth_2_array = gdal.Open(join(statistics_unmasked_dir, mean_oldgrowth_2_file)).ReadAsArray()
        # Which one is greater?
        oldgrowth_1_is_greater = mean_oldgrowth_1_array > mean_oldgrowth_2_array
        # Take maximum value
        merged_mean = np.maximum(mean_oldgrowth_1_array, mean_oldgrowth_2_array)
        # Save merged mean
        export_array_as_tif(merged_mean, mean_merged_path, compress=True)
        print(f"Saved merged mean: {mean_merged_file}")
        # Process uncertainty if oldgrowth_1 exists
        if exists(join(statistics_unmasked_dir, uncertainty_oldgrowth_1_file)):
            uncertainty_oldgrowth_1_array = gdal.Open(join(statistics_unmasked_dir, uncertainty_oldgrowth_1_file)).ReadAsArray()
            # If oldgrowth_2 uncertainty exists, use it where appropriate
            if exists(join(statistics_unmasked_dir, uncertainty_oldgrowth_2_file)):
                uncertainty_oldgrowth_2_array = gdal.Open(join(statistics_unmasked_dir, uncertainty_oldgrowth_2_file)).ReadAsArray()
                merged_uncertainty = np.where(oldgrowth_1_is_greater, uncertainty_oldgrowth_1_array, uncertainty_oldgrowth_2_array)
            else: merged_uncertainty = uncertainty_oldgrowth_1_array
            # Save merged uncertainty
            export_array_as_tif(merged_uncertainty, uncertainty_merged_path, compress=True)
            print(f"Saved merged uncertainty: {uncertainty_merged_file}")
    else: # Just use '_1'
        if not exists(mean_merged_path):
            shutil.copy2(join(statistics_unmasked_dir, mean_oldgrowth_1_file), mean_merged_path)
            print(f"Copied oldgrowth_1 mean to {mean_merged_file}")
        if exists(join(statistics_unmasked_dir, uncertainty_oldgrowth_1_file)) and not exists(uncertainty_merged_path):
            shutil.copy2(join(statistics_unmasked_dir, uncertainty_oldgrowth_1_file), uncertainty_merged_path)
            print(f"Copied oldgrowth_1 uncertainty to {uncertainty_merged_file}")


unmasked_predictions = []
for scenario_prediction in os.listdir(statistics_unmasked_dir):
  # Only mask mean, uncertainty and ci for visualisation and calculating statistics
  if ('mean__' in scenario_prediction) or ('uncertainty__' in scenario_prediction):
    if ('oldgrowth_1' not in scenario_prediction) and ('oldgrowth_2' not in scenario_prediction):
      unmasked_predictions.append(scenario_prediction)

# Determine last feature year for masking future scenarios
final_feature_years = []
for final_feature in os.listdir(feature_final_dir):
  if final_feature.endswith('.tif') and final_feature[-9] == '_':
    try: final_feature_years.append(int(final_feature[-8:-4]))
    except: continue
last_feature_year = max(final_feature_years)

# Masking progress
masking_progress_index = 0
masking_progress_label = widgets.Label(f"Masking progress: {masking_progress_index}/{len(unmasked_predictions)}")
display(masking_progress_label)

# Mask scenario statistics with the relevatant mask
for scenario_prediction in unmasked_predictions: # Loop through each unmasked scenario
  scenario_masked_filename = f"{scenario_prediction[:-13]}.tif"
  scenario_masked_dir = join(statistics_masked_dir, scenario_masked_filename)
  if not exists(scenario_masked_dir):
    mask_exists = False
    for mask in os.listdir(masks_dir):
      # Match all oldgrowth scenarios
      if 'all_oldgrowth' in mask or 'all_oldgrowth' in scenario_prediction:
        if 'all_oldgrowth' in mask and 'all_oldgrowth' in scenario_prediction:
          selected_mask_filename = mask
          selected_mask_dir = join(masks_dir, selected_mask_filename)
          mask_exists = True
      else: # Match all other historic scenarios
        scenario_year = int(scenario_prediction.split('__')[1][:4])
        mask_year = int(mask[12:16])
        if scenario_year == mask_year:
          selected_mask_filename = mask
          selected_mask_dir = join(masks_dir, selected_mask_filename)
          mask_exists = True
        else: # Match future scenarios with most recent forest mask
          if scenario_year > last_feature_year and last_feature_year == mask_year:
            selected_mask_filename = mask
            selected_mask_dir = join(masks_dir, selected_mask_filename)
            mask_exists = True
    if mask_exists == False: print(f"A suitable mask for {scenario_prediction} does not exist.\n")
    else: # Mask the scenario prediction
      print(f"Masking {scenario_prediction} with {selected_mask_filename}...")
      mask_array = gdal.Open(selected_mask_dir).ReadAsArray()
      scenario_prediction_unmasked_dir = join(statistics_unmasked_dir, scenario_prediction)
      scenario_prediction_array = gdal.Open(scenario_prediction_unmasked_dir).ReadAsArray()
      # Mask where the mask array is not 1
      scenario_masked_array = np.where(mask_array != 1, nodatavalue, scenario_prediction_array)
      export_array_as_tif(scenario_masked_array, scenario_masked_dir, compress = True)
      if len(mask_polygons) > 0:
        for polygon_mask in mask_polygons:
          inverse_gedi_area_path = join(polygons_dir, f"{polygon_mask}_inverse.gpkg")
          print(f"Masking {scenario_prediction} with {polygon_mask}...")
          burn_polygon_to_raster(scenario_masked_dir, inverse_gedi_area_path, fixed_value=nodatavalue, all_touched=False)
        # Recompress the prediction after burning the polygon masks
        scenario_masked_array_2 = gdal.Open(scenario_masked_dir).ReadAsArray()
        export_array_as_tif(scenario_masked_array_2, scenario_masked_dir, compress = True)
      print(f"{scenario_masked_filename} exported.")
  # Update masking progress
  masking_progress_index += 1
  masking_progress_label.value = f"Masking progress: {masking_progress_index}/{len(unmasked_predictions)}"

# Scenario disturbance / change with uncertainty

In [None]:
# Select a scenario statistics area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "asartr"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")
dist_predictions_dir = join(uncertainty_scenario_area_dir, "scenario_disturbance")
makedirs(dist_predictions_dir, exist_ok=True)

# Build dictionaries of disturbance / change options based on available files

# Extract all available scenarios from directory
scenario_stats = {}
for file in os.listdir(statistics_masked_dir):
    parts = file.split("__")
    if len(parts) >= 2:
        stat, scenario = parts[0], parts[1]
        if scenario not in scenario_stats:
            scenario_stats[scenario] = set()
        scenario_stats[scenario].add(stat)

# Only keep scenarios that have both 'uncertainty' and 'mean' statistics
scenarios = {scenario for scenario, stats in scenario_stats.items()
             if 'uncertainty' in stats and 'mean' in stats}

# Extract and categorize years from scenarios
years = set()
plain_years = set()  # Years as standalone scenarios (e.g. "2014")
oldgrowth_years = set()  # Years with oldgrowth variants

for s in scenarios:
    if s.isdigit():
        years.add(int(s))
        plain_years.add(int(s))
    elif "_oldgrowth" in s:
        year = s.split("_oldgrowth")[0]
        if year.isdigit():
            years.add(int(year))
            oldgrowth_years.add(int(year))
    elif any(pattern in s for pattern in ["_no_degradation_since_", "_alternate_degradation_"]):
        year = s.split("_")[0]
        if year.isdigit():
            years.add(int(year))
        if "_since_" in s:
            since_year = s.split("_since_")[1]
            if since_year.isdigit():
                years.add(int(since_year) - 1)  # Add baseline year (year before "since")

years_sorted = sorted(list(years))

# Track scenario availability and dependencies
deforest_since_scenarios = set()
direct_degradation_pairs = set()
direct_deforestation_pairs = set()

# Output dictionaries
scenario_difference_dictionary = {}

print("# Differences in scenario_difference_dictionary and in before_baseline_dictionary are ")
print("# calculated by subtracting the second scenario / difference from the first. The ")
print("# differences in degradation_deforestation_dictionary are summed.")
print("")
print("scenario_difference_dictionary = {")
print("")

# Track deforestation scenarios for dependencies
deforest_since_scenarios = set()
for year_a in years_sorted:
    for year_b in years_sorted:
        if year_a <= year_b:
            continue

        a_str, b_str = str(year_a), str(year_b)
        b_plus1 = str(year_b + 1)

        # Check for deforestation_since scenarios
        if (year_b in plain_years and f"{a_str}_alternate_degradation_{b_str}" in scenarios):
            deforest_since_scenarios.add(f"{a_str}_deforestation_since_{b_plus1}")

# 1. Process year-to-year baselines for direct differences
target_entries = {}

for year_a in sorted(years_sorted):  # Primary sort by target year
    a_str = str(year_a)
    target_entries[a_str] = []

    for year_b in sorted(years_sorted):  # Secondary sort by baseline year
        if year_a <= year_b:
            continue

        b_str = str(year_b)
        b_plus1 = str(year_b + 1)

        deforest_since_key = f"{a_str}_deforestation_since_{b_plus1}"
        has_deforestation_since = deforest_since_key in deforest_since_scenarios

        # Collect all comparisons for alphabetical sorting
        comparisons = []

        # Deforestation since [year_b+1]
        if has_deforestation_since:
            comparisons.append((
                f"  ('{a_str}_alternate_degradation_{b_str}', '{b_str}'):",
                f"    '{a_str}_deforestation_since_{b_plus1}',"
            ))
            scenario_difference_dictionary[(f"{a_str}_alternate_degradation_{b_str}", b_str)] = f"{a_str}_deforestation_since_{b_plus1}"
            direct_deforestation_pairs.add((a_str, b_plus1))

        # Degradation since [year_b+1]
        if (year_a in plain_years and f"{a_str}_no_degradation_since_{b_plus1}" in scenarios):
            comparisons.append((
                f"  ('{a_str}', '{a_str}_no_degradation_since_{b_plus1}'):",
                f"    '{a_str}_degradation_since_{b_plus1}',"
            ))
            scenario_difference_dictionary[(a_str, f"{a_str}_no_degradation_since_{b_plus1}")] = f"{a_str}_degradation_since_{b_plus1}"
            direct_degradation_pairs.add((a_str, b_plus1))

        if comparisons:
            section_text = [f"# Disturbance by {a_str}, using {b_str} as a baseline"]
            # Sort comparisons alphabetically
            for line1, line2 in sorted(comparisons, key=lambda x: x[0]):
                section_text.append(line1)
                section_text.append(line2)
            section_text.append("")

            target_entries[a_str].append((b_str, section_text))

# Output entries sorted by target year first
for a_str in sorted(target_entries.keys()):
    # Sort by baseline year within each target year
    for b_str, section in sorted(target_entries[a_str], key=lambda x: x[0]):
        if section:  # Only print non-empty sections
            print("\n".join(section))

# 2. Process oldgrowth baseline sections
oldgrowth_entries = []
for year in years_sorted:
    y_str = str(year)

    if (year in plain_years and
        year in oldgrowth_years and
        "all_oldgrowth" in scenarios):

        if not oldgrowth_entries:
            oldgrowth_entries.append("# Disturbance using oldgrowth as a baseline")

        oldgrowth_entries.append(f"  ('{y_str}', '{y_str}_oldgrowth'):")
        oldgrowth_entries.append(f"    '{y_str}_degradation_total',")
        scenario_difference_dictionary[(y_str, f"{y_str}_oldgrowth")] = f"{y_str}_degradation_total"

        oldgrowth_entries.append(f"  ('{y_str}_oldgrowth', 'all_oldgrowth'):")
        oldgrowth_entries.append(f"    '{y_str}_deforestation_total',")
        scenario_difference_dictionary[(f"{y_str}_oldgrowth", "all_oldgrowth")] = f"{y_str}_deforestation_total"
        oldgrowth_entries.append("")

# Output oldgrowth entries
if oldgrowth_entries:
    print("\n".join(oldgrowth_entries))

# 3. Process year-to-year change sections
years_available = sorted([y for y in years_sorted if y in plain_years])

if len(years_available) >= 2:
    # Single-year consecutive changes
    print("# Change between single years")
    for i in range(1, len(years_available)):
        current = str(years_available[i])
        previous = str(years_available[i-1])

        print(f"  ('{current}', '{previous}'):")
        print(f"    '{current}_change_{previous}',")
        scenario_difference_dictionary[(current, previous)] = f"{current}_change_{previous}"
    print("")

    # Multi-year comparison (earliest to latest only)
    if len(years_available) > 2:
        earliest = str(years_available[0])
        latest = str(years_available[-1])

        print("# Change between multiple years")
        print("# Add any other desired year combinations manually using the pattern below")
        print(f"  ('{latest}', '{earliest}'):")
        print(f"    '{latest}_change_{earliest}',")
        scenario_difference_dictionary[(latest, earliest)] = f"{latest}_change_{earliest}"
        print("")

print("}")

# Dictionary for calculated 'before' differences
print("")
print("before_baseline_dictionary = {")

# Collection for degradation before metrics
degradation_before_entries = []
for year_a, year_b_plus1 in sorted(direct_degradation_pairs):
    # Check if required components exist
    if (year_a, f"{year_a}_oldgrowth") in scenario_difference_dictionary:
        deg_since = f"{year_a}_degradation_since_{year_b_plus1}"
        deg_total = f"{year_a}_degradation_total"
        deg_before = f"{year_a}_degradation_before_{year_b_plus1}"

        degradation_before_entries.append(f"  '{deg_before}': ('{deg_total}', '{deg_since}'),")

# Print degradation before metrics if any exist
if degradation_before_entries:
    print("\n# Degradation before metrics (Total - Since)")
    for entry in degradation_before_entries:
        print(entry)

# Collection for deforestation before metrics
deforestation_before_entries = []
for year_a, year_b_plus1 in sorted(direct_deforestation_pairs):
    # Check if required components exist
    if (f"{year_a}_oldgrowth", "all_oldgrowth") in scenario_difference_dictionary:
        def_since = f"{year_a}_deforestation_since_{year_b_plus1}"
        def_total = f"{year_a}_deforestation_total"
        def_before = f"{year_a}_deforestation_before_{year_b_plus1}"

        deforestation_before_entries.append(f"  '{def_before}': ('{def_total}', '{def_since}'),")

# Print deforestation before metrics if any exist
if deforestation_before_entries:
    if degradation_before_entries:
        print("")
    print("# Deforestation before metrics (Total - Since)")
    for entry in deforestation_before_entries:
        print(entry)

print("}")

# Dictionary for combined degradation and deforestation calculations
print("")
print("degradation_deforestation_dictionary = {")

# Collection for combined disturbance totals
combined_total_entries = []
for year in years_sorted:
    y_str = str(year)

    # Check if required components exist
    if ((y_str, f"{y_str}_oldgrowth") in scenario_difference_dictionary and
        (f"{y_str}_oldgrowth", "all_oldgrowth") in scenario_difference_dictionary):
        deg_total = f"{y_str}_degradation_total"
        def_total = f"{y_str}_deforestation_total"
        combined = f"{y_str}_degradation_deforestation_total"

        combined_total_entries.append(f"  '{combined}': ('{deg_total}', '{def_total}'),")

# Print combined total entries if any exist
if combined_total_entries:
    print("\n# Combined degradation and deforestation totals")
    for entry in combined_total_entries:
        print(entry)

# Collection for combined since metrics
combined_pairs = direct_degradation_pairs.intersection(direct_deforestation_pairs)
combined_since_entries = []

for year_a, year_b_plus1 in sorted(combined_pairs):
    # Verify components exist
    deg_since = f"{year_a}_degradation_since_{year_b_plus1}"
    def_since = f"{year_a}_deforestation_since_{year_b_plus1}"
    combined = f"{year_a}_degradation_deforestation_since_{year_b_plus1}"

    combined_since_entries.append(f"  '{combined}': ('{deg_since}', '{def_since}'),")

# Print combined since entries if any exist
if combined_since_entries:
    if combined_total_entries:
        print("")
    print("# Combined degradation and deforestation since")
    for entry in combined_since_entries:
        print(entry)

# Collection for combined before metrics
combined_before_entries = []
for year_a, year_b_plus1 in sorted(combined_pairs):
    # Check if individual before metrics defined
    deg_before = f"{year_a}_degradation_before_{year_b_plus1}"
    def_before = f"{year_a}_deforestation_before_{year_b_plus1}"

    # Only include if components would exist
    if ((year_a, f"{year_a}_oldgrowth") in scenario_difference_dictionary and
        (f"{year_a}_oldgrowth", "all_oldgrowth") in scenario_difference_dictionary):
        combined = f"{year_a}_degradation_deforestation_before_{year_b_plus1}"

        combined_before_entries.append(f"  '{combined}': ('{deg_before}', '{def_before}'),")

# Print combined before entries if any exist
if combined_before_entries:
    if combined_total_entries or combined_since_entries:
        print("")
    print("# Combined degradation and deforestation before")
    for entry in combined_before_entries:
        print(entry)

print("}")

In [None]:
# Differences in scenario_difference_dictionary and in before_baseline_dictionary are
# calculated by subtracting the second scenario / difference from the first. The
# differences in degradation_deforestation_dictionary are summed.

scenario_difference_dictionary = {

# Disturbance by 2014, using 1990 as a baseline
  ('2014', '2014_no_degradation_since_1991'):
    '2014_degradation_since_1991',

# Disturbance by 2022, using 2021 as a baseline
  ('2022', '2022_no_degradation_since_2022'):
    '2022_degradation_since_2022',
  ('2022_alternate_degradation_2021', '2021'):
    '2022_deforestation_since_2022',

# Disturbance by 2023, using 2022 as a baseline
  ('2023', '2023_no_degradation_since_2023'):
    '2023_degradation_since_2023',
  ('2023_alternate_degradation_2022', '2022'):
    '2023_deforestation_since_2023',

# Disturbance by 2024, using 1999 as a baseline
  ('2024', '2024_no_degradation_since_2000'):
    '2024_degradation_since_2000',

# Disturbance by 2024, using 2014 as a baseline
  ('2024', '2024_no_degradation_since_2015'):
    '2024_degradation_since_2015',
  ('2024_alternate_degradation_2014', '2014'):
    '2024_deforestation_since_2015',

# Disturbance by 2024, using 2021 as a baseline
  ('2024', '2024_no_degradation_since_2022'):
    '2024_degradation_since_2022',
  ('2024_alternate_degradation_2021', '2021'):
    '2024_deforestation_since_2022',

# Disturbance by 2024, using 2022 as a baseline
  ('2024', '2024_no_degradation_since_2023'):
    '2024_degradation_since_2023',
  ('2024_alternate_degradation_2022', '2022'):
    '2024_deforestation_since_2023',

# Disturbance by 2024, using 2023 as a baseline
  ('2024', '2024_no_degradation_since_2024'):
    '2024_degradation_since_2024',
  ('2024_alternate_degradation_2023', '2023'):
    '2024_deforestation_since_2024',

# Disturbance using oldgrowth as a baseline
  ('2014', '2014_oldgrowth'):
    '2014_degradation_total',
  ('2014_oldgrowth', 'all_oldgrowth'):
    '2014_deforestation_total',

  ('2021', '2021_oldgrowth'):
    '2021_degradation_total',
  ('2021_oldgrowth', 'all_oldgrowth'):
    '2021_deforestation_total',

  ('2022', '2022_oldgrowth'):
    '2022_degradation_total',
  ('2022_oldgrowth', 'all_oldgrowth'):
    '2022_deforestation_total',

  ('2023', '2023_oldgrowth'):
    '2023_degradation_total',
  ('2023_oldgrowth', 'all_oldgrowth'):
    '2023_deforestation_total',

  ('2024', '2024_oldgrowth'):
    '2024_degradation_total',
  ('2024_oldgrowth', 'all_oldgrowth'):
    '2024_deforestation_total',

# Change between single years
  ('2015', '2014'):
    '2015_change_2014',
  ('2016', '2015'):
    '2016_change_2015',
  ('2017', '2016'):
    '2017_change_2016',
  ('2018', '2017'):
    '2018_change_2017',
  ('2019', '2018'):
    '2019_change_2018',
  ('2020', '2019'):
    '2020_change_2019',
  ('2021', '2020'):
    '2021_change_2020',
  ('2022', '2021'):
    '2022_change_2021',
  ('2023', '2022'):
    '2023_change_2022',
  ('2024', '2023'):
    '2024_change_2023',

# Change between multiple years
# Add any other desired year combinations manually using the pattern below
  ('2024', '2014'):
    '2024_change_2014',

}

before_baseline_dictionary = {

# Degradation before metrics (Total - Since)
  '2014_degradation_before_1991': ('2014_degradation_total', '2014_degradation_since_1991'),
  '2022_degradation_before_2022': ('2022_degradation_total', '2022_degradation_since_2022'),
  '2023_degradation_before_2023': ('2023_degradation_total', '2023_degradation_since_2023'),
  '2024_degradation_before_2000': ('2024_degradation_total', '2024_degradation_since_2000'),
  '2024_degradation_before_2015': ('2024_degradation_total', '2024_degradation_since_2015'),
  '2024_degradation_before_2022': ('2024_degradation_total', '2024_degradation_since_2022'),
  '2024_degradation_before_2023': ('2024_degradation_total', '2024_degradation_since_2023'),
  '2024_degradation_before_2024': ('2024_degradation_total', '2024_degradation_since_2024'),

# Deforestation before metrics (Total - Since)
  '2022_deforestation_before_2022': ('2022_deforestation_total', '2022_deforestation_since_2022'),
  '2023_deforestation_before_2023': ('2023_deforestation_total', '2023_deforestation_since_2023'),
  '2024_deforestation_before_2015': ('2024_deforestation_total', '2024_deforestation_since_2015'),
  '2024_deforestation_before_2022': ('2024_deforestation_total', '2024_deforestation_since_2022'),
  '2024_deforestation_before_2023': ('2024_deforestation_total', '2024_deforestation_since_2023'),
  '2024_deforestation_before_2024': ('2024_deforestation_total', '2024_deforestation_since_2024'),
}

degradation_deforestation_dictionary = {

# Combined degradation and deforestation totals
  '2014_degradation_deforestation_total': ('2014_degradation_total', '2014_deforestation_total'),
  '2021_degradation_deforestation_total': ('2021_degradation_total', '2021_deforestation_total'),
  '2022_degradation_deforestation_total': ('2022_degradation_total', '2022_deforestation_total'),
  '2023_degradation_deforestation_total': ('2023_degradation_total', '2023_deforestation_total'),
  '2024_degradation_deforestation_total': ('2024_degradation_total', '2024_deforestation_total'),

# Combined degradation and deforestation since
  '2022_degradation_deforestation_since_2022': ('2022_degradation_since_2022', '2022_deforestation_since_2022'),
  '2023_degradation_deforestation_since_2023': ('2023_degradation_since_2023', '2023_deforestation_since_2023'),
  '2024_degradation_deforestation_since_2015': ('2024_degradation_since_2015', '2024_deforestation_since_2015'),
  '2024_degradation_deforestation_since_2022': ('2024_degradation_since_2022', '2024_deforestation_since_2022'),
  '2024_degradation_deforestation_since_2023': ('2024_degradation_since_2023', '2024_deforestation_since_2023'),
  '2024_degradation_deforestation_since_2024': ('2024_degradation_since_2024', '2024_deforestation_since_2024'),

# Combined degradation and deforestation before
  '2022_degradation_deforestation_before_2022': ('2022_degradation_before_2022', '2022_deforestation_before_2022'),
  '2023_degradation_deforestation_before_2023': ('2023_degradation_before_2023', '2023_deforestation_before_2023'),
  '2024_degradation_deforestation_before_2015': ('2024_degradation_before_2015', '2024_deforestation_before_2015'),
  '2024_degradation_deforestation_before_2022': ('2024_degradation_before_2022', '2024_deforestation_before_2022'),
  '2024_degradation_deforestation_before_2023': ('2024_degradation_before_2023', '2024_deforestation_before_2023'),
  '2024_degradation_deforestation_before_2024': ('2024_degradation_before_2024', '2024_deforestation_before_2024'),
}

In [None]:
# Functions for difference and sum calculations with uncertainty
def subtract_mean_arrays(array1, array2):
    return array1 - array2

def sum_mean_arrays(array1, array2):
    return array1 + array2

# Uncertainty propagation following Liang et al. (2023)
# Same function used for both subtraction and addition operations
# Based on IPCC and CEOS Land Product Validation Protocol methods
# Represents relative uncertainty propagation for biomass change estimates
def propagate_uncertainty(mean1, uncertainty1, mean2, uncertainty2):
    absolute_uncertainty1 = np.multiply(mean1, uncertainty1, dtype='float64')
    absolute_uncertainty2 = np.multiply(mean2, uncertainty2, dtype='float64')
    sums_of_squares = np.square(absolute_uncertainty1, dtype='float64') + np.square(absolute_uncertainty2, dtype='float64')
    # Avoid division by zero
    denominator = np.abs(mean1 + mean2)
    epsilon = np.finfo(np.float64).eps
    denominator = np.where(denominator == 0, epsilon, denominator)
    return np.sqrt(sums_of_squares, dtype='float64') / denominator

# Set up progress tracking
total_operations = len(scenario_difference_dictionary) + len(before_baseline_dictionary) + len(degradation_deforestation_dictionary)
progress_index = 0
progress_label = widgets.Label(f"Calculation progress: {progress_index}/{total_operations}")
display(progress_label)

# 1. Process direct scenario differences
for (scenario1, scenario2), difference_name in scenario_difference_dictionary.items():
    # Define filenames and directories for mean and uncertainty
    mean_filename = f"mean__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
    mean_dir = join(dist_predictions_dir, mean_filename)
    uncertainty_filename = f"uncertainty__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
    uncertainty_dir = join(dist_predictions_dir, uncertainty_filename)
    # Skip if both files already exist
    if exists(mean_dir) and exists(uncertainty_dir):
        # Update progress and continue
        progress_index += 1
        progress_label.value = f"Calculation progress: {progress_index}/{total_operations}"
        continue

    scenario1_base = f"{scenario1}__{selected_scenario_iterations_area}_{selected_model}"
    scenario2_base = f"{scenario2}__{selected_scenario_iterations_area}_{selected_model}"
    # Assert files exist
    scenario1_mean_dir = join(statistics_masked_dir, f"mean__{scenario1_base}.tif")
    scenario1_uncertainty_dir = join(statistics_masked_dir, f"uncertainty__{scenario1_base}.tif")
    scenario2_mean_dir = join(statistics_masked_dir, f"mean__{scenario2_base}.tif")
    scenario2_uncertainty_dir = join(statistics_masked_dir, f"uncertainty__{scenario2_base}.tif")
    assert exists(scenario1_mean_dir), f"mean__{scenario1_base}.tif does not exist."
    assert exists(scenario1_uncertainty_dir), f"uncertainty__{scenario1_base}.tif does not exist."
    assert exists(scenario2_mean_dir), f"mean__{scenario2_base}.tif does not exist."
    assert exists(scenario2_uncertainty_dir), f"uncertainty__{scenario2_base}.tif does not exist."
    # Read arrays
    scenario1_mean = gdal.Open(scenario1_mean_dir).ReadAsArray()
    scenario1_uncertainty = gdal.Open(scenario1_uncertainty_dir).ReadAsArray()
    scenario2_mean = gdal.Open(scenario2_mean_dir).ReadAsArray()
    scenario2_uncertainty = gdal.Open(scenario2_uncertainty_dir).ReadAsArray()
    # Handle nodata values
    scenario1_mean = np.where((scenario1_mean == nodatavalue) & (scenario2_mean != nodatavalue), 0, scenario1_mean)
    scenario1_uncertainty = np.where((scenario1_uncertainty == nodatavalue) & (scenario2_uncertainty != nodatavalue), 0, scenario1_uncertainty)
    scenario2_mean = np.where((scenario2_mean == nodatavalue) & (scenario1_mean != nodatavalue), 0, scenario2_mean)
    scenario2_uncertainty = np.where((scenario2_uncertainty == nodatavalue) & (scenario1_uncertainty != nodatavalue), 0, scenario2_uncertainty)
    # Calculate differences
    diff_mean_array = np.where(scenario1_mean == nodatavalue, nodatavalue, subtract_mean_arrays(scenario1_mean, scenario2_mean))
    diff_uncertainty_array = np.where(scenario1_mean == nodatavalue, nodatavalue,
                                     propagate_uncertainty(scenario1_mean, scenario1_uncertainty, scenario2_mean, scenario2_uncertainty))
    # Set uncertainty to 0 where mean is 0
    diff_uncertainty_array = np.where(diff_mean_array == 0, 0, diff_uncertainty_array)
    # Export results
    if not exists(mean_dir):
        export_array_as_tif(diff_mean_array, mean_dir, template=scenario1_mean_dir)
    if not exists(uncertainty_dir):
        export_array_as_tif(diff_uncertainty_array, uncertainty_dir, template=scenario1_mean_dir)
    # Update progress
    progress_index += 1
    progress_label.value = f"Calculation progress: {progress_index}/{total_operations}"

# 2. Process before baseline differences
for difference_name, (diff1_name, diff2_name) in before_baseline_dictionary.items():
    # Define filenames
    mean_filename = f"mean__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
    mean_dir = join(dist_predictions_dir, mean_filename)
    uncertainty_filename = f"uncertainty__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
    uncertainty_dir = join(dist_predictions_dir, uncertainty_filename)
    # Skip if both files already exist
    if exists(mean_dir) and exists(uncertainty_dir):
        # Update progress and continue
        progress_index += 1
        progress_label.value = f"Calculation progress: {progress_index}/{total_operations}"
        continue
    diff1_mean_dir = join(dist_predictions_dir, f"mean__{diff1_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    diff1_uncertainty_dir = join(dist_predictions_dir, f"uncertainty__{diff1_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    diff2_mean_dir = join(dist_predictions_dir, f"mean__{diff2_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    diff2_uncertainty_dir = join(dist_predictions_dir, f"uncertainty__{diff2_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    assert exists(diff1_mean_dir), f"{diff1_mean_dir} does not exist."
    assert exists(diff1_uncertainty_dir), f"{diff1_uncertainty_dir} does not exist."
    assert exists(diff2_mean_dir), f"{diff2_mean_dir} does not exist."
    assert exists(diff2_uncertainty_dir), f"{diff2_uncertainty_dir} does not exist."
    # Read arrays
    diff1_mean = gdal.Open(diff1_mean_dir).ReadAsArray()
    diff1_uncertainty = gdal.Open(diff1_uncertainty_dir).ReadAsArray()
    diff2_mean = gdal.Open(diff2_mean_dir).ReadAsArray()
    diff2_uncertainty = gdal.Open(diff2_uncertainty_dir).ReadAsArray()
    # Handle nodata values
    diff1_mean = np.where((diff1_mean == nodatavalue) & (diff2_mean != nodatavalue), 0, diff1_mean)
    diff1_uncertainty = np.where((diff1_uncertainty == nodatavalue) & (diff2_uncertainty != nodatavalue), 0, diff1_uncertainty)
    diff2_mean = np.where((diff2_mean == nodatavalue) & (diff1_mean != nodatavalue), 0, diff2_mean)
    diff2_uncertainty = np.where((diff2_uncertainty == nodatavalue) & (diff1_uncertainty != nodatavalue), 0, diff2_uncertainty)
    # Calculate difference
    result_mean = np.where(diff1_mean == nodatavalue, nodatavalue, subtract_mean_arrays(diff1_mean, diff2_mean))
    result_uncertainty = np.where(diff1_mean == nodatavalue, nodatavalue,
                                 propagate_uncertainty(diff1_mean, diff1_uncertainty, diff2_mean, diff2_uncertainty))
    # Set uncertainty to 0 where mean is 0
    result_uncertainty = np.where(result_mean == 0, 0, result_uncertainty)
    # Export results
    if not exists(mean_dir):
        export_array_as_tif(result_mean, mean_dir, template=diff1_mean_dir)
    if not exists(uncertainty_dir):
        export_array_as_tif(result_uncertainty, uncertainty_dir, template=diff1_mean_dir)
    # Update progress
    progress_index += 1
    progress_label.value = f"Calculation progress: {progress_index}/{total_operations}"

# 3. Process combined degradation and deforestation
for difference_name, (diff1_name, diff2_name) in degradation_deforestation_dictionary.items():
    # Define filenames
    mean_filename = f"mean__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
    mean_dir = join(dist_predictions_dir, mean_filename)
    uncertainty_filename = f"uncertainty__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
    uncertainty_dir = join(dist_predictions_dir, uncertainty_filename)
    # Skip if both files already exist
    if exists(mean_dir) and exists(uncertainty_dir):
        # Update progress and continue
        progress_index += 1
        progress_label.value = f"Calculation progress: {progress_index}/{total_operations}"
        continue
    diff1_mean_dir = join(dist_predictions_dir, f"mean__{diff1_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    diff1_uncertainty_dir = join(dist_predictions_dir, f"uncertainty__{diff1_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    diff2_mean_dir = join(dist_predictions_dir, f"mean__{diff2_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    diff2_uncertainty_dir = join(dist_predictions_dir, f"uncertainty__{diff2_name}__{selected_scenario_iterations_area}_{selected_model}.tif")
    assert exists(diff1_mean_dir), f"{diff1_mean_dir} does not exist."
    assert exists(diff1_uncertainty_dir), f"{diff1_uncertainty_dir} does not exist."
    assert exists(diff2_mean_dir), f"{diff2_mean_dir} does not exist."
    assert exists(diff2_uncertainty_dir), f"{diff2_uncertainty_dir} does not exist."
    # Read arrays
    diff1_mean = gdal.Open(diff1_mean_dir).ReadAsArray()
    diff1_uncertainty = gdal.Open(diff1_uncertainty_dir).ReadAsArray()
    diff2_mean = gdal.Open(diff2_mean_dir).ReadAsArray()
    diff2_uncertainty = gdal.Open(diff2_uncertainty_dir).ReadAsArray()
    # Handle nodata values
    diff1_mean = np.where((diff1_mean == nodatavalue) & (diff2_mean != nodatavalue), 0, diff1_mean)
    diff1_uncertainty = np.where((diff1_uncertainty == nodatavalue) & (diff2_uncertainty != nodatavalue), 0, diff1_uncertainty)
    diff2_mean = np.where((diff2_mean == nodatavalue) & (diff1_mean != nodatavalue), 0, diff2_mean)
    diff2_uncertainty = np.where((diff2_uncertainty == nodatavalue) & (diff1_uncertainty != nodatavalue), 0, diff2_uncertainty)
    # Calculate sum
    result_mean = np.where(diff1_mean == nodatavalue, nodatavalue, sum_mean_arrays(diff1_mean, diff2_mean))
    result_uncertainty = np.where(diff1_mean == nodatavalue, nodatavalue,
                                 propagate_uncertainty(diff1_mean, diff1_uncertainty, diff2_mean, diff2_uncertainty))
    # Set uncertainty to 0 where mean is 0
    result_uncertainty = np.where(result_mean == 0, 0, result_uncertainty)
    # Export results
    if not exists(mean_dir):
        export_array_as_tif(result_mean, mean_dir, template=diff1_mean_dir)
    if not exists(uncertainty_dir):
        export_array_as_tif(result_uncertainty, uncertainty_dir, template=diff1_mean_dir)
    # Update progress
    progress_index += 1
    progress_label.value = f"Calculation progress: {progress_index}/{total_operations}"

print("All calculations completed.")

# Intactness

In [None]:
# Select a scenario statistics area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "asartr"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir, f"scenarios_{selected_scenario_iterations_area}")
statistics_masked_dir = join(uncertainty_scenario_area_dir, "statistics_masked")
dist_predictions_dir = join(uncertainty_scenario_area_dir, "scenario_disturbance")
intactness_dir = join(uncertainty_scenario_area_dir, 'intactness')
makedirs(intactness_dir, exist_ok=True)

# Select which baseline and disturbance raster to use for calculating intactness
# percentage and relative intactness. Ideally this is the scenario with the least disturbance
# and the difference between that and the current reality.

for baseline in os.listdir(statistics_masked_dir):
  if 'mean' in baseline:
    print(f"selected_baseline = '{baseline}'")
for dist in os.listdir(dist_predictions_dir):
  if 'mean' in dist:
    print(f"selected_dist = '{dist}'")

In [None]:
# selected_baseline = 'mean__all_oldgrowth__asartr_agbd_historic_250429_223033.tif'
selected_baseline = 'mean__2024_no_degradation_since_2000__asartr_agbd_historic_250429_223033.tif'
# selected_dist = 'mean__2024_degradation_deforestation_total__asartr_agbd_historic_250429_223033.tif'
selected_dist = 'mean__2024_degradation_since_2000__asartr_agbd_historic_250429_223033.tif'

forest_mask_year = '2024'

base_dist_name = f"{selected_baseline.split('__')[1]}__{selected_dist.split('__')[1]}"
intactness_baseline_dist_dir = join(intactness_dir, base_dist_name)
makedirs(intactness_baseline_dist_dir, exist_ok=True)

percentage_filename = f"percentage_change__{base_dist_name}__{selected_model}.tif"
percentage_path = join(intactness_baseline_dist_dir, percentage_filename)

if not exists(percentage_path):
  # Define filenames and directories
  selected_baseline_path = join(statistics_masked_dir, selected_baseline)
  selected_dist_path = join(dist_predictions_dir, selected_dist)
  selected_mask_path = join(masks_dir, f"mask_forest_{forest_mask_year}.tif")

  # Convert to arrays
  selected_baseline_array = gdal.Open(selected_baseline_path).ReadAsArray()
  selected_dist_array = gdal.Open(selected_dist_path).ReadAsArray()
  selected_mask_array = gdal.Open(selected_mask_path).ReadAsArray()

  # Create percentage array where the value is not 'nodatavalue' in any of the inputs
  percentage_array = np.where((selected_mask_array==nodatavalue) | (selected_baseline_array==nodatavalue) | (selected_dist_array==nodatavalue), nodatavalue,
                              selected_dist_array/selected_baseline_array*100)
  export_array_as_tif(percentage_array, percentage_path, template = selected_baseline_path)
  print(f"{percentage_filename} has been exported.")

else: print(f"{percentage_filename} already exists.")

In [None]:
# Use additional polygons for masking relative intactness quantiles
polygons_to_exclude = ['template.gpkg', 'project_area_buffered_bbox.gpkg']

# Select baseline / disturbance pairs to measure relative intactness
print("baseline_disturbance_pairs = [")
for dir in os.listdir(intactness_dir):
  print(f"'{dir}',")
print("]\n")

# Select polygons to mask and calculate quantiles
print("mask_polygons = [")
for polygon in os.listdir(polygons_dir):
  if polygon not in polygons_to_exclude:
    if 'inverse' not in polygon:
      print(f"'{polygon}',")
print(None)
print("]")

In [None]:
baseline_disturbance_pairs = [
'all_oldgrowth__2024_degradation_deforestation_total',
'2024_no_degradation_since_2000__2024_degradation_since_2000',
]

mask_polygons = [
'project_area.gpkg',
# 'gedi_area.gpkg',
# 'peninsular_malaysia.gpkg',
# 'lu_oldgrowth.gpkg',
'asartr_phase_2.gpkg',
'intactness_wo_tn.gpkg',
None
]

# Define number of quantiles for intactness rating (e.g. 10 for 1 - 10)
num_quantiles = 10


for mask_polygon in mask_polygons:
  if mask_polygon is not None:
    # Create an inverse project area path for masking
    template_polygon_path = join(polygons_dir, "template.gpkg")
    inverse_polygon_path = join(polygons_dir, f"{mask_polygon[:-5]}_inverse.gpkg")
    if not exists(inverse_polygon_path):
      polygon_path = join(polygons_dir, mask_polygon)
      template_polygon = gpd.read_file(template_polygon_path)
      polygon_read = gpd.read_file(polygon_path)
      polygon_crs = polygon_read.crs.to_epsg()
      inverse_polygon = template_polygon['geometry'].difference(polygon_read['geometry']).iloc[0]
      inverse_polygon_gdf = gpd.GeoDataFrame({'geometry': [inverse_polygon]}, crs=f"EPSG:{polygon_crs}")
      inverse_polygon_gdf.to_file(inverse_polygon_path, driver="GPKG")
      print(f"An inverse masking polygon for {mask_polygon} has been created in {polygons_dir}.")
    else: print(f"An inverse masking polygon for {mask_polygon} already exists.")

for base_dist_name in baseline_disturbance_pairs:
  intactness_baseline_dist_dir = join(intactness_dir, base_dist_name)
  percentage_filename = f"percentage_change__{base_dist_name}__{selected_model}"
  percentage_path = join(intactness_baseline_dist_dir, f"{percentage_filename}.tif")

  for mask_polygon in mask_polygons:

    if mask_polygon is not None:
      # Copy the percentage raster for potential masking
      percentage_masked_filename = f"{percentage_filename}__masked_{mask_polygon[:-5]}.tif"
      percentage_masked_path = join(intactness_baseline_dist_dir, percentage_masked_filename)
      if not exists(percentage_masked_path):
        print(f"Copying {percentage_filename} for masking...")
        copyfile(percentage_path, percentage_masked_path)
        print(f"Masking {percentage_filename} with {mask_polygon}...")
        inverse_polygon_path = join(polygons_dir, f"{mask_polygon[:-5]}_inverse.gpkg")
        burn_polygon_to_raster(percentage_masked_path, inverse_polygon_path, fixed_value=nodatavalue, all_touched=False)
        # Recompress the prediction after burning the polygon masks
        percentage_masked_array = gdal.Open(percentage_masked_path).ReadAsArray()
        export_array_as_tif(percentage_masked_array, percentage_masked_path, compress = True)
        print(f"{percentage_filename} masked.")
      else: print(f"{percentage_masked_filename} already exists.")

    # Define paths and arrays
    if mask_polygon is None: relative_intactness_name = f'intactness__{num_quantiles}_quantiles__{base_dist_name}__{selected_model}'
    else: relative_intactness_name = f'intactness__{mask_polygon[:-5]}_{num_quantiles}_quantiles__{base_dist_name}__{selected_model}'
    relative_intactness_path = join(intactness_baseline_dist_dir, f'{relative_intactness_name}.tif')
    if not exists(relative_intactness_path):
      if mask_polygon is None: percentage_array = gdal.Open(percentage_path).ReadAsArray()
      else: percentage_array = gdal.Open(percentage_masked_path).ReadAsArray()
      relative_intactness_array = np.empty_like(percentage_array, dtype=object)

      # Set all values above 0 to 0, assuming negative values are not intact
      percentage_array[percentage_array > 0] = 0

      # Separate valid and invalid (nodatavalue) elements
      valid_elements = percentage_array[percentage_array != nodatavalue]
      invalid_elements = percentage_array == nodatavalue

      # Calculate quantiles for valid elements
      quantiles = np.percentile(valid_elements, np.linspace(0, 100, num_quantiles + 1)[1:-1]) if len(valid_elements) > 0 else []
      for i in range(1, num_quantiles + 1):
          lower_bound = quantiles[i-2] if i > 1 and len(quantiles) >= i-1 else float('-inf')
          upper_bound = quantiles[i-1] if len(quantiles) >= i else float('inf')
          relative_intactness_array[(percentage_array > lower_bound) & (percentage_array <= upper_bound)] = i
          relative_intactness_array[invalid_elements] = nodatavalue
          # Set all perfectly intact pixels (0 % change) to max score
          relative_intactness_array[percentage_array == 0] = num_quantiles
      export_array_as_tif(relative_intactness_array, relative_intactness_path)

      # Prepare data for CSV: Collect lower and upper bounds for each category
      ranges_data = {'Lower_Bound': [], 'Upper_Bound': []}
      for i in range(1, num_quantiles + 1):
          lower_bound = quantiles[i-2] if i > 1 and len(quantiles) >= i-1 else float('-inf')
          upper_bound = quantiles[i-1] if len(quantiles) >= i else float('inf')
          ranges_data['Lower_Bound'].append(lower_bound)
          ranges_data['Upper_Bound'].append(upper_bound)

      # Create DataFrame and save to CSV
      relative_intactness_df = pd.DataFrame(ranges_data)
      relative_intactness_csv_path = os.path.join(intactness_baseline_dist_dir, f'{relative_intactness_name}.csv')
      relative_intactness_df.to_csv(relative_intactness_csv_path, index=False)

      # Generate and save histogram as .png
      histogram_path = join(intactness_baseline_dist_dir, f'{relative_intactness_name}.png')
      plt.figure()
      plt.hist(valid_elements.flatten(), bins='auto')
      plt.title(f'{relative_intactness_name} Histogram')
      plt.xlabel('Value')
      plt.ylabel('Frequency')
      plt.savefig(histogram_path)
      plt.close()

    else: print(f"{relative_intactness_name} already exists.")

# Disconnect runtime

In [None]:
# Useful for stopping background execution upon completion
runtime.unassign()