<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/7_uncertainty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and directories

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs and upgrades
!pip install geopandas
!pip install xgboost

In [None]:
# Imports
try: import cupy # Only works on GPU runtime
except: None
import ast
from concurrent.futures import ThreadPoolExecutor
import gc
import geopandas as gpd
from google.colab import runtime
import ipywidgets as widgets
import json
import numpy as np
from numpy import random
from numpy.random import normal
from os import makedirs
from os.path import join, exists, basename
from osgeo import gdal
gdal.UseExceptions()
import pickle
import pandas as pd
import scipy.stats as st
import shutil
from shutil import copyfile
import xgboost as xgb

In [None]:
# Define directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
feature_dir = join(base_dir, "3_features")
feature_final_dir = join(feature_dir, "final")
models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")
masks_dir = join(scenarios_dir, "scenario_masks")
uncertainty_dir = join(base_dir, "7_uncertainty")

# Create directories
makedirs(uncertainty_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -11111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'] # Good speed / size ratio
    else: options = []
    if input_array.dtype == 'int16': dtype = gdal.GDT_Int16
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Select model

In [None]:
# Select a baseline model, tested and trained in advance.
# The target must have an uncertainty metric - otherwise
# skip to the next notebook '8_statistics' and use the outputs
# of the '6_scenarios' notebook.
model_exists = False
for subdir, dirs, files in os.walk(models_dir):
  for file in files:
    if file == 'model.json':
      print(f'selected_model = "{subdir.split(f"{models_dir}/",1)[1]}"')
      model_exists = True
if not model_exists:
  print("No model exists.")

In [None]:
selected_model = "agbd_251203_161707"

# This must be True when using AlphaEarth features.
alpha_earth = False

# Define model directories
selected_model_dir = join(models_dir,selected_model)
selected_model_json = join(selected_model_dir, "model.json")
selected_model_descr_dir = join(selected_model_dir, "model_description.json")
selected_model_dataset_path = join(selected_model_dir, f"{selected_model}.pkl")
selected_model_dataset = pd.read_pickle(selected_model_dataset_path)

# Read description for model dataset attributes
with open(join(selected_model_dir,"model_dataset_description.json")) as model_dataset_description_json:
  model_dataset_description = json.load(model_dataset_description_json)
model_dataset_name = model_dataset_description["model_dataset_name"]
number_of_columns = model_dataset_description["number_of_columns"]
number_of_rows = model_dataset_description["number_of_rows"]
id_column = model_dataset_description["id_column"]
selected_target = model_dataset_description["selected_target"]
uncertainty = model_dataset_description["uncertainty"]
covariates_renamed = model_dataset_description["covariates_renamed"]
selected_features = model_dataset_description["selected_features"] + model_dataset_description["covariates_renamed"]
categorical_features_mappings = model_dataset_description["categorical_features_mappings"]
categorical_columns = model_dataset_description["categorical_columns"]
descriptive_parameters = model_dataset_description["descriptive_parameters"]
sample_imported_dataset = model_dataset_description["sample_imported_dataset"]
sample_imported_dataset_by_percent = model_dataset_description["sample_imported_dataset_by_percent"]
sample_imported_dataset_value = model_dataset_description["sample_imported_dataset_value"]

# Reload hyperparameters
with open(selected_model_descr_dir) as model_description_json:
  model_description = json.load(model_description_json)
final_hyperparameters = ast.literal_eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])

# Define directories
scenarios_model_dir = join(scenarios_dir,selected_model)
features_dir = join(scenarios_model_dir, "features")
tile_templates_dir = join(scenarios_model_dir, 'tile_templates')
tile_feature_stacks_dir = join(scenarios_model_dir, 'tile_feature_stacks')
uncertainty_selected_model_dir = join(uncertainty_dir, selected_model)
model_iterations_dir = join(uncertainty_selected_model_dir, "model_iterations")
scenario_iterations_dir = join(uncertainty_selected_model_dir, "scenario_iterations")
tile_prediction_cache_dir = join(uncertainty_selected_model_dir, "tile_prediction_cache")
predictions_dir = join(uncertainty_selected_model_dir, "uncertainty_predictions")

# Create directories
makedirs(uncertainty_selected_model_dir, exist_ok=True)
makedirs(model_iterations_dir, exist_ok=True)
makedirs(scenario_iterations_dir, exist_ok=True)
makedirs(tile_prediction_cache_dir, exist_ok=True)
makedirs(predictions_dir, exist_ok=True)

# Model iterations

In [None]:
# Verify that the target is equal to the mean
print(f'mean = "{selected_target}"')

# Calculate se from columns flagged 'uncertainty'
if len(uncertainty)==0:
  print("There are no flagged uncertainty columns to calculate SE from.")
  print("Manually create the metric from the available columns.")
  for col in selected_model_dataset.columns:
    print(f"{col}")
else:
  for col in selected_model_dataset.columns:
    if col in uncertainty and col not in selected_target:
      print(f'se = "{col}"')

In [None]:
mean = "tar_agbd"
se = "tar_agbd_se"
# GEDI L4A agbd_se represents the prediction standard error incorporating both model
# parameter uncertainty and residual variance (GEDI L4A ATBD Eq. 9, Kellner et al. 2021).
# For Monte Carlo uncertainty propagation, agbd_se directly parameterises the standard
# deviation of the prediction error distribution for each footprint.

# Set model iterations
model_iterations = 10

# Define model (y axis changes for each iteration based on mean and se arrays)
model_dataset_x = selected_model_dataset[selected_features].copy()
for col in categorical_columns:
    if col in model_dataset_x.columns:
        model_dataset_x[col] = model_dataset_x[col].astype('category')
mean_array = selected_model_dataset[mean].values
se_array = selected_model_dataset[se].values

# Detect model type from existing model or determine from target
existing_model_path = join(model_iterations_dir, "model_iteration_1.json")
if exists(existing_model_path):
    # Load existing model and detect type
    temp_booster = xgb.Booster()
    temp_booster.load_model(existing_model_path)
    model_config = json.loads(temp_booster.save_config())

    objective_name = model_config['learner']['objective']['name']
    num_class = int(model_config['learner']['learner_model_param'].get('num_class', '0'))
    classification = any(obj_type in objective_name for obj_type in ['logistic', 'softprob', 'softmax'])
    multiclass = classification and num_class > 2
else:
    # Determine from target variable characteristics
    unique_values = len(np.unique(mean_array))
    if unique_values <= 10 and all(val == int(val) for val in np.unique(mean_array)):
        classification = True
        multiclass = unique_values > 2
        num_class = unique_values if multiclass else 0
    else:
        classification = False
        multiclass = False
        num_class = 0

# Set model type
if classification:
    XGBPredictor = xgb.XGBClassifier(**final_hyperparameters)
    if multiclass: print(f"Model type: Multiclass classification ({num_class} classes)")
    else: print("Model type: Binary classification")
else:
    XGBPredictor = xgb.XGBRegressor(**final_hyperparameters)
    print("Model type: Regression")

model_params = XGBPredictor.get_params()
model_params['eval_metric'] = model_description['metric_used_for_training']
# Default fix for new XGBoost version
[model_params.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]

# Progress label
model_progress_index = 0
model_progress_label = widgets.Label(f"Model iteration: {model_progress_index}/{model_iterations}")
display(model_progress_label)

for model_iteration in range(1,model_iterations+1):
  # Set model iteration filename and check if already exists
  model_iteration_filename = f"model_iteration_{model_iteration}.json"
  model_iteration_path = join(model_iterations_dir, model_iteration_filename)
  # If model iteration does not exist...
  if not exists(model_iteration_path):
    # Set the random seed based on iteration for replicability
    np.random.seed(model_iteration)
    # Set a normal distribution sample as the y for this iteration
    model_dataset_y = normal(mean_array, se_array)
    # Create DMatrix objects
    model_dtrain = xgb.DMatrix(model_dataset_x, model_dataset_y, enable_categorical=True)
    # Train the model iteration using the tested hyperparameters
    model = xgb.train(model_params,
                        model_dtrain,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        verbose_eval=True)
    # Save the model iteration
    model.save_model(model_iteration_path)
  # Update progress
  model_progress_index += 1
  model_progress_label.value = f"Model iteration: {model_progress_index}/{model_iterations}"
print("All model iterations have been trained and saved.")

# Scenario iterations

In [None]:
# Scenarios must be designed and tested using 06_scenarios first.

# Check existing tile parameters
template_tile_list = [f for f in os.listdir(tile_templates_dir) if f.endswith('.tif') and f.startswith('template_tile')]
n_tiles = len(template_tile_list)
assert n_tiles > 0, "There are no template tiles. Run the template tiles section, even if only one is created."
if n_tiles == 1: print(f"# There is 1 template tile.\n")
if n_tiles > 1: print(f"# There are {n_tiles} template tiles.\n")

# Collect available scenarios from the feature stack tiles directory
scenario_stacks_list = []
for scenario in os.listdir(tile_feature_stacks_dir):
    stack_files = [f for f in os.listdir(join(tile_feature_stacks_dir, scenario)) if f.startswith('feature_stack_')]
    if len(stack_files) == n_tiles:
        scenario_stacks_list.append(scenario)

print("scenarios_to_predict = [")
for scenario in sorted(scenario_stacks_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
# There is 1 template tile.

scenarios_to_predict = [
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_no_disturbance_since_1993",
  "2021_no_disturbance_since_oldgrowth_1",
  "2021_no_disturbance_since_oldgrowth_2",
  "2021_oldgrowth_recovery_1",
  "2021_oldgrowth_recovery_2",
  "2022",
  "2023",
  "2024",
  "2024_no_disturbance_since_1996",
  "2024_no_disturbance_since_1997",
  "2024_no_disturbance_since_1998",
  "2024_no_disturbance_since_1999",
  "2024_no_disturbance_since_2000",
  "2024_no_disturbance_since_2001",
  "2024_no_disturbance_since_2002",
  "2024_no_disturbance_since_2003",
  "2024_no_disturbance_since_2004",
  "2024_no_disturbance_since_2005",
  "2024_no_disturbance_since_2006",
  "2024_no_disturbance_since_2007",
  "2024_no_disturbance_since_2008",
  "2024_no_disturbance_since_2009",
  "2024_no_disturbance_since_2010",
  "2024_no_disturbance_since_2011",
  "2024_no_disturbance_since_2012",
  "2024_no_disturbance_since_2013",
  "2024_no_disturbance_since_2014",
  "2024_no_disturbance_since_2015",
  "2024_no_disturbance_since_2016",
  "2024_no_disturbance_since_2017",
  "2024_no_disturbance_since_2018",
  "2024_no_disturbance_since_2019",
  "2024_no_disturbance_since_2020",
  "2024_no_disturbance_since_2021",
  "2024_no_disturbance_since_2022",
  "2024_no_disturbance_since_2023",
  "2024_no_disturbance_since_2024",
  "2024_no_disturbance_since_oldgrowth_1",
  "2024_no_disturbance_since_oldgrowth_2",
  "2024_oldgrowth_recovery_1",
  "2024_oldgrowth_recovery_2",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
]

In [None]:
# Set the number of scenario iterations. It must be <= the number of model iterations available.
scenario_iterations = 10

# Assert the number of scenario iterations is <= the number of model iterations available.
model_iterations_available = len([f for f in os.listdir(model_iterations_dir) if f.startswith('model_iteration_') and f.endswith('.json')])
assert scenario_iterations <= model_iterations_available, f"Reduce the number of scenario iterations to <= {model_iterations_available}."

# Prediction raster precision. GEDI AGBD can be set to 0, any higher is
# spurious due to the wide prediction intervals of the source data.
raster_precision = 0

# Probabilities instead of classes IF binary classification
predict_probabilities = False

# Classification threshold IF binary classification
classification_threshold = 0.5

# Detect GPU availability and set predictor type
try:
    test_array = cupy.array([1, 2, 3])
    del test_array
    predictor_type = 'gpu_predictor'
    gpu_id, use_gpu = 0, True
    print("GPU detected and accessible - using GPU to load the feature stack and to predict.")
except:
    predictor_type = 'cpu_predictor'
    gpu_id, use_gpu = -1, False
    print("GPU not accessible - using CPU prediction")
xgb.set_config(verbosity=0, use_rmm=use_gpu)

# Detect model type using first model iteration
first_model_path = join(model_iterations_dir, "model_iteration_1.json")
booster = xgb.Booster()
booster.load_model(first_model_path)
model_config = json.loads(booster.save_config())
objective_name = model_config['learner']['objective']['name']
num_class = int(model_config['learner']['learner_model_param'].get('num_class', '0'))
classification = any(obj_type in objective_name for obj_type in ['logistic', 'softprob', 'softmax'])
multiclass = classification and num_class > 2
if classification and multiclass: print(f"Model type: Multiclass classification ({num_class} classes)")
elif classification: print("Model type: Binary classification")
else: print("Model type: Regression")
booster = None

# Check existing tile parameters
template_tile_list = [f for f in os.listdir(tile_templates_dir) if f.endswith('.tif') and f.startswith('template_tile')]
n_tiles = len(template_tile_list)
if n_tiles < 1: print("There are currently no template tiles.")
else:
  template_tile = gdal.Open(join(tile_templates_dir, 'template_tile_1.tif'))
  template_tile_x = template_tile.GetRasterBand(1).XSize
  template_tile = None
  print(f"There are {n_tiles} template tiles.")

if alpha_earth: template_base_path = next(r for r in model_scenario_features_dirs if all(c not in r for c in covariates))
else: template_base_path = template_tif_path

# Progress labels
n_scenarios = len(scenarios_to_predict)
scenario_progress_label = widgets.Label(value=f"Scenario progress: 0 / {n_scenarios}")
tile_progress_label = widgets.Label(value="Tile progress: -")
iteration_progress_label = widgets.Label(value="Iteration progress: -")
display(scenario_progress_label, tile_progress_label, iteration_progress_label)

# Loop through each scenario
for scenario_index, scenario in enumerate(scenarios_to_predict):
  scenario_feature_stack_dir = join(tile_feature_stacks_dir, scenario)
  # Create scenario iterations directory
  iterations_dir = join(scenario_iterations_dir, f"{scenario}_iterations")
  makedirs(iterations_dir, exist_ok=True)

  # Check if all scenario iterations already exist
  scenario_iteration_list = []
  for model_iteration in range(1, scenario_iterations + 1):
    prediction_iteration_filename = f"{scenario}__{selected_model}_iteration_{model_iteration}.tif"
    prediction_iteration_path = join(iterations_dir, prediction_iteration_filename)
    scenario_iteration_list.append(prediction_iteration_path)
  all_scenario_iterations_exist = all(exists(p) for p in scenario_iteration_list)

  if not all_scenario_iterations_exist:
    n_stacks = len([f for f in os.listdir(scenario_feature_stack_dir) if f.startswith('feature_stack_')])

    # Single-stack prediction
    if n_stacks == 1:
      tile_progress_label.value = "Tile progress: 0 / 1"
      iteration_progress_label.value = f"Iteration progress: 0 / {scenario_iterations}"

      # Load template parameters
      template_tile_path = join(tile_templates_dir, "template_tile_1.tif")
      template_tile = gdal.Open(template_tile_path)
      template_tile_y = template_tile.GetRasterBand(1).YSize
      template_tile_x = template_tile.GetRasterBand(1).XSize
      template_tile = None
      n_pixels = template_tile_y * template_tile_x
      # Load feature stack and valid indices
      stack_path = join(scenario_feature_stack_dir, f"feature_stack_{scenario}_1.npy")
      indices_path = join(scenario_feature_stack_dir, f"valid_indices_{scenario}_1.npy")
      feature_stack = np.load(stack_path)
      valid_indices = np.load(indices_path)
      n_valid = len(valid_indices)
      # Load to GPU if available and valid pixels exist
      if n_valid > 0 and use_gpu:
        try:
          feature_stack = cupy.asarray(feature_stack)
        except Exception as e:
          if "Memory allocation error" in str(e) or "Out of memory" in str(e):
            print("GPU memory insufficient, switching to CPU.")
            cupy.get_default_memory_pool().free_all_blocks()
            gc.collect()
            use_gpu = False
            predictor_type = 'cpu_predictor'
          else: raise

      # Predict all iterations using loaded stack
      iteration_progress_index = 0
      for model_iteration in range(1, scenario_iterations + 1):
        model_path = join(model_iterations_dir, f"model_iteration_{model_iteration}.json")
        prediction_iteration_filename = f"{scenario}__{selected_model}_iteration_{model_iteration}.tif"
        prediction_iteration_path = join(iterations_dir, prediction_iteration_filename)

        if not exists(prediction_iteration_path):
          # Load model iteration
          if classification:
            XGBPredictor = xgb.XGBClassifier()
            XGBPredictor.load_model(model_path)
            XGBPredictor.set_params(predictor=predictor_type)
            if use_gpu: XGBPredictor.set_params(device='cuda:0')
          else:
            XGBPredictor = xgb.XGBRegressor()
            XGBPredictor.load_model(model_path)
            XGBPredictor.set_params(predictor=predictor_type)
            if use_gpu: XGBPredictor.set_params(device='cuda:0')

          # Handle empty tiles
          if n_valid == 0:
            if raster_precision == 0:
              prediction_array = np.full((template_tile_y, template_tile_x), nodatavalue, dtype=np.int16)
            else:
              prediction_array = np.full((template_tile_y, template_tile_x), nodatavalue, dtype=np.float32)
          else:
            # Predict - terminate runtime if GPU prediction fails
            try:
              if classification and predict_probabilities and not multiclass:
                prediction_proba = XGBPredictor.predict_proba(feature_stack)
                prediction = prediction_proba[:, 1]
              else:
                if classification and not multiclass:
                  prediction_proba = XGBPredictor.predict_proba(feature_stack)
                  prediction = (prediction_proba[:, 1] > classification_threshold).astype(int)
                else:
                  prediction = XGBPredictor.predict(feature_stack)
                  if classification:
                    if prediction.ndim > 1 and prediction.shape[1] > 1: prediction = np.argmax(prediction, axis=1)
                    prediction = prediction.astype(int)
            except Exception as e:
              if "Memory allocation error" in str(e) or "Out of memory" in str(e):
                print("GPU memory insufficient for prediction. Terminating runtime to save compute units, restart with TPU.")
                runtime.unassign()
              else: raise
            # Reconstruct full array from valid indices (C-order)
            if raster_precision == 0:
              prediction_flat = np.full(n_pixels, nodatavalue, dtype=np.int16)
              prediction_flat[valid_indices] = np.round(prediction).astype(np.int16)
            else:
              prediction_flat = np.full(n_pixels, nodatavalue, dtype=np.float32)
              prediction_flat[valid_indices] = np.round(prediction, raster_precision)
            prediction = None
            prediction_array = prediction_flat.reshape((template_tile_y, template_tile_x), order='C')
            prediction_flat = None
          export_array_as_tif(prediction_array, prediction_iteration_path, template=template_base_path, compress=True)
          prediction_array = None

        iteration_progress_index += 1
        iteration_progress_label.value = f"Iteration progress: {iteration_progress_index} / {scenario_iterations}"
      # Clean up single-stack feature stack from memory
      feature_stack = valid_indices = None
      tile_progress_label.value = "Tile progress: 1 / 1"

    # Tiled prediction - load each stack once, predict all iterations per tile
    if n_stacks > 1:
      # Create tile cache directories for all iterations
      tile_cache_dirs = {}
      for model_iteration in range(1, scenario_iterations + 1):
        prediction_iteration_filename = f"{scenario}__{selected_model}_iteration_{model_iteration}.tif"
        tile_cache_iteration_dir = join(tile_prediction_cache_dir, prediction_iteration_filename[:-4])
        makedirs(tile_cache_iteration_dir, exist_ok=True)
        tile_cache_dirs[model_iteration] = tile_cache_iteration_dir

      # Process each tile: load stack once, predict all iterations
      for tile_count in range(1, n_stacks + 1):
        # Determine which iterations still need this tile
        iterations_needing_tile = []
        for model_iteration in range(1, scenario_iterations + 1):
          iteration_tile_path = join(tile_cache_dirs[model_iteration], f"scenario_tile_{tile_count}.tif")
          prediction_iteration_path = join(iterations_dir, f"{scenario}__{selected_model}_iteration_{model_iteration}.tif")
          if not exists(prediction_iteration_path) and not exists(iteration_tile_path):
            iterations_needing_tile.append(model_iteration)

        # Skip tile if no iterations need it
        n_iterations_for_tile = len(iterations_needing_tile)
        if n_iterations_for_tile == 0:
          iteration_progress_label.value = f"Iteration progress: {scenario_iterations} / {scenario_iterations}"
          tile_progress_label.value = f"Tile progress: {tile_count} / {n_stacks}"
          continue
        iteration_progress_label.value = f"Iteration progress: 0 / {n_iterations_for_tile}"

        # Load template tile parameters
        template_tile_path = join(tile_templates_dir, f"template_tile_{tile_count}.tif")
        template_tile = gdal.Open(template_tile_path)
        template_tile_y = template_tile.GetRasterBand(1).YSize
        template_tile_x = template_tile.GetRasterBand(1).XSize
        template_tile = None
        n_pixels = template_tile_y * template_tile_x

        # Load feature stack and valid indices once per tile
        stack_path = join(scenario_feature_stack_dir, f"feature_stack_{scenario}_{tile_count}.npy")
        indices_path = join(scenario_feature_stack_dir, f"valid_indices_{scenario}_{tile_count}.npy")
        if not exists(stack_path):
          print(f"Warning: {basename(stack_path)} not found. Skipping tile {tile_count}.")
          continue
        feature_stack = np.load(stack_path)
        valid_indices = np.load(indices_path)
        n_valid = len(valid_indices)

        # Handle empty tiles for all needed iterations
        if n_valid == 0:
          if raster_precision == 0:
            prediction_tile = np.full((template_tile_y, template_tile_x), nodatavalue, dtype=np.int16)
          else: prediction_tile = np.full((template_tile_y, template_tile_x), nodatavalue, dtype=np.float32)
          tile_iteration_index = 0
          for model_iteration in iterations_needing_tile:
            iteration_tile_path = join(tile_cache_dirs[model_iteration], f"scenario_tile_{tile_count}.tif")
            export_array_as_tif(prediction_tile, iteration_tile_path, template=template_tile_path, compress=False)
            tile_iteration_index += 1
            iteration_progress_label.value = f"Iteration progress: {tile_iteration_index} / {n_iterations_for_tile}"
          prediction_tile = None
          tile_progress_label.value = f"Tile progress: {tile_count} / {n_stacks}"
          continue

        # Load to GPU if available
        tile_use_gpu = use_gpu
        if use_gpu:
          try: feature_stack = cupy.asarray(feature_stack)
          except Exception as e:
            if "Memory allocation error" in str(e) or "Out of memory" in str(e):
              print(f"GPU memory insufficient for tile {tile_count}, using CPU.")
              cupy.get_default_memory_pool().free_all_blocks()
              gc.collect()
              tile_use_gpu = False
            else: raise

        # Predict all needed iterations for this tile
        tile_iteration_index = 0
        for model_iteration in iterations_needing_tile:
          iteration_tile_path = join(tile_cache_dirs[model_iteration], f"scenario_tile_{tile_count}.tif")
          model_path = join(model_iterations_dir, f"model_iteration_{model_iteration}.json")
          # Load model iteration
          if classification:
            XGBPredictor = xgb.XGBClassifier()
            XGBPredictor.load_model(model_path)
            XGBPredictor.set_params(predictor='gpu_predictor' if tile_use_gpu else 'cpu_predictor')
            if tile_use_gpu: XGBPredictor.set_params(device='cuda:0')
          else:
            XGBPredictor = xgb.XGBRegressor()
            XGBPredictor.load_model(model_path)
            XGBPredictor.set_params(predictor='gpu_predictor' if tile_use_gpu else 'cpu_predictor')
            if tile_use_gpu: XGBPredictor.set_params(device='cuda:0')
          # Predict - terminate runtime if GPU prediction fails
          try:
            if classification and predict_probabilities and not multiclass:
              prediction_proba = XGBPredictor.predict_proba(feature_stack)
              prediction = prediction_proba[:, 1]
            else:
              if classification and not multiclass:
                prediction_proba = XGBPredictor.predict_proba(feature_stack)
                prediction = (prediction_proba[:, 1] > classification_threshold).astype(int)
              else:
                prediction = XGBPredictor.predict(feature_stack)
                if classification:
                  if prediction.ndim > 1 and prediction.shape[1] > 1: prediction = np.argmax(prediction, axis=1)
                  prediction = prediction.astype(int)
          except Exception as e:
            if "Memory allocation error" in str(e) or "Out of memory" in str(e):
              print("GPU memory insufficient for prediction. Terminating runtime to save compute units, restart with TPU.")
              runtime.unassign()
            else: raise
          # Reconstruct full tile from valid indices (C-order)
          if raster_precision == 0:
            prediction_flat = np.full(n_pixels, nodatavalue, dtype=np.int16)
            prediction_flat[valid_indices] = np.round(prediction).astype(np.int16)
          else:
            prediction_flat = np.full(n_pixels, nodatavalue, dtype=np.float32)
            prediction_flat[valid_indices] = np.round(prediction, raster_precision)
          prediction = None
          prediction_tile = prediction_flat.reshape((template_tile_y, template_tile_x), order='C')
          prediction_flat = None
          export_array_as_tif(prediction_tile, iteration_tile_path, template=template_tile_path, compress=False)
          prediction_tile = None

          tile_iteration_index += 1
          iteration_progress_label.value = f"Iteration progress: {tile_iteration_index} / {n_iterations_for_tile}"

        # Release tile resources
        feature_stack = valid_indices = None
        tile_progress_label.value = f"Tile progress: {tile_count} / {n_stacks}"

      # Merge tiles for each iteration
      tile_progress_label.value = "Tile progress: merging"
      for model_iteration in range(1, scenario_iterations + 1):
        prediction_iteration_filename = f"{scenario}__{selected_model}_iteration_{model_iteration}.tif"
        prediction_iteration_path = join(iterations_dir, prediction_iteration_filename)
        tile_cache_iteration_dir = tile_cache_dirs[model_iteration]
        if not exists(prediction_iteration_path):
          prediction_array = np.empty((0, template_tile_x))
          tile_files = sorted([f for f in os.listdir(tile_cache_iteration_dir) if f.endswith('.tif')],
                              key=lambda x: int(x.split('_')[-1].split('.')[0]))
          for tile_file in tile_files:
            tile = gdal.Open(join(tile_cache_iteration_dir, tile_file))
            tile_array = tile.ReadAsArray()
            tile = None
            prediction_array = np.vstack((prediction_array, tile_array))
          export_array_as_tif(prediction_array, prediction_iteration_path, template=template_base_path, compress=True)
          prediction_array = None
        # Delete tile cache for this iteration
        if exists(tile_cache_iteration_dir):
          shutil.rmtree(tile_cache_iteration_dir)

        iteration_progress_label.value = f"Merge progress: {model_iteration} / {scenario_iterations}"

  else:
    tile_progress_label.value = "Tile progress: complete"
    iteration_progress_label.value = "Iteration progress: complete"

  scenario_progress_label.value = f"Scenario progress: {scenario_index + 1} / {n_scenarios}"

print("\nScenario iterations complete.")

# Predictions with uncertainty


In [None]:
# Collect scenarios with iterations
scenarios_iterations_list = []
for subdir in os.listdir(scenario_iterations_dir):
  scenarios_iterations_list.append(subdir[:-11])

# Select scenarios to calculate mean, confidence intervals and uncertainty
print("scenarios_to_calculate = [")
for scenario in sorted(scenarios_iterations_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_calculate = [
  # "2018",
  # "2019",
  # "2020",
  "2021",
  "2021_no_disturbance_since_1993",
  "2021_no_disturbance_since_oldgrowth_1",
  "2021_no_disturbance_since_oldgrowth_2",
  "2021_oldgrowth_recovery_1",
  "2021_oldgrowth_recovery_2",
  # "2022",
  # "2023",
  "2024",
  "2024_no_disturbance_since_1996",
  "2024_no_disturbance_since_1997",
  "2024_no_disturbance_since_1998",
  "2024_no_disturbance_since_1999",
  "2024_no_disturbance_since_2000",
  "2024_no_disturbance_since_2001",
  "2024_no_disturbance_since_2002",
  "2024_no_disturbance_since_2003",
  "2024_no_disturbance_since_2004",
  "2024_no_disturbance_since_2005",
  "2024_no_disturbance_since_2006",
  "2024_no_disturbance_since_2007",
  "2024_no_disturbance_since_2008",
  "2024_no_disturbance_since_2009",
  "2024_no_disturbance_since_2010",
  "2024_no_disturbance_since_2011",
  "2024_no_disturbance_since_2012",
  "2024_no_disturbance_since_2013",
  "2024_no_disturbance_since_2014",
  "2024_no_disturbance_since_2015",
  "2024_no_disturbance_since_2016",
  "2024_no_disturbance_since_2017",
  "2024_no_disturbance_since_2018",
  "2024_no_disturbance_since_2019",
  "2024_no_disturbance_since_2020",
  "2024_no_disturbance_since_2021",
  "2024_no_disturbance_since_2022",
  "2024_no_disturbance_since_2023",
  "2024_no_disturbance_since_2024",
  "2024_no_disturbance_since_oldgrowth_1",
  "2024_no_disturbance_since_oldgrowth_2",
  "2024_oldgrowth_recovery_1",
  "2024_oldgrowth_recovery_2",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
]

In [None]:
# Check iteration quality
# High proportion of values <= 0 may indicate corrupt iterations.
# Delete and repredict affected files if necessary.
check_iterations = False
nonpositive_threshold_percent = 1

# Check the number of prediction iterations
scenario_iterations = {}
for scenario in scenarios_to_calculate:
    iterations_dir = join(scenario_iterations_dir, f"{scenario}_iterations")
    iterations = 0
    for subdir in os.listdir(iterations_dir):
        if subdir.endswith(".tif"):
            if check_iterations:
                iteration_path = join(iterations_dir, subdir)
                iteration = gdal.Open(iteration_path)
                iteration_array = iteration.ReadAsArray()
                iteration = None
                valid_mask = iteration_array != nodatavalue
                n_valid = np.count_nonzero(valid_mask)
                if n_valid > 0:
                    nonpositive_count = np.count_nonzero(iteration_array[valid_mask] <= 0)
                    nonpositive_percent = (nonpositive_count / n_valid) * 100
                    if nonpositive_percent > nonpositive_threshold_percent:
                        print(f"Warning: {subdir} has {nonpositive_percent:.1f}% values <= 0 in valid pixels.")
            iterations += 1
    scenario_iterations[scenario] = iterations
    print(f"There are {iterations} prediction iterations for scenario {scenario} statistics.")

In [None]:
# 95% confidence interval (hardcoded)
confidence_interval = 0.95

# Raster precision settings
mean_precision = 2
ci_precision = 2
uncertainty_precision = 2

# Statistics progress
stats_progress_index = 0
stats_progress_label = widgets.Label(f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}")
display(stats_progress_label)

# Read single iteration raster as array
def read_iteration(iteration_path):
    iteration = gdal.Open(iteration_path)
    iteration_array = iteration.ReadAsArray()
    iteration = None
    return iteration_array

# Loop through the scenarios
for scenario, iteration_n in scenario_iterations.items():
    stat_base_filename = f"{scenario}__{selected_model}"
    iterations_dir = join(scenario_iterations_dir, f"{scenario}_iterations")

    # Skip oldgrowth _1 and _2 scenarios if merged files already exist
    if '_oldgrowth_1' in scenario or '_oldgrowth_2' in scenario or '_oldgrowth_recovery_1' in scenario or '_oldgrowth_recovery_2' in scenario or '_no_disturbance_since_oldgrowth_1' in scenario or '_no_disturbance_since_oldgrowth_2' in scenario:
        merged_scenario = scenario.replace('_1', '').replace('_2', '')
        merged_stat_base = f"{merged_scenario}__{selected_model}"
        merged_mean_path = join(predictions_dir, f"mean__{merged_stat_base}.tif")
        merged_uncertainty_path = join(predictions_dir, f"uncertainty__{merged_stat_base}.tif")
        if exists(merged_mean_path) and exists(merged_uncertainty_path):
            print(f"Merged files exist, skipping: {scenario}")
            stats_progress_index += 1
            stats_progress_label.value = f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}"
            continue

    # Calculate t critical value for this scenario's iteration count
    t_crit = st.t.ppf((1 + confidence_interval) / 2, iteration_n - 1)

    # Define statistics raster paths
    stat_mean_filename = f"mean__{stat_base_filename}.tif"
    stat_mean_path = join(predictions_dir, stat_mean_filename)
    stat_uncertainty_filename = f"uncertainty__{stat_base_filename}.tif"
    stat_uncertainty_path = join(predictions_dir, stat_uncertainty_filename)
    stat_ci_filename = f"ci_{int(confidence_interval * 100)}__{stat_base_filename}.tif"
    stat_ci_path = join(predictions_dir, stat_ci_filename)

    # If either mean or uncertainty do not exist
    if not exists(stat_mean_path) or not exists(stat_uncertainty_path):

        # Collect iteration file paths
        iteration_paths = [join(iterations_dir, f) for f in os.listdir(iterations_dir) if f.endswith(".tif")]

        # Read iterations in parallel using thread pool
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            iteration_arrays = list(executor.map(read_iteration, iteration_paths))

        # Determine valid pixels (same across all iterations)
        valid_mask = iteration_arrays[0] != nodatavalue

        # Replace nodata with 0 for summation (only affects nodata positions which we'll overwrite later)
        iteration_arrays_clean = [np.where(arr == nodatavalue, 0, arr.astype(np.float64)) for arr in iteration_arrays]

        # Sum and sum of squares in float64 for numerical stability
        stat_sum = np.sum(iteration_arrays_clean, axis=0)
        stat_sum_sq = np.sum([arr ** 2 for arr in iteration_arrays_clean], axis=0)

        # Retain one path as export template
        iteration_path = iteration_paths[0]

        # Calculate mean: sum / count
        stat_mean = stat_sum / iteration_n
        if mean_precision == 0:
            stat_mean_rounded = np.round(stat_mean, mean_precision).astype(np.int16)
            stat_mean_rounded = np.where(valid_mask, stat_mean_rounded, nodatavalue).astype(np.int16)
        else:
            stat_mean_rounded = np.round(stat_mean, mean_precision)
            stat_mean_rounded = np.where(valid_mask, stat_mean_rounded, nodatavalue)
        if not exists(stat_mean_path):
            export_array_as_tif(stat_mean_rounded, stat_mean_path, template=iteration_path)
            print(f"{stat_mean_filename} has been exported.")
        else: print(f"{stat_mean_filename} already exists.")

        if not exists(stat_uncertainty_path):
            # Calculate variance: E[X^2] - (E[X])^2
            stat_variance = np.maximum(0, (stat_sum_sq - stat_sum ** 2 / iteration_n) / (iteration_n - 1))
            # Standard error: Ïƒ / sqrt(n)
            stat_se = np.sqrt(stat_variance) / np.sqrt(iteration_n)
            # Mask for non-zero SE values (within valid pixels only)
            nonzero_se_mask = valid_mask & (stat_se != 0.0)
            # Calculate confidence intervals using t-distribution
            stat_ci_lower = np.zeros_like(stat_mean)
            stat_ci_upper = np.zeros_like(stat_mean)
            stat_ci_lower[nonzero_se_mask] = stat_mean[nonzero_se_mask] - t_crit * stat_se[nonzero_se_mask]
            stat_ci_upper[nonzero_se_mask] = stat_mean[nonzero_se_mask] + t_crit * stat_se[nonzero_se_mask]
            # CI width: (upper - lower) / 2
            stat_ci = (stat_ci_upper - stat_ci_lower) / 2
            if ci_precision == 0:
                stat_ci = np.round(stat_ci, ci_precision).astype(np.int16)
                stat_ci = np.where(valid_mask, stat_ci, nodatavalue).astype(np.int16)
            else:
                stat_ci = np.round(stat_ci, ci_precision)
                stat_ci = np.where(valid_mask, stat_ci, nodatavalue)
            # Uncertainty: (CI / mean) * 100%
            stat_uncertainty = np.zeros_like(stat_mean_rounded, dtype=np.float64)
            # Avoid division by zero in mean
            nonzero_mean_mask = nonzero_se_mask & (stat_mean_rounded != 0)
            stat_uncertainty[nonzero_mean_mask] = (stat_ci[nonzero_mean_mask] / np.abs(stat_mean_rounded[nonzero_mean_mask])) * 100
            if uncertainty_precision == 0:
                stat_uncertainty = np.round(stat_uncertainty, uncertainty_precision).astype(np.int16)
                stat_uncertainty = np.where(valid_mask, stat_uncertainty, nodatavalue).astype(np.int16)
            else:
                stat_uncertainty = np.round(stat_uncertainty, uncertainty_precision)
                stat_uncertainty = np.where(valid_mask, stat_uncertainty, nodatavalue)

            # Export statistics arrays as rasters
            export_array_as_tif(stat_ci, stat_ci_path, template=iteration_path)
            print(f"{stat_ci_filename} has been exported.")
            export_array_as_tif(stat_uncertainty, stat_uncertainty_path, template=iteration_path)
            print(f"{stat_uncertainty_filename} has been exported.")
        else: print(f"{stat_uncertainty_filename} already exists.")

    else: print(f"{stat_mean_filename} and {stat_uncertainty_filename} already exist.")
    stats_progress_index += 1
    stats_progress_label.value = f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}"

print("\nStatistics calculations complete.")

In [None]:
# Merge oldgrowth scenario statistics
# Version 1 uses land-use proxy for pre-Landsat undisturbed forest.
# Version 2 removes all disturbance without the proxy.
# Taking maximum mean avoids underestimation where proxy may not capture all oldgrowth characteristics.
# CI and uncertainty are selected from whichever version has greater mean at each pixel.

oldgrowth_v1_files = [f for f in os.listdir(predictions_dir)
                      if f.startswith('mean__') and ('_oldgrowth_recovery_1__' in f or '_no_disturbance_since_oldgrowth_1__' in f)
                      and f.endswith('.tif')]

if not oldgrowth_v1_files:
    print("\nNo oldgrowth version 1 statistics found to merge.")
else:
    print("\nMerging oldgrowth statistics...")
    for mean_v1_file in oldgrowth_v1_files:
        # Construct filenames for all statistics
        mean_v2_file = mean_v1_file.replace('_1__', '_2__')
        mean_merged_file = mean_v1_file.replace('_1__', '__')
        ci_v1_file = mean_v1_file.replace('mean__', f'ci_{int(confidence_interval * 100)}__')
        ci_v2_file = ci_v1_file.replace('_1__', '_2__')
        ci_merged_file = ci_v1_file.replace('_1__', '__')
        uncertainty_v1_file = mean_v1_file.replace('mean__', 'uncertainty__')
        uncertainty_v2_file = uncertainty_v1_file.replace('_1__', '_2__')
        uncertainty_merged_file = uncertainty_v1_file.replace('_1__', '__')

        # Construct paths
        mean_v1_path = join(predictions_dir, mean_v1_file)
        mean_v2_path = join(predictions_dir, mean_v2_file)
        mean_merged_path = join(predictions_dir, mean_merged_file)
        ci_v1_path = join(predictions_dir, ci_v1_file)
        ci_v2_path = join(predictions_dir, ci_v2_file)
        ci_merged_path = join(predictions_dir, ci_merged_file)
        uncertainty_v1_path = join(predictions_dir, uncertainty_v1_file)
        uncertainty_v2_path = join(predictions_dir, uncertainty_v2_file)
        uncertainty_merged_path = join(predictions_dir, uncertainty_merged_file)

        # Skip if all merged files already exist
        if exists(mean_merged_path) and exists(ci_merged_path) and exists(uncertainty_merged_path):
            print(f"Merged files already exist for {mean_v1_file.split('__')[1]}")
            continue

        # Load version 1 arrays
        mean_v1 = gdal.Open(mean_v1_path)
        mean_v1_array = mean_v1.ReadAsArray()
        mean_v1 = None
        ci_v1 = gdal.Open(ci_v1_path)
        ci_v1_array = ci_v1.ReadAsArray()
        ci_v1 = None
        uncertainty_v1 = gdal.Open(uncertainty_v1_path)
        uncertainty_v1_array = uncertainty_v1.ReadAsArray()
        uncertainty_v1 = None

        # Check if version 2 exists
        if exists(mean_v2_path):
            print(f"Merging oldgrowth versions for {mean_v1_file.split('__')[1]}...")
            mean_v2 = gdal.Open(mean_v2_path)
            mean_v2_array = mean_v2.ReadAsArray()
            mean_v2 = None
            ci_v2 = gdal.Open(ci_v2_path)
            ci_v2_array = ci_v2.ReadAsArray()
            ci_v2 = None
            uncertainty_v2 = gdal.Open(uncertainty_v2_path)
            uncertainty_v2_array = uncertainty_v2.ReadAsArray()
            uncertainty_v2 = None

            # Determine where v1 has greater mean (excluding nodata)
            v1_greater = (mean_v1_array > mean_v2_array) & (mean_v1_array != nodatavalue)

            # Merge: take maximum mean, select CI/uncertainty from version with greater mean
            mean_merged_array = np.maximum(mean_v1_array, mean_v2_array)
            ci_merged_array = np.where(v1_greater, ci_v1_array, ci_v2_array)
            uncertainty_merged_array = np.where(v1_greater, uncertainty_v1_array, uncertainty_v2_array)

            # Export merged arrays
            if not exists(mean_merged_path):
                export_array_as_tif(mean_merged_array, mean_merged_path, template=mean_v1_path)
            if not exists(ci_merged_path):
                export_array_as_tif(ci_merged_array, ci_merged_path, template=mean_v1_path)
            if not exists(uncertainty_merged_path):
                export_array_as_tif(uncertainty_merged_array, uncertainty_merged_path, template=mean_v1_path)
            print(f"Merged statistics exported for {mean_merged_file.split('__')[1]}")

            # Delete originals
            os.remove(mean_v1_path)
            os.remove(mean_v2_path)
            os.remove(ci_v1_path)
            os.remove(ci_v2_path)
            os.remove(uncertainty_v1_path)
            os.remove(uncertainty_v2_path)
        else:
            # Copy version 1 as merged if version 2 doesn't exist
            print(f"Version 2 not found, using version 1 for {mean_v1_file.split('__')[1]}")
            shutil.copy2(mean_v1_path, mean_merged_path)
            shutil.copy2(ci_v1_path, ci_merged_path)
            shutil.copy2(uncertainty_v1_path, uncertainty_merged_path)
            os.remove(mean_v1_path)
            os.remove(ci_v1_path)
            os.remove(uncertainty_v1_path)

    print("\nOldgrowth statistics merging complete.")

# Disconnect runtime

In [None]:
# Useful for stopping background execution upon completion
runtime.unassign()