<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/6_scenarios.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports, directories and global functions

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs and upgrades
!pip install astropy
!pip install geopandas
!pip install rasterio
!pip install xgboost
!apt-get install -y gdal-bin

In [None]:
# Imports
from astropy.convolution import convolve, Gaussian2DKernel
from contextlib import contextmanager
try: import cupy # Only works on GPU runtime
except: None
import gc
import geopandas as gpd
from google.colab import runtime
import json
import math
from os import makedirs
from os.path import join, exists, basename
from osgeo import gdal, ogr
gdal.UseExceptions()
import ipywidgets as widgets
import numpy as np
import pandas as pd
from pathlib import Path
import psutil
import rasterio
from rasterio.features import rasterize
import re
from scipy.ndimage import distance_transform_edt
import shutil
from shutil import copyfile, rmtree
import subprocess
import time
import xgboost as xgb
import warnings

In [None]:
# Define directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
masks_dir = join(areas_dir, "masks")

features_dir = join(base_dir, "3_features")
feature_alpha_earth_dir = join(features_dir, "alpha_earth")
feature_continuous_final_dir = join(features_dir, 'continuous_final')
feature_binary_dir = join(features_dir, "binary")
feature_edge_effects_dir = join(features_dir, "binary_edge_effects")
feature_alternate_dir = join(features_dir, "alternate")
feature_topo_dsm_final_dir = join(features_dir, 'topo_dsm_final')
feature_topo_dtm_final_dir = join(features_dir, 'topo_dtm_final')
feature_geographic_final_dir = join(features_dir, 'geographic_final')

models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")

# Create directories
makedirs(scenarios_dir, exist_ok=True)

# Define final feature directories for scenario creation
final_features_dir_list = [
    feature_alpha_earth_dir,
    feature_continuous_final_dir,
    feature_edge_effects_dir,
    feature_alternate_dir,
    feature_topo_dsm_final_dir,
    feature_topo_dtm_final_dir,
    feature_geographic_final_dir
]

In [None]:
# Global function: Search final feature directories
def locate_feature(feature_list, covariates_renamed):
    result = []
    for feature in feature_list:
      if f"fea_{feature}" not in covariates_renamed:
        for dir in final_features_dir_list:
          if exists(join(dir, f"{feature}.tif")):
            result.append(f"{basename(dir)}/{feature}")
            break
        else: raise FileNotFoundError(f"{feature}.tif not found")
    return result

# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -11111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'] # Good speed / size ratio
    else: options = []
    if input_array.dtype == 'int16': dtype = gdal.GDT_Int16
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Global function: burn a polygon to raster
def burn_polygon_to_raster(raster_path, polygon_path, fixed=True, fixed_value=1, column_name=None, all_touched=True):
    raster = vector = None
    try:
        raster = gdal.Open(raster_path, gdal.GA_Update)
        vector = ogr.Open(polygon_path)
        if not raster or not vector:
            raise ValueError("Cannot open input files")
        layer = vector.GetLayer()
        options = ["ALL_TOUCHED=TRUE"] if all_touched else []
        if fixed:
            gdal.RasterizeLayer(raster, [1], layer, burn_values=[fixed_value], options=options)
        else:
            attr_name = column_name or layer.GetLayerDefn().GetFieldDefn(0).GetName()
            options.append(f"ATTRIBUTE={attr_name}")
            gdal.RasterizeLayer(raster, [1], layer, options=options)
    finally:
        if raster: raster.FlushCache()
        raster = vector = None

# Global function: edge effects
# Provides spatial awareness analogous to CNN receptive fields for tabular models
# Data_type: 'binary' or 'continuous'.
cell_size_y_path = join(areas_dir, 'cell_size_y.tif')
cell_size_x_path = join(areas_dir, 'cell_size_x.tif')
def edge_effects(array, data_type, cell_size_x_path, cell_size_y_path, threshold_metres):
    # Determine pixel size from cell size rasters.
    cell_size_x = gdal.Open(cell_size_x_path)
    cell_size_x_array = cell_size_x.ReadAsArray()
    cell_size_x = None
    cell_size_y = gdal.Open(cell_size_y_path)
    cell_size_y_array = cell_size_y.ReadAsArray()
    cell_size_y = None
    mean_cell_resolution = np.mean([np.mean(cell_size_x_array), np.mean(cell_size_y_array)])
    # Maximum pixel distance for kernel extent.
    max_pixel_distance = threshold_metres / mean_cell_resolution
    # 2D Gaussian weight distribution follows chi-squared with df=2.
    # Cumulative probability within radius r: P = 1 - exp(-r² / 2σ²).
    # Solving for r at P=0.95: r = σ * sqrt(-2 * ln(0.05)) ≈ 2.45σ.
    # Setting r = max_pixel_distance ensures 95% of kernel weight falls within threshold.
    gaussian_stdev = max_pixel_distance / 2.45
    kernel_radius = int(np.ceil(max_pixel_distance))
    kernel_size = 2 * kernel_radius + 1
    # Gaussian kernel for spatial weighting.
    kernel = Gaussian2DKernel(x_stddev=gaussian_stdev, y_stddev=gaussian_stdev,
                              x_size=kernel_size, y_size=kernel_size)
    # Circular mask enforces ecological threshold as hard boundary.
    # Square kernels would include pixels beyond threshold at corners.
    y, x = np.ogrid[:kernel_size, :kernel_size]
    centre = kernel_radius
    distance_from_centre = np.sqrt((x - centre)**2 + (y - centre)**2)
    circular_mask = distance_from_centre <= max_pixel_distance
    # Apply mask and renormalise to sum to 1.
    # Renormalisation ensures consistent weighting after truncation.
    kernel_array = kernel.array.copy()
    kernel_array[~circular_mask] = 0
    kernel_array /= kernel_array.sum()
    # Gaussian smoothing captures local spatial context.
    # For binary: represents local class density within threshold.
    # For continuous: represents local weighted mean within threshold.
    # boundary='extend' extrapolates edge values beyond raster extent.
    smoothed = convolve(array.astype(float), kernel_array, boundary='extend')
    if data_type == 'continuous': return smoothed # Without rounding
    if data_type == 'binary': smoothed = np.round(smoothed, 2) # Round
    # Binary data: compute signed distance to class boundary.
    # Euclidean distance transform gives centre-to-centre pixel distance.
    dist_from_ones = distance_transform_edt(array == 0)
    dist_from_zeros = distance_transform_edt(array == 1)
    # Convert to distance from pixel centre to class boundary.
    # Class boundary lies between adjacent pixels of different classes.
    # Subtracting 0.5 pixels approximates centre-to-boundary distance.
    # Sign encodes class membership: positive = class 1, negative = class 0.
    # Magnitude encodes proximity to boundary (edge effects zone).
    signed_distance = np.where(
        array == 1,
        np.maximum(dist_from_zeros - 0.5, 0) * mean_cell_resolution,
        -np.maximum(dist_from_ones - 0.5, 0) * mean_cell_resolution
    )
    # Cap at threshold: pixels beyond are interior, not edge-influenced.
    # Round to integer metres for cleaner feature representation.
    signed_distance = np.round(np.clip(signed_distance, -threshold_metres, threshold_metres)).astype(np.int16)
    return signed_distance, smoothed

# Select model

In [None]:
# Select a model
model_exists = False
for subdir, dirs, files in os.walk(models_dir):
  for file in files:
    if file == 'model.json':
      print(f'selected_model = "{subdir.split(f"{models_dir}/",1)[1]}"')
      model_exists = True
if not model_exists:
  print("No model exists.")

In [None]:
selected_model = "agbd_251203_161707"

# This must be True when using AlphaEarth features.
# Alternate scenarios cannot be created with AlphaEarth's embeddings.
# Mixing Alpha Earth with features of lower resolution has not been tested.
alpha_earth = False

# Set this to True for anything (e.g. elevation) with only a single prediction
single_prediction = False

# Define model directories
selected_model_dir = join(models_dir,selected_model)
selected_model_shap_dir = join(selected_model_dir, "shap")
selected_model_json = join(selected_model_dir, "model.json")
selected_model_descr_dir = join(selected_model_dir, "model_description.json")
selected_model_dataset_path = join(selected_model_dir, f"{selected_model}.pkl")
selected_model_dataset = pd.read_pickle(selected_model_dataset_path)

# Read description for model dataset attributes
with open(join(selected_model_dir,"model_dataset_description.json")) as model_dataset_description_json:
  model_dataset_description = json.load(model_dataset_description_json)
model_dataset_name = model_dataset_description["model_dataset_name"]
number_of_columns = model_dataset_description["number_of_columns"]
number_of_rows = model_dataset_description["number_of_rows"]
id_column = model_dataset_description["id_column"]
selected_target = model_dataset_description["selected_target"]
uncertainty = model_dataset_description["uncertainty"]
covariates_renamed = model_dataset_description["covariates_renamed"]
selected_features = model_dataset_description["selected_features"] + model_dataset_description["covariates_renamed"]
categorical_features_mappings = model_dataset_description["categorical_features_mappings"]
categorical_columns = model_dataset_description["categorical_columns"]
descriptive_parameters = model_dataset_description["descriptive_parameters"]
filter_parameter = model_dataset_description["filter_parameter"]
filter_values_to_include = model_dataset_description["filter_values_to_include"]
sample_imported_dataset = model_dataset_description["sample_imported_dataset"]
sample_imported_dataset_by_percent = model_dataset_description["sample_imported_dataset_by_percent"]
sample_imported_dataset_value = model_dataset_description["sample_imported_dataset_value"]

covariates = [covariate[4:] for covariate in covariates_renamed]

# Create scenarios model directory
scenarios_model_dir = join(scenarios_dir, selected_model)
makedirs(scenarios_model_dir, exist_ok=True)

# Copy model_dataset_description.json
with open(join(scenarios_model_dir, "model_dataset_description.json"), "w") as file:
  file.write(json.dumps(model_dataset_description))

# Create subdirectories
scenario_masks_dir = join(scenarios_model_dir, "scenario_masks")
tile_templates_dir = join(scenarios_model_dir, 'tile_templates')
tile_features_dir = join(scenarios_model_dir, "tile_features")
tile_scenario_masks_dir = join(scenarios_model_dir, "tile_scenario_masks")
tile_feature_stacks_dir = join(scenarios_model_dir, "tile_feature_stacks")
tile_prediction_cache_dir = join(scenarios_model_dir,"tile_prediction_cache")
scenario_predictions_dir = join(scenarios_model_dir, "scenario_predictions")

makedirs(scenario_masks_dir, exist_ok=True)
makedirs(tile_templates_dir, exist_ok=True)
makedirs(tile_features_dir, exist_ok=True)
makedirs(tile_scenario_masks_dir, exist_ok=True)
makedirs(tile_feature_stacks_dir, exist_ok=True)
makedirs(tile_prediction_cache_dir, exist_ok=True)
makedirs(scenario_predictions_dir, exist_ok=True)

# Define yearly scenarios

In [None]:
# Yearly scenarios may include years after the model scenario if feature data is available later than GEDI data.
# Date of the prediction is ~December 31st, e.g. '2024' is 31/12/2024, requiring features up to 2024.

model_scenario_override = None # set if cannot be automatically determined from model features

yearly_features = ["forest", "disturbance", "alpha_earth"]

# Remove the 'fea_' prefix from each feature
model_features = sorted([feature[4:] for feature in selected_features])

# Create a list of feature years from the model's features
model_feature_years = []
for feature in model_features:
  for yearly_feature in yearly_features:
    if yearly_feature in feature:
      model_feature_years.append(int(feature[-4:]))
# Locate features in final feature directories
model_features = locate_feature(model_features, covariates_renamed)

# Determine the model scenario from the maximum year
if model_scenario_override != None: model_scenario = model_scenario_override
else: model_scenario = max(model_feature_years)
model_scenario_filename = f"{model_scenario}.csv"
model_scenario_path = join(scenarios_model_dir, model_scenario_filename)
pd.DataFrame(model_features).to_csv(model_scenario_path, index=False)
print(f"The maximum year used in the model is {model_scenario}, which has been created as the first scenario.\n")
print(f"The {model_scenario} scenario feature list has been saved to:\n {model_scenario_path}\n")

# Available feature years
final_feature_years_set = set()
if alpha_earth:
  for final_feature in os.listdir(feature_alpha_earth_dir):
    try: final_feature_years_set.add(int(final_feature[-8:-4]))
    except: continue
else:
  for final_feature in os.listdir(feature_edge_effects_dir):
    try: final_feature_years_set.add(int(final_feature[-8:-4]))
    except: continue
final_feature_years = list(final_feature_years_set)

# Find the first and last feature years
first_feature_year = min(final_feature_years)
last_feature_year = max(final_feature_years)
additional_feature_years = last_feature_year - model_scenario
print(f"The first available feature year is {first_feature_year} and the last is {last_feature_year}.\n")

# Calculate the range of scenario years and minimum scenario year
model_scenario_year_range = max(model_feature_years) - (min(model_feature_years))
minimum_yearly_scenario = first_feature_year + model_scenario_year_range
print(f"The earliest scenario year that can be predicted is {minimum_yearly_scenario}.")
print(f"The latest scenario year that can be predicted is {last_feature_year}.")
print(f"This is based on the number of yearly features used to train the model and the total availability of features.")

In [None]:
# Select static features which are the same in every scenario, e.g. topography
print("static_features = [")
for feature in model_features:
  if feature not in covariates_renamed:
    print(f'  "{feature}",')
print("]")

In [None]:
# # GEDI elevation

# static_features = [
#   "coast_proximity_km",
#   "latitude",
#   "longitude",
#   "topo_dsm_smooth_aspect_cosine",
#   "topo_dsm_smooth_aspect_sine",
#   "topo_dsm_smooth_circular_variance_aspect_03",
#   "topo_dsm_smooth_circular_variance_aspect_07",
#   "topo_dsm_smooth_circular_variance_aspect_11",
#   "topo_dsm_smooth_deviation_mean_elevation_03",
#   "topo_dsm_smooth_deviation_mean_elevation_07",
#   "topo_dsm_smooth_deviation_mean_elevation_11",
#   "topo_dsm_smooth_eastness",
#   "topo_dsm_smooth_elevation",
#   "topo_dsm_smooth_northness",
#   "topo_dsm_smooth_profile_curvature",
#   "topo_dsm_smooth_roughness_03",
#   "topo_dsm_smooth_roughness_07",
#   "topo_dsm_smooth_roughness_11",
#   "topo_dsm_smooth_slope",
#   "topo_dsm_smooth_stream_power_index_log10",
#   "topo_dsm_smooth_surface_area_ratio",
#   "topo_dsm_smooth_tangential_curvature",
#   "topo_dsm_smooth_topographic_position_index_03",
#   "topo_dsm_smooth_topographic_position_index_07",
#   "topo_dsm_smooth_topographic_position_index_11",
#   "topo_dsm_smooth_topographic_ruggedness_index",
#   "topo_dsm_smooth_topographic_wetness_index",
#   "topo_dsm_unsmooth_aspect_cosine",
#   "topo_dsm_unsmooth_aspect_sine",
#   "topo_dsm_unsmooth_circular_variance_aspect_03",
#   "topo_dsm_unsmooth_circular_variance_aspect_07",
#   "topo_dsm_unsmooth_circular_variance_aspect_11",
#   "topo_dsm_unsmooth_deviation_mean_elevation_03",
#   "topo_dsm_unsmooth_deviation_mean_elevation_07",
#   "topo_dsm_unsmooth_deviation_mean_elevation_11",
#   "topo_dsm_unsmooth_eastness",
#   "topo_dsm_unsmooth_elevation",
#   "topo_dsm_unsmooth_northness",
#   "topo_dsm_unsmooth_profile_curvature",
#   "topo_dsm_unsmooth_roughness_03",
#   "topo_dsm_unsmooth_roughness_07",
#   "topo_dsm_unsmooth_roughness_11",
#   "topo_dsm_unsmooth_slope",
#   "topo_dsm_unsmooth_stream_power_index_log10",
#   "topo_dsm_unsmooth_surface_area_ratio",
#   "topo_dsm_unsmooth_tangential_curvature",
#   "topo_dsm_unsmooth_topographic_position_index_03",
#   "topo_dsm_unsmooth_topographic_position_index_07",
#   "topo_dsm_unsmooth_topographic_position_index_11",
#   "topo_dsm_unsmooth_topographic_ruggedness_index",
#   "topo_dsm_unsmooth_topographic_wetness_index",
# ]

In [None]:
static_features = [
  "coast_proximity_km",
  "latitude",
  "longitude",
  "topo_dtm_smooth_aspect_cosine",
  "topo_dtm_smooth_aspect_sine",
  "topo_dtm_smooth_circular_variance_aspect_03",
  "topo_dtm_smooth_circular_variance_aspect_07",
  "topo_dtm_smooth_circular_variance_aspect_11",
  "topo_dtm_smooth_deviation_mean_elevation_03",
  "topo_dtm_smooth_deviation_mean_elevation_07",
  "topo_dtm_smooth_deviation_mean_elevation_11",
  "topo_dtm_smooth_eastness",
  "topo_dtm_smooth_elevation",
  "topo_dtm_smooth_northness",
  "topo_dtm_smooth_profile_curvature",
  "topo_dtm_smooth_roughness_03",
  "topo_dtm_smooth_roughness_07",
  "topo_dtm_smooth_roughness_11",
  "topo_dtm_smooth_slope",
  "topo_dtm_smooth_stream_power_index_log10",
  "topo_dtm_smooth_surface_area_ratio",
  "topo_dtm_smooth_tangential_curvature",
  "topo_dtm_smooth_topographic_position_index_03",
  "topo_dtm_smooth_topographic_position_index_07",
  "topo_dtm_smooth_topographic_position_index_11",
  "topo_dtm_smooth_topographic_ruggedness_index",
  "topo_dtm_smooth_topographic_wetness_index",
  "topo_dtm_unsmooth_aspect_cosine",
  "topo_dtm_unsmooth_aspect_sine",
  "topo_dtm_unsmooth_circular_variance_aspect_03",
  "topo_dtm_unsmooth_circular_variance_aspect_07",
  "topo_dtm_unsmooth_circular_variance_aspect_11",
  "topo_dtm_unsmooth_deviation_mean_elevation_03",
  "topo_dtm_unsmooth_deviation_mean_elevation_07",
  "topo_dtm_unsmooth_deviation_mean_elevation_11",
  "topo_dtm_unsmooth_eastness",
  "topo_dtm_unsmooth_elevation",
  "topo_dtm_unsmooth_northness",
  "topo_dtm_unsmooth_profile_curvature",
  "topo_dtm_unsmooth_roughness_03",
  "topo_dtm_unsmooth_roughness_07",
  "topo_dtm_unsmooth_roughness_11",
  "topo_dtm_unsmooth_slope",
  "topo_dtm_unsmooth_stream_power_index_log10",
  "topo_dtm_unsmooth_surface_area_ratio",
  "topo_dtm_unsmooth_tangential_curvature",
  "topo_dtm_unsmooth_topographic_position_index_03",
  "topo_dtm_unsmooth_topographic_position_index_07",
  "topo_dtm_unsmooth_topographic_position_index_11",
  "topo_dtm_unsmooth_topographic_ruggedness_index",
  "topo_dtm_unsmooth_topographic_wetness_index",
]

In [None]:
# Note that areas that were forested in a historic yearly scenario but were reservoirs
# or flooded at the time topographic feature data was collected (~2014 for GLO-30 DEM)
# will be predicted based on a flat topography at the elevation of the water's surface.

# Set scenario features as all non-static features
scenario_features = sorted(list(set([f.split('/')[-1] for f in model_features]) - set(static_features)))

# Create feature lists for all possible yearly scenarios
for yearly_scenario in range(minimum_yearly_scenario, last_feature_year +1):
  year_difference = model_scenario - yearly_scenario
  yearly_scenario_features = []
  for scenario_feature in scenario_features:
    try:
      year_change = int(scenario_feature[-4:]) - year_difference
      yearly_scenario_feature = scenario_feature[:-4] + str(year_change)
      yearly_scenario_features.append(yearly_scenario_feature)
    except: yearly_scenario_features.append(scenario_feature)
  # Compile yearly features
  yearly_scenario_features = sorted(yearly_scenario_features + static_features)
  # Locate features in final feature directories
  yearly_scenario_features = locate_feature(yearly_scenario_features, covariates_renamed)
  yearly_scenario_filename = f"{yearly_scenario}.csv"
  yearly_scenario_dir = join(scenarios_model_dir,yearly_scenario_filename)
  pd.DataFrame(yearly_scenario_features).to_csv(yearly_scenario_dir, index=False)

# Open the most recent yearly scenario feature list
most_recent_scenario_csv = join(scenarios_model_dir,f"{last_feature_year}.csv")
most_recent_scenario_features = pd.Series.tolist(pd.read_csv(most_recent_scenario_csv).iloc[:,0])

print(f"Lists of features for all possible yearly scenarios have been exported to {scenarios_model_dir}/.")
print(f"Ensure all features in these lists have been copied to:\n{features_dir}\n")

# Define alternate scenarios (optional)

## Disturbance requirements

In [None]:
# Tool to identify required scenarios for forest AGBD disturbance mapping
# Outputs: (deforestation, degradation) for oldgrowth/area-based, degradation only otherwise
# Decomposition uses nodata masking

use_tool = False

def select_disturbance_scenarios():
    # Header
    print("\n" + "="*50)
    print("\nDISTURBANCE SCENARIO SELECTOR\n")
    print("="*50 + "\n")

    # Methodology explainer
    print("AGBD loss from forest disturbance calculated by comparing scenarios.")
    print("Degradation: AGBD loss in pixels that remain forest.")
    print("Deforestation: AGBD loss from forest-to-nonforest transitions.")
    print("Total disturbance: deforestation + degradation.\n")
    print("Deforestation and total disturbance only available for:")
    print("  - Oldgrowth baseline (cumulative loss since intact forest)")
    print("  - Area-based scenarios (forecast loss from planned land use change)")
    print("Year-based counterfactuals output the effects of degradation only.\n")

    # Year of interest
    year = input("Enter year of interest: ")
    print("\n")

    # Calculation type selection
    calc_prompt = (
        "Select the effect of disturbance to calculate:\n"
        "1. Since an oldgrowth state\n"
        "2. Since a baseline year\n"
        "3. In a multi-year interval\n"
        "4. In a single year\n"
        "5. In an area (polygon)\n\n"
        "Enter your choice (1-5): ")
    calc_type = input(calc_prompt)

    # Initialise outputs
    scenario_pair = None
    scenarios_required = []
    output_names = None
    output_description = None

    # Option 1: Cumulative since oldgrowth
    # Uses disturbance_since_dictionary
    # Compares actual AGBD against oldgrowth counterfactual
    if calc_type == "1":
        actual = year
        counterfactual = f"{year}_no_disturbance_since_oldgrowth"
        scenario_pair = (actual, counterfactual)
        scenarios_required = [actual, counterfactual]
        output_names = (
            f"{year}_deforestation_since_oldgrowth",
            f"{year}_degradation_since_oldgrowth")
        output_description = (
            f"AGBD loss in {year} from cumulative disturbance since intact forest.\n"
            f"Comparison: actual {year} vs modelled {year} if no disturbance ever occurred.")

    # Option 2: Cumulative since baseline year
    # Uses disturbance_since_dictionary
    # Compares actual AGBD against no_disturbance counterfactual
    # Degradation only
    elif calc_type == "2":
        print("\n")
        baseline_year = input("Enter baseline year (disturbance occurring after this year will be measured): ")
        if int(baseline_year) >= int(year):
            print("\nError: Baseline year must be before year of interest")
            return None
        since_year = str(int(baseline_year) + 1)
        actual = year
        counterfactual = f"{year}_no_disturbance_since_{since_year}"
        scenario_pair = (actual, counterfactual)
        scenarios_required = [actual, counterfactual]
        output_names = f"{year}_degradation_since_{since_year}"
        output_description = (
            f"AGBD loss in {year} from degradation events since {baseline_year}.\n"
            f"Comparison: actual {year} vs modelled {year} if no disturbance since {since_year}.\n"
            f"Degradation only: deforestation requires oldgrowth baseline.")

    # Option 3: Multi-year interval
    # Uses degradation_interval_dictionary
    # Compares two counterfactuals to isolate interval effect
    # Degradation only
    elif calc_type == "3":
        print("\n")
        interval_prompt = (
            "Select interval baseline:\n"
            "1. From oldgrowth\n"
            "2. From a specific year\n\n"
            "Enter your choice (1-2): ")
        interval_type = input(interval_prompt)
        print("\n")
        end_year = input("Enter interval end year: ")
        if int(end_year) >= int(year):
            print("\nError: Interval end year must be before year of interest")
            return None
        end_since = str(int(end_year) + 1)
        cf_recent = f"{year}_no_disturbance_since_{end_since}"
        if interval_type == "1":
            cf_baseline = f"{year}_no_disturbance_since_oldgrowth"
            scenario_pair = (cf_recent, cf_baseline)
            scenarios_required = [cf_recent, cf_baseline]
            output_names = f"{year}_degradation_from_oldgrowth_to_{end_year}"
            output_description = (
                f"AGBD loss in {year} from degradation events between oldgrowth and {end_year}.\n"
                f"Comparison: modelled {year} (no disturbance since {end_since}) vs modelled {year} (oldgrowth).\n"
                f"Isolates historical degradation; excludes events after {end_year}.")
        else:
            start_year = input("Enter interval start year: ")
            if int(start_year) >= int(end_year):
                print("\nError: Start year must be before end year")
                return None
            cf_baseline = f"{year}_no_disturbance_since_{start_year}"
            scenario_pair = (cf_recent, cf_baseline)
            scenarios_required = [cf_recent, cf_baseline]
            output_names = f"{year}_degradation_from_{start_year}_to_{end_year}"
            output_description = (
                f"AGBD loss in {year} from degradation events between {start_year} and {end_year}.\n"
                f"Comparison: modelled {year} (no disturbance since {end_since}) vs modelled {year} (no disturbance since {start_year}).\n"
                f"Isolates interval effect; excludes events before {start_year} and after {end_year}.")

    # Option 4: Single year effect
    # Uses degradation_single_year_dictionary
    # Consecutive counterfactuals isolate effect of one year
    # Same-year case compares actual against no_disturbance_since_year
    # Degradation only
    elif calc_type == "4":
        print("\n")
        effect_year = input("Enter year whose effect to measure: ")
        if int(effect_year) > int(year):
            print("\nError: Effect year cannot be after year of interest")
            return None
        if effect_year == year:
            actual = year
            counterfactual = f"{year}_no_disturbance_since_{year}"
            scenario_pair = (actual, counterfactual)
            scenarios_required = [actual, counterfactual]
            output_description = (
                f"AGBD loss in {year} from degradation events in {effect_year}.\n"
                f"Comparison: actual {year} vs modelled {year} (no disturbance since {year}).\n"
                f"Isolates same-year degradation effect.")
        else:
            effect_year_plus1 = str(int(effect_year) + 1)
            cf_after = f"{year}_no_disturbance_since_{effect_year_plus1}"
            cf_before = f"{year}_no_disturbance_since_{effect_year}"
            scenario_pair = (cf_after, cf_before)
            scenarios_required = [cf_after, cf_before]
            output_description = (
                f"AGBD loss in {year} from degradation events in {effect_year}.\n"
                f"Comparison: modelled {year} (no disturbance since {effect_year_plus1}) vs modelled {year} (no disturbance since {effect_year}).\n"
                f"Isolates single-year effect using consecutive counterfactuals.")
        output_names = f"{year}_effect_of_degradation_in_{effect_year}"

    # Option 5: Area-based (polygon)
    # Uses disturbance_area_dictionary
    # Compares polygon-based alternate scenario against actual
    elif calc_type == "5":
        print("\n")
        polygon_name = input("Enter polygon name (from .gpkg filename, without extension): ")
        print("\n")
        area_type_prompt = (
            "Select polygon disturbance type:\n"
            "1. Deforestation (with degradation buffer)\n"
            "2. Degradation\n\n"
            "Enter your choice (1-2): ")
        area_type = input(area_type_prompt)
        print("\n")
        year_affix = input("Enter year affix for polygon scenario: ")
        if area_type == "1":
            print("\n")
            buffer_size = input("Enter buffer size in metres: ")
            scenario_name = f"{year}_{polygon_name}_deforestation_{year_affix}_{buffer_size}m_degradation_buffer"
            output_base = f"{year}_deforestation_of_{polygon_name}_{year_affix}"
            output_description = (
                f"Forecast AGBD loss in {year} from deforestation of {polygon_name} ({year_affix}).\n"
                f"Comparison: modelled {year} (with deforestation + {buffer_size}m degradation buffer) vs actual {year}.\n"
                f"Deforestation: loss in cleared area. Degradation: loss in buffer zone.")
        else:
            scenario_name = f"{year}_{polygon_name}_degradation_{year_affix}"
            output_base = f"{year}_degradation_of_{polygon_name}_{year_affix}"
            output_description = (
                f"Forecast AGBD loss in {year} from degradation of {polygon_name} ({year_affix}).\n"
                f"Comparison: modelled {year} (with degradation scenario) vs actual {year}.")
        scenario_pair = (scenario_name, year)
        scenarios_required = [scenario_name, year]
        output_names = (
            f"{output_base}_deforestation",
            f"{output_base}_degradation")
    else:
        print("\nError: Invalid selection")
        return None

    # Display results
    result_text = []
    result_text.append("\n" + "="*50)
    result_text.append("\n" + output_description)
    result_text.append("\nOutput rasters:")
    if isinstance(output_names, tuple):
        for name in output_names: result_text.append(f"  {name}")
    else:
        result_text.append(f"  {output_names}")
    result_text.append(f"\nScenarios required:")
    for s in scenarios_required: result_text.append(f"  {s}")
    result_text.append("\n" + "="*50)
    print("\n\n")
    print("\n".join(result_text))
    return output_names, scenario_pair, scenarios_required


# Run tool
if use_tool:
    if alpha_earth: print("Alternate scenarios cannot be created with Alpha Earth features.")
    else: select_disturbance_scenarios()

## No disturbance since...

In [None]:
# These alternate scenarios remove both degradation and deforestation for specific time ranges.
# They are used to calculate the effect of degradation and deforestation on forest AGBD.
# It does not attempt to model loss of forest to water (e.g. reservoirs), as these often pre-date
# available satellite data (especially topographic), and there is no management application,
# i.e. it's unlikely the reservoir will ever be restored to forest.

# Define ranges for 'no disturbance' scenarios
define_no_disturbance_scenarios = True

# No disturbance ranges as tuples of (start_year, end_year)
no_disturbance_ranges = [
    (1993, 2021),
    (1996, 2024),
    (1997, 2024),
    (1998, 2024),
    (1999, 2024),
    (2000, 2024),
    (2001, 2024),
    (2002, 2024),
    (2003, 2024),
    (2004, 2024),
    (2005, 2024),
    (2006, 2024),
    (2007, 2024),
    (2008, 2024),
    (2009, 2024),
    (2010, 2024),
    (2011, 2024),
    (2012, 2024),
    (2013, 2024),
    (2014, 2024),
    (2015, 2024),
    (2016, 2024),
    (2017, 2024),
    (2018, 2024),
    (2019, 2024),
    (2020, 2024),
    (2021, 2024),
    (2022, 2024),
    (2023, 2024),
    (2024, 2024),
]

# Create a feature list for 'no disturbance' scenarios
if define_no_disturbance_scenarios:
  if alpha_earth: print("Alternate scenarios cannot be created with Alpha Earth features.")
  else:
    for start_year, end_year in no_disturbance_ranges:
      assert end_year <= last_feature_year, "End years must be at or before the last feature year."
      assert end_year >= minimum_yearly_scenario, "End years must be at or after the minimum yearly scenario."
      assert start_year >= first_feature_year, "Start years must be at or after the first feature year."
      assert start_year >= end_year - model_scenario_year_range, "Start years must be within the model scenario range of the end year."
      assert start_year <= end_year, "The start year must less than or equal to the end year."

      # Determine base features based on the end year of the range
      scenario_features_csv = join(scenarios_model_dir, f"{end_year}.csv")
      base_features = [f.split('/')[-1] for f in pd.read_csv(scenario_features_csv).iloc[:,0]]

      no_disturbance_features = []
      for scenario_feature in base_features:
        if "disturbance_edge_distance" in scenario_feature:
          scenario_feature_year = int(scenario_feature[-4:])
          # Replace disturbance with the minimum (negative) edge distance
          if scenario_feature_year >= start_year: no_disturbance_features.append("minimum_edge_distance")
          else: no_disturbance_features.append(scenario_feature)
        elif "disturbance_local_density" in scenario_feature:
          scenario_feature_year = int(scenario_feature[-4:])
          # Replace disturbance with the minimum (negative) edge distance
          if scenario_feature_year >= start_year: no_disturbance_features.append("minimum_local_density")
          else: no_disturbance_features.append(scenario_feature)
        elif "forest_edge_distance" in scenario_feature:
          scenario_feature_year = int(scenario_feature[-4:])
          forest_year = start_year - 1
          alternate_forest = f"forest_edge_distance_{forest_year}"
          if scenario_feature_year > forest_year: no_disturbance_features.append(alternate_forest)
          else: no_disturbance_features.append(scenario_feature)
        elif "forest_local_density" in scenario_feature:
          scenario_feature_year = int(scenario_feature[-4:])
          forest_year = start_year - 1
          alternate_forest = f"forest_local_density_{forest_year}"
          if scenario_feature_year > forest_year: no_disturbance_features.append(alternate_forest)
          else: no_disturbance_features.append(scenario_feature)
        else: no_disturbance_features.append(scenario_feature)
      # Locate features in final feature directories
      no_disturbance_features = locate_feature(no_disturbance_features, covariates_renamed)

      no_disturbance_scenario_filename = f"{end_year}_no_disturbance_since_{start_year}.csv"
      no_disturbance_scenario_path = join(scenarios_model_dir, no_disturbance_scenario_filename)
      pd.DataFrame(no_disturbance_features).to_csv(no_disturbance_scenario_path, index=False)
      print(f"Feature list for a scenario without disturbance between {start_year} and {end_year} exported to {no_disturbance_scenario_filename}.")
else:
  print("The 'no disturbance' scenarios are not enabled.")

In [None]:
# These alternate scenarios simulate old-growth forest using a proxy area specified by the user.
# Forest extent (i.e. 'no deforestation') can be set from yearly feature, or all historic / potential forest area.
# A second version of the scenario without the oldgrowth proxy will be created for comparison.
# In rare cases, areas with unexpectedly high AGBD will have a lower AGBD estimate with the oldgrowth proxy,
# and the highest estimate will be used for that pixel.

define_no_disturbance_since_oldgrowth = True

# List of land-use base feature names. All but one are 'redundant'.
# One should be selected as 'oldgrowth_feature'.
# Both _edge_distance and _local_density variants will be modified.
print("oldgrowth_redundant_features = [")
seen_bases = set()
for feature in model_features:
    if "lu_" in feature:
        if "_edge_distance" in feature: base_name = feature.replace("_edge_distance", "")
        elif "_local_density" in feature: base_name = feature.replace("_local_density", "")
        else: continue
        if base_name not in seen_bases:
            seen_bases.add(base_name)
            print(f'  "{base_name}",')
print("]")

In [None]:
most_recent_scenario_features

In [None]:
# Base name of the feature that best indicates oldgrowth to the model.
# Both _edge_distance and _local_density variants will be modified.
oldgrowth_feature = 'lu_old-growth_protected_areas'

# Base names of features that may confound the old-growth proxy.
# Both _edge_distance and _local_density variants will be removed for the old-growth scenarios.
oldgrowth_redundant_features = [
    "lu_ais",
    "lu_berkelah_jerantut",
    "lu_berkelah_kuantan",
    "lu_berkelah_temerloh",
    "lu_old-growth_protected_areas",
    "lu_remen_chereh",
    "lu_tekai_tembeling",
    "lu_tekam",
    "lu_yong_lipis",
    "lu_yong",
]

# Should be set to the year(s) of interest
# Given fluctuations in rivers or lake extents etc.
oldgrowth_land_years = [
    2021,
    last_feature_year
]

for year in oldgrowth_land_years:
  assert year in final_feature_years, "Years in 'oldgrowth_land_years' must be available in the final yearly features."

# This is the prefix in the binary edge effects directory for the maximum forest extent
# By default it is 'land'(according to TMF data), assuming all land in the study area
# was once forest. Upload a different extent to binary rasters if this was not the case,
# then run the edge effects section and change this prefix.
oldgrowth_forest_extent = "land"


most_recent_scenario_features = [f.split('/')[-1] for f in most_recent_scenario_features]
if define_no_disturbance_since_oldgrowth:
  if alpha_earth: print("Alternate scenarios cannot be created with Alpha Earth features.")
  else:
      for year in oldgrowth_land_years:
        no_disturbance_since_oldgrowth_name = f"{year}_no_disturbance_since_oldgrowth"
        oldgrowth_all_features_1 = []
        oldgrowth_all_features_2 = []
        for scenario_feature in most_recent_scenario_features:
          if "disturbance_edge_distance" in scenario_feature:
            feature_1 = feature_2 = "minimum_edge_distance"
          elif "disturbance_local_density" in scenario_feature:
            feature_1 = feature_2 = "minimum_local_density"
          elif "forest_edge_distance" in scenario_feature:
            feature_1 = feature_2 = f"{oldgrowth_forest_extent}_edge_distance_{year}"
          elif "forest_local_density" in scenario_feature:
            feature_1 = feature_2 = f"{oldgrowth_forest_extent}_local_density_{year}"
          elif scenario_feature.startswith(oldgrowth_feature):
            if 'edge_distance' in scenario_feature: feature_1 = "maximum_edge_distance" # LU interior max
            if 'local_density' in scenario_feature: feature_1 = "maximum_local_density"
            feature_2 = scenario_feature
          elif any(scenario_feature.startswith(rf) for rf in oldgrowth_redundant_features):
            if 'edge_distance' in scenario_feature: feature_1 = "minimum_edge_distance" # LU exterior min (neg)
            if 'local_density' in scenario_feature: feature_1 = "minimum_local_density"
            feature_2 = scenario_feature
          else: feature_1 = feature_2 = scenario_feature
          oldgrowth_all_features_1.append(feature_1)
          oldgrowth_all_features_2.append(feature_2)
        # Locate features in final feature directories
        oldgrowth_all_features_1 = locate_feature(oldgrowth_all_features_1, covariates_renamed)
        oldgrowth_all_features_2 = locate_feature(oldgrowth_all_features_2, covariates_renamed)

        # Compare feature lists and save appropriate CSVs.
        if oldgrowth_all_features_1 == oldgrowth_all_features_2:
          filename = f"{no_disturbance_since_oldgrowth_name}_1.csv"
          pd.DataFrame(oldgrowth_all_features_1).to_csv(join(scenarios_model_dir, filename), index=False)
          print(f"Feature lists were identical, only saved {filename}")
        else:
          for suffix, features in [("1", oldgrowth_all_features_1), ("2", oldgrowth_all_features_2)]:
            filename = f"{no_disturbance_since_oldgrowth_name}_{suffix}.csv"
            pd.DataFrame(features).to_csv(join(scenarios_model_dir, filename), index=False)
          print(f"Feature lists for {no_disturbance_since_oldgrowth_name} have been exported to {no_disturbance_since_oldgrowth_name}_1.csv and {no_disturbance_since_oldgrowth_name}_2.csv.\n")

else: print("Old-growth scenarios are not enabled.")

## Area-based disturbance

In [None]:
# Use polygons to select areas for alternate scenarios of area-based disturbance
define_area_based_disturbance = True

# Set the edge effect distance for the alternate scenario features
# This should match the distance used in '3_features_lcluc.ipynb'.
edge_effect_distance = 120

if define_area_based_disturbance:
  if alpha_earth: print("Alternate scenarios cannot be created with Alpha Earth features.")
  else:
    # Exclude existing polygons from search
    polygons_to_exclude = ['project_area.gpkg', 'project_area_buffered_bbox.gpkg', 'gedi_area.gpkg', 'template.gpkg']
    print("# Modify this dictionary by:")
    print("# 1) Commenting out any polygons not being used for disturbance.")
    print("# 2) Changing the type from 'deforestation' to 'degradation' if necessary.")
    print("# 3) Changing the tuple years from 'range' to 'discrete' to specify individual years.")
    print("# 4) If years are discrete, add one or more. If a range, add the start and end year.")
    print("# 5) Changing the alternate scenario year for each area if needed.")
    print("# 6) Copy and paste lines for multiple scenarios with the same area (different disturbance types, different years).")
    print("# Remember to ensure all keys are unique - if you copy and paste, manually change the keys.\n")

    # Exclude existing polygons from search
    polygons_to_exclude = ['project_area.gpkg', 'gedi_area.gpkg', 'template.gpkg']
    exclude_lu_polygons = False

    print("disturbance_polygons = {")
    index = 1
    first_disturbance_year = last_feature_year - model_scenario_year_range
    for polygon in sorted(os.listdir(polygons_dir)):
      if polygon not in polygons_to_exclude and 'inverse' not in polygon and 'buffered' not in polygon:
        if not exclude_lu_polygons:
          print(f"    {index}: ['{polygon[:-5]}', 'deforestation','range', ({first_disturbance_year}, {last_feature_year}), {last_feature_year}],")
          index += 1
        if exclude_lu_polygons and 'lu_' not in polygon:
          print(f"    {index}: ['{polygon[:-5]}', 'deforestation','range', ({first_disturbance_year}, {last_feature_year}), {last_feature_year}],")
          index += 1
    print("}\n")

else: print("Area-based disturbance scenarios are not enabled.")

In [None]:
disturbance_polygons = {
    3: ['road_mat_daling', 'deforestation','range', (2023, 2024), 2024],
}

# The alternate year is set to 2024.
# The years for alternate area-based disturbance can be between 1996 and 2024

# Modify this dictionary by:
# 1) Commenting out any polygons not being used for disturbance.
# 2) Changing the type from 'deforestation' to 'degradation' if necessary.
# 3) Changing the tuple years from 'range' to 'discrete' to specify individual years.
# 4) If years are discrete, add one or more. If a range, add the start and end year.
# 5) Copy and paste lines for multiple scenarios with the same area (different disturbance types, different years).
# Remember to ensure all keys are unique - if you copy and paste, manually change the keys.

if define_area_based_disturbance:

    if alpha_earth: print("Alternate scenarios cannot be created with Alpha Earth features.")
    else:
      # Validate disturbance types, year types and available years.
      for area_index, value in disturbance_polygons.items():
          polygon_name = value[0]
          disturbance_type = value[1]
          year_type, years_data = value[2], value[3]
          alternate_scenario_year = value[4]

          # Calculate first available disturbance year for this area's alternate scenario year
          first_disturbance_year = alternate_scenario_year - model_scenario_year_range

          # Validate alternate scenario year
          assert alternate_scenario_year >= minimum_yearly_scenario, f"Alternate scenario year for {polygon_name} must be between {minimum_yearly_scenario} and {last_feature_year}."
          assert alternate_scenario_year <= last_feature_year, f"Alternate scenario year for {polygon_name} must be between {minimum_yearly_scenario} and {last_feature_year}."
          # Validate disturbance types and year types
          assert disturbance_type in ['deforestation', 'degradation'], f"Disturbance type for {polygon_name} must be 'deforestation' or 'degradation'."
          if year_type == 'range':
            start_year, end_year = years_data
            assert start_year <= end_year, f"The start year for {polygon_name} {disturbance_type} must be before the end year."

          # Validate deforestation constraints
          if disturbance_type == 'deforestation':
            assert year_type == 'range', f"Year type for {polygon_name} deforestation must be 'range'."
            assert end_year == alternate_scenario_year, f"Deforestation in {polygon_name} must end in the alternate scenario year {alternate_scenario_year}. Deforestation is considered permanent land-cover change."
            assert start_year >= first_disturbance_year, f"The start year for deforestation in {polygon_name} must be >= the first available disturbance year {first_disturbance_year}."
            all_years = list(range(start_year, end_year + 1))

          # Validate degradation constraints
          if disturbance_type == 'degradation':
            assert year_type in ['range', 'discrete'], f"Year type for {polygon_name} degradation must be 'range' or 'discrete'."
            if year_type == 'range':
                all_years = list(range(start_year, end_year + 1))
            else: all_years = list(years_data)
            for year in all_years:
                assert year <= alternate_scenario_year, f"Years for {polygon_name} degradation (check {year}) must be <= the alternate scenario year {alternate_scenario_year}."
                assert year >= first_disturbance_year, f"Years for {polygon_name} degradation (check {year}) must >= the first available disturbance year {first_disturbance_year}."

          # Simplify dictionary
          disturbance_polygons[area_index] = [polygon_name, disturbance_type, all_years, alternate_scenario_year]

      print("The 'disturbance_polygons' dictionary is valid.")
else: print("Area-based disturbance scenarios are not enabled.")

In [None]:
if define_area_based_disturbance:
    if alpha_earth: print("Alternate scenarios cannot be created with Alpha Earth features.")
    else:
      # Define the projects CRS to check the area polygon matches
      crs_epsg = 4326
      # This setting buffers any deforestation area to add degradation around it.
      # At least 1 pixel distance (e.g. 30 m) might be realistic. Otherwise set to None.
      buffer_distance_metres = 30

      # Define a temporary directory for copying binary rasters and burning the area's polygon
      binary_temp_dir = join(scenarios_model_dir, 'binary_temp')
      makedirs(binary_temp_dir, exist_ok=True)

      # Calculate progress totals before processing
      total_areas = len(disturbance_polygons)
      total_rasters = 0

      for area_index, parameters in disturbance_polygons.items():
          polygon_name = parameters[0]
          disturbance_type = parameters[1]
          disturbance_years = parameters[2]

          # Raster counting: degradation = years x 2, deforestation = years x 5
          # (edge_distance + local_density for disturbance, plus same for forest, plus mask)
          if disturbance_type == 'degradation': area_raster_count = len(disturbance_years) * 2
          elif disturbance_type == 'deforestation': area_raster_count = len(disturbance_years) * 4 + 1
          total_rasters += area_raster_count

      # Progress indicators
      area_progress_index, area_progress_label = 0, widgets.Label(value=f"Area progress: 0 / {total_areas}")
      display(area_progress_label)
      raster_progress_index, raster_progress_label = 0, widgets.Label(value=f"Raster progress: 0 / {total_rasters}")
      display(raster_progress_label)

      for area_index, parameters in disturbance_polygons.items():

          # Extract alternate area-based disturbance parameters
          area_disturbance_features = []
          polygon_name = parameters[0]
          disturbance_type = parameters[1]
          disturbance_years = parameters[2]
          alternate_scenario_year = parameters[3]

          # Determine base features by the alternate scenario's year for this area
          alternate_year_scenario_csv = join(scenarios_model_dir, f"{alternate_scenario_year}.csv")
          base_features = [f.split('/')[-1] for f in pd.read_csv(alternate_year_scenario_csv).iloc[:,0]]

          # Define area polygon
          area_polygon_path = join(polygons_dir, f"{polygon_name}.gpkg")
          if disturbance_type == 'deforestation' and buffer_distance_metres:
            area_buffered_path = join(polygons_dir, f"{polygon_name}_buffered_{buffer_distance_metres}.gpkg")
            if not exists(area_buffered_path):
              area_polygon = gpd.read_file(join(polygons_dir, f"{polygon_name}.gpkg"))
              if area_polygon.crs.to_epsg() == crs_epsg:
                # Suppress warning about not being a geographic CRS, as we account for this.
                # However larger buffers or project areas near the poles might still need to be converted.
                warnings.filterwarnings("ignore", category=UserWarning)
                # Get the centroid of the project polygon
                area_polygon_centroid = area_polygon.centroid.values[0]
                # Convert the buffer distance from meters to decimal degrees based on the location at the centroid
                buffer_distance_degrees = buffer_distance_metres / (111320 * abs(math.cos(math.radians(area_polygon_centroid.y))))
                # Buffer the polygon and save
                area_polygon_buffered = area_polygon.buffer(buffer_distance_degrees)
                gdf = gpd.GeoDataFrame(geometry=area_polygon_buffered, crs=f"EPSG:{crs_epsg}")
                gdf.to_file(area_buffered_path, driver='GPKG')
                print(f"Buffered the project area to {buffer_distance_metres} and exported to the polygons directory.")
              else: print(f"Reproject {polygon_name}.gpkg to EPSG:4326.")
          else: area_buffered_path = None

          # Track which binary rasters have been processed to avoid duplicate edge_effects calls
          processed_disturbance_years = set()
          processed_forest_years = set()

          for scenario_feature in base_features:
              # Handle disturbance edge_distance features
              if "disturbance_edge_distance" in scenario_feature:
                  scenario_feature_year = int(scenario_feature[-4:])
                  if scenario_feature_year in disturbance_years:
                    # Define feature names for both edge_distance and local_density
                    if disturbance_type == 'deforestation':
                      distance_name = f"disturbance_edge_distance_{scenario_feature_year}_{polygon_name}_deforestation_{buffer_distance_metres}m_buffer"
                      density_name = f"disturbance_local_density_{scenario_feature_year}_{polygon_name}_deforestation_{buffer_distance_metres}m_buffer"
                    else:
                      distance_name = f"disturbance_edge_distance_{scenario_feature_year}_{polygon_name}_degradation"
                      density_name = f"disturbance_local_density_{scenario_feature_year}_{polygon_name}_degradation"
                    distance_path = join(feature_alternate_dir, f"{distance_name}.tif")
                    density_path = join(feature_alternate_dir, f"{density_name}.tif")

                    if not exists(distance_path) or not exists(density_path):
                      # Copy the disturbance binary raster for burning '1' to the polygon area
                      binary_raster_name = f"disturbance_binary_{scenario_feature_year}.tif"
                      binary_raster_path = join(feature_binary_dir, binary_raster_name)
                      binary_raster_temp_path = join(binary_temp_dir, binary_raster_name)
                      copyfile(binary_raster_path, binary_raster_temp_path)
                      if area_buffered_path: burn_polygon_to_raster(binary_raster_temp_path, area_buffered_path, fixed_value=1, all_touched=True)
                      else: burn_polygon_to_raster(binary_raster_temp_path, area_polygon_path, fixed_value=1, all_touched=True)
                      # Apply edge effects and export both arrays
                      binary_burned = gdal.Open(binary_raster_temp_path)
                      binary_burned_array = binary_burned.ReadAsArray()
                      binary_burned = None
                      distance_array, density_array = edge_effects(binary_burned_array, 'binary', cell_size_x_path, cell_size_y_path, edge_effect_distance)
                      if not exists(distance_path): export_array_as_tif(distance_array, distance_path)
                      if not exists(density_path): export_array_as_tif(density_array, density_path)

                    area_disturbance_features.append(distance_name)
                    processed_disturbance_years.add(scenario_feature_year)
                    raster_progress_index += 2
                    raster_progress_label.value = f"Raster progress: {raster_progress_index} / {total_rasters}"
                  else: area_disturbance_features.append(scenario_feature)

              # Handle disturbance local_density features
              elif "disturbance_local_density" in scenario_feature:
                  scenario_feature_year = int(scenario_feature[-4:])
                  if scenario_feature_year in disturbance_years:
                    # Feature was created when processing edge_distance
                    if disturbance_type == 'deforestation':
                      density_name = f"disturbance_local_density_{scenario_feature_year}_{polygon_name}_deforestation_{buffer_distance_metres}m_buffer"
                    else:
                      density_name = f"disturbance_local_density_{scenario_feature_year}_{polygon_name}_degradation"
                    area_disturbance_features.append(density_name)
                  else: area_disturbance_features.append(scenario_feature)

              # Handle forest edge_distance features (deforestation only)
              elif "forest_edge_distance" in scenario_feature:
                  if disturbance_type == 'deforestation':
                      scenario_feature_year = int(scenario_feature[-4:])
                      if scenario_feature_year in disturbance_years:
                        # Define feature names for both edge_distance and local_density
                        distance_name = f"forest_edge_distance_{scenario_feature_year}_{polygon_name}_deforestation"
                        density_name = f"forest_local_density_{scenario_feature_year}_{polygon_name}_deforestation"
                        distance_path = join(feature_alternate_dir, f"{distance_name}.tif")
                        density_path = join(feature_alternate_dir, f"{density_name}.tif")

                        if not exists(distance_path) or not exists(density_path):
                          # Copy the forest binary raster for burning '0' to the polygon area
                          binary_raster_name = f"forest_binary_{scenario_feature_year}.tif"
                          binary_raster_path = join(feature_binary_dir, binary_raster_name)
                          binary_raster_temp_path = join(binary_temp_dir, binary_raster_name)
                          copyfile(binary_raster_path, binary_raster_temp_path)
                          burn_polygon_to_raster(binary_raster_temp_path, area_polygon_path, fixed_value=0, all_touched=True)
                          # Apply edge effects and export both arrays
                          binary_burned = gdal.Open(binary_raster_temp_path)
                          binary_burned_array = binary_burned.ReadAsArray()
                          binary_burned = None
                          distance_array, density_array = edge_effects(binary_burned_array, 'binary', cell_size_x_path, cell_size_y_path, edge_effect_distance)
                          if not exists(distance_path): export_array_as_tif(distance_array, distance_path)
                          if not exists(density_path): export_array_as_tif(density_array, density_path)

                        area_disturbance_features.append(distance_name)
                        processed_forest_years.add(scenario_feature_year)
                        raster_progress_index += 2
                        raster_progress_label.value = f"Raster progress: {raster_progress_index} / {total_rasters}"
                      else: area_disturbance_features.append(scenario_feature)
                  else: area_disturbance_features.append(scenario_feature)

              # Handle forest local_density features (deforestation only)
              elif "forest_local_density" in scenario_feature:
                  if disturbance_type == 'deforestation':
                      scenario_feature_year = int(scenario_feature[-4:])
                      if scenario_feature_year in disturbance_years:
                        # Feature was created when processing edge_distance
                        density_name = f"forest_local_density_{scenario_feature_year}_{polygon_name}_deforestation"
                        area_disturbance_features.append(density_name)
                      else: area_disturbance_features.append(scenario_feature)
                  else: area_disturbance_features.append(scenario_feature)
              else: area_disturbance_features.append(scenario_feature)

          # Locate features in final feature directories
          area_disturbance_features = locate_feature(area_disturbance_features, covariates_renamed)

          # Add name affix based on whether years are a range or discrete
          if disturbance_type == "deforestation": year_affix = f'_{min(disturbance_years)}'
          else:
            if len(disturbance_years) != (max(disturbance_years) - min(disturbance_years) + 1):
                sorted_years = sorted(disturbance_years)
                parts, start = [], sorted_years[0]
                for i, year in enumerate(sorted_years[1:] + [None], 1):
                    if year != sorted_years[i-1] + 1:
                        end = sorted_years[i-1]
                        parts.append(f"{start}-{end}" if start != end else str(start))
                        start = year
                year_affix = "_" + "_".join(parts)
            else: year_affix = f'_{min(disturbance_years)}-{max(disturbance_years)}'

          if disturbance_type == 'deforestation':
            if buffer_distance_metres:
              area_disturbance_scenario_name = f"{alternate_scenario_year}_{polygon_name}_deforestation{year_affix}_{buffer_distance_metres}m_degradation_buffer"
            else: area_disturbance_scenario_name = f"{alternate_scenario_year}_{polygon_name}_deforestation{year_affix}_0m_degradation_buffer"
            # Create a new forest mask for the area-based disturbance scenario
            mask_raster_path = join(masks_dir, f"mask_forest_{alternate_scenario_year}_{polygon_name}_deforestation.tif")
            if not exists(mask_raster_path):
              # Ensure original forest binary is copied to temp and burned with polygon
              scenario_year_forest_binary_path = join(binary_temp_dir, f"forest_binary_{alternate_scenario_year}.tif")
              forest_binary_source = join(feature_binary_dir, f"forest_binary_{alternate_scenario_year}.tif")
              copyfile(forest_binary_source, scenario_year_forest_binary_path)
              burn_polygon_to_raster(scenario_year_forest_binary_path, area_polygon_path, fixed_value=0, all_touched=True)
              # Create mask from burned forest data
              scenario_year_forest_binary = gdal.Open(scenario_year_forest_binary_path)
              scenario_year_forest_binary_array = scenario_year_forest_binary.ReadAsArray()
              scenario_year_forest_binary = None
              mask_array = np.where(scenario_year_forest_binary_array == 0, nodatavalue, 1)
              export_array_as_tif(mask_array, mask_raster_path)
              print(f"A mask raster has been created for {area_disturbance_scenario_name}.")
            raster_progress_index += 1
            raster_progress_label.value = f"Raster progress: {raster_progress_index} / {total_rasters}"

          else: area_disturbance_scenario_name = f"{alternate_scenario_year}_{polygon_name}_degradation{year_affix}"

          # Clear temporary binary raster folder
          for temp_file in os.listdir(binary_temp_dir): os.remove(join(binary_temp_dir, temp_file))

          # Export the alternate area-based disturbance scenario
          no_degradation_scenario_path = join(scenarios_model_dir, f"{area_disturbance_scenario_name}.csv")
          pd.DataFrame(area_disturbance_features).to_csv(no_degradation_scenario_path, index=False)
          print(f"Feature list for {area_disturbance_scenario_name} has been exported.")

          # Update area progress
          area_progress_index += 1
          area_progress_label.value = f"Area progress: {area_progress_index} / {total_areas}"

      print("\nAlternate area-based disturbance scenarios complete.")
      Path.rmdir(binary_temp_dir) # Delete temporary directory

else: print("Area-based disturbance scenarios are not enabled.")

## Recovered to oldgrowth

In [None]:
# This simulates edge effects on oldgrowth forest within an actual forest extent.
# Assumes no forest regrowth (from non-forest land-cover), only recovery.
define_oldgrowth_recovery = True

# List of land-use base feature names. All but one are 'redundant'.
# One should be selected as 'oldgrowth_feature'.
# Both _edge_distance and _local_density variants will be modified.
print("oldgrowth_redundant_features = [")
seen_bases = set()
for feature in model_features:
    if "lu_" in feature:
        if "_edge_distance" in feature: base_name = feature.replace("_edge_distance", "")
        elif "_local_density" in feature: base_name = feature.replace("_local_density", "")
        else: continue
        if base_name not in seen_bases:
            seen_bases.add(base_name)
            print(f'  "{base_name}",')
print("]")

In [None]:
# Base name of the feature that best indicates oldgrowth to the model.
# Both _edge_distance and _local_density variants will be modified.
oldgrowth_feature = 'lu_old-growth_protected_areas'

# Base names of features that may confound the old-growth proxy.
# Both _edge_distance and _local_density variants will be removed for the old-growth scenarios.
oldgrowth_redundant_features = [
    "lu_ais",
    "lu_berkelah_jerantut",
    "lu_berkelah_kuantan",
    "lu_berkelah_temerloh",
    "lu_old-growth_protected_areas",
    "lu_remen_chereh",
    "lu_tekai_tembeling",
    "lu_tekam",
    "lu_yong_lipis",
    "lu_yong",
]

# Set to the year(s) of interest to use that forest extent
oldgrowth_recovery_years = [
    2021,
    last_feature_year
]

for year in oldgrowth_recovery_years:
  assert year in final_feature_years, "Years in 'oldgrowth_recovery_years' must be available in the final yearly features."

# Generate 'oldgrowth' scenarios for each specified year (i.e. forest extent in that year).
if define_oldgrowth_recovery:
    for year in oldgrowth_recovery_years:
      oldgrowth_recovery_features_1 = []
      oldgrowth_recovery_features_2 = []
      old_growth_scenario_year_diff = last_feature_year - year
      for scenario_feature in most_recent_scenario_features:
        if "disturbance_edge_distance" in scenario_feature:
          feature_1 = feature_2 = "minimum_edge_distance"
        elif "disturbance_local_density" in scenario_feature:
          feature_1 = feature_2 = "minimum_local_density"
        elif "forest_edge_distance" in scenario_feature:
          scenario_feature_year = int(scenario_feature[-4:])
          new_year = (scenario_feature_year - old_growth_scenario_year_diff > first_feature_year) and \
                    scenario_feature_year - old_growth_scenario_year_diff or first_feature_year
          feature_1 = feature_2 = f"forest_edge_distance_{new_year}"
        elif "forest_local_density" in scenario_feature:
          scenario_feature_year = int(scenario_feature[-4:])
          new_year = (scenario_feature_year - old_growth_scenario_year_diff > first_feature_year) and \
                    scenario_feature_year - old_growth_scenario_year_diff or first_feature_year
          feature_1 = feature_2 = f"forest_local_density_{new_year}"
        elif scenario_feature.startswith(oldgrowth_feature):
          if 'edge_distance' in scenario_feature: feature_1 = "maximum_edge_distance" # LU interior max
          if 'local_density' in scenario_feature: feature_1 = "maximum_local_density"
          feature_2 = scenario_feature
        elif any(scenario_feature.startswith(rf) for rf in oldgrowth_redundant_features):
          if 'edge_distance' in scenario_feature: feature_1 = "minimum_edge_distance" # LU exterior min (neg)
          if 'local_density' in scenario_feature: feature_1 = "minimum_local_density"
          feature_2 = scenario_feature
        else: feature_1 = feature_2 = scenario_feature
        oldgrowth_recovery_features_1.append(feature_1)
        oldgrowth_recovery_features_2.append(feature_2)
      # Locate features in final feature directories
      oldgrowth_recovery_features_1 = locate_feature(oldgrowth_recovery_features_1, covariates_renamed)
      oldgrowth_recovery_features_2 = locate_feature(oldgrowth_recovery_features_2, covariates_renamed)

      # Compare feature lists and save appropriate CSVs.
      if oldgrowth_recovery_features_1 == oldgrowth_recovery_features_2:
        filename = f"{year}_oldgrowth_recovery_1.csv"
        pd.DataFrame(oldgrowth_recovery_features_1).to_csv(join(scenarios_model_dir, filename), index=False)
        print(f"Feature lists were identical, only saved {filename}")
      else:
        for suffix, features in [("1", oldgrowth_recovery_features_1), ("2", oldgrowth_recovery_features_2)]:
          filename = f"{year}_oldgrowth_recovery_{suffix}.csv"
          pd.DataFrame(features).to_csv(join(scenarios_model_dir, filename), index=False)
        print(f"Feature lists for scenarios where all forest in {year} becomes old-growth")
        print(f"have been exported to {year}_oldgrowth_recovery_1.csv and {year}_oldgrowth_recovery_2.csv.\n")

# Feature verification

In [None]:
# Check all features in scenario .csvs exist
scenario_csv_list = []
all_features_exist = True # Changes to false if feature missing
for csv in os.listdir(scenarios_model_dir):
  if csv.endswith('.csv'):
    csv_dir = join(scenarios_model_dir, csv)
    csv_feature_list = pd.Series.tolist(pd.read_csv(csv_dir).iloc[:,0])
    csv_feature_dir_list = []
    for csv_feature in csv_feature_list:
      if csv_feature not in covariates: csv_feature_dir_list.append(f"{features_dir}/{csv_feature}.tif")
    for feature in csv_feature_dir_list:
      if not exists(feature):
        all_features_exist = False
        print(f"The following feature is missing:\n{feature}\n and is required for the scenario '{csv[:-4]}'")

if all_features_exist: print("All required features are present.")
print("Covariate features e.g. 'beam' and 'sensitivity' will be added at the prediction stage.")

# Check all features against template dimensions
# Find all features in all scenarios for tiling
feature_paths = set()
for csv in os.listdir(scenarios_model_dir):
    if csv.endswith('.csv'):
        features = pd.read_csv(join(scenarios_model_dir, csv)).iloc[:,0]
        for feature in features:
            feature_path = join(features_dir, f"{feature}.tif")
            feature_paths.add(feature_path)
feature_paths = list(feature_paths)

if not alpha_earth: # Higher resolution than template
  scenario_template = gdal.Open(template_tif_path)
  scenario_template_dimensions, scenario_template_projection = scenario_template.GetGeoTransform(), scenario_template.GetProjection()
  scenario_template = None
  feature_issue = False
  for feature_path in feature_paths:
    feature = gdal.Open(feature_path)
    feature_dimensions, feature_projection = feature.GetGeoTransform(), feature.GetProjection()
    feature = None
    if feature_dimensions != scenario_template_dimensions:
      print(f"{feature} dimensions:\n{feature_dimensions}\ndo not match the scenario template dimensions:\n{scenario_template_dimensions}\n")
      feature_issue = True
    if feature_projection != scenario_template_projection:
      print(f"{feature} projection:\n{feature_projection}\ndoes not match the scenario template projection:\n{scenario_template_projection}\n\n")
      feature_issue = True
  if not feature_issue: print(f"All features listed in the scenario .csv files have the correct dimensions and projection.")
  else: print("Correct and / or resample the feature(s).")
else: print("AlphaEarth features cannot be mixed with other features unless resampled.")

# Scenario masks

In [None]:
# Scenario masks

# Use polygons for masking, only areas inside the polygons will be included.

# Exclude existing polygons from search
polygons_to_exclude = ['template.gpkg', 'project_area_buffered_bbox.gpkg']

print("scenario_mask_polygons = [")
for polygon in os.listdir(polygons_dir):
  if polygon not in polygons_to_exclude:
    if 'inverse' not in polygon:
      print(f"  '{polygon[:-5]}',")
print("]")

In [None]:
scenario_mask_polygons = [
  # 'project_area',
  'gedi_area',
]

# Optional forest mask override for 'no_disturbance_since_oldgrowth' scenarios.
# If None, uses the yearly land mask (mask_land_{scenario_year}).
# Set to a mask filename (without .tif) to use a different forest extent,
# e.g. 'mask_land_2020' or a custom mask representing maximum potential forest cover.
oldgrowth_scenario_mask_override = None

# Create inverse polygons for masking
template_polygon_path = join(polygons_dir, "template.gpkg")
for polygon in scenario_mask_polygons:
  inverse_polygon_path = join(polygons_dir, f"{polygon}_inverse.gpkg")
  if not exists(inverse_polygon_path):
    polygon_path = join(polygons_dir, f"{polygon}.gpkg")
    template_polygon = gpd.read_file(template_polygon_path)
    polygon_read = gpd.read_file(polygon_path)
    polygon_crs = polygon_read.crs.to_epsg()
    inverse_polygon = template_polygon['geometry'].difference(polygon_read['geometry']).iloc[0]
    inverse_polygon_gdf = gpd.GeoDataFrame({'geometry': [inverse_polygon]}, crs=f"EPSG:{polygon_crs}")
    inverse_polygon_gdf.to_file(inverse_polygon_path, driver="GPKG")
    print(f"An inverse masking polygon for {polygon} has been created in {polygons_dir}.")
  else: print(f"An inverse masking polygon for {polygon} already exists.")

In [None]:
# Collect all scenarios with .csv feature lists
all_scenario_csvs = [csv[:-4] for csv in os.listdir(scenarios_model_dir) if csv.endswith('.csv')]

# Determine last feature year for future scenario handling
final_feature_years_list = []
for final_feature in os.listdir(feature_edge_effects_dir):
  try: final_feature_years_list.append(int(final_feature[-8:-4]))
  except: continue
last_feature_year_masks = max(final_feature_years_list)

# Progress
mask_progress_index = 0
mask_progress_label = widgets.Label(f"Scenario mask progress: 0 / {len(all_scenario_csvs)}")
display(mask_progress_label)

for scenario in sorted(all_scenario_csvs):
  scenario_mask_path = join(scenario_masks_dir, f"{scenario}.tif")
  if exists(scenario_mask_path):
    mask_progress_index += 1
    mask_progress_label.value = f"Scenario mask progress: {mask_progress_index} / {len(all_scenario_csvs)}"
    continue
  scenario_year = int(scenario[:4])
  forest_mask_path = None

  # Match 'no_disturbance_since_oldgrowth' scenarios
  if 'no_disturbance_since_oldgrowth' in scenario:
    if oldgrowth_scenario_mask_override: forest_mask_path = join(masks_dir, f"{oldgrowth_scenario_mask_override}.tif")
    else: forest_mask_path = join(masks_dir, f"mask_land_{scenario_year}.tif")
  # Match area-based deforestation scenarios
  elif 'deforestation' in scenario:
    for mask in sorted(os.listdir(masks_dir)):
      if 'deforestation' in mask:
        mask_middle = mask[12:-4]  # Remove 'mask_forest_' prefix and '.tif' suffix
        if scenario.startswith(mask_middle):
          forest_mask_path = join(masks_dir, mask)
          break
  # Match 'no_disturbance_since' scenarios
  # Use the mask from the year before disturbance removal begins
  elif 'no_disturbance_since' in scenario:
    disturbance_since_year = int(scenario.split('_since_')[1][:4])
    forest_mask_path = join(masks_dir, f"mask_forest_{disturbance_since_year - 1}.tif")
  # Match future scenarios with most recent forest mask
  elif scenario_year > last_feature_year_masks:
    forest_mask_path = join(masks_dir, f"mask_forest_{last_feature_year_masks}.tif")
  # Match all other historic and degradation scenarios
  # Degradation uses same mask as historic, no additional deforestation
  else: forest_mask_path = join(masks_dir, f"mask_forest_{scenario_year}.tif")

  if forest_mask_path is None or not exists(forest_mask_path):
    print(f"Warning: No forest mask found for {scenario} at {forest_mask_path}, skipping.")
    mask_progress_index += 1
    mask_progress_label.value = f"Scenario mask progress: {mask_progress_index} / {len(all_scenario_csvs)}"
    continue

  # Load forest mask
  forest_mask = gdal.Open(forest_mask_path)
  combined_mask = forest_mask.ReadAsArray()
  forest_mask = None
  # Intersect with land mask for scenario year
  land_mask_path = join(masks_dir, f"mask_land_{scenario_year}.tif")
  if exists(land_mask_path):
    land_mask = gdal.Open(land_mask_path)
    land_mask_array = land_mask.ReadAsArray()
    land_mask = None
    combined_mask = np.where((combined_mask == 1) & (land_mask_array == 1), 1, 0)
  else:
    print(f"Warning: No land mask for year {scenario_year}, using forest mask only.")
    combined_mask = np.where(combined_mask == 1, 1, 0)
  # Apply polygon masks via burn
  if scenario_mask_polygons:
    temp_mask_path = join(scenario_masks_dir, f"temp_{scenario}.tif")
    export_array_as_tif(combined_mask.astype(np.float32), temp_mask_path, compress=False)
    for polygon_name in scenario_mask_polygons:
      inverse_polygon_path = join(polygons_dir, f"{polygon_name}_inverse.gpkg")
      if exists(inverse_polygon_path):
        burn_polygon_to_raster(temp_mask_path, inverse_polygon_path, fixed_value=0, all_touched=False)
    temp_mask = gdal.Open(temp_mask_path)
    combined_mask = temp_mask.ReadAsArray()
    temp_mask = None
    os.remove(temp_mask_path)
  # Convert to 1/nodata Int16
  final_mask = np.where(combined_mask == 1, 1, nodatavalue).astype(np.int16)
  export_array_as_tif(final_mask, scenario_mask_path, dtype=gdal.GDT_Int16)
  masked_fraction = np.sum(final_mask != 1) / final_mask.size
  print(f"{scenario}: {masked_fraction:.1%} masked")
  print(f"  Forest mask: {basename(forest_mask_path)}")
  mask_progress_index += 1
  mask_progress_label.value = f"Scenario mask progress: {mask_progress_index} / {len(all_scenario_csvs)}"

print("\nScenario masks complete.")

# Tiling

In [None]:
# Load the model scenario features for template tile creation
model_scenario_features = pd.Series.tolist(pd.read_csv(model_scenario_path).iloc[:,0])
model_scenario_features_dirs = [features_dir + '/' + feature + '.tif' for feature in model_scenario_features]
# Create a template feature array from the first feature that isn't a covariate (these are created later)
template_base_path = next(r for r in model_scenario_features_dirs if all(c not in r for c in covariates))
template_base = gdal.Open(template_base_path)
template_base_array = template_base.ReadAsArray()
template_base_xsize, template_base_ysize = template_base.GetRasterBand(1).XSize, template_base.GetRasterBand(1).YSize
template_base = None
print(f"The template feature is {template_base_xsize} x {template_base_ysize} pixels.")

# Check existing tile parameters
template_tile_list = []
for file in os.listdir(tile_templates_dir):
  if file.endswith('.tif') and file[:13] == 'template_tile':
    template_tile_list.append(file)
n_tiles_exist = len(template_tile_list)

if n_tiles_exist < 1: print("There are currently no template tiles. Run the next section.")
if n_tiles_exist >= 1:
  tile_size_y_rounded_exist_ds = gdal.Open(join(tile_templates_dir,'template_tile_1.tif'))
  tile_size_y_rounded_exist = tile_size_y_rounded_exist_ds.GetRasterBand(1).YSize
  tile_size_y_rounded_exist_ds = None
  tile_size_y_remainder_exist_ds = gdal.Open(join(tile_templates_dir,f'template_tile_{n_tiles_exist}.tif'))
  tile_size_y_remainder_exist = tile_size_y_remainder_exist_ds.GetRasterBand(1).YSize
  tile_size_y_remainder_exist_ds = None
  if n_tiles_exist == 1:
    print(f"There is a single 'tile' with a height of {tile_size_y_rounded_exist} pixels.")
    tile_size_y_remainder_exist = 0
  else:  print(f"There are {n_tiles_exist} template tiles, the first {n_tiles_exist-1} having a height of {tile_size_y_rounded_exist} pixels, the last {tile_size_y_remainder_exist} pixels.")

In [None]:
# Large template areas and / or numbers of features may be too much for the available memory.
# This section defines how to split predictions into tiles that can then be merged.
override_n_tiles = False  # Useful if the tile number has already been tested.
n_tiles_override = 1

memory_utilisation = 0.8 # Set to 0.8 to ensure crashes are avoided

assert memory_utilisation > 0 and memory_utilisation <= 1, "Set memory_utilisation to a value between 0 and 1"

# Calculate total size of feature stack
feature_stack_size = template_base_array.size * len(model_scenario_features_dirs)

# Calculate memory and the number of tiles required (assuming highest precision raster is Float32)
total_memory_needed = 32 / 8 * feature_stack_size *2 # 8 bits per byte, *2 for transposing feature stack
print(f'RAM required for each prediction: ~{total_memory_needed/(1024**3):.3f} GB')
print(f'RAM currently available: {psutil.virtual_memory().free / (1024**3):.3f} GB')
n_tiles_temp = int(np.ceil(total_memory_needed / (psutil.virtual_memory().free * memory_utilisation)))

# Calculate template tile size (split on the y axis only)
tile_size_y_rounded = int(np.ceil(template_base_ysize/n_tiles_temp)) # Round the number of y pixels in each tile
tile_size_y_remainder = template_base_ysize%tile_size_y_rounded # Calculate the remainder for the last tile
n_tiles = max(1, len(range(0, template_base_ysize, tile_size_y_rounded))) # Update the number of tiles to include the remainder

if override_n_tiles:
  tile_size_y_rounded = int(np.ceil(template_base_ysize / n_tiles_override))
  tile_size_y_remainder = template_base_ysize % tile_size_y_rounded
  n_tiles = n_tiles_override
  print("n_tiles has been overridden.")

print(f'The prediction template should be divided into {n_tiles} tiles to avoid crashing.')

# Check if tiles need to be changed
change_tiles = True
if override_n_tiles:
  if n_tiles == n_tiles_exist: change_tiles = False
if n_tiles == n_tiles_exist and tile_size_y_rounded == tile_size_y_rounded_exist and tile_size_y_remainder == tile_size_y_remainder_exist:
  change_tiles = False

if change_tiles:
  # Clear all tile directories
  for tile in Path(tile_templates_dir).glob("**/*"):
    if tile.is_file(): tile.unlink()
  for tile in Path(tile_features_dir).glob("**/*"):
    if tile.is_file(): tile.unlink()
  for scenario_stack_dir in Path(tile_feature_stacks_dir).glob("**/*"):
    shutil.rmtree(scenario_stack_dir)
  for tile in Path(tile_prediction_cache_dir).glob("**/*"):
    if tile.is_file(): tile.unlink()

  # Generate new tile templates based on available memory
  tile_number = 1
  for y_start in range(0, template_base_ysize, tile_size_y_rounded):
    if tile_size_y_remainder != 0 and tile_number == n_tiles: tile_size_y = tile_size_y_remainder
    else: tile_size_y = tile_size_y_rounded
    tiling_string = "gdal_translate -of GTIFF -srcwin " + str(0)+ ", " + str(y_start) + ", " + str(template_base_xsize) + ", " + str(tile_size_y) + " " + str(template_base_path) + " " + str(tile_templates_dir) + "/template_tile_" + str(tile_number) + ".tif"
    os.system(tiling_string)
    tile_number += 1
  print("Template tile creation complete.")

else: print("No changes to existing tiles are required.")

In [None]:
# Create feature tiles.

# Check existing tile parameters
template_tile_list = []
for file in os.listdir(tile_templates_dir):
  if file.endswith('.tif') and file[:13] == 'template_tile':
    template_tile_list.append(file)
n_tiles = len(template_tile_list)

assert n_tiles > 0, "There are no template tiles. Run the template tiles section, even if only one is created."
template_tile_1 = gdal.Open(join(tile_templates_dir, 'template_tile_1.tif'))
tile_size_y_rounded = template_tile_1.GetRasterBand(1).YSize
template_tile_1 = None
print(f"There are {n_tiles} template tiles.")

if n_tiles == 1: print("Feature tile creation skipped. Feature stack creation will use the original features.")
else:
  # Find all features in all scenarios for tiling
  feature_paths = set()
  for csv in os.listdir(scenarios_model_dir):
      if csv.endswith('.csv'):
          features = pd.read_csv(join(scenarios_model_dir, csv)).iloc[:,0]
          for feature in features:
              feature_path = join(features_dir, f"{feature}.tif")
              feature_paths.add(feature_path)
  feature_paths = list(feature_paths)
  # Progress
  n_features = len(feature_paths)
  feature_progress_index, feature_progress_label = 0, widgets.Label(value=f"Feature progress: 0 / {n_features}")
  display(feature_progress_label)
  tile_progress_index, tile_progress_label = 0, widgets.Label(value=f"Tile progress: 0 / {n_tiles}")
  display(tile_progress_label)

  # Loop through each feature in the scenarios features directory
  for feature_path in feature_paths:
      feature = gdal.Open(feature_path)
      feature_array = feature.ReadAsArray()
      feature = None
      # Split feature array into tiles matching template dimensions
      y_start = 0
      for tile_count in range(1, n_tiles + 1):
          feature_tile_filename = f"{basename(feature_path)[:-4]}_{tile_count}.tif"
          feature_tile_path = join(tile_features_dir, feature_tile_filename)
          # Get tile dimensions from template
          template_tile_path = join(tile_templates_dir, f"template_tile_{tile_count}.tif")
          template_tile = gdal.Open(template_tile_path)
          tile_ysize = template_tile.GetRasterBand(1).YSize
          template_tile = None
          # Export feature chunk if tile does not exist
          if not exists(feature_tile_path):
              feature_chunk = feature_array[y_start:y_start + tile_ysize, :]
              export_array_as_tif(feature_chunk, feature_tile_path, template_tile_path)
          # Update y offset and tile progress
          y_start += tile_ysize
          tile_progress_index += 1
          tile_progress_label.value = f"Tile progress: {tile_progress_index} / {n_tiles}"

      # Reset tile progress and update feature progress
      tile_progress_index = 0
      feature_progress_index += 1
      feature_progress_label.value = f"Feature progress: {feature_progress_index} / {n_features}"

In [None]:
# Create scenario mask tiles.
if n_tiles == 1: print("Scenario mask tile creation skipped. Feature stack creation will use the original masks.")
else:
  # Find all scenario masks
  mask_paths = [join(scenario_masks_dir, f) for f in os.listdir(scenario_masks_dir) if f.endswith('.tif')]
  # Progress
  n_masks = len(mask_paths)
  mask_progress_index, mask_progress_label = 0, widgets.Label(value=f"Mask progress: 0 / {n_masks}")
  display(mask_progress_label)
  mask_tile_progress_index, mask_tile_progress_label = 0, widgets.Label(value=f"Tile progress: 0 / {n_tiles}")
  display(mask_tile_progress_label)

  # Loop through each scenario mask
  for mask_path in mask_paths:
      mask = gdal.Open(mask_path)
      mask_array = mask.ReadAsArray()
      mask = None
      # Split mask array into tiles matching template dimensions
      y_start = 0
      for tile_count in range(1, n_tiles + 1):
          mask_tile_filename = f"{basename(mask_path)[:-4]}_{tile_count}.tif"
          mask_tile_path = join(tile_scenario_masks_dir, mask_tile_filename)
          # Get tile dimensions from template
          template_tile_path = join(tile_templates_dir, f"template_tile_{tile_count}.tif")
          template_tile = gdal.Open(template_tile_path)
          tile_ysize = template_tile.GetRasterBand(1).YSize
          template_tile = None
          # Export mask chunk if tile does not exist
          if not exists(mask_tile_path):
              mask_chunk = mask_array[y_start:y_start + tile_ysize, :]
              export_array_as_tif(mask_chunk, mask_tile_path, template_tile_path, dtype=gdal.GDT_Int16)
          # Update y offset and tile progress
          y_start += tile_ysize
          mask_tile_progress_index += 1
          mask_tile_progress_label.value = f"Tile progress: {mask_tile_progress_index} / {n_tiles}"
      # Reset tile progress and update mask progress
      mask_tile_progress_index = 0
      mask_progress_index += 1
      mask_progress_label.value = f"Mask progress: {mask_progress_index} / {n_masks}"

  print("Scenario mask tile creation complete.")

# Feature stacks

In [None]:
# Load neutral effect values for covariates from SHAP analysis.
# Used to minimise covariate bias when making spatial predictions.
# Otherwise set a 'manual override' at a suitable value.

# Manual overrides (applied after SHAP values loaded)
covariate_overrides = {
    # 'fea_beam': 5,
    # 'fea_sensitivity': 0.99,
}

covariate_values = {}
feature_analysis_path = join(selected_model_shap_dir, 'shap_feature_analysis.csv')
if covariates_renamed:
    print(f"Covariates defined in model: {covariates_renamed}\n")
    if exists(feature_analysis_path):
        feature_analysis = pd.read_csv(feature_analysis_path)
        print("Neutral effect values from SHAP analysis:")
        for cov in covariates_renamed:
            row = feature_analysis[feature_analysis['Dataset name'] == cov]
            if not row.empty:
                neutral_val = row['Neutral_Effect_Value'].values[0]
                shap_at_neutral = row['SHAP_at_Neutral'].values[0]
                covariate_values[cov] = neutral_val
                print(f"  {cov}: {neutral_val:.4f} (SHAP at neutral: {shap_at_neutral:.4f})")
            else: print(f"  {cov}: not found in SHAP analysis")
    else: print("SHAP feature analysis not found. Set values in covariate_overrides.")

    # Apply overrides and verify completeness
    covariate_values.update(covariate_overrides)
    if covariate_overrides: print(f"\nManual overrides applied: {covariate_overrides}")
    missing = [c for c in covariates_renamed if c not in covariate_values]
    assert not missing, f"Missing covariate values: {missing}. Set in covariate_overrides."

    # Cast to float32 for consistency with feature stack dtype
    covariate_values = {k: np.float32(v) for k, v in covariate_values.items()}
    print(f"Final covariate values: {covariate_values}")
else: print("No covariates defined in model.")

In [None]:
# Create feature stack arrays for each scenario
# Collect scenarios with .csv feature lists
scenarios_list = []
for csv in os.listdir(scenarios_model_dir):
  if csv.endswith('.csv'):
    scenarios_list.append(csv[:-4])
# Select scenarios to generate tiled feature stacks
print("scenarios_to_stack = [")
for scenario in sorted(scenarios_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_stack = [
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_no_disturbance_since_1993",
  "2021_no_disturbance_since_oldgrowth_1",
  "2021_no_disturbance_since_oldgrowth_2",
  "2021_oldgrowth_recovery_1",
  "2021_oldgrowth_recovery_2",
  "2022",
  "2023",
  "2024",
  "2024_no_disturbance_since_1996",
  "2024_no_disturbance_since_1997",
  "2024_no_disturbance_since_1998",
  "2024_no_disturbance_since_1999",
  "2024_no_disturbance_since_2000",
  "2024_no_disturbance_since_2001",
  "2024_no_disturbance_since_2002",
  "2024_no_disturbance_since_2003",
  "2024_no_disturbance_since_2004",
  "2024_no_disturbance_since_2005",
  "2024_no_disturbance_since_2006",
  "2024_no_disturbance_since_2007",
  "2024_no_disturbance_since_2008",
  "2024_no_disturbance_since_2009",
  "2024_no_disturbance_since_2010",
  "2024_no_disturbance_since_2011",
  "2024_no_disturbance_since_2012",
  "2024_no_disturbance_since_2013",
  "2024_no_disturbance_since_2014",
  "2024_no_disturbance_since_2015",
  "2024_no_disturbance_since_2016",
  "2024_no_disturbance_since_2017",
  "2024_no_disturbance_since_2018",
  "2024_no_disturbance_since_2019",
  "2024_no_disturbance_since_2020",
  "2024_no_disturbance_since_2021",
  "2024_no_disturbance_since_2022",
  "2024_no_disturbance_since_2023",
  "2024_no_disturbance_since_2024",
  "2024_no_disturbance_since_oldgrowth_1",
  "2024_no_disturbance_since_oldgrowth_2",
  "2024_oldgrowth_recovery_1",
  "2024_oldgrowth_recovery_2",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
]

In [None]:
# Check all scenarios exist
assert set(scenarios_to_stack).issubset(scenarios_list), "Not all selected scenarios exist."

# Check existing tile parameters
template_tile_list = [f for f in os.listdir(tile_templates_dir) if f.endswith('.tif') and f.startswith('template_tile')]
n_tiles = len(template_tile_list)
assert n_tiles > 0, "There are no template tiles. Run the template tiles section, even if only one is created."
print(f"There are {n_tiles} template tiles.")
covariate_set = set(covariates)

# Progress
scenario_progress_index, scenario_progress_label = 0, widgets.Label(value=f"Scenario progress: 0 / {len(scenarios_to_stack)}")
display(scenario_progress_label)
stack_progress_index, stack_progress_label = 0, widgets.Label(value=f"Tiled feature stack progress: 0 / {n_tiles}")
display(stack_progress_label)

for scenario in scenarios_to_stack:
    scenario_feature_stacks_dir = join(tile_feature_stacks_dir, scenario)
    makedirs(scenario_feature_stacks_dir, exist_ok=True)
    scenario_features_csv = join(scenarios_model_dir, f"{scenario}.csv")
    scenario_features = pd.Series.tolist(pd.read_csv(scenario_features_csv).iloc[:,0])

    for tile_count in range(1, n_tiles + 1):
      stack_path = join(scenario_feature_stacks_dir, f"feature_stack_{scenario}_{tile_count}.npy")
      indices_path = join(scenario_feature_stacks_dir, f"valid_indices_{scenario}_{tile_count}.npy")
      if exists(stack_path) and exists(indices_path):
        stack_progress_index += 1
        stack_progress_label.value = f"Tiled feature stack progress: {stack_progress_index} / {n_tiles}"
        continue

      # Load scenario mask tile and extract valid indices (C-order for consistent reconstruction)
      if n_tiles == 1: mask_path = join(scenario_masks_dir, f"{scenario}.tif")
      else: mask_path = join(tile_scenario_masks_dir, f"{scenario}_{tile_count}.tif")
      if not exists(mask_path):
        raise FileNotFoundError(f"Missing mask for scenario {scenario}, tile {tile_count}: {mask_path}")
      mask = gdal.Open(mask_path)
      mask_array = mask.ReadAsArray()
      mask = None
      valid_indices = np.where(mask_array.ravel(order='C') == 1)[0].astype(np.int32)
      n_valid = len(valid_indices)
      mask_array = None

      # Build feature tile paths, filtering out covariates
      if n_tiles == 1:
        feature_tile_paths = [join(features_dir, f"{f}.tif") for f in scenario_features]
      else:
        feature_tile_paths = [join(tile_features_dir, f"{f.split('/')[-1]}_{tile_count}.tif") for f in scenario_features]
      covariate_tile_set = {f"{cov}_{tile_count}" for cov in covariate_set}
      valid_feature_paths = [p for p in feature_tile_paths
                             if basename(p).replace('.tif', '') not in covariate_set
                             and basename(p).replace('.tif', '') not in covariate_tile_set]

      # Handle tiles with no valid pixels
      if n_valid == 0:
        n_features_total = len(valid_feature_paths)
        if covariates_renamed: n_features_total += len(covariates_renamed)
        np.save(stack_path, np.empty((0, n_features_total), dtype=np.float32))
        np.save(indices_path, valid_indices)
        stack_progress_index += 1
        stack_progress_label.value = f"Tiled feature stack progress: {stack_progress_index} / {n_tiles}"
        continue

      # Verify all feature tiles exist
      missing_features = [f for f in valid_feature_paths if not exists(f)]
      if missing_features:
        raise FileNotFoundError(f"Missing feature tiles for scenario {scenario}, tile {tile_count}:\n" + "\n".join(missing_features))
      if not valid_feature_paths:
        raise ValueError(f"No non-covariate features found for scenario {scenario}, tile {tile_count}")

      # Read all features via VRT
      vrt_options = gdal.BuildVRTOptions(separate=True)
      vrt = gdal.BuildVRT('', valid_feature_paths, options=vrt_options)
      if vrt is None:
        raise RuntimeError(f"Failed to build VRT for scenario {scenario}, tile {tile_count}")
      feature_cube = vrt.ReadAsArray()  # (n_features, height, width)
      vrt = None

      # Reshape to (n_pixels, n_features) C-order, extract valid pixels only
      n_features, height, width = feature_cube.shape
      feature_flat = feature_cube.transpose(1, 2, 0).reshape(-1, n_features, order='C')
      feature_cube = None
      feature_stack_valid = feature_flat[valid_indices, :].astype(np.float32)
      feature_flat = None

      # Append covariate columns
      if covariates_renamed:
        covariate_block = np.column_stack([np.full(n_valid, covariate_values[cov], dtype=np.float32)
                                           for cov in covariates_renamed])
        feature_stack_valid = np.hstack([feature_stack_valid, covariate_block])
        covariate_block = None

      # Save feature stack and valid indices
      np.save(stack_path, feature_stack_valid)
      np.save(indices_path, valid_indices)
      feature_stack_valid = valid_indices = None

      # Force Drive sync and verify
      subprocess.run(['sync'], check=True)
      last_size_stack, last_size_indices = -1, -1
      for attempt in range(10):
        time.sleep(5)
        if exists(stack_path) and exists(indices_path):
          size_stack, size_indices = os.path.getsize(stack_path), os.path.getsize(indices_path)
          if size_stack == last_size_stack and size_indices == last_size_indices and size_stack > 0:
            try:
              np.load(stack_path)
              np.load(indices_path)
              break
            except: pass
          last_size_stack, last_size_indices = size_stack, size_indices
      else: raise RuntimeError(f"Drive sync failed: {stack_path}")

      stack_progress_index += 1
      stack_progress_label.value = f"Tiled feature stack progress: {stack_progress_index} / {n_tiles}"

    stack_progress_index = 0
    scenario_progress_index += 1
    scenario_progress_label.value = f"Scenario progress: {scenario_progress_index} / {len(scenarios_to_stack)}"

print("\nFeature stacks complete.")

# Predict scenarios

In [None]:
# This is for testing models and scenarios, or making predictions where no
# uncertainty metric for the variate (e.g. standard error or stdev) is available.
# If these are available, proceed to 7_uncertainty.ipynb.

# Check existing tile parameters
template_tile_list = [f for f in os.listdir(tile_templates_dir) if f.endswith('.tif') and f.startswith('template_tile')]
n_tiles = len(template_tile_list)
assert n_tiles > 0, "There are no template tiles. Run the template tiles section, even if only one is created."
if n_tiles == 1: print(f"# There is 1 template tile.\n")
if n_tiles > 1: print(f"# There are {n_tiles} template tiles.\n")

# Collect available scenarios from the feature stack tiles directory
scenario_stacks_list = []
for scenario in os.listdir(tile_feature_stacks_dir):
    stack_files = [f for f in os.listdir(join(tile_feature_stacks_dir, scenario)) if f.startswith('feature_stack_')]
    if len(stack_files) == n_tiles:
        scenario_stacks_list.append(scenario)

print("# Note: If you end a runtime after the creation of many large feature stacks,")
print("# it will take time for the notebook to recognise their existence again due to")
print("# Google Drive latency issues. If the stacks do not appear here after some time,")
print("# run the feature stack section again until they do.\n")

print("scenarios_to_predict = [")
for scenario in sorted(scenario_stacks_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
# There is 1 template tile.

# Note: If you end a runtime after the creation of many large feature stacks,
# it will time for the notebook to recognise their existence again due to
# Google Drive latency issues. If the stacks do not appear here after some time,
# run the feature stack section again until they do.

scenarios_to_predict = [
  "2018",
  "2019",
  "2020",
  "2021",
  "2021_no_disturbance_since_1993",
  "2021_no_disturbance_since_oldgrowth_1",
  "2021_no_disturbance_since_oldgrowth_2",
  "2021_oldgrowth_recovery_1",
  "2021_oldgrowth_recovery_2",
  "2022",
  "2023",
  "2024",
  "2024_no_disturbance_since_1996",
  "2024_no_disturbance_since_1997",
  "2024_no_disturbance_since_1998",
  "2024_no_disturbance_since_1999",
  "2024_no_disturbance_since_2000",
  "2024_no_disturbance_since_2001",
  "2024_no_disturbance_since_2002",
  "2024_no_disturbance_since_2003",
  "2024_no_disturbance_since_2004",
  "2024_no_disturbance_since_2005",
  "2024_no_disturbance_since_2006",
  "2024_no_disturbance_since_2007",
  "2024_no_disturbance_since_2008",
  "2024_no_disturbance_since_2009",
  "2024_no_disturbance_since_2010",
  "2024_no_disturbance_since_2011",
  "2024_no_disturbance_since_2012",
  "2024_no_disturbance_since_2013",
  "2024_no_disturbance_since_2014",
  "2024_no_disturbance_since_2015",
  "2024_no_disturbance_since_2016",
  "2024_no_disturbance_since_2017",
  "2024_no_disturbance_since_2018",
  "2024_no_disturbance_since_2019",
  "2024_no_disturbance_since_2020",
  "2024_no_disturbance_since_2021",
  "2024_no_disturbance_since_2022",
  "2024_no_disturbance_since_2023",
  "2024_no_disturbance_since_2024",
  "2024_no_disturbance_since_oldgrowth_1",
  "2024_no_disturbance_since_oldgrowth_2",
  "2024_oldgrowth_recovery_1",
  "2024_oldgrowth_recovery_2",
  "2024_road_mat_daling_deforestation_2023_30m_degradation_buffer",
]

In [None]:
# Prediction raster precision. GEDI AGBD can be set to 0, any higher is
# spurious due to the wide prediction intervals of the source data.
raster_precision = 0

# Probabilities instead of classes IF binary classification
predict_probabilities = False

# Classification threshold IF binary classification
classification_threshold = 0.5

# Detect GPU availability and set predictor type. Note that XGBoost inference is
# not much faster on GPU than CPU, and transferring large feature stacks from
# CPU to GPU memory can actually make it much slower. TPU is not used, but the
# TPU Colab runtime can provide more memory and more CPU workers.
try:
    test_array = cupy.array([1, 2, 3])
    del test_array
    predictor_type = 'gpu_predictor'
    gpu_id, use_gpu = 0, True
    print("GPU detected and accessible - using GPU to load the feature stack and to predict.")
except:
    predictor_type = 'cpu_predictor'
    gpu_id, use_gpu = -1, False
    print("GPU not accessible - using CPU prediction")
xgb.set_config(verbosity=0, use_rmm=use_gpu)

# Load model and detect type
booster = xgb.Booster()
booster.load_model(selected_model_json)
model_config = json.loads(booster.save_config())

objective_name = model_config['learner']['objective']['name']
num_class = int(model_config['learner']['learner_model_param'].get('num_class', '0'))
classification = any(obj_type in objective_name for obj_type in ['logistic', 'softprob', 'softmax'])
multiclass = classification and num_class > 2
if classification and multiclass: print(f"Model type: Multiclass classification ({num_class} classes)")
elif classification: print("Model type: Binary classification")
else: print("Model type: Regression")

# Build feature_types list matching selected_features order
feature_types = []
for feat in selected_features:
    if feat in categorical_columns:
        feature_types.append('c')
    else:
        feature_types.append('q')

# Select appropriate predictor type and set feature_types
if classification:
    XGBPredictor = xgb.XGBClassifier()
    XGBPredictor.load_model(selected_model_json)
    XGBPredictor.set_params(predictor=predictor_type, feature_types=feature_types)
    if use_gpu: XGBPredictor.set_params(device='cuda:0')
else:
    XGBPredictor = xgb.XGBRegressor()
    XGBPredictor.load_model(selected_model_json)
    XGBPredictor.set_params(predictor=predictor_type, feature_types=feature_types)
    if use_gpu: XGBPredictor.set_params(device='cuda:0')

# Check existing tile parameters
template_tile_list = [file for file in os.listdir(tile_templates_dir)
                     if file.endswith('.tif') and file[:13] == 'template_tile']
n_tiles = len(template_tile_list)
assert n_tiles > 0, "There are no template tiles. Run the template tiles section, even if only one is created."
template_tile = gdal.Open(join(tile_templates_dir,'template_tile_1.tif'))
template_tile_x = template_tile.GetRasterBand(1).XSize
template_tile = None
print(f"There are {n_tiles} template tiles.")

if alpha_earth: template_base_path = next(r for r in model_scenario_features_dirs if all(c not in r for c in covariates))
else: template_base_path = template_tif_path

# Progress tracking
scenario_progress_index = 0
scenario_progress_label = widgets.Label(f"Scenario progress: {scenario_progress_index}/{len(scenarios_to_predict)}")
display(scenario_progress_label)
tile_progress_index, tile_progress_label = 0, widgets.Label(value=f"Tile progress: 0 / {n_tiles}")
display(tile_progress_label)

# Loop through each scenario
for scenario in scenarios_to_predict:
  scenario_prediction_filename = f"{scenario}__{selected_model}.tif"
  scenario_prediction_path = join(scenario_predictions_dir, scenario_prediction_filename)
  # Skip oldgrowth _1 and _2 scenarios if merged file already exists
  if '_oldgrowth_1' in scenario or '_oldgrowth_2' in scenario or '_oldgrowth_recovery_1' in scenario or '_oldgrowth_recovery_2' in scenario:
    merged_scenario = scenario.replace('_1', '').replace('_2', '')
    merged_prediction_path = join(scenario_predictions_dir, f"{merged_scenario}__{selected_model}.tif")
    if exists(merged_prediction_path):
      print(f"Merged file exists, skipping: {scenario}")
      scenario_progress_index += 1
      scenario_progress_label.value = f"Scenario progress: {scenario_progress_index}/{len(scenarios_to_predict)}"
      continue
  if not exists(scenario_prediction_path):
    scenario_feature_stack_dir = join(tile_feature_stacks_dir, scenario)
    n_stacks = len([f for f in os.listdir(scenario_feature_stack_dir) if f.startswith('feature_stack_')])
    tile_cache_scenario_dir = join(tile_prediction_cache_dir, f"{scenario}__{selected_model}")
    makedirs(tile_cache_scenario_dir, exist_ok=True)
    for tile_count in range(1, n_stacks + 1):
      scenario_tile_filename = f"scenario_tile_{tile_count}.tif"
      scenario_tile_exists = scenario_tile_filename in os.listdir(tile_cache_scenario_dir)
      if not scenario_tile_exists:
        # Load template tile parameters
        template_tile_path = join(tile_templates_dir, f"template_tile_{tile_count}.tif")
        template_tile = gdal.Open(template_tile_path)
        template_tile_y = template_tile.GetRasterBand(1).YSize
        template_tile_x = template_tile.GetRasterBand(1).XSize
        template_tile = None
        n_pixels = template_tile_y * template_tile_x
        # Load feature stack and valid indices
        stack_path = join(scenario_feature_stack_dir, f"feature_stack_{scenario}_{tile_count}.npy")
        indices_path = join(scenario_feature_stack_dir, f"valid_indices_{scenario}_{tile_count}.npy")
        feature_stack = np.load(stack_path)
        valid_indices = np.load(indices_path)
        n_valid = len(valid_indices)
        # Handle empty tiles (no valid pixels)
        if n_valid == 0:
          if raster_precision == 0:
            prediction_tile = np.full((template_tile_y, template_tile_x), nodatavalue, dtype=np.int16)
          else:
            prediction_tile = np.full((template_tile_y, template_tile_x), nodatavalue, dtype=np.float32)
          export_array_as_tif(prediction_tile, join(tile_cache_scenario_dir, scenario_tile_filename),
                              template=template_tile_path, compress=False)
          prediction_tile = None
          tile_progress_index += 1
          tile_progress_label.value = f"Tile progress: {tile_progress_index} / {n_stacks}"
          continue
        # Load stack to GPU if available
        if use_gpu:
          try: feature_stack = cupy.asarray(feature_stack)
          except Exception as e:
              if "Memory allocation error" in str(e) or "Out of memory" in str(e):
                  print("GPU memory insufficient, switching to CPU for this tile.")
                  cupy.get_default_memory_pool().free_all_blocks()
                  gc.collect()
                  XGBPredictor.set_params(device='cpu', predictor='cpu_predictor')
              else: raise
        # Predict - terminate runtime if GPU prediction fails
        try:
            if classification and predict_probabilities and not multiclass:
                # Get probability of class 1 for binary classification
                prediction_proba = XGBPredictor.predict_proba(feature_stack)
                prediction = prediction_proba[:, 1]  # Probability of class 1
            else:
                if classification and not multiclass:
                    # Use predict_proba for better accuracy in binary classification
                    prediction_proba = XGBPredictor.predict_proba(feature_stack)
                    prediction = (prediction_proba[:, 1] > classification_threshold).astype(int)
                else:
                    prediction = XGBPredictor.predict(feature_stack)
                    if classification:
                        # Check if prediction is 2D (probabilities) and convert to class labels
                        if prediction.ndim > 1 and prediction.shape[1] > 1: prediction = np.argmax(prediction, axis=1)
                        # Ensure prediction is integer type for classification
                        prediction = prediction.astype(int)
        except Exception as e:
            if "Memory allocation error" in str(e) or "Out of memory" in str(e):
                print("GPU memory insufficient for prediction. Terminating runtime to save compute units, restart with TPU.")
                runtime.unassign()
            else: raise
        feature_stack = None
        # Reconstruct full tile from valid indices (C-order to match stacking)
        if raster_precision == 0:
          prediction_flat = np.full(n_pixels, nodatavalue, dtype=np.int16)
          prediction_flat[valid_indices] = np.round(prediction).astype(np.int16)
        else:
          prediction_flat = np.full(n_pixels, nodatavalue, dtype=np.float32)
          prediction_flat[valid_indices] = np.round(prediction, raster_precision)
        prediction = valid_indices = None
        prediction_tile = prediction_flat.reshape((template_tile_y, template_tile_x), order='C')
        prediction_flat = None
        # Export prediction array as .tif
        export_array_as_tif(prediction_tile, join(tile_cache_scenario_dir, scenario_tile_filename),
                          template=template_tile_path, compress=False)
        prediction_tile = None
      tile_progress_index += 1
      tile_progress_label.value = f"Tile progress: {tile_progress_index} / {n_stacks}"
    # Prepare empty array for merging tiles
    prediction_array = np.empty((0, template_tile_x))
    # Ensure tiles are in the correct order
    tile_files = sorted([f for f in os.listdir(tile_cache_scenario_dir) if f.endswith('.tif')],
                        key=lambda x: int(x.split('_')[-1].split('.')[0]))
    # Read each tile .tif as an array, stack, then export as a .tif
    for fname in tile_files:
        tile = gdal.Open(join(tile_cache_scenario_dir, fname))
        tile_array = tile.ReadAsArray()
        prediction_array = np.vstack((prediction_array, tile_array))
        tile = None
    if raster_precision == 0: prediction_array = np.round(prediction_array, raster_precision).astype(np.int16)
    else: prediction_array = np.round(prediction_array, raster_precision)
    export_array_as_tif(prediction_array, scenario_prediction_path, template=template_base_path)
    # Delete scenario tile cache directory
    shutil.rmtree(tile_cache_scenario_dir)
  # Reset tile progress
  tile_progress_index = 0
  # Update scenario progress
  scenario_progress_index += 1
  scenario_progress_label.value = f"Scenario progress: {scenario_progress_index}/{len(scenarios_to_predict)}"
print("\nScenario predictions complete.")

In [None]:
# Merge oldgrowth versions by taking maximum values.
# Version 1 uses land-use proxy for pre-Landsat undisturbed forest.
# Version 2 removes all disturbance without the proxy.
# Taking maximum avoids underestimation where proxy may not capture all oldgrowth characteristics.

oldgrowth_v1_files = [f for f in os.listdir(scenario_predictions_dir)
                      if ('_oldgrowth_recovery_1__' in f or '_no_disturbance_since_oldgrowth_1__' in f)
                      and f.endswith('.tif')]

if not oldgrowth_v1_files: print("No oldgrowth version 1 predictions found.")
else:
  for v1_file in oldgrowth_v1_files:
    # Construct version 2 and merged filenames
    base_name = v1_file.split('__')[0][:-1] + '2'  # Replace '1' with '2'
    rest_of_name = '__' + v1_file.split('__')[1]
    v2_file = f"{base_name}{rest_of_name}"
    merged_file = v1_file.replace('_1__', '__')
    merged_path = join(scenario_predictions_dir, merged_file)
    # Skip if merged file already exists
    if exists(merged_path):
      print(f"Merged file already exists: {merged_file}")
      continue
    v1_path = join(scenario_predictions_dir, v1_file)
    v2_path = join(scenario_predictions_dir, v2_file)
    # Merge or copy depending on version 2 existence
    if exists(v2_path):
      print(f"Merging oldgrowth versions for {v1_file.split('__')[0]}...")
      oldgrowth_1 = gdal.Open(v1_path)
      oldgrowth_1_array = oldgrowth_1.ReadAsArray()
      oldgrowth_1 = None
      oldgrowth_2 = gdal.Open(v2_path)
      oldgrowth_2_array = oldgrowth_2.ReadAsArray()
      oldgrowth_2 = None
      merged_array = np.maximum(oldgrowth_1_array, oldgrowth_2_array)
      export_array_as_tif(merged_array, merged_path, template=template_base_path)
      merged_array = oldgrowth_1_array = oldgrowth_2_array = None
      print(f"Merged version exported: {merged_file}")
      # Delete originals
      os.remove(v1_path)
      os.remove(v2_path)
    else:
      shutil.copy2(v1_path, merged_path)
      print(f"Version 2 not found, copied version 1: {merged_file}")
      os.remove(v1_path)

  print("\nOldgrowth merging complete.")

# Disconnect runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()