<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/7_uncertainty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and directories

In [None]:
# Define base directory
base_dir = "/gdrive/Shareddrives/masfi"

# Mount Google Drive and set base directory
from google.colab import drive
import os
import sys
drive.mount('/gdrive', force_remount=True)
_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs and upgrades
!pip install geopandas
!pip install rasterio
!pip install xgboost --upgrade

In [None]:
# Reload imports, replacing those in the cache
%reload_ext autoreload
%autoreload 2
# Imports
import json
import geopandas as gpd
from google.colab import runtime
import ipywidgets as widgets
import itertools
import joblib
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
from numpy.random import normal
from os import makedirs
from os.path import exists, join
from osgeo import gdal
import pandas as pd
import rasterio
from rasterio.features import rasterize
import scipy.stats as st
import shutil
from shutil import copyfile
import xgboost as xgb

# Define GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found.')
else:
  print(f"Found GPU at: {device_name}")

In [None]:
# Define directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
predictor_dir = join(base_dir, "3_predictors")
predictor_final_dir = join(predictor_dir, "final")
models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")
masks_dir = join(scenarios_dir, "scenario_masks")
uncertainty_dir = join(base_dir, "7_uncertainty")

# Create directories
makedirs(uncertainty_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Burn a polygon to raster
def burn_polygon_to_raster(raster, polygon, fixed=True, fixed_value=1, column_name=None, all_touched=True):
  with rasterio.open(raster, 'r+') as src:
      array = src.read(1)
      transform = src.transform
      gdf = gpd.read_file(polygon)
      for geom in gdf.geometry:
          if not fixed and column_name == None:
              column_name = gdf.columns[0]
          if not fixed: burn_value = gdf.loc[gdf.geometry == geom, column_name].values[0]
          else: burn_value = fixed_value
          rasterize([(geom, burn_value)], out=array, transform=transform,
              all_touched=all_touched, dtype=src.meta['dtype'], out_shape=src.shape)
      src.write(array, 1)

# Select model

In [None]:
# Select a baseline model, tested and trained in advance.
model_exists = False
for subdir, dirs, files in os.walk(models_dir):
  for file in files:
    if file == 'model.json':
      print(f'selected_model = "{subdir.split(f"{models_dir}/",1)[1]}"')
      model_exists = True
if not model_exists:
  print("No model exists.")

In [None]:
selected_model = "agbd_240926_030225"
categorise_variate = False # If the variate was categorised in 5_models

# Define model directories
selected_model_dir = join(models_dir,selected_model)
selected_model_json = join(selected_model_dir, "model.json")
selected_model_descr_dir = join(selected_model_dir, "model_description.json")
selected_model_dataset_path = join(selected_model_dir, f"{selected_model}.pkl")
selected_model_dataset = pd.read_pickle(selected_model_dataset_path)

# Read description for model dataset attributes
with open(join(selected_model_dir,"model_dataset_description.json")) as model_dataset_description_json:
  model_dataset_description = json.load(model_dataset_description_json)
model_dataset_name = model_dataset_description["model_dataset_name"]
number_of_columns = model_dataset_description["number_of_columns"]
number_of_rows = model_dataset_description["number_of_rows"]
id_column = model_dataset_description["id_column"]
selected_variate = model_dataset_description["selected_variate"]
uncertainty = model_dataset_description["uncertainty"]
covariates_renamed = model_dataset_description["covariates_renamed"]
covariates_categorised = model_dataset_description["covariates_categorised"]
selected_predictors = model_dataset_description["selected_predictors"] + model_dataset_description["covariates_renamed"]
categorical_columns = model_dataset_description["categorical_columns"]
descriptive_parameters = model_dataset_description["descriptive_parameters"]
filter_parameter = model_dataset_description["filter_parameter"]
filter_values_to_include = model_dataset_description["filter_values_to_include"]
sample_imported_dataset = model_dataset_description["sample_imported_dataset"]
sample_imported_dataset_by_percent = model_dataset_description["sample_imported_dataset_by_percent"]
sample_imported_dataset_value = model_dataset_description["sample_imported_dataset_value"]

# Reload hyperparameters
with open(selected_model_descr_dir) as model_description_json:
  model_description = json.load(model_description_json)
final_hyperparameters = eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])

# Define directories
scenarios_model_dir = join(scenarios_dir,selected_model)
uncertainty_selected_model_dir = join(uncertainty_dir, selected_model)
model_iterations_dir = join(uncertainty_selected_model_dir, "model_iterations")

# Create directories
makedirs(uncertainty_selected_model_dir, exist_ok=True)
makedirs(model_iterations_dir, exist_ok=True)

# Model iterations

In [None]:
# Verify that the variate is equal to the mean
print(f'mean = "{selected_variate}"')

# Calculate se from columns flagged 'uncertainty'
if len(uncertainty)==0:
  print("There are no flagged uncertainty columns to calculate SE from.")
  print("Manually create the metric from the available columns.")
  for col in selected_model_dataset.columns:
    print(f"{col}")
else:
  for col in selected_model_dataset.columns:
    if col in uncertainty and col not in selected_variate:
      print(f'se = "{col}"')

In [None]:
mean = "var_agbd"
se = "var_agbd_se"
# Liang et al. (2023) use SE as a proxy for STDEV

# Set model iterations
model_iterations = 100

# Define model (y axis changes for each iteration based on mean and se arrays)
model_dataset_x = selected_model_dataset[selected_predictors]
mean_array = selected_model_dataset[mean].values
se_array = selected_model_dataset[se].values
if categorise_variate: XGBPredictor = xgb.XGBClassifier(**final_hyperparameters)
else: XGBPredictor = xgb.XGBRegressor(**final_hyperparameters)
model_params = XGBPredictor.get_params()
model_params['eval_metric'] = model_description['metric_used_for_training']
# Default fix for new XGBoost version
[model_params.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]

# Progress label
model_progress_index = 0
model_progress_label = widgets.Label(f"Model iteration: {model_progress_index}/{model_iterations}")
display(model_progress_label)

for model_iteration in range(1,model_iterations+1):
  # Set model iteration filename and check if already exists
  model_iteration_filename = f"model_iteration_{model_iteration}.json"
  model_iteration_path = join(model_iterations_dir, model_iteration_filename)
  # If model iteration does not exist...
  if not exists(model_iteration_path):
    # Set the random seed based on iteration for replicability
    np.random.seed(model_iteration)
    # Set a normal distribution sample as the y for this iteration
    model_dataset_y = normal(mean_array, se_array)
    # Create DMatrix objects
    model_dtrain = xgb.DMatrix(model_dataset_x, model_dataset_y, enable_categorical=True)
    # Train the model iteration using the tested hyperparameters
    model = xgb.train(model_params,
                        model_dtrain,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        verbose_eval=True)
    # Save the model iteration
    model.save_model(model_iteration_path)
  # Update progress
  model_progress_index += 1
  model_progress_label.value = f"Model iteration: {model_progress_index}/{model_iterations}"
print("All model iterations have been trained and saved.")

# Scenario iterations

In [None]:
# Scenarios must be designed and tested using 06_scenarios first.
# Predictors in the 'constant' and scenario subdirs should not be moved.

# Select a scenario area
scenario_area_exists = False
for subdir in os.listdir(scenarios_model_dir):
  if not subdir.endswith('.json') and not subdir.endswith('.csv'):
    print(f'selected_scenario_area = "{subdir}"')
    scenario_area_exists = True
if not scenario_area_exists:
  print(f"Create a scenario area directory in {scenarios_model_dir}")

In [None]:
selected_scenario_area = "tekai"

# Locate scenario area directories
selected_scenario_area_dir = join(scenarios_model_dir, selected_scenario_area)
predictors_dir = join(selected_scenario_area_dir, "predictors")
tile_templates_dir = join(selected_scenario_area_dir, 'tile_templates')
tile_predictor_stacks_dir = join(selected_scenario_area_dir, 'tile_predictor_stacks')

# Define uncertainty scenario area directory
uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_area}")
tile_prediction_cache_dir = join(uncertainty_scenario_area_dir, "tile_prediction_cache")
makedirs(uncertainty_scenario_area_dir, exist_ok=True)
makedirs(tile_prediction_cache_dir, exist_ok=True)

# Collect available scenarios from the predictor stack tiles directory
scenario_stacks_list = []
for scenario in os.listdir(tile_predictor_stacks_dir):
    scenario_stacks_list.append(scenario)

# Select scenarios to predict
print("scenarios_to_predict = [")
for scenario in sorted(scenario_stacks_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_predict = [
  # "2018",
  # "2019",
  # "2020",
  # "2021",
  "2022",
  "2022_no_degradation_since_1990",
  "2022_oldgrowth",
  "2023",
  "2023_no_degradation_since_1990",
  "2023_oldgrowth",
  "all_oldgrowth",
]

# Check the number of model iterations available
model_iterations_available = len(os.listdir(model_iterations_dir))
print(f"\nThere are {len(os.listdir(model_iterations_dir))} model iterations available.")

In [None]:
# Set the number of scenario iterations. It must be <= the number of model iterations available.
scenario_iterations = 100

assert scenario_iterations <= model_iterations_available, f"Reduce the number of scenario iterations to <= {model_iterations_available}."

# Change this and the code within the block accordingly.
add_covariates = True # Adds a selected covariate value as the predictor
sensitivity_value = 0.99
beam_value = 5

# Check for GPU
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0': print('GPU device not found')
else: print(f"Found GPU at: {device_name}")

# Check existing tile parameters
template_tile_list = []
for file in os.listdir(tile_templates_dir):
  if file.endswith('.tif') and file[:13] == 'template_tile':
    template_tile_list.append(file)
n_tiles = len(template_tile_list)

if n_tiles < 1: print("There are currently no template tiles.")
if n_tiles >= 1:
  template_tile_x = gdal.Open(join(tile_templates_dir,'template_tile_1.tif')).GetRasterBand(1).XSize
  print(f"There are {n_tiles} template tiles.")

# Tile progress
if n_tiles > 1:
  tile_progress_index, tile_progress_label = 0, widgets.Label(value=f"Tile progress: 0 / {n_tiles}")
  display(tile_progress_label)

# Loop through each scenario
for scenario in scenarios_to_predict:
  scenario_predictor_stack_dir = join(tile_predictor_stacks_dir, scenario)
  # Create scenario iterations directory
  scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")
  makedirs(scenario_iterations_dir, exist_ok=True)
  # Iteration progress
  iteration_progress_index = 0
  iteration_progress_label = widgets.Label(f"{scenario} iteration progress: {iteration_progress_index} / {scenario_iterations}")
  display(iteration_progress_label)

  # Check if all scenario iterations already exist, if not then load predictor stack
  scenario_iteration_list = []
  for model_iteration in range(1,scenario_iterations+1):
    prediction_iteration_filename = f"{scenario}__{selected_scenario_area}_{selected_model}_unmasked_iteration_{model_iteration}.tif"
    prediction_iteration_path = join(scenario_iterations_dir, prediction_iteration_filename)
    scenario_iteration_list.append(prediction_iteration_path)
  all_scenario_iterations_exist = True
  for scenario_iteration in scenario_iteration_list:
    if not exists(scenario_iteration): all_scenario_iterations_exist = False
  if not all_scenario_iterations_exist:
    if n_tiles == 1:
    # Load template parameters
      template_tile_dir = join(tile_templates_dir, f"template_tile_1.tif")
      template_tile_y = gdal.Open(template_tile_dir).GetRasterBand(1).YSize
      template_tile_x = gdal.Open(template_tile_dir).GetRasterBand(1).XSize
      # Load predictor stack
      stack_filename = f"predictor_stack_{scenario}_1.npy"
      predictor_stack = np.load(join(scenario_predictor_stack_dir, stack_filename))
      if add_covariates: predictor_stack = np.hstack((predictor_stack,
                    np.full((predictor_stack.shape[0], 1), beam_value, dtype=int),
                    np.full((predictor_stack.shape[0], 1), sensitivity_value, dtype=float)
                    ))
    # Predict scenario for each model iteration
    for model_iteration in range(1,scenario_iterations+1):
      # Define the model
      model_dir = join(model_iterations_dir,f"model_iteration_{model_iteration}.json")
      prediction_iteration_filename = f"{scenario}__{selected_scenario_area}_{selected_model}_unmasked_iteration_{model_iteration}.tif"
      prediction_iteration_path = join(scenario_iterations_dir, prediction_iteration_filename)
      # If scenario iteration does not exist:
      if not exists(prediction_iteration_path):
        # Load model
        if categorise_variate: XGBPredictor = xgb.XGBClassifier(**final_hyperparameters)
        else: XGBPredictor = xgb.XGBRegressor(**final_hyperparameters)
        XGBPredictor.load_model(fname=model_dir)
        # Avoids issues using dataframe from CPU
        xgb.set_config(verbosity=0, use_rmm=True)
        # Get number of stacks
        n_stacks = len(os.listdir(scenario_predictor_stack_dir))
        if n_stacks == 1:
          # Define prediction array and reshape
          prediction = XGBPredictor.predict(predictor_stack)
          prediction_array = prediction.reshape((template_tile_y, template_tile_x))
          prediction = None # Flush prediction

        # Tiling for if predictor stacks and separated into chunks
        if n_stacks > 1:
          # Create a tile cache directory for the prediction
          tile_cache_iteration_dir = join(tile_prediction_cache_dir, prediction_iteration_filename[:-4])
          makedirs(tile_cache_iteration_dir, exist_ok=True)
          # Create a tile count to match the predictor stack chunk
          for stack in range(1, n_stacks+1):
            iteration_tile_filename = f"scenario_tile_{stack}.tif"
            # Check if tile already exists
            scenario_tile_exists = False
            for scenario_tile in os.listdir(tile_cache_iteration_dir):
              if scenario_tile == iteration_tile_filename: scenario_tile_exists=True
            # If scenario prediction tile does not exist:
            if scenario_tile_exists == False:
              # Load template tile parameters
              template_tile_dir = join(tile_templates_dir, f"template_tile_{stack}.tif")
              template_tile_y = gdal.Open(template_tile_dir).GetRasterBand(1).YSize
              template_tile_x = gdal.Open(template_tile_dir).GetRasterBand(1).XSize
              # Load predictor tile stack
              stack_filename = f"predictor_stack_{scenario}_{stack}.npy"
              predictor_stack = np.load(join(scenario_predictor_stack_dir, stack_filename))
              # Add covariates (sensitivity and BEAM)
              if add_covariates: predictor_stack = np.hstack((predictor_stack,
                                np.full((predictor_stack.shape[0], 1), beam_value, dtype=int),
                                np.full((predictor_stack.shape[0], 1), sensitivity_value, dtype=float)
                                ))
              # Define prediction array and reshape
              prediction = XGBPredictor.predict(predictor_stack)
              predictor_stack = None # Flush predictor stack
              prediction_tile = prediction.reshape((template_tile_y, template_tile_x))
              prediction = None # Flush prediction
              # Export prediction array as .tif
              export_array_as_tif(prediction_tile, join(tile_cache_iteration_dir, iteration_tile_filename), template = template_tile_dir, compress = False)
              prediction_tile = None # Flush prediction tile
              # Update progress
            tile_progress_index += 1
            tile_progress_label.value = f"Tile progress: {tile_progress_index} / {n_stacks}"
          # Prepare empty array for merging tiles
          prediction_array = np.empty((0,template_tile_x))
          # Read each tile .tif as an array, stack, then export as a .tif
          for subdir in os.listdir(tile_cache_iteration_dir):
            if subdir.endswith('.tif'):
              tile_dir = join(tile_cache_iteration_dir, subdir)
              prediction_array = np.vstack((prediction_array, gdal.Open(tile_dir).ReadAsArray()))
          # Delete scenario tile cache directory and reset index
          shutil.rmtree(tile_cache_iteration_dir)
          tile_progress_index = 0
          tile_progress_label.value = f"Tile progress: 0 / {n_tiles}"

        # Define scenario template
        scenario_template = join(predictors_dir, os.listdir(predictors_dir)[0])
        export_array_as_tif(prediction_array, prediction_iteration_path, template = scenario_template, compress = True)

      iteration_progress_index += 1
      iteration_progress_label.value = f"{scenario} iteration progress: {iteration_progress_index} / {scenario_iterations}"
  else:
    iteration_progress_label.value = f"{scenario} iteration progress: 100 / {scenario_iterations}"
print("\nScenario iterations complete.")

# Scenario statistics


In [None]:
# Select a scenario iterations area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "tekai"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_unmasked_dir = join(uncertainty_scenario_area_dir,"statistics_unmasked")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")
makedirs(statistics_unmasked_dir, exist_ok=True)
makedirs(statistics_masked_dir, exist_ok=True)

# Collect scenarios with iterations
scenarios_iterations_list = []
for subdir in os.listdir(uncertainty_scenario_area_dir):
  if subdir.endswith("_iterations"):
    scenarios_iterations_list.append(subdir[:-11])
# Select scenarios to calculate mean and standard deviation
print("scenarios_to_calculate = [")
for scenario in sorted(scenarios_iterations_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_calculate = [
  "2022",
  "2022_no_degradation_since_1990",
  "2022_oldgrowth",
  "2023",
  "2023_no_degradation_since_1990",
  "2023_oldgrowth",
  "all_oldgrowth",
]

In [None]:
# Check iteration number

# Exact '0' pixels without decimals can be an indicator that an iteration was incorrectly generated.
# These must be deleted and predicted again to avoid incorrect statistics.
check_problems = True

# Exact '0' pixels might have been genuinely predicted (though unlikely in a regression),
# In which case set fix_problems to True and add these iterations to problem_arrays list.
# The code below will add 0.001 so they won't trigger the problem checker again.
fix_problems = False
problem_rasters = [

]

if len(problem_rasters) > 0:
  for problem_raster in problem_rasters:
    problem_raster_path = join(uncertainty_scenario_area_dir, f"{scenario}_iterations", problem_raster)
    problem_raster_array = gdal.Open(problem_raster_path).ReadAsArray()
    problem_raster_array[problem_raster_array == 0] = 0.001
    export_array_as_tif(problem_raster_array, problem_raster_path, template = problem_raster_path, compress=True)

# Check the number of prediction iterations
for scenario in scenarios_to_calculate:
  scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")
  iterations = 0
  for subdir in os.listdir(scenario_iterations_dir):
    if subdir.endswith(".tif"):
      # Check whether the prediction iteration is valid
      if check_problems:
        iteration = join(scenario_iterations_dir,subdir)
        iteration_array = gdal.Open(iteration).ReadAsArray()
        assert np.count_nonzero(iteration_array==0) == 0, f"{subdir} contains 0 values, so the iteration may not have predicted correctly.\n Check the file, delete and repredict if necessary.\n If they are valid 0 values, run the cell below on:\n {iteration}."
      iterations += 1
  print(f"There are {iterations} prediction iterations for scenario {scenario} statistics.")

In [None]:
confidence_interval = 0.95

# Statistics progress
stats_progress_index = 0
stats_progress_label = widgets.Label(f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}")
display(stats_progress_label)

for scenario in scenarios_to_calculate:
    stat_base_filename = f"{scenario}__{selected_scenario_iterations_area}_{selected_model}"
    scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")

    # Define statistics raster directories
    stat_mean_filename = f"mean__{stat_base_filename}_unmasked.tif"
    stat_mean_dir = join(statistics_unmasked_dir,stat_mean_filename)
    stat_uncertainty_filename = f"uncertainty__{stat_base_filename}_unmasked.tif"
    stat_uncertainty_dir = join(statistics_unmasked_dir,stat_uncertainty_filename)

    # Check whether statistics rasters already exist
    stat_mean_tif_exists, stat_uncertainty_tif_exists = False, False
    for stat_tif in os.listdir(statistics_unmasked_dir):
        if stat_tif == stat_mean_filename: stat_mean_tif_exists = True
        if stat_tif == stat_uncertainty_filename: stat_uncertainty_tif_exists = True

    # If either mean or uncertainty do not exist
    if stat_mean_tif_exists == False or stat_uncertainty_tif_exists == False:
        stat_sum = None
        stat_sum_sq = None
        iteration_n = 0
        for subdir in os.listdir(scenario_iterations_dir):
            if subdir.endswith(".tif"):
                iteration = os.path.join(scenario_iterations_dir, subdir)
                iteration_array = gdal.Open(iteration).ReadAsArray()
                if stat_sum is None:
                    stat_sum = np.zeros_like(iteration_array, dtype='float64')
                    stat_sum_sq = np.zeros_like(iteration_array, dtype='float64')
                # Sum and sum of squares
                stat_sum += iteration_array  # Running sum for mean
                stat_sum_sq += np.square(iteration_array, dtype=np.float64)  # Running sum of squares for variance
                iteration_n += 1

        # Calculate mean: sum / count
        stat_mean = np.divide(stat_sum, iteration_n, dtype='float64')
        if stat_mean_tif_exists == False:
            export_array_as_tif(stat_mean, stat_mean_dir, template = iteration)
            print(f"{stat_mean_filename} has been exported.")
        else: print(f"{stat_mean_filename} already exists.")

        if stat_uncertainty_tif_exists == False:
            # Calculate variance: E[X^2] - (E[X])^2
            stat_variance = (stat_sum_sq - (stat_sum ** 2) / iteration_n) / (iteration_n - 1)
            # Standard error: σ / sqrt(n)
            stat_se = np.sqrt(stat_variance) / np.sqrt(iteration_n)
            # Calculate confidence intervals using t-distribution
            stat_ci_lower, stat_ci_upper = st.t.interval(confidence_interval, iteration_n - 1, loc=stat_mean, scale=stat_se)
            # CI width: (upper - lower) / 2
            stat_ci = np.divide(np.subtract(stat_ci_upper, stat_ci_lower, dtype='float64'), 2, dtype='float64')
            # Uncertainty: (CI / mean) * 100%
            stat_uncertainty = np.multiply(np.divide(stat_ci, stat_mean, dtype='float64'), 100, dtype='float64')
            # Export statistics arrays as rasters
            export_array_as_tif(stat_se, join(statistics_unmasked_dir,f"se__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"se__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci_lower, join(statistics_unmasked_dir,f"ci_lower__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"ci_lower__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci_upper, join(statistics_unmasked_dir,f"ci_upper__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"ci_upper__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci, join(statistics_unmasked_dir,f"ci__{stat_base_filename}_unmasked.tif"), template = iteration)
            print(f"ci__{stat_base_filename}_unmasked.tif has been exported.")
            export_array_as_tif(stat_uncertainty, stat_uncertainty_dir, template = iteration)
            print(f"{stat_uncertainty_filename} has been exported.")
        else: print(f"{stat_uncertainty_filename} already exists.")

    else: print(f"{stat_mean_filename} and {stat_uncertainty_filename} already exist.")
    stats_progress_index += 1
    stats_progress_label.value = (f"Scenario stats progress: {stats_progress_index}/{len(scenarios_to_calculate)}")

print("Statistics calculations and .tif exports complete.")

# Mask scenario statistics

In [None]:
# Select a scenario iterations area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "tekai"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_unmasked_dir = join(uncertainty_scenario_area_dir,"statistics_unmasked")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")

# Use polygons for masking, only areas inside the polygons will be included
polygons_to_exclude = ['template.gpkg', 'project_area_buffered_bbox.gpkg']
print("mask_polygons = [")
for polygon in os.listdir(polygons_dir):
  if polygon not in polygons_to_exclude:
    if 'inverse' not in polygon:
      print(f"  '{polygon[:-5]}',")
print("]")

In [None]:
mask_polygons = [
  # 'project_area',
  'gedi_area',
  # 'peninsular_malaysia',
  # 'pa_taman_krau',
  # 'pa_ais',
]

# Create an inverse project area path for masking
template_polygon_path = join(polygons_dir, "template.gpkg")
for polygon in mask_polygons:
  inverse_polygon_path = join(polygons_dir, f"{polygon}_inverse.gpkg")
  if not exists(inverse_polygon_path):
    polygon_path = join(polygons_dir, f"{polygon}.gpkg")
    template_polygon = gpd.read_file(template_polygon_path)
    polygon_read = gpd.read_file(polygon_path)
    polygon_crs = polygon_read.crs.to_epsg()
    inverse_polygon = template_polygon['geometry'].difference(polygon_read['geometry']).iloc[0]
    inverse_polygon_gdf = gpd.GeoDataFrame({'geometry': [inverse_polygon]}, crs=f"EPSG:{polygon_crs}")
    inverse_polygon_gdf.to_file(inverse_polygon_path, driver="GPKG")
    print(f"An inverse masking polygon for {polygon} has been created in {polygons_dir}.")
  else: print(f"An inverse masking polygon for {polygon} already exists.")

unmasked_predictions = []
for scenario_prediction in os.listdir(statistics_unmasked_dir):
  # Only mask mean, uncertainty and ci for visualisation and calculating statistics
  if ('mean__' in scenario_prediction) or ('uncertainty__' in scenario_prediction):
    unmasked_predictions.append(scenario_prediction)

# Determine last predictor year for masking future scenarios
final_predictor_years = []
for final_predictor in os.listdir(predictor_final_dir):
  if final_predictor.endswith('.tif') and final_predictor[-9] == '_':
    try: final_predictor_years.append(int(final_predictor[-8:-4]))
    except: continue
last_predictor_year = max(final_predictor_years)

# Masking progress
masking_progress_index = 0
masking_progress_label = widgets.Label(f"Masking progress: {masking_progress_index}/{len(unmasked_predictions)}")
display(masking_progress_label)

# Mask scenario statistics with the relevatant mask
for scenario_prediction in unmasked_predictions: # Loop through each unmasked scenario
  scenario_masked_filename = f"{scenario_prediction[:-13]}.tif"
  scenario_masked_dir = join(statistics_masked_dir, scenario_masked_filename)
  if not exists(scenario_masked_dir):
    mask_exists = False
    for mask in os.listdir(masks_dir):
      # Match all oldgrowth scenarios
      if 'all_oldgrowth' in mask or 'all_oldgrowth' in scenario_prediction:
        if 'all_oldgrowth' in mask and 'all_oldgrowth' in scenario_prediction:
          selected_mask_filename = mask
          selected_mask_dir = join(masks_dir, selected_mask_filename)
          mask_exists = True
      else: # Match all other historic scenarios
        scenario_year = int(scenario_prediction.split('__')[1][:4])
        mask_year = int(mask[12:16])
        if scenario_year == mask_year:
          selected_mask_filename = mask
          selected_mask_dir = join(masks_dir, selected_mask_filename)
          mask_exists = True
        else: # Match future scenarios with most recent forest mask
          if scenario_year > last_predictor_year and last_predictor_year == mask_year:
            selected_mask_filename = mask
            selected_mask_dir = join(masks_dir, selected_mask_filename)
            mask_exists = True
    if mask_exists == False: print(f"A suitable mask for {scenario_prediction} does not exist.\n")
    else: # Mask the scenario prediction
      print(f"Masking {scenario_prediction} with {selected_mask_filename}...")
      mask_array = gdal.Open(selected_mask_dir).ReadAsArray()
      scenario_prediction_unmasked_dir = join(statistics_unmasked_dir, scenario_prediction)
      scenario_prediction_array = gdal.Open(scenario_prediction_unmasked_dir).ReadAsArray()
      # Mask where the mask array is not 1
      scenario_masked_array = np.where(mask_array != 1, nodatavalue, scenario_prediction_array)
      export_array_as_tif(scenario_masked_array, scenario_masked_dir, compress = True)
      if len(mask_polygons) > 0:
        for polygon_mask in mask_polygons:
          inverse_gedi_area_path = join(polygons_dir, f"{polygon_mask}_inverse.gpkg")
          print(f"Masking {scenario_prediction} with {polygon_mask}...")
          burn_polygon_to_raster(scenario_masked_dir, inverse_gedi_area_path, fixed_value=nodatavalue, all_touched=False)
        # Recompress the prediction after burning the polygon masks
        scenario_masked_array_2 = gdal.Open(scenario_masked_dir).ReadAsArray()
        export_array_as_tif(scenario_masked_array_2, scenario_masked_dir, compress = True)
      print(f"{scenario_masked_filename} exported.")
  # Update masking progress
  masking_progress_index += 1
  masking_progress_label.value = f"Masking progress: {masking_progress_index}/{len(unmasked_predictions)}"

# Scenario difference with uncertainty

In [None]:
# Select a scenario statistics area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "tekai"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")
diff_uncertainty_dir = join(uncertainty_scenario_area_dir, "scenario_difference")
makedirs(diff_uncertainty_dir, exist_ok=True)

scenarios_diff_set = set()
for masked_statistic in os.listdir(statistics_masked_dir):
    scenarios_diff_set.add(masked_statistic.split("__")[1])

# Generate all possible pairs of scenarios, including all orders
scenario_pairs = sorted(list(itertools.permutations(scenarios_diff_set, 2)))

print("# Select scenarios to calculate mean difference with uncertainty")
print("scenario_pairs = [")
for s1, s2 in scenario_pairs:
    print(f"# ('{s1}','{s2}'),")
print("]")

In [None]:
# Select scenarios to calculate mean difference with uncertainty
scenario_pairs = [
  ('2022', 'all_oldgrowth'),
  ('2022', '2022_no_degradation_since_1990'),
  ('2022', '2022_oldgrowth'),
  ('2022_no_degradation_since_1990', '2022_oldgrowth'),
  ('2022_oldgrowth', 'all_oldgrowth'),
  ('2023', 'all_oldgrowth'),
  ('2023', '2023_no_degradation_since_1990'),
  ('2023', '2023_oldgrowth'),
  ('2023_no_degradation_since_1990', '2023_oldgrowth'),
  ('2023_oldgrowth', 'all_oldgrowth'),
]

In [None]:
# Rename scenario differences for semantic meaning (optional)
difference_names = {
  ('2022', 'all_oldgrowth'):
    '2022_degradation_deforestation_total',
  ('2022', '2022_no_degradation_since_1990'):
    '2022_degradation_since_1990',
  ('2022', '2022_oldgrowth'):
    '2022_degradation_total',
  ('2022_no_degradation_since_1990', '2022_oldgrowth'):
    '2022_degradation_before_1990',
  ('2022_oldgrowth', 'all_oldgrowth'):
    '2022_deforestation_total',
  ('2023', 'all_oldgrowth'):
    '2023_degradation_deforestation_total',
  ('2023', '2023_no_degradation_since_1990'):
    '2023_degradation_since_1990',
  ('2023', '2023_oldgrowth'):
    '2023_degradation_total',
  ('2023_no_degradation_since_1990', '2023_oldgrowth'):
    '2023_degradation_before_1990',
  ('2023_oldgrowth', 'all_oldgrowth'):
    '2023_deforestation_total',
}

In [None]:
# Functions for difference in mean and uncertainty
def diff_mean(scenario1_mean, scenario2_mean):
  diff_mean_array = scenario1_mean - scenario2_mean
  return diff_mean_array
def diff_uncertainty(scenario1_mean, scenario1_uncertainty, scenario2_mean, scenario2_uncertainty):
  sums_of_squares = np.square( np.multiply( scenario2_mean, scenario2_uncertainty, dtype='float64'), dtype='float64') + np.square( np.multiply( scenario1_mean, scenario1_uncertainty, dtype='float64'), dtype='float64')
  diff_uncertainty_array = np.sqrt(sums_of_squares, dtype='float64') / (scenario2_mean + scenario1_mean)
  return diff_uncertainty_array

# Loop through the scenario pairs
for scenario1, scenario2 in scenario_pairs:

  # Lookup the description from the dictionary
  difference_name = difference_names.get((scenario1, scenario2), f"{scenario1}_-_{scenario2}")

  # Define filenames and directories of mean and uncertainty difference .tifs
  diff_mean_filename = f"mean__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
  diff_mean_dir = join(diff_uncertainty_dir, diff_mean_filename)
  diff_uncertainty_filename = f"uncertainty__{difference_name}__{selected_scenario_iterations_area}_{selected_model}.tif"
  diff_uncertainty_raster_dir = join(diff_uncertainty_dir, diff_uncertainty_filename)

  if not exists(diff_mean_dir) and not exists(diff_uncertainty_raster_dir):
    print(f"Calculating mean difference with uncertainty between {scenario1} and {scenario2}")
    scenario1_base_filename = f"{scenario1}__{selected_scenario_iterations_area}_{selected_model}"
    scenario2_base_filename = f"{scenario2}__{selected_scenario_iterations_area}_{selected_model}"

    # Define mean and uncertainty directories, assert that both exist for both scenarios
    scenario1_mean_dir = join(statistics_masked_dir,f"mean__{scenario1_base_filename}.tif")
    assert exists(scenario1_mean_dir), f"mean__{scenario1_base_filename}.tif does not exist."
    scenario1_uncertainty_dir = join(statistics_masked_dir,f"uncertainty__{scenario1_base_filename}.tif")
    assert exists(scenario1_uncertainty_dir), f"uncertainty__{scenario1_base_filename}.tif does not exist."
    scenario2_mean_dir = join(statistics_masked_dir,f"mean__{scenario2_base_filename}.tif")
    assert exists(scenario2_mean_dir), f"mean__{mean__scenario2_base_filename}.tif does not exist."
    scenario2_uncertainty_dir = join(statistics_masked_dir,f"uncertainty__{scenario2_base_filename}.tif")
    assert exists(scenario2_uncertainty_dir), f"uncertainty__{scenario2_base_filename}.tif does not exist."

    # Convert scenario mean and uncertainty .tifs to temporary arrays
    scenario1_mean_array_temp = gdal.Open(scenario1_mean_dir).ReadAsArray()
    scenario1_uncertainty_array_temp = gdal.Open(scenario1_uncertainty_dir).ReadAsArray()
    scenario2_mean_array_temp = gdal.Open(scenario2_mean_dir).ReadAsArray()
    scenario2_uncertainty_array_temp = gdal.Open(scenario2_uncertainty_dir).ReadAsArray()

    # Fill scenario nodata values with 0 if they are not nodatavalues in the other scenario
    scenario1_mean_array = np.where((scenario1_mean_array_temp == nodatavalue) & (scenario2_mean_array_temp != nodatavalue), 0, scenario1_mean_array_temp)
    scenario1_uncertainty_array = np.where((scenario1_uncertainty_array_temp == nodatavalue) & (scenario2_uncertainty_array_temp != nodatavalue), 0, scenario1_uncertainty_array_temp)
    scenario2_mean_array = np.where((scenario2_mean_array_temp == nodatavalue) & (scenario1_mean_array != nodatavalue), 0, scenario2_mean_array_temp)
    scenario2_uncertainty_array = np.where((scenario2_uncertainty_array_temp == nodatavalue) & (scenario1_uncertainty_array != nodatavalue), 0, scenario2_uncertainty_array_temp)

    # Create difference mean and uncertainty arrays where the value is not 'nodatavalue'
    diff_mean_array = np.where(scenario1_mean_array==nodatavalue, nodatavalue, diff_mean(scenario1_mean_array, scenario2_mean_array))
    diff_uncertainty_array = np.where(scenario1_uncertainty_array==nodatavalue, nodatavalue, diff_uncertainty(scenario1_mean_array, scenario1_uncertainty_array, scenario2_mean_array, scenario2_uncertainty_array))

    # Export the mean and uncertainty difference .tifs if they do not exist
    if exists(diff_mean_dir): print(f"{diff_mean_filename} already exists.")
    else: export_array_as_tif(diff_mean_array, diff_mean_dir, template = scenario1_mean_dir), print(f"{diff_mean_filename} has been exported.")
    if exists(diff_uncertainty_raster_dir): print(f"{diff_uncertainty_filename} already exists.")
    else: export_array_as_tif(diff_uncertainty_array, diff_uncertainty_raster_dir, template = scenario1_mean_dir), print(f"{diff_uncertainty_filename} has been exported.")

  else: print(f"Both {diff_mean_filename} and {diff_uncertainty_filename} already exist.")

# Intactness

In [None]:
# Select a scenario statistics area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "tekai"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir, f"scenarios_{selected_scenario_iterations_area}")
statistics_masked_dir = join(uncertainty_scenario_area_dir, "statistics_masked")
diff_uncertainty_dir = join(uncertainty_scenario_area_dir, "scenario_difference")
intactness_dir = join(uncertainty_scenario_area_dir, 'intactness')
makedirs(intactness_dir, exist_ok=True)

# Select which baseline and difference raster to use for calculating intactness
# percentage and relative intactness. Ideally this is the scenario with the least disturbance
# and the difference between that and the current reality.

for baseline in os.listdir(statistics_masked_dir):
  if 'mean' in baseline:
    print(f"selected_baseline = '{baseline}'")
for diff in os.listdir(diff_uncertainty_dir):
  if 'mean' in diff:
    print(f"selected_diff = '{diff}'")

In [None]:
selected_baseline = 'mean__all_oldgrowth__tekai_agbd_240926_030225.tif'
selected_diff = 'mean__2022_degradation_deforestation_total__tekai_agbd_240926_030225.tif'
forest_mask_year = '2022'

percentage_filename = f"percentage_change__{selected_baseline.split('__')[0]}__{selected_diff.split('__')[0]}__{selected_diff.split('__')[1]}"
percentage_path = join(intactness_dir, percentage_filename)

if not exists(percentage_path):
  # Define filenames and directories
  selected_baseline_path = join(statistics_masked_dir, selected_baseline)
  selected_diff_path = join(diff_uncertainty_dir, selected_diff)
  selected_mask_path = join(masks_dir, f"mask_forest_{forest_mask_year}.tif")

  # Convert to arrays
  selected_baseline_array = gdal.Open(selected_baseline_path).ReadAsArray()
  selected_diff_array = gdal.Open(selected_diff_path).ReadAsArray()
  selected_mask_array = gdal.Open(selected_mask_path).ReadAsArray()

  # Create difference arrays where the value is not 'nodatavalue'
  percentage_array = np.where(selected_mask_array==nodatavalue, nodatavalue, selected_diff_array/selected_baseline_array*100)
  export_array_as_tif(percentage_array, percentage_path, template = selected_baseline_path)
  print(f"{percentage_filename} has been exported.")

else: print(f"{percentage_filename} already exists.")

In [None]:
# Use additional polygons for masking relative intactness quantiles
polygons_to_exclude = ['template.gpkg', 'project_area_buffered_bbox.gpkg']

for polygon in os.listdir(polygons_dir):
  if polygon not in polygons_to_exclude:
    if 'inverse' not in polygon:
      print(f"mask_polygon = '{polygon}'")

In [None]:
mask_polygon = 'forest_reserves.gpkg'
# mask_polygon = None

if mask_polygon is not None:
  # Create an inverse project area path for masking
  template_polygon_path = join(polygons_dir, "template.gpkg")
  inverse_polygon_path = join(polygons_dir, f"{mask_polygon[:-5]}_inverse.gpkg")
  if not exists(inverse_polygon_path):
    polygon_path = join(polygons_dir, mask_polygon)
    template_polygon = gpd.read_file(template_polygon_path)
    polygon_read = gpd.read_file(polygon_path)
    polygon_crs = polygon_read.crs.to_epsg()
    inverse_polygon = template_polygon['geometry'].difference(polygon_read['geometry']).iloc[0]
    inverse_polygon_gdf = gpd.GeoDataFrame({'geometry': [inverse_polygon]}, crs=f"EPSG:{polygon_crs}")
    inverse_polygon_gdf.to_file(inverse_polygon_path, driver="GPKG")
    print(f"An inverse masking polygon for {polygon} has been created in {polygons_dir}.")
  else: print(f"An inverse masking polygon for {polygon} already exists.")

  # Copy the percentage raster for potential masking
  percentage_masked_filename = f"{percentage_filename}_masked_{mask_polygon[:-5]}.tif"
  percentage_masked_path = join(intactness_dir, percentage_masked_filename)
  if not exists(percentage_masked_path):
    print(f"Copying {percentage_filename} for masking...")
    copyfile(percentage_path, percentage_masked_path)
    print(f"Masking {percentage_filename} with {mask_polygon}...")
    burn_polygon_to_raster(percentage_masked_path, inverse_polygon_path, fixed_value=nodatavalue, all_touched=False)
    # Recompress the prediction after burning the polygon masks
    percentage_masked_array = gdal.Open(percentage_masked_path).ReadAsArray()
    export_array_as_tif(percentage_masked_array, percentage_masked_path, compress = True)
    print(f"{percentage_filename} masked.")
  else: print(f"{percentage_masked_path} already exists.")

else: print("No additional mask will be used to calculate relative intactness.")

In [None]:
# Define number of quantiles for intactness rating (e.g. 10 for 1 - 10)
num_quantiles = 10

# Define paths and arrays
relative_intactness_name = f'intactness_{mask_polygon[:-5]}_{num_quantiles}_quantiles'
relative_intactness_path = join(intactness_dir, f'{relative_intactness_name}.tif')
percentage_masked_array = gdal.Open(percentage_masked_path).ReadAsArray()
relative_intactness_array = np.empty_like(percentage_masked_array, dtype=object)

# Set all values above 0 to 0, assuming negative values are not intact
percentage_masked_array[percentage_masked_array > 0] = 0

# Separate valid and invalid (nodatavalue) elements
valid_elements = percentage_masked_array[percentage_masked_array != nodatavalue]
invalid_elements = percentage_masked_array == nodatavalue

# Calculate quantiles for valid elements
quantiles = np.percentile(valid_elements, np.linspace(0, 100, num_quantiles + 1)[1:-1]) if len(valid_elements) > 0 else []
for i in range(1, num_quantiles + 1):
    lower_bound = quantiles[i-2] if i > 1 and len(quantiles) >= i-1 else float('-inf')
    upper_bound = quantiles[i-1] if len(quantiles) >= i else float('inf')
    relative_intactness_array[(percentage_masked_array > lower_bound) & (percentage_masked_array <= upper_bound)] = i
# if nodatavalue is not None:
    relative_intactness_array[invalid_elements] = nodatavalue
export_array_as_tif(relative_intactness_array, relative_intactness_path)

# Prepare data for CSV: Collect lower and upper bounds for each category
ranges_data = {'Lower_Bound': [], 'Upper_Bound': []}
for i in range(1, num_quantiles + 1):
    lower_bound = quantiles[i-2] if i > 1 and len(quantiles) >= i-1 else float('-inf')
    upper_bound = quantiles[i-1] if len(quantiles) >= i else float('inf')
    ranges_data['Lower_Bound'].append(lower_bound)
    ranges_data['Upper_Bound'].append(upper_bound)

# Create DataFrame and save to CSV
relative_intactness_df = pd.DataFrame(ranges_data)
relative_intactness_csv_path = os.path.join(intactness_dir, f'{relative_intactness_name}.csv')
relative_intactness_df.to_csv(relative_intactness_csv_path, index=False)

# Generate and save histogram as .png
histogram_path = join(intactness_dir, f'{relative_intactness_name}.png')
plt.figure()
plt.hist(valid_elements.flatten(), bins='auto')
plt.title(f'{relative_intactness_name} Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.savefig(histogram_path)
plt.show()
plt.close()

# Disconnect runtime

In [None]:
# Useful for stopping background execution upon completion
runtime.unassign()