<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/7_uncertainty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and directories

In [None]:
# Define base directory
base_dir = "/gdrive/Shareddrives/masfi"

# Mount Google Drive and set base directory
from google.colab import drive
import os
import sys
drive.mount('/gdrive', force_remount=True)
_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs and upgrades
!pip install xgboost --upgrade

In [None]:
# Reload imports, replacing those in the cache
%reload_ext autoreload
%autoreload 2
# Imports
import json
from google.colab import runtime
import ipywidgets as widgets
import itertools
import joblib
import numpy as np
from numpy import random
from numpy.random import normal
from os import makedirs
from os.path import exists, join
from osgeo import gdal
import pandas as pd
import scipy.stats as st
import shutil
import xgboost as xgb

# Define GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found.')
else:
  print(f"Found GPU at: {device_name}")

In [None]:
# Define directories
areas_dir = join(base_dir, "1_areas")
masks_dir = join(areas_dir, "masks")
models_dir = join(base_dir, "5_models")
scenarios_dir = join(base_dir, "6_scenarios")
uncertainty_dir = join(base_dir, "7_uncertainty")

# Create directories
makedirs(uncertainty_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Select model

In [None]:
# Select a baseline model, tested and trained in advance.
model_exists = False
for subdir, dirs, files in os.walk(models_dir):
  for file in files:
    if file == 'model.json':
      print(f'selected_model = "{subdir.split(f"{models_dir}/",1)[1]}"')
      model_exists = True
if not model_exists:
  print("No model exists.")

In [None]:
selected_model = "agbd_240819_091905"
categorise_variate = False # If the variate was categorised in 5_models

# Define model directories
selected_model_dir = join(models_dir,selected_model)
selected_model_json = join(selected_model_dir, "model.json")
selected_model_descr_dir = join(selected_model_dir, "model_description.json")
selected_model_dataset_path = join(selected_model_dir, f"{selected_model}.pkl")
selected_model_dataset = pd.read_pickle(selected_model_dataset_path)

# Read description for model dataset attributes
with open(join(selected_model_dir,"model_dataset_description.json")) as model_dataset_description_json:
  model_dataset_description = json.load(model_dataset_description_json)
model_dataset_name = model_dataset_description["model_dataset_name"]
number_of_columns = model_dataset_description["number_of_columns"]
number_of_rows = model_dataset_description["number_of_rows"]
id_column = model_dataset_description["id_column"]
selected_variate = model_dataset_description["selected_variate"]
uncertainty = model_dataset_description["uncertainty"]
covariates_renamed = model_dataset_description["covariates_renamed"]
covariates_categorised = model_dataset_description["covariates_categorised"]
selected_predictors = model_dataset_description["selected_predictors"] + model_dataset_description["covariates_renamed"]
categorical_columns = model_dataset_description["categorical_columns"]
descriptive_parameters = model_dataset_description["descriptive_parameters"]
filter_parameter = model_dataset_description["filter_parameter"]
filter_values_to_include = model_dataset_description["filter_values_to_include"]
sample_imported_dataset = model_dataset_description["sample_imported_dataset"]
sample_imported_dataset_by_percent = model_dataset_description["sample_imported_dataset_by_percent"]
sample_imported_dataset_value = model_dataset_description["sample_imported_dataset_value"]

# Reload hyperparameters
with open(selected_model_descr_dir) as model_description_json:
  model_description = json.load(model_description_json)
final_hyperparameters = eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])

# Define directories
scenarios_model_dir = join(scenarios_dir,selected_model)
uncertainty_selected_model_dir = join(uncertainty_dir, selected_model)
model_iterations_dir = join(uncertainty_selected_model_dir, "model_iterations")

# Create directories
makedirs(uncertainty_selected_model_dir, exist_ok=True)
makedirs(model_iterations_dir, exist_ok=True)

# Model iterations

In [None]:
# Verify that the variate is equal to the mean
print(f'mean = "{selected_variate}"')

# Calculate se from columns flagged 'uncertainty'
if len(uncertainty)==0:
  print("There are no flagged uncertainty columns to calculate SE from.")
  print("Manually create the metric from the available columns.")
  for col in selected_model_dataset.columns:
    print(f"{col}")
else:
  for col in selected_model_dataset.columns:
    if col in uncertainty and col not in selected_variate:
      print(f'se = "{col}"')

In [None]:
mean = "var_agbd"
se = "var_agbd_se"
# Liang et al. (2023) use SE as a proxy for STDEV

# Set model iterations
model_iterations = 100

# Define model (y axis changes for each iteration based on mean and se arrays)
model_dataset_x = selected_model_dataset[selected_predictors]
mean_array = selected_model_dataset[mean].values
se_array = selected_model_dataset[se].values
if categorise_variate: XGBPredictor = xgb.XGBClassifier(**final_hyperparameters)
else: XGBPredictor = xgb.XGBRegressor(**final_hyperparameters)
model_params = XGBPredictor.get_params()
model_params['eval_metric'] = model_description['metric_used_for_training']
# Default fix for new XGBoost version
[model_params.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]

# Progress label
model_progress_index = 0
model_progress_label = widgets.Label(f"Model iteration: {model_progress_index}/{model_iterations}")
display(model_progress_label)

for model_iteration in range(1,model_iterations+1):
  # Set model iteration filename and check if already exists
  model_iteration_filename = f"model_iteration_{model_iteration}.json"
  model_iteration_path = join(model_iterations_dir, model_iteration_filename)
  # If model iteration does not exist...
  if not exists(model_iteration_path):
    # Set the random seed based on iteration for replicability
    np.random.seed(model_iteration)
    # Set a normal distribution sample as the y for this iteration
    model_dataset_y = normal(mean_array, se_array)
    # Create DMatrix objects
    model_dtrain = xgb.DMatrix(model_dataset_x, model_dataset_y, enable_categorical=True)
    # Train the model iteration using the tested hyperparameters
    model = xgb.train(model_params,
                        model_dtrain,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        verbose_eval=True)
    # Save the model iteration
    model.save_model(model_iteration_path)
  # Update progress
  model_progress_index += 1
  model_progress_label.value = f"Model iteration: {model_progress_index}/{model_iterations}"
print("All model iterations have been trained and saved.")

# Scenario iterations

In [None]:
# Scenarios must be designed and tested using 06_scenarios first.
# Predictors in the 'constant' and scenario subdirs should not be moved.

# Select a scenario area
scenario_area_exists = False
for subdir in os.listdir(scenarios_model_dir):
  if not subdir.endswith('.json') and not subdir.endswith('.csv'):
    print(f'selected_scenario_area = "{subdir}"')
    scenario_area_exists = True
if not scenario_area_exists:
  print(f"Create a scenario area directory in {scenarios_model_dir}")

In [None]:
selected_scenario_area = "taman"

# Locate scenario area directories
selected_scenario_area_dir = join(scenarios_model_dir, selected_scenario_area)
predictors_dir = join(selected_scenario_area_dir, "predictors")
tile_templates_dir = join(selected_scenario_area_dir, 'tile_templates')
tile_predictor_stacks_dir = join(selected_scenario_area_dir, 'tile_predictor_stacks')

# Define uncertainty scenario area directory
uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_area}")
tile_prediction_cache_dir = join(uncertainty_scenario_area_dir, "tile_prediction_cache")
makedirs(uncertainty_scenario_area_dir, exist_ok=True)
makedirs(tile_prediction_cache_dir, exist_ok=True)

# Collect available scenarios from the predictor stack tiles directory
scenario_stacks_list = []
for scenario in os.listdir(tile_predictor_stacks_dir):
    scenario_stacks_list.append(scenario)

# Select scenarios to predict
print("scenarios_to_predict = [")
for scenario in sorted(scenario_stacks_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_predict = [
  # "2019",
  # "2020",
  # "2021",
  # "2022",
  # "2023",
  "2024",
  "2024_nodef_historic",
  "2024_nodeg_historic",
  "9999_comrec",
  "9999_comrest",
  "9999_comrest_first_historic",
]

# Check the number of model iterations available
model_iterations_available = len(os.listdir(model_iterations_dir))
print(f"\nThere are {len(os.listdir(model_iterations_dir))} model iterations available.")

In [None]:
# Set the number of scenario iterations. It must be <= the number of model iterations available.
scenario_iterations = 100

assert scenario_iterations <= model_iterations_available, f"Reduce the number of scenario iterations to <= {model_iterations_available}."

# Change this and the code within the block accordingly.
add_covariates = True # Adds a selected covariate value as the predictor
sensitivity_value = 0.99
beam_value = 5

# Check for GPU
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0': print('GPU device not found')
else: print(f"Found GPU at: {device_name}")

# Check existing tile parameters
template_tile_list = []
for file in os.listdir(tile_templates_dir):
  if file.endswith('.tif') and file[:13] == 'template_tile':
    template_tile_list.append(file)
n_tiles = len(template_tile_list)

if n_tiles < 1: print("There are currently no template tiles.")
if n_tiles >= 1:
  template_tile_x = gdal.Open(join(tile_templates_dir,'template_tile_1.tif')).GetRasterBand(1).XSize
  print(f"There are {n_tiles} template tiles.")

# Tile progress
if n_tiles > 1:
  tile_progress_index, tile_progress_label = 0, widgets.Label(value=f"Tile progress: 0 / {n_tiles}")
  display(tile_progress_label)

# Loop through each scenario
for scenario in scenarios_to_predict:
  scenario_predictor_stack_dir = join(tile_predictor_stacks_dir, scenario)
  # Create scenario iterations directory
  scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")
  makedirs(scenario_iterations_dir, exist_ok=True)
  # Iteration progress
  iteration_progress_index = 0
  iteration_progress_label = widgets.Label(f"{scenario} iteration progress: {iteration_progress_index} / {scenario_iterations}")
  display(iteration_progress_label)

  if n_tiles == 1:
    # Load template parameters
    template_tile_dir = join(tile_templates_dir, f"template_tile_1.tif")
    template_tile_y = gdal.Open(template_tile_dir).GetRasterBand(1).YSize
    template_tile_x = gdal.Open(template_tile_dir).GetRasterBand(1).XSize
    # Load predictor stack
    stack_filename = f"predictor_stack_{scenario}_1.npy"
    predictor_stack = np.load(join(scenario_predictor_stack_dir, stack_filename))
    if add_covariates: predictor_stack = np.hstack((predictor_stack,
                  np.full((predictor_stack.shape[0], 1), beam_value, dtype=int),
                  np.full((predictor_stack.shape[0], 1), sensitivity_value, dtype=float)
                  ))

  # Predict scenario for each model iteration
  for model_iteration in range(1,scenario_iterations+1):
    # Define the model
    model_dir = join(model_iterations_dir,f"model_iteration_{model_iteration}.json")
    # Define scenario iteration filename and check if exists
    prediction_iteration_filename = f"{scenario}__{selected_scenario_area}_{selected_model}_unmasked_iteration_{model_iteration}.tif"
    prediction_iteration_dir = join(scenario_iterations_dir, prediction_iteration_filename)
    prediction_iteration_exists = False
    for file in os.listdir(scenario_iterations_dir):
      if file == prediction_iteration_filename: prediction_iteration_exists=True
    # If scenario iteration does not exist:
    if prediction_iteration_exists == False:
      # Load model
      if categorise_variate: XGBPredictor = xgb.XGBClassifier(**final_hyperparameters)
      else: XGBPredictor = xgb.XGBRegressor(**final_hyperparameters)
      XGBPredictor.load_model(fname=model_dir)
      # Avoids issues using dataframe from CPU
      xgb.set_config(verbosity=0, use_rmm=True)
      # Get number of stacks
      n_stacks = len(os.listdir(scenario_predictor_stack_dir))
      if n_stacks == 1:
        # Define prediction array and reshape
        prediction = XGBPredictor.predict(predictor_stack)
        prediction_array = prediction.reshape((template_tile_y, template_tile_x))
        prediction = None # Flush prediction

      # Tiling for if predictor stacks and separated into chunks
      if n_stacks > 1:
        # Create a tile cache directory for the prediction
        tile_cache_iteration_dir = join(tile_prediction_cache_dir, prediction_iteration_filename[:-4])
        makedirs(tile_cache_iteration_dir, exist_ok=True)
        # Create a tile count to match the predictor stack chunk
        for stack in range(1, n_stacks+1):
          iteration_tile_filename = f"scenario_tile_{stack}.tif"
          # Check if tile already exists
          scenario_tile_exists = False
          for scenario_tile in os.listdir(tile_cache_iteration_dir):
            if scenario_tile == iteration_tile_filename: scenario_tile_exists=True
          # If scenario prediction tile does not exist:
          if scenario_tile_exists == False:
            # Load template tile parameters
            template_tile_dir = join(tile_templates_dir, f"template_tile_{stack}.tif")
            template_tile_y = gdal.Open(template_tile_dir).GetRasterBand(1).YSize
            template_tile_x = gdal.Open(template_tile_dir).GetRasterBand(1).XSize
            # Load predictor tile stack
            stack_filename = f"predictor_stack_{scenario}_{stack}.npy"
            predictor_stack = np.load(join(scenario_predictor_stack_dir, stack_filename))
            # Add covariates (sensitivity and BEAM)
            if add_covariates: predictor_stack = np.hstack((predictor_stack,
                              np.full((predictor_stack.shape[0], 1), beam_value, dtype=int),
                              np.full((predictor_stack.shape[0], 1), sensitivity_value, dtype=float)
                              ))
            # Define prediction array and reshape
            prediction = XGBPredictor.predict(predictor_stack)
            predictor_stack = None # Flush predictor stack
            prediction_tile = prediction.reshape((template_tile_y, template_tile_x))
            prediction = None # Flush prediction
            # Export prediction array as .tif
            export_array_as_tif(prediction_tile, join(tile_cache_iteration_dir, iteration_tile_filename), template = template_tile_dir, compress = False)
            prediction_tile = None # Flush prediction tile
            # Update progress
          tile_progress_index += 1
          tile_progress_label.value = f"Tile progress: {tile_progress_index} / {n_stacks}"
        # Prepare empty array for merging tiles
        prediction_array = np.empty((0,template_tile_x))
        # Read each tile .tif as an array, stack, then export as a .tif
        for subdir in os.listdir(tile_cache_iteration_dir):
          if subdir.endswith('.tif'):
            tile_dir = join(tile_cache_iteration_dir, subdir)
            prediction_array = np.vstack((prediction_array, gdal.Open(tile_dir).ReadAsArray()))
        # Delete scenario tile cache directory and reset index
        shutil.rmtree(tile_cache_iteration_dir)
        tile_progress_index = 0
        tile_progress_label.value = f"Tile progress: 0 / {n_tiles}"

      # Define scenario template
      scenario_template = join(predictors_dir, os.listdir(predictors_dir)[0])
      export_array_as_tif(prediction_array, prediction_iteration_dir, template = scenario_template, compress = True)

    iteration_progress_index += 1
    iteration_progress_label.value = f"{scenario} iteration progress: {iteration_progress_index} / {scenario_iterations}"
print("\nScenario iterations complete.")

# Scenario statistics


In [None]:
# Select a scenario iterations area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "taman"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_unmasked_dir = join(uncertainty_scenario_area_dir,"statistics_unmasked")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")
makedirs(statistics_unmasked_dir, exist_ok=True)
makedirs(statistics_masked_dir, exist_ok=True)

# Collect scenarios with iterations
scenarios_iterations_list = []
for subdir in os.listdir(uncertainty_scenario_area_dir):
  if subdir.endswith("_iterations"):
    scenarios_iterations_list.append(subdir[:-11])
# Select scenarios to calculate mean and standard deviation
print("scenarios_to_calculate = [")
for scenario in sorted(scenarios_iterations_list):
  print(f'  "{scenario}",')
print("]")

In [None]:
scenarios_to_calculate = [
  "2024",
  "2024_nodef_historic",
  "2024_nodeg_historic",
  "9999_comrec",
  "9999_comrest",
  "9999_comrest_first_historic",
]

In [None]:
# Check the number of prediction iterations
for scenario in scenarios_to_calculate:
  scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")
  iterations = 0
  for subdir in os.listdir(scenario_iterations_dir):
    if subdir.endswith(".tif"):
      # Check whether the prediction iteration is valid
      iteration = join(scenario_iterations_dir,subdir)
      iteration_array = gdal.Open(iteration).ReadAsArray()
      assert np.count_nonzero(iteration_array==0) == 0, f"{subdir} contains 0 values, so the iteration may not have predicted correctly.\n Check the file, delete and repredict if necessary.\n If they are valid 0 values, run the cell below on:\n {iteration}."
      iterations += 1
  print(f"There are {iterations} prediction iterations for scenario {scenario} statistics.")

In [None]:
# If the predicted 0 is legitimate, convert to 0.001 and repeat the above cell.
problem_array = ''

if len(problem_array) > 0:
  problem_array_dir = join(uncertainty_scenario_area_dir, f"{scenario}_iterations", problem_array)
  problem_array = gdal.Open(problem_array_dir).ReadAsArray()
  problem_array[problem_array == 0] = 0.001
  export_array_as_tif(problem_array, problem_array_dir, template = problem_array_dir, compress=True)

In [None]:
confidence_interval = 0.95

for scenario in scenarios_to_calculate:
    stat_base_filename = f"{scenario}__{selected_scenario_iterations_area}_{selected_model}"
    scenario_iterations_dir = join(uncertainty_scenario_area_dir,f"{scenario}_iterations")

    # Define statistics raster directories
    stat_mean_filename = f"{stat_base_filename}__mean_unmasked.tif"
    stat_mean_dir = join(statistics_unmasked_dir,stat_mean_filename)
    stat_uncertainty_filename = f"{stat_base_filename}__uncertainty_unmasked.tif"
    stat_uncertainty_dir = join(statistics_unmasked_dir,stat_uncertainty_filename)

    # Check whether statistics rasters already exist
    stat_mean_tif_exists, stat_uncertainty_tif_exists = False, False
    for stat_tif in os.listdir(statistics_unmasked_dir):
        if stat_tif == stat_mean_filename: stat_mean_tif_exists = True
        if stat_tif == stat_uncertainty_filename: stat_uncertainty_tif_exists = True

    # If either mean or uncertainty do not exist
    if stat_mean_tif_exists == False or stat_uncertainty_tif_exists == False:
        stat_sum = None
        stat_sum_sq = None
        iteration_n = 0
        for subdir in os.listdir(scenario_iterations_dir):
            if subdir.endswith(".tif"):
                iteration = os.path.join(scenario_iterations_dir, subdir)
                iteration_array = gdal.Open(iteration).ReadAsArray()
                if stat_sum is None:
                    stat_sum = np.zeros_like(iteration_array, dtype='float64')
                    stat_sum_sq = np.zeros_like(iteration_array, dtype='float64')
                # Sum and sum of squares
                stat_sum += iteration_array  # Running sum for mean
                stat_sum_sq += np.square(iteration_array, dtype=np.float64)  # Running sum of squares for variance
                iteration_n += 1

        # Calculate mean: sum / count
        stat_mean = np.divide(stat_sum, iteration_n, dtype='float64')
        if stat_mean_tif_exists == False:
            export_array_as_tif(stat_mean, stat_mean_dir, template = iteration)
            print(f"{stat_mean_filename} has been exported.")
        else: print(f"{stat_mean_filename} already exists.")

        if stat_uncertainty_tif_exists == False:
            # Calculate variance: E[X^2] - (E[X])^2
            stat_variance = (stat_sum_sq - (stat_sum ** 2) / iteration_n) / (iteration_n - 1)
            # Standard error: σ / sqrt(n)
            stat_se = np.sqrt(stat_variance) / np.sqrt(iteration_n)
            # Calculate confidence intervals using t-distribution
            stat_ci_lower, stat_ci_upper = st.t.interval(confidence_interval, iteration_n - 1, loc=stat_mean, scale=stat_se)
            # CI width: (upper - lower) / 2
            stat_ci = np.divide(np.subtract(stat_ci_upper, stat_ci_lower, dtype='float64'), 2, dtype='float64')
            # Uncertainty: (CI / mean) * 100%
            stat_uncertainty = np.multiply(np.divide(stat_ci, stat_mean, dtype='float64'), 100, dtype='float64')
            # Export statistics arrays as rasters
            export_array_as_tif(stat_se, join(statistics_unmasked_dir,f"{stat_base_filename}__se_unmasked.tif"), template = iteration)
            print(f"{stat_base_filename}__se_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci_lower, join(statistics_unmasked_dir,f"{stat_base_filename}__ci_lower_unmasked.tif"), template = iteration)
            print(f"{stat_base_filename}__ci_lower_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci_upper, join(statistics_unmasked_dir,f"{stat_base_filename}__ci_upper_unmasked.tif"), template = iteration)
            print(f"{stat_base_filename}__ci_upper_unmasked.tif has been exported.")
            export_array_as_tif(stat_ci, join(statistics_unmasked_dir,f"{stat_base_filename}__ci_unmasked.tif"), template = iteration)
            print(f"{stat_base_filename}__ci_unmasked.tif has been exported.")
            export_array_as_tif(stat_uncertainty, stat_uncertainty_dir, template = iteration)
            print(f"{stat_base_filename}__uncertainty_unmasked.tif has been exported.")
        else: print(f"{stat_uncertainty_filename} already exists.")

    else: print(f"{stat_mean_filename} and {stat_uncertainty_filename} already exist.")

print("Statistics calculations and .tif exports complete.")

In [None]:
last_historic_predictor_year = '2024'
first_historic_predictor_year = '1990'

# Mask statistics .tif files with the relevatant mask in 1_areas/masks.
for statistic_unmasked in os.listdir(statistics_unmasked_dir): # Loop through each unmasked prediction
  statistic_masked_filename = f"{statistic_unmasked[:-13]}.tif"
  statistic_masked_dir = join(statistics_masked_dir, statistic_masked_filename)
  if not exists(statistic_masked_dir):
    # Match the year of the statistic (first four characters) to a mask which includes the year in the filename.
    mask_exists = False
    for mask in os.listdir(masks_dir):
      # Match historic statistics and 9999_comrec (complete recovery) with the respective mask
      if statistic_unmasked[:4] in mask and 'comrest' not in statistic_unmasked and 'nodef_historic' not in statistic_unmasked:
        selected_mask_filename = mask
        selected_mask_dir = join(masks_dir, selected_mask_filename)
        mask_exists = True
      # Match 9999_comrest with last historic predictor year mask (e.g. forest 2024)
      if 'comrest' in statistic_unmasked and last_historic_predictor_year in mask:
        selected_mask_filename = mask
        selected_mask_dir = join(masks_dir, selected_mask_filename)
        mask_exists = True
      # Match 9999_comrest_first_historic with first historic predictor year mask (e.g. forest 1991)
      if 'comrest' in statistic_unmasked and 'first_historic' in statistic_unmasked and str(int(first_historic_predictor_year)+1) in mask:
        selected_mask_filename = mask
        selected_mask_dir = join(masks_dir, selected_mask_filename)
        mask_exists = True
      # Match nodef_historic with first historic predictor year mask (e.g. forest 1991)
      if 'nodef_historic' in statistic_unmasked and str(int(first_historic_predictor_year)+1) in mask:
        selected_mask_filename = mask
        selected_mask_dir = join(masks_dir, selected_mask_filename)
        mask_exists = True
      # Match other alternate scenarios with the last historic predictor year mask (e.g. forest 2024)
      if int(statistic_unmasked[:4]) > int(last_historic_predictor_year) and int(statistic_unmasked[:4]) < 9999 and last_historic_predictor_year in mask:
        selected_mask_filename = mask
        selected_mask_dir = join(masks_dir, selected_mask_filename)
        mask_exists = True
    if mask_exists == False: print(f"A suitable mask for {statistic_unmasked} does not exist.\n")
    else: # Mask the scenario prediction
      print(f"Masking {statistic_unmasked} with {selected_mask_filename}...")
      mask_array = gdal.Open(selected_mask_dir).ReadAsArray()
      statistic_unmasked_dir = join(statistics_unmasked_dir, statistic_unmasked)
      statistic_prediction_array = gdal.Open(statistic_unmasked_dir).ReadAsArray()
      # Mask where the mask array is not 1
      statistic_masked_array = np.where(mask_array != 1, nodatavalue, statistic_prediction_array)
      statistic_masked_filename = f"{statistic_unmasked[:-13]}.tif"
      export_array_as_tif(statistic_masked_array, statistic_masked_dir, template = selected_mask_dir, compress = True)
      print(f"{statistic_masked_filename} exported.")

# Scenario difference with uncertainty

In [None]:
# Select a scenario statistics area
scenario_iterations_area_exists = False
for subdir in os.listdir(uncertainty_selected_model_dir):
  if subdir != "model_iterations":
    print(f'selected_scenario_iterations_area = "{subdir[10:]}"')
    scenario_iterations_area_exists = True
if not scenario_iterations_area_exists:
  print(f"Run the scenario iterations section first.")

In [None]:
selected_scenario_iterations_area = "taman"

uncertainty_scenario_area_dir = join(uncertainty_selected_model_dir,f"scenarios_{selected_scenario_iterations_area}")
statistics_masked_dir = join(uncertainty_scenario_area_dir,"statistics_masked")
diff_uncertainty_dir = join(uncertainty_scenario_area_dir, "scenario_difference")
makedirs(diff_uncertainty_dir, exist_ok=True)

scenarios_diff_set = set()
for masked_statistic in os.listdir(statistics_masked_dir):
    scenarios_diff_set.add(masked_statistic.split("__")[0])

# Generate all possible pairs of scenarios, including all orders
scenario_pairs = sorted(list(itertools.permutations(scenarios_diff_set, 2)))

print("# Select scenarios to calculate mean difference with uncertainty")
print("scenario_pairs = [")
for s1, s2 in scenario_pairs:
    print(f" ('{s1}','{s2}'),")
print("]")

In [None]:
# Select scenarios to calculate mean difference with uncertainty
scenario_pairs = [
#  ('2024','2024_nodef_historic'),
#  ('2024','2024_nodeg_historic'),
#  ('2024','9999_comrec'),
#  ('2024','9999_comrest'),
#  ('2024','9999_comrest_first_historic'),
 ('2024_nodef_historic','2024'),
 ('2024_nodef_historic','2024_nodeg_historic'),
#  ('2024_nodef_historic','9999_comrec'),
#  ('2024_nodef_historic','9999_comrest'),
#  ('2024_nodef_historic','9999_comrest_first_historic'),
 ('2024_nodeg_historic','2024'),
#  ('2024_nodeg_historic','2024_nodef_historic'),
#  ('2024_nodeg_historic','9999_comrec'),
#  ('2024_nodeg_historic','9999_comrest'),
#  ('2024_nodeg_historic','9999_comrest_first_historic'),
 ('9999_comrec','2024'),
 ('9999_comrec','2024_nodef_historic'),
#  ('9999_comrec','2024_nodeg_historic'),
 ('9999_comrec','9999_comrest'),
 ('9999_comrec','9999_comrest_first_historic'),
 ('9999_comrest','2024'),
#  ('9999_comrest','2024_nodef_historic'),
 ('9999_comrest','2024_nodeg_historic'),
#  ('9999_comrest','9999_comrec'),
#  ('9999_comrest','9999_comrest_first_historic'),
#  ('9999_comrest_first_historic','2024'),
#  ('9999_comrest_first_historic','2024_nodef_historic'),
#  ('9999_comrest_first_historic','2024_nodeg_historic'),
#  ('9999_comrest_first_historic','9999_comrec'),
#  ('9999_comrest_first_historic','9999_comrest'),
]

In [None]:
# Functions for difference in mean and uncertainty
def diff_mean(scenario1_mean, scenario2_mean):
  diff_mean_array = scenario2_mean - scenario1_mean
  return diff_mean_array
def diff_uncertainty(scenario1_mean, scenario1_uncertainty, scenario2_mean, scenario2_uncertainty):
  sums_of_squares = np.square( np.multiply( scenario1_mean, scenario1_uncertainty, dtype='float64'), dtype='float64') + np.square( np.multiply( scenario2_mean, scenario2_uncertainty, dtype='float64'), dtype='float64')
  diff_uncertainty_array = np.sqrt(sums_of_squares, dtype='float64') / (scenario1_mean + scenario2_mean)
  return diff_uncertainty_array

# Loop through the scenario pairs
for scenario1, scenario2 in scenario_pairs:

  # Define filenames and directories of mean and uncertainty difference .tifs
  diff_mean_filename = f"diff_mean_{scenario1}_{scenario2}_{selected_scenario_iterations_area}_{selected_model}.tif"
  diff_mean_dir = join(diff_uncertainty_dir, diff_mean_filename)
  diff_uncertainty_filename = f"diff_uncertainty_{scenario1}_{scenario2}_{selected_scenario_iterations_area}_{selected_model}.tif"
  diff_uncertainty_raster_dir = join(diff_uncertainty_dir, diff_uncertainty_filename)

  if not exists(diff_mean_dir) and not exists(diff_uncertainty_raster_dir):
    print(f"Calculating mean difference with uncertainty between {scenario1} and {scenario2}")
    scenario1_base_filename = f"{scenario1}__{selected_scenario_iterations_area}_{selected_model}"
    scenario2_base_filename = f"{scenario2}__{selected_scenario_iterations_area}_{selected_model}"

    # Define mean and uncertainty directories, assert that both exist for both scenarios
    scenario1_mean_dir = join(statistics_masked_dir,f"{scenario1_base_filename}__mean.tif")
    assert exists(scenario1_mean_dir), f"{scenario1_base_filename}__mean.tif does not exist."
    scenario1_uncertainty_dir = join(statistics_masked_dir,f"{scenario1_base_filename}__uncertainty.tif")
    assert exists(scenario1_uncertainty_dir), f"{scenario1_base_filename}__uncertainty.tif does not exist."
    scenario2_mean_dir = join(statistics_masked_dir,f"{scenario2_base_filename}__mean.tif")
    assert exists(scenario2_mean_dir), f"{scenario2_base_filename}__mean.tif does not exist."
    scenario2_uncertainty_dir = join(statistics_masked_dir,f"{scenario2_base_filename}__uncertainty.tif")
    assert exists(scenario2_uncertainty_dir), f"{scenario2_base_filename}__uncertainty.tif does not exist."

    # Convert scenario mean and uncertainty .tifs to temporary arrays
    scenario1_mean_array_temp = gdal.Open(scenario1_mean_dir).ReadAsArray()
    scenario1_uncertainty_array_temp = gdal.Open(scenario1_uncertainty_dir).ReadAsArray()
    scenario2_mean_array_temp = gdal.Open(scenario2_mean_dir).ReadAsArray()
    scenario2_uncertainty_array_temp = gdal.Open(scenario2_uncertainty_dir).ReadAsArray()

    # Fill scenario nodata values with 0 if they are not nodatavalues in the other scenario
    scenario1_mean_array = np.where((scenario1_mean_array_temp == nodatavalue) & (scenario2_mean_array_temp != nodatavalue), 0, scenario1_mean_array_temp)
    scenario1_uncertainty_array = np.where((scenario1_uncertainty_array_temp == nodatavalue) & (scenario2_uncertainty_array_temp != nodatavalue), 0, scenario1_uncertainty_array_temp)
    scenario2_mean_array = np.where((scenario2_mean_array_temp == nodatavalue) & (scenario1_mean_array != nodatavalue), 0, scenario2_mean_array_temp)
    scenario2_uncertainty_array = np.where((scenario2_uncertainty_array_temp == nodatavalue) & (scenario1_uncertainty_array != nodatavalue), 0, scenario2_uncertainty_array_temp)

    # Create difference mean and uncertainty arrays where the value is not 'nodatavalue'
    diff_mean_array = np.where(scenario1_mean_array==nodatavalue, nodatavalue, diff_mean(scenario1_mean_array, scenario2_mean_array))
    diff_uncertainty_array = np.where(scenario1_uncertainty_array==nodatavalue, nodatavalue, diff_uncertainty(scenario1_mean_array, scenario1_uncertainty_array, scenario2_mean_array, scenario2_uncertainty_array))

    # Export the mean and uncertainty difference .tifs if they do not exist
    if exists(diff_mean_dir): print(f"{diff_mean_filename} already exists.")
    else: export_array_as_tif(diff_mean_array, diff_mean_dir, template = scenario1_mean_dir), print(f"{diff_mean_filename} has been exported.")
    if exists(diff_uncertainty_raster_dir): print(f"{diff_uncertainty_filename} already exists.")
    else: export_array_as_tif(diff_uncertainty_array, diff_uncertainty_raster_dir, template = scenario1_mean_dir), print(f"{diff_uncertainty_filename} has been exported.")

  else: print(f"Both {diff_mean_filename} and {diff_uncertainty_filename} already exist.")

# Disconnect runtime

In [None]:
# Useful for stopping background execution upon completion
runtime.unassign()