<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/dev/3_features_lcluc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports, directories and global functions

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Installs
%%capture
!pip install astropy
!pip install earthengine-api
!pip install geopandas

In [None]:
!# Reload imports, replacing those in the cache
%load_ext autoreload
%autoreload 2
# Imports
from astropy.convolution import convolve, Gaussian2DKernel
import csv
import ee
import geopandas as gpd
import glob
from google.colab import runtime, userdata
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
from os import makedirs, remove
from os.path import exists, join
from osgeo import gdal, gdalconst, ogr
from scipy.ndimage import distance_transform_edt
from scipy.ndimage import label, sum as ndi_sum
from shutil import copyfile, move
from time import sleep

In [None]:
# 1_areas directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(areas_dir, "polygons")
template_dir = join(areas_dir, "template.tif")

# 3_features directories
features_dir = join(base_dir, "3_features")
ee_dir = join(features_dir, "earth_engine")
user_upload_dir = join(features_dir, "user_upload")
glad_lcluc_dir = join(features_dir, 'glad_lcluc')
resampled_dir = join(features_dir, "resampled")
continuous_final_dir = join(features_dir, "continuous_final")
binary_dir = join(features_dir, 'binary')
edge_effects_dir = join(features_dir, 'binary_edge_effects')

# 6_scenarios directories
scenario_dir = join(base_dir, "6_scenarios")
scenario_mask_dir = join(scenario_dir, "scenario_masks")

# Create directories
makedirs(ee_dir, exist_ok=True)
makedirs(user_upload_dir, exist_ok=True)
makedirs(glad_lcluc_dir, exist_ok=True)
makedirs(resampled_dir, exist_ok=True)
makedirs(continuous_final_dir, exist_ok=True)
makedirs(binary_dir, exist_ok=True)
makedirs(edge_effects_dir, exist_ok=True)
makedirs(scenario_dir, exist_ok=True)
makedirs(scenario_mask_dir, exist_ok=True)

In [None]:
# export_array_as_tif function
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_dir, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Burn a polygon to raster
def burn_polygon_to_raster(raster_path, polygon_path, fixed=True, fixed_value=1, column_name=None, all_touched=True):
    raster = gdal.Open(raster_path, gdal.GA_Update)
    vector = ogr.Open(polygon_path)
    layer = vector.GetLayer()
    if all_touched: options = ["ALL_TOUCHED=TRUE"]
    else: options = []
    if not fixed: options.append(f"ATTRIBUTE={column_name or layer.GetLayerDefn().GetFieldDefn(0).GetName()}")
    gdal.RasterizeLayer(raster, [1], layer,
                        burn_values=[fixed_value] if fixed else None,
                        options=options)
    raster.FlushCache()
    raster = vector = None

# Download Earth Engine rasters

In [None]:
# Enable Google Earth Engine API at Google Cloud https://console.cloud.google.com/apis/dashboard
# See here for walkthrough: https://github.com/googlecolab/colabtools/issues/4228#issuecomment-1859068706
# Set project ID under 'secrets' tab on the left with the name 'google_cloud_project'
ee_project = userdata.get('google_cloud_project')

# Authenticate Earth Engine
ee.Authenticate()
ee.Initialize(project=ee_project)

In [None]:
# Edit this section to change which Earth Engine datasets are downloaded.

# Warning: Earth Engine uses 'nearest neighbour' to resample rasters to the desired extent and resolution before exporting.
# This creates artifacts if the data is continuous, such as DEMs (elevation) or other topographic metrics.
# These should be downloaded from the original source, uploaded to '/user_upload' and resampled in the next section, checking the option for 'bilinear'.

# Check datasets in https://code.earthengine.google.com/ with:
# var assetList = ee.data.listAssets("projects/JRC/TMF/v1_2022/");
# print(assetList);

ee_datasets = [

    {
        "ee_dataset_name": "tmf",
        "ee_dataset_type": "ImageCollection",
        "ee_paths": [
            "projects/JRC/TMF/v1_2024/AnnualChanges",
            "projects/JRC/TMF/v1_2024/TransitionMap_MainClasses",
            "projects/JRC/TMF/v1_2024/TransitionMap_Subtypes",
            "projects/JRC/TMF/v1_2024/AnnualDisruptionObs2024",
            "projects/JRC/TMF/v1_2023/AnnualDisruptionObs2023",
            "projects/JRC/TMF/v1_2023/Ndisturb_C2_1982_2022",

        ],
    }
    # {
    #     "ee_dataset_name": "glad",
    #     "ee_dataset_type": "Image",
    #     "ee_paths": [
    #                 # 'projects/glad/GLCLU2020/Forest_gain',
    #                 'projects/glad/GLCLU2020/Forest_height_2000',
    #                 'projects/glad/GLCLU2020/Forest_height_2005',
    #                 'projects/glad/GLCLU2020/Forest_height_2010',
    #                 'projects/glad/GLCLU2020/Forest_height_2015',
    #                 'projects/glad/GLCLU2020/Forest_height_2020',
    #                 'projects/glad/GLCLU2020/Forest_height_disturbance',
    #                 'projects/glad/GLCLU2020/Forest_height_netgain',
    #                 'projects/glad/GLCLU2020/Forest_height_netloss',
    #                 # 'projects/glad/GLCLU2020/Forest_loss',
    #                 # 'projects/glad/GLCLU2020/Forest_type',
    #                 'projects/glad/GLCLU2020/LCLUC',
    #                 'projects/glad/GLCLU2020/LCLUC_2000',
    #                 'projects/glad/GLCLU2020/LCLUC_2020',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_2000',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_2005',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_2010',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_2015',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_2020',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_gain',
    #                 'projects/glad/GLCLU2020/Vegetation_cover_loss',
    #                 'projects/glad/GLCLU2020/Water_2000',
    #                 'projects/glad/GLCLU2020/Water_2005',
    #                 'projects/glad/GLCLU2020/Water_2010',
    #                 'projects/glad/GLCLU2020/Water_2015',
    #                 'projects/glad/GLCLU2020/Water_2020',
    #                 # 'projects/glad/GLCLU2020/Water_dynamics',
    #                 # 'projects/glad/GLCLU2020/Water_dynamics_classes',
    #     ]
    # }
]

In [None]:
# Verify Earth Engine rasters that will be downloaded
ee_raster_list = []
for ee_dataset in ee_datasets:
    ee_dataset_name = ee_dataset['ee_dataset_name']
    ee_dataset_type = ee_dataset['ee_dataset_type']
    ee_paths = ee_dataset['ee_paths']
    for ee_path in ee_paths:
        if ee_dataset_type == 'ImageCollection':
            ee_image = ee.ImageCollection(ee_path)
            ee_bands = [b['id'] for b in ee_image.getInfo()['features'][0]['bands']]
        else:
            ee_image = ee.Image(ee_path)
            ee_bands = ee_image.bandNames().getInfo()
        for ee_band in ee_bands:
            ee_tif_filename = f"{ee_dataset_name}_{ee_path.split('/')[-1]}_{ee_band}.tif"
            ee_raster_list.append(ee_tif_filename)

ee_raster_list = list(reversed(ee_raster_list))
ee_raster_list

## Original queue method

In [None]:
# Earth Engine download progress
ee_progress_index = 0
ee_progress_label = widgets.Label(f"Earth Engine download progress: {ee_progress_index}/{len(ee_raster_list)}")
display(ee_progress_label)

# Load template and set Earth Engine geometry
template_polygon_dir = join(polygons_dir, 'template.gpkg')
template_area = gpd.read_file(template_polygon_dir)["geometry"].iloc[0]
template_coords = list(gpd.read_file(template_polygon_dir)["geometry"].iloc[0].exterior.coords)
ee_geometry = ee.Geometry.Polygon(template_coords)

# Download Earth Engine datasets
for ee_dataset in ee_datasets:
    ee_dataset_name = ee_dataset['ee_dataset_name']
    ee_dataset_type = ee_dataset['ee_dataset_type']
    ee_paths = ee_dataset['ee_paths']
    # Loop through Earth Engine paths
    for ee_path in ee_paths:
        # identify bands
        if ee_dataset_type == 'ImageCollection':
            ee_image = ee.ImageCollection(ee_path)
            ee_bands = [b['id'] for b in ee_image.getInfo()['features'][0]['bands']]
        elif ee_dataset_type == 'Image':
            ee_image = ee.Image(ee_path)
            ee_bands = ee_image.bandNames().getInfo()
        # Loop through bands
        for ee_band in reversed(ee_bands):
            # Set filename and directory of downloaded raster and check if exists
            ee_tif_filename = f"{ee_dataset_name}_{ee_path.split('/')[-1]}_{ee_band}.tif"
            ee_tif_dir = join(ee_dir, ee_tif_filename)
            ee_temp_dir = join("/gdrive/MyDrive", ee_tif_filename)
            # Different temporary directory required if MyDrive is mounted
            if base_dir.startswith('/content/drive/MyDrive/'):
              ee_temp_dir = join("/content/drive/MyDrive/", ee_tif_filename)
            # Check if temporary raster exists and needs copying
            if exists(ee_temp_dir):
              copyfile(ee_temp_dir, ee_tif_dir)
              remove(ee_temp_dir)
            # Check if copied raster exists, and if not download from Earth Engine.
            if not exists(ee_tif_dir):
              if ee_dataset_type == 'ImageCollection':
                image_selected = ee_image.qualityMosaic(ee_band).select([ee_band])
                resolution = ee_image.first().projection().nominalScale().getInfo()
              if ee_dataset_type == 'Image':
                image_selected = ee_image.select([ee_band])
                resolution = ee_image.select(0).projection().nominalScale().getInfo()
              ee_task = ee.batch.Export.image.toDrive(image=image_selected.toFloat(),
                                                    description=ee_tif_filename[:-4],
                                                    scale=resolution,
                                                    region=ee_geometry,
                                                    maxPixels=10000000000,
                                                    fileNamePrefix=ee_tif_filename[:-4],
                                                    crs='EPSG:4326',
                                                    fileFormat='GeoTIFF')
              ee_task.start()
              # Check whether the raster has downloaded yet
              while not exists(ee_temp_dir):
                  ee_task_status = ee_task.status()
                  # If the task is completed, continue
                  if ee_task_status["state"] == 'COMPLETED': break
                  # If it has failed or been cancelled, show an error
                  elif ee_task_status['state'] == 'FAILED' or ee_task_status['state'] == 'CANCELLED':
                      print(f"{ee_tif_filename}:{ee_task_status['error_message']}")
                      try: remove(ee_temp_dir)
                      except: pass
                      break
                  sleep(1)
              # Copy the raster to intended directory and remove the temporary raster
              while not exists(ee_temp_dir):
                sleep(1)
              copyfile(ee_temp_dir, ee_tif_dir)
              remove(ee_temp_dir)
            # Update Earth Engine download progress
            ee_progress_index += 1
            ee_progress_label.value = f"Earth Engine download progress: {ee_progress_index}/{len(ee_raster_list)}"

# Check Earth Engine tasks here: https://code.earthengine.google.com/tasks

## Enhanced queue method

In [None]:
# Maximum concurrent tasks in Earth Engine
ee_max_concurrent_tasks = 30

# Load template and set Earth Engine geometry
template_polygon_dir = join(polygons_dir, 'template.gpkg')
template_area = gpd.read_file(template_polygon_dir)["geometry"].iloc[0]
template_coords = list(gpd.read_file(template_polygon_dir)["geometry"].iloc[0].exterior.coords)
ee_geometry = ee.Geometry.Polygon(template_coords)

# Create a dictionary of all rasters to download
raster_dictionary = {}

# Populate the dictionary with information about each raster
for ee_dataset in ee_datasets:
    ee_dataset_name = ee_dataset['ee_dataset_name']
    ee_dataset_type = ee_dataset['ee_dataset_type']
    ee_paths = ee_dataset['ee_paths']

    for ee_path in ee_paths:
        # Identify bands
        if ee_dataset_type == 'ImageCollection':
            ee_image = ee.ImageCollection(ee_path)
            ee_bands = [b['id'] for b in ee_image.getInfo()['features'][0]['bands']]
        elif ee_dataset_type == 'Image':
            ee_image = ee.Image(ee_path)
            ee_bands = ee_image.bandNames().getInfo()

        # Loop through bands and create entries in dictionary
        for ee_band in ee_bands:
            ee_tif_filename = f"{ee_dataset_name}_{ee_path.split('/')[-1]}_{ee_band}.tif"
            ee_tif_dir = join(ee_dir, ee_tif_filename)

            # Different temporary directory required if MyDrive is mounted
            if base_dir.startswith('/content/drive/MyDrive/'):
                ee_temp_dir = join("/content/drive/MyDrive/", ee_tif_filename)
            else:
                ee_temp_dir = join("/gdrive/MyDrive", ee_tif_filename)

            description = ee_tif_filename[:-4]

            raster_dictionary[description] = {
                'ee_dataset_type': ee_dataset_type,
                'ee_path': ee_path,
                'ee_band': ee_band,
                'image_path': ee_tif_dir,
                'image_path_temp': ee_temp_dir,
                'image_description': description,
                'image_status': '',
                'ee_task_id': '',
                'ee_task': None,
                'task_current_execution': False
            }

# Count total number of rasters
raster_number = len(raster_dictionary)

# Progress widgets
ee_counted_tasks = set()
ee_task_progress_index = 0
ee_task_progress_label = widgets.Label(
    f"Earth Engine task progress: {ee_task_progress_index}/{raster_number}"
)
display(ee_task_progress_label)

raster_progress_index = 0
raster_progress_label = widgets.Label(
    f"Raster download progress: {raster_progress_index}/{raster_number}"
)
display(raster_progress_label)

# Initialize per-raster monitoring fields
for raster_info in raster_dictionary.values():
    # 1) move any stray temp file into downloads/
    if exists(raster_info['image_path_temp']):
        move(raster_info['image_path_temp'], raster_info['image_path'])

    # 2) if the final TIFF now exists, mark it processed
    if exists(raster_info['image_path']):
        raster_info.update({
            'image_status': 'processed',
            'ee_task_id': '',
            'ee_task': None,
            'task_current_execution': False
        })
        # count it once for the EE-task progress bar
        ee_counted_tasks.add(raster_info['image_description'])
        ee_task_progress_index += 1
        ee_task_progress_label.value = (
            f"Earth Engine task progress: {ee_task_progress_index}/{raster_number}"
        )
        raster_progress_index += 1
        raster_progress_label.value = (
            f"Raster download progress: {raster_progress_index}/{raster_number}"
        )
    else:
        # 3) otherwise clear any state so it'll be re-queued
        raster_info.update({
            'image_status': '',
            'ee_task_id': '',
            'ee_task': None,
            'task_current_execution': False
        })
        ee_counted_tasks.discard(raster_info['image_description'])

# Detect tasks that were already running before this session
ee_current_task_count = 0
for task in ee.batch.Task.list():
    task_state = task.status()['state']
    task_id = task.id
    task_description = task.config['description']

    if task_state in ['READY', 'RUNNING', 'QUEUED']:
        ee_current_task_count += 1
        for v in raster_dictionary.values():
            if v['image_description'] == task_description:
                v.update({
                    'image_status': 'task',
                    'ee_task_id': task_id,
                    'ee_task': task,
                    'task_current_execution': True
                })
                if task_description not in ee_counted_tasks:
                    ee_counted_tasks.add(task_description)
                    ee_task_progress_index += 1
                    ee_task_progress_label.value = (
                        f"Earth Engine task progress: {ee_task_progress_index}/{raster_number}"
                    )
                break

# Main processing loop
while True:
    # Re-count active EE tasks each pass
    active_states = ['READY', 'RUNNING', 'QUEUED']
    ee_current_task_count = len([t for t in ee.batch.Task.list()
                              if t.status()['state'] in active_states])

    # Break when every raster is either processed or failed
    if all(v['image_status'] in ['processed', 'failed']
           for v in raster_dictionary.values()):
        break

    # Iterate over rasters
    for raster_info in raster_dictionary.values():
        # Skip finished / failed rasters
        if raster_info['image_status'] in ['processed', 'failed']:
            continue

        # Final file already exists
        if exists(raster_info['image_path']):
            raster_info.update({
                'image_status': 'processed',
                'ee_task_id': '',
                'ee_task': None,
                'task_current_execution': False
            })

            if raster_info['image_description'] not in ee_counted_tasks:
                ee_counted_tasks.add(raster_info['image_description'])
                ee_task_progress_index += 1
                ee_task_progress_label.value = (
                    f"Earth Engine task progress: {ee_task_progress_index}/{raster_number}"
                )

            raster_progress_index += 1
            raster_progress_label.value = (
                f"Raster download progress: {raster_progress_index}/{raster_number}"
            )
            continue

        # Temporary file exists – move to downloads directory
        if exists(raster_info['image_path_temp']):
            move(raster_info['image_path_temp'], raster_info['image_path'])

            raster_info.update({
                'image_status': 'processed',
                'ee_task_id': '',
                'ee_task': None,
                'task_current_execution': False
            })

            if raster_info['image_description'] not in ee_counted_tasks:
                ee_counted_tasks.add(raster_info['image_description'])
                ee_task_progress_index += 1
                ee_task_progress_label.value = (
                    f"Earth Engine task progress: {ee_task_progress_index}/{raster_number}"
                )

            raster_progress_index += 1
            raster_progress_label.value = (
                f"Raster download progress: {raster_progress_index}/{raster_number}"
            )
            continue

        # Task is running – poll its status
        if raster_info['image_status'] == 'task':
            ee_task = raster_info['ee_task']
            if ee_task is None:
                # Fallback: find it again by ID
                matches = [t for t in ee.batch.Task.list() if t.id == raster_info['ee_task_id']]
                ee_task = matches[0] if matches else None
                raster_info['ee_task'] = ee_task

            if ee_task is not None:
                task_state = ee_task.status()['state']

                if task_state in ('FAILED', 'CANCELLED'):
                    raster_info.update({
                        'image_status': 'failed',
                        'ee_task_id': '',
                        'ee_task': None,
                        'task_current_execution': False
                    })
                    print(f"{raster_info['image_description']} failed. Skipping.")

                elif task_state == 'COMPLETED':
                    raster_info.update({
                        'task_current_execution': False
                    })

            continue  # READY / RUNNING / QUEUED, keep polling

        # Need to queue a new task
        if raster_info['image_status'] == '':
            # Wait for a free slot
            if ee_current_task_count >= ee_max_concurrent_tasks:
                continue  # try again next outer loop pass

            ee_path = raster_info['ee_path']
            ee_band = raster_info['ee_band']
            ee_dataset_type = raster_info['ee_dataset_type']

            # Select the appropriate image and band
            if ee_dataset_type == 'ImageCollection':
                ee_image = ee.ImageCollection(ee_path)
                image_selected = ee_image.qualityMosaic(ee_band).select([ee_band])
                resolution = ee_image.first().projection().nominalScale().getInfo()
            elif ee_dataset_type == 'Image':
                ee_image = ee.Image(ee_path)
                image_selected = ee_image.select([ee_band])
                resolution = ee_image.select(0).projection().nominalScale().getInfo()

            task = ee.batch.Export.image.toDrive(
                image=image_selected.toFloat(),
                description=raster_info['image_description'],
                fileNamePrefix=raster_info['image_description'],
                fileFormat='GeoTIFF',
                region=ee_geometry,
                scale=resolution,
                maxPixels=10000000000,
                crs='EPSG:4326'
            )
            task.start()

            ee_current_task_count += 1
            raster_info.update({
                'image_status': 'task',
                'ee_task_id': task.id,
                'ee_task': task,
                'task_current_execution': True
            })

    sleep(5)

processed_count = sum(1 for v in raster_dictionary.values()
                     if v['image_status'] == 'processed')
failed_count = sum(1 for v in raster_dictionary.values()
                  if v['image_status'] == 'failed')

print(f"Final Status Check:\nProcessed Rasters: {processed_count}\nFailed Rasters: {failed_count}")
print("Check Earth Engine tasks here: https://code.earthengine.google.com/tasks")

## Direct download method

In [None]:
import concurrent.futures
from IPython.display import display, HTML, clear_output
import requests
import threading
from shutil import move, rmtree
from urllib.request import urlretrieve

# Set to False to suppress detailed messages about image splitting
verbose = False

compression = [
    'COMPRESS=LZW',  # Good speed / size ratio
    # 'ZSTD_LEVEL=1',
]

clip_geometry = False  # If True, clips the download geometry to the dataset footprint
# If False, the empty geometry will be filled with nodata values.

# Create a temporary directory for tiles
temp_tiles_dir = join(ee_dir, 'temp_tiles')
makedirs(temp_tiles_dir, exist_ok=True)

# Load template and set Earth Engine geometry
template_polygon_dir = join(polygons_dir, 'template.gpkg')
template_area = gpd.read_file(template_polygon_dir)["geometry"].iloc[0]
template_coords = list(gpd.read_file(template_polygon_dir)["geometry"].iloc[0].exterior.coords)
ee_geometry = ee.Geometry.Polygon(template_coords)

# Create a dictionary of all rasters to download
raster_dictionary = {}

# Populate the dictionary with information about each raster
for ee_dataset in ee_datasets:
    ee_dataset_name = ee_dataset['ee_dataset_name']
    ee_dataset_type = ee_dataset['ee_dataset_type']
    ee_paths = ee_dataset['ee_paths']

    for ee_path in ee_paths:
        # Identify bands
        if ee_dataset_type == 'ImageCollection':
            ee_image = ee.ImageCollection(ee_path)
            ee_bands = [b['id'] for b in ee_image.getInfo()['features'][0]['bands']]
        elif ee_dataset_type == 'Image':
            ee_image = ee.Image(ee_path)
            ee_bands = ee_image.bandNames().getInfo()

        # Loop through bands and create entries in dictionary
        for ee_band in ee_bands:
            ee_tif_filename = f"{ee_dataset_name}_{ee_path.split('/')[-1]}_{ee_band}.tif"
            ee_tif_dir = join(ee_dir, ee_tif_filename)

            # Different temporary directory required if MyDrive is mounted
            if base_dir.startswith('/content/drive/MyDrive/'):
                ee_temp_dir = join("/content/drive/MyDrive/", ee_tif_filename)
            else:
                ee_temp_dir = join("/gdrive/MyDrive", ee_tif_filename)

            description = ee_tif_filename[:-4]

            raster_dictionary[description] = {
                'ee_dataset_type': ee_dataset_type,
                'ee_path': ee_path,
                'ee_band': ee_band,
                'image_path': ee_tif_dir,
                'image_path_temp': ee_temp_dir,
                'image_description': description,
                'image_status': '',
                'ee_object_id': ee_path
            }

# Control parallel processing
MAX_CONCURRENT_IMAGES = 10

# Earth Engine size limit in bytes (approximately 50MB)
EE_SIZE_LIMIT = 50331648

# Lock for updating progress
progress_lock = threading.Lock()

# Total raster count
raster_number = len(raster_dictionary)

# Global variable for tracking progress
global_progress_index = 0

# Function to display custom progress bar
def display_progress():
    percent = int((global_progress_index / raster_number) * 100) if raster_number > 0 else 0
    bar_width = 80
    filled_length = int(bar_width * global_progress_index // raster_number)
    bar = '=' * filled_length + ' ' * (bar_width - filled_length)

    progress_html = f"""
    <div style="width:100%; margin-top:10px; margin-bottom:10px;">
        <div style="color:#CCCCCC; font-family:monospace;">
            Raster download progress: {percent}% [{bar}] {global_progress_index}/{raster_number}
        </div>
    </div>
    """

    clear_output(wait=True)
    display(HTML(progress_html))

# Display initial progress
display_progress()

def download_tile(raster_band, geometry, scale, output_path, max_retries=3):
    """Try to download a tile with the given geometry, handling EE-specific errors"""
    for retry in range(max_retries):
        try:
            # Get the download URL
            url = raster_band.getDownloadURL({
                'scale': scale,
                'region': geometry,
                'format': 'GEO_TIFF',
                'crs': 'EPSG:4326'
            })

            # Download the file
            urlretrieve(url, output_path)
            return True, None

        except ee.EEException as e:
            error_msg = str(e)
            # Check for size-related errors specifically
            if "Total request size" in error_msg and "must be less than or equal to" in error_msg:
                return False, "SIZE_LIMIT"
            else:
                if retry < max_retries - 1:
                    sleep(5)
        except Exception as e:
            if retry < max_retries - 1:
                sleep(5)

    return False, "OTHER_ERROR"

def split_tile_vertically(geometry, n_parts=2):
    """Split a rectangular geometry into n_parts vertically"""
    bounds = geometry.bounds().getInfo()['coordinates'][0]
    min_x = min(coord[0] for coord in bounds)
    min_y = min(coord[1] for coord in bounds)
    max_x = max(coord[0] for coord in bounds)
    max_y = max(coord[1] for coord in bounds)

    height = max_y - min_y
    part_height = height / n_parts

    parts = []
    for i in range(n_parts):
        part_min_y = min_y + (i * part_height)
        part_max_y = min_y + ((i + 1) * part_height)
        parts.append(ee.Geometry.Rectangle([min_x, part_min_y, max_x, part_max_y]))

    return parts

def process_image(image_description, raster_info):
    """Process a single raster image - to be run in parallel"""
    global global_progress_index

    # Skip if already processed or failed
    if exists(raster_info['image_path']) or raster_info.get('image_status') == 'failed':
        with progress_lock:
            global_progress_index += 1
            # No direct widget update here - handled by update_progress_display thread
        return True

    # Get the band
    ee_object = None
    raster_band = None
    tile_paths = []

    try:
        ee_path = raster_info['ee_path']
        ee_band = raster_info['ee_band']
        ee_dataset_type = raster_info['ee_dataset_type']

        # Select the appropriate image and band
        if ee_dataset_type == 'ImageCollection':
            ee_image_collection = ee.ImageCollection(ee_path)
            ee_object = ee_image_collection.qualityMosaic(ee_band)
            raster_band = ee_object.select([ee_band]).toFloat()
            projection = ee_image_collection.first().projection()
        elif ee_dataset_type == 'Image':
            ee_object = ee.Image(ee_path)
            raster_band = ee_object.select([ee_band]).toFloat()
            projection = ee_object.select(0).projection()

        if clip_geometry:
            # Get the footprint
            footprint = ee_object.geometry()
            # Calculate intersection with the footprint
            download_geometry = ee_geometry.intersection(footprint)
        else:
            download_geometry = ee_geometry

        # Get image projection and scale
        scale = projection.nominalScale().getInfo()

        # Create a folder for this image's tiles
        image_tiles_dir = join(temp_tiles_dir, image_description)
        makedirs(image_tiles_dir, exist_ok=True)

        # Track downloaded tiles
        tile_paths = []

        # First try to download the whole image at once
        whole_image_path = join(image_tiles_dir, f"{image_description}_whole.tif")

        success, error_type = download_tile(raster_band, download_geometry, scale, whole_image_path)

        if success:
            tile_paths = [whole_image_path]
        else:
            # Log failure only if verbose
            if verbose:
                if error_type == "SIZE_LIMIT":
                    print(f"[{image_description}] Full image download failed due to size limit, starting adaptive tiling...")
                else:
                    print(f"[{image_description}] Full image download failed, starting adaptive tiling...")

            # Initial split factor depends on error type
            initial_parts = 2 if error_type == "SIZE_LIMIT" else 2

            # Start with initial split of the geometry
            parts = split_tile_vertically(download_geometry, initial_parts)
            tiles_to_process = [(parts[i], i+1, f"{image_description}_part_{i+1}.tif")
                            for i in range(len(parts))]

            successful_tile_height = None

            # Process tiles until none are left
            while tiles_to_process:
                current_geometry, part_num, tile_filename = tiles_to_process.pop(0)
                tile_path = join(image_tiles_dir, tile_filename)

                # Try to download with current dimensions
                success, error_type = download_tile(raster_band, current_geometry, scale, tile_path)

                if success:
                    tile_paths.append(tile_path)

                    # If this is our first successful tile, remember its height
                    if successful_tile_height is None:
                        bounds = current_geometry.bounds().getInfo()['coordinates'][0]
                        min_y = min(coord[1] for coord in bounds)
                        max_y = max(coord[1] for coord in bounds)
                        successful_tile_height = max_y - min_y
                else:
                    # If download failed, log it if verbose
                    if verbose:
                        print(f"[{image_description}] Part {part_num} download failed: {error_type}")

                    # If size limit error, split more aggressively
                    split_factor = 3 if error_type == "SIZE_LIMIT" else 2

                    # If we have a successful tile height, try to use it
                    if successful_tile_height is not None:
                        bounds = current_geometry.bounds().getInfo()['coordinates'][0]
                        min_y = min(coord[1] for coord in bounds)
                        max_y = max(coord[1] for coord in bounds)
                        current_height = max_y - min_y

                        # Calculate how many parts we need to match the successful height
                        needed_parts = max(split_factor, math.ceil(current_height / successful_tile_height))
                        split_parts = split_tile_vertically(current_geometry, needed_parts)
                    else:
                        split_parts = split_tile_vertically(current_geometry, split_factor)

                    # Add new parts to the processing queue
                    next_part_num = max([p[1] for p in tiles_to_process]) + 1 if tiles_to_process else part_num + 1
                    for i, geom in enumerate(split_parts):
                        new_part_num = next_part_num + i
                        tiles_to_process.insert(0, (geom, new_part_num, f"{image_description}_part_{new_part_num}.tif"))

                    # Remove the failed attempt file if it exists
                    if os.path.exists(tile_path):
                        os.remove(tile_path)

        # Merge tiles using GDAL
        if len(tile_paths) > 0:
            if len(tile_paths) == 1:
                # Just one tile, compress and copy directly
                merged_temp_path = raster_info['image_path_temp']

                gdal_translate_options = gdal.TranslateOptions(
                    format="GTiff",
                    creationOptions=compression
                )
                gdal.Translate(merged_temp_path, tile_paths[0], options=gdal_translate_options)
            else:
                # Multiple tiles need merging
                vrt_path = join(image_tiles_dir, f"{image_description}_mosaic.vrt")
                merged_temp_path = raster_info['image_path_temp']

                # Create VRT from tiles
                gdal.BuildVRT(vrt_path, tile_paths)

                # Translate VRT to GeoTIFF with compression
                gdal_translate_options = gdal.TranslateOptions(
                    format="GTiff",
                    creationOptions=compression
                )
                gdal.Translate(merged_temp_path, vrt_path, options=gdal_translate_options)

            # Move to final location
            move(merged_temp_path, raster_info['image_path'])

            # Update progress tracking with thread safety
            with progress_lock:
                global_progress_index += 1
                # No direct widget update here - handled by update_progress_display thread

            # Clean up tile folder after successful merge
            rmtree(image_tiles_dir)
            return True
        else:
            if verbose:
                print(f"[{image_description}] Failed: No tiles were successfully downloaded")
            raster_info['image_status'] = 'failed'
            return False

    except Exception as e:
        if verbose:
            print(f"Error processing {image_description}: {str(e)}")
        raster_info['image_status'] = 'failed'
        return False
    finally:
        # Explicitly clear any large objects
        ee_object = None
        raster_band = None

# Function to update progress display periodically
def update_progress_display():
    last_count = 0
    while global_progress_index < raster_number:
        sleep(0.5)  # Update every half second

        current_count = 0
        with progress_lock:
            current_count = global_progress_index

        if current_count != last_count:
            display_progress()
            last_count = current_count

    # Final update to ensure 100% is shown
    display_progress()

# Count initially processed images
global_progress_index = 0
for v in raster_dictionary.values():
    if v.get('image_status') == 'processed' or exists(v['image_path']):
        global_progress_index += 1

# Display initial progress
display_progress()

# Create a list of pending images to process
pending_images = [(desc, img) for desc, img in raster_dictionary.items()
                  if not exists(img['image_path']) and img.get('image_status') != 'failed']

if verbose:
    print(f"Starting processing of {len(pending_images)} rasters with {MAX_CONCURRENT_IMAGES} parallel workers")

# Start the progress monitoring thread
progress_thread = threading.Thread(target=update_progress_display)
progress_thread.daemon = True
progress_thread.start()

try:
    # Use ThreadPoolExecutor for parallel processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_CONCURRENT_IMAGES) as executor:
        futures = {executor.submit(process_image, desc, img): (desc, img) for desc, img in pending_images}

        # Wait for completion and process results
        for future in concurrent.futures.as_completed(futures):
            desc, _ = futures[future]
            try:
                success = future.result()
                # Only log failures if verbose
                if not success and verbose:
                    print(f"Raster {desc} processing failed")
            except Exception as e:
                if verbose:
                    print(f"Raster {desc} processing generated an exception: {e}")
                # Mark as failed
                raster_dictionary[desc]['image_status'] = 'failed'
except Exception as e:
    print(f"Error in thread pool execution: {e}")
finally:
    # Make sure we wait for the progress thread to update one last time
    if progress_thread.is_alive():
        sleep(0.6)  # Give time for one last update

print(f"Processing complete")

# Count and display results
processed_count = sum(1 for v in raster_dictionary.values()
                      if exists(v['image_path']))
failed_count = sum(1 for v in raster_dictionary.values()
                   if v.get('image_status') == 'failed')

print(f"Final Status: {processed_count} rasters processed, {failed_count} rasters failed")
print("Check Earth Engine tasks here: https://code.earthengine.google.com/tasks")

# GLAD LCLUC

In [None]:
# GLAD data can be used in-place of TMF data for testing non-TMF areas.
# LCLUC contains several land cover and land use types, each with continuous metrics.
# This splits them into categories for better modelling, based on the legend:
# https://glad.umd.edu/sites/default/files/legend_0.xlsx
# Should do before resampling.

lcluc_dict = {
    'terra_vegetation_cover_percent': (0, 24),
    'terra_stable_tree_m': (25, 48),
    'wetland_vegetation_cover_percent': (100, 124),
    'wetland_stable_tree_m': (125, 148),
    'open_surface_water_percent_of_year': (200, 207),
    'snow_ice': (241, 241),
    'cropland': (244, 244),
    'built_up': (250, 250),
    'ocean': (254, 254),
}

lcluc_exists = False
for lcluc_raster in os.listdir(ee_dir):
  if 'LCLUC' in lcluc_raster:
    lcluc_exists = True
    lcluc_path = join(ee_dir, lcluc_raster)
    luluc_array = gdal.Open(lcluc_path).ReadAsArray()
    for key, (lower, upper) in lcluc_dict.items():
        split_luluc_filename = f"{lcluc_raster[:-4]}_{key}.tif"
        split_luluc_filename_binary = f"{lcluc_raster[:-4]}_{key}_binary.tif"
        split_luluc_dir = join(glad_lcluc_dir, split_luluc_filename)
        split_luluc_dir_binary = join(glad_lcluc_dir, split_luluc_filename_binary)
        if not exists(split_luluc_dir) and not exists(split_luluc_dir_binary):
          split_luluc_mask = np.logical_and(luluc_array >= lower, luluc_array <= upper)
          split_luluc_array = np.where(split_luluc_mask, luluc_array, 0) # outside the range set to 0
          non_zero_percentage = np.count_nonzero(split_luluc_array) / split_luluc_array.size * 100
          if non_zero_percentage >= 0.1:
            # Check if there's only one unique non-zero value, and convert to a 1-0 binary raster if true
            unique_non_zero_values = np.unique(split_luluc_array[split_luluc_array > 0])
            if len(unique_non_zero_values) == 1:
                split_luluc_array = np.where(split_luluc_array > 0, 1, 0)
                split_luluc_dir = split_luluc_dir_binary
            export_array_as_tif(split_luluc_array, split_luluc_dir, template=lcluc_path)
            print(f"{lcluc_raster} raster has been processed")

if not lcluc_exists: print("There are no GLAD LCLUC rasters.")

# Resample EE rasters

In [None]:
# Create dictionary of all tifs in Earth Engine and user upload directory
resample_dict = {}
for resample_raster in os.listdir(ee_dir):
    resample_dict.update({f'{resample_raster}':"'categorical'"})
for resample_raster in os.listdir(user_upload_dir):
    resample_dict.update({f'{resample_raster}':"'categorical'"})
for resample_raster in os.listdir(glad_lcluc_dir):
    resample_dict.update({f'{resample_raster}':"'continuous'"})
resample_dict = {key: value for key, value in sorted(resample_dict.items())}

# Select rasters for resampling and verify data type (categorical or continuous)
print("selected_original_rasters = {")
for key, value in resample_dict.items():
    print(f'"{key}": {value},')
print("}")

In [None]:
selected_original_rasters = {
# "temp_tiles": 'categorical',
"tmf_AnnualChanges_Dec1990.tif": 'categorical',
"tmf_AnnualChanges_Dec1991.tif": 'categorical',
"tmf_AnnualChanges_Dec1992.tif": 'categorical',
"tmf_AnnualChanges_Dec1993.tif": 'categorical',
"tmf_AnnualChanges_Dec1994.tif": 'categorical',
"tmf_AnnualChanges_Dec1995.tif": 'categorical',
"tmf_AnnualChanges_Dec1996.tif": 'categorical',
"tmf_AnnualChanges_Dec1997.tif": 'categorical',
"tmf_AnnualChanges_Dec1998.tif": 'categorical',
"tmf_AnnualChanges_Dec1999.tif": 'categorical',
"tmf_AnnualChanges_Dec2000.tif": 'categorical',
"tmf_AnnualChanges_Dec2001.tif": 'categorical',
"tmf_AnnualChanges_Dec2002.tif": 'categorical',
"tmf_AnnualChanges_Dec2003.tif": 'categorical',
"tmf_AnnualChanges_Dec2004.tif": 'categorical',
"tmf_AnnualChanges_Dec2005.tif": 'categorical',
"tmf_AnnualChanges_Dec2006.tif": 'categorical',
"tmf_AnnualChanges_Dec2007.tif": 'categorical',
"tmf_AnnualChanges_Dec2008.tif": 'categorical',
"tmf_AnnualChanges_Dec2009.tif": 'categorical',
"tmf_AnnualChanges_Dec2010.tif": 'categorical',
"tmf_AnnualChanges_Dec2011.tif": 'categorical',
"tmf_AnnualChanges_Dec2012.tif": 'categorical',
"tmf_AnnualChanges_Dec2013.tif": 'categorical',
"tmf_AnnualChanges_Dec2014.tif": 'categorical',
"tmf_AnnualChanges_Dec2015.tif": 'categorical',
"tmf_AnnualChanges_Dec2016.tif": 'categorical',
"tmf_AnnualChanges_Dec2017.tif": 'categorical',
"tmf_AnnualChanges_Dec2018.tif": 'categorical',
"tmf_AnnualChanges_Dec2019.tif": 'categorical',
"tmf_AnnualChanges_Dec2020.tif": 'categorical',
"tmf_AnnualChanges_Dec2021.tif": 'categorical',
"tmf_AnnualChanges_Dec2022.tif": 'categorical',
"tmf_AnnualChanges_Dec2023.tif": 'categorical',
"tmf_AnnualChanges_Dec2024.tif": 'categorical',
"tmf_AnnualDisruptionObs2023_y2023.tif": 'categorical',
"tmf_AnnualDisruptionObs2024_SumNonForest.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1982.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1983.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1984.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1985.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1986.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1987.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1988.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1989.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1990.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1991.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1992.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1993.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1994.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1995.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1996.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1997.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1998.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y1999.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2000.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2001.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2002.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2003.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2004.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2005.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2006.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2007.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2008.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2009.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2010.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2011.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2012.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2013.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2014.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2015.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2016.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2017.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2018.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2019.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2020.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2021.tif": 'categorical',
"tmf_Ndisturb_C2_1982_2022_y2022.tif": 'categorical',
"tmf_TransitionMap_MainClasses_TransitionMap_MainClasses.tif": 'categorical',
"tmf_TransitionMap_Subtypes_TransitionMap_Subtypes.tif": 'categorical',
}

In [None]:
# Set resample algorithms for different raster types
# See https://gdal.org/programs/gdalwarp.html
categorical_alg = 'near'
continuous_alg = 'bilinear'

template = gdal.Open(template_dir)
template_dimensions = template.GetGeoTransform()
xres, yres = template_dimensions[1], -template_dimensions[5]
xmin = template_dimensions[0]
ymin = template_dimensions[3] - template.RasterYSize * yres
xmax = xmin + template.RasterXSize * xres
ymax = template_dimensions[3]

# Resample progress
resample_progress_index = 0
resample_progress_label = widgets.Label(f"Resample progress: {resample_progress_index}/{len(selected_original_rasters.items())}")
display(resample_progress_label)

# Iterate over selected rasters
for original_raster_name, data_type in selected_original_rasters.items():
  resampled_raster_dir = join(resampled_dir, original_raster_name)
  if not exists(resampled_raster_dir):
    original_raster_dir = join(ee_dir, original_raster_name)
    if not exists(original_raster_dir): original_raster_dir = join(user_upload_dir, original_raster_name)
    if not exists(original_raster_dir): original_raster_dir = join(glad_lcluc_dir, original_raster_name)
    # Set resample type
    if data_type == 'categorical': resample_alg = categorical_alg
    if data_type == 'continuous': resample_alg = continuous_alg
    src = gdal.Warp(
        resampled_raster_dir,
        original_raster_dir,
        xRes=xres, yRes=yres,
        outputBounds=(xmin, ymin, xmax, ymax),
        resampleAlg=resample_alg,
        outputType=gdalconst.GDT_Float32)
    # Compress and close
    driver = gdal.GetDriverByName("GTiff")
    src = driver.CreateCopy(resampled_raster_dir, src, 0, options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
    src = None
  # Update resample progress
  resample_progress_index += 1
  resample_progress_label.value = f"Resample progress: {resample_progress_index}/{len(selected_original_rasters.items())}"

In [None]:
# Determine continous feature precision

override_max_unique_values = False
max_unique_values = 5000 # Should be >=10

if override_max_unique_values == False:
  dem_base_path = join(areas_dir, "base_dem.tif")
  dem_base_array = gdal.Open(dem_base_path).ReadAsArray()
  max_unique_values = int(np.ptp(dem_base_array)) # Precision based on elevation variance
resampled_precision_dict = {}

for resampled_feature, resample_type in selected_original_rasters.items():
  if resample_type == 'continuous':
    resampled_feature_path = join(resampled_dir, resampled_feature)
    print(f"Reading {resampled_feature}...")
    # Read raster as array
    resampled_feature_array = gdal.Open(resampled_feature_path).ReadAsArray()
    # Convert 'nodata' values to nan
    resampled_feature_array[resampled_feature_array == nodatavalue] = np.nan
    resampled_feature_array_masked = np.ma.array(resampled_feature_array, mask=np.isnan(resampled_feature_array))
    # Count unique values in raster
    unique_values = len(np.unique(resampled_feature_array_masked))
    print(f"There are {unique_values} unique values in {resampled_feature}")
    # Generate histogram from 100,000 random points
    random_selection = np.random.choice(resampled_feature_array_masked.ravel(), size = 100_000, replace = False)
    _ = plt.hist(random_selection, bins='auto')  # arguments are passed to np.histogram
    plt.title(f"{resampled_feature}")
    plt.show()
    # Remove 0 values for log10
    resampled_feature_array_masked[resampled_feature_array_masked == 0] = np.nan
    resampled_feature_array_masked = np.ma.array(resampled_feature_array, mask=np.isnan(resampled_feature_array))
    # Create log10 array for determining positions for rounding
    array_log10 = np.log10(abs(resampled_feature_array_masked))
    place_value_decimal = int(abs(np.min(array_log10)))
    place_value_integer = int(0 - np.max(array_log10))
    # Iterate down precision levels to determine optimal number of unique values
    min_starting_precision = len(str(max_unique_values))
    for precision in reversed(range(place_value_integer, max(min_starting_precision, place_value_decimal +1))):
      rounded_array = np.round(resampled_feature_array, decimals=precision)
      round_unique_values = len(np.unique(rounded_array))
      optimal_precision = None
      if round_unique_values <= max_unique_values:
        optimal_precision = precision
        print(f"The optimal precison for {resampled_feature} is {optimal_precision}, with {round_unique_values} unique values.")
        resampled_precision_dict.update({f'{resampled_feature}':f'{optimal_precision}'})
        break
    if optimal_precision == None: print("There's a problem with setting precision.")
    print("___________________\n")

print("Dictionary for optimal rounding values:")
resampled_precision_dict

precision_dict_csv_path = join(resampled_dir, 'rounding_dictionary.csv')
# Save rounding dictionary to CSV
with open(precision_dict_csv_path, 'w', newline='') as precision_dict_csv:
    writer = csv.writer(precision_dict_csv)
    writer.writerow(resampled_precision_dict.keys())
    writer.writerow(resampled_precision_dict.values())

In [None]:
# Open rounding dictionary and verify
with open(precision_dict_csv_path, 'r') as file:
    keys, values = list(csv.reader(file))
    topo_precision_dict = dict(zip(keys, values))

# Verify precision and correct if necessary
print("topo_precision_dict = {")
for key, value in topo_precision_dict.items():
    print(f'"{key}": {value},')
print("}")

In [None]:
topo_precision_dict = {

}

In [None]:
# Set smoothing kernel
kernel = Gaussian2DKernel(x_stddev=1, y_stddev=1)

# Continuous progress
continuous_progress_index = 0
continuous_progress_label = widgets.Label(f"Continuous progress: {continuous_progress_index}/{len(topo_precision_dict.items())}")
display(continuous_progress_label)

# Iterate over selected continuous rasters
for continuous, precision in topo_precision_dict.items():
  cont_raster_resampled_path = join(resampled_dir, continuous)
  cont_raster_resampled_array = gdal.Open(cont_raster_resampled_path).ReadAsArray()
  # Convert nodata values to 0
  cont_raster_resampled_array[cont_raster_resampled_array == nodatavalue] = 0
  # Set path and check if exists
  cont_raster_unsmoothed_filename = f"{continuous[:-4]}_unsmooth.tif"
  cont_raster_unsmoothed_path = join(continuous_final_dir, cont_raster_unsmoothed_filename)
  if not exists(cont_raster_unsmoothed_path):
    # Round and export unsmoothed continuous raster
    cont_raster_unsmoothed_rounded = np.round(cont_raster_resampled_array, decimals=int(precision))
    export_array_as_tif(cont_raster_unsmoothed_rounded, cont_raster_unsmoothed_path)
  # Smooth using 2D spatial convolution
  cont_raster_smoothed_filename = f"{continuous[:-4]}_smooth.tif"
  cont_raster_smoothed_path = join(continuous_final_dir, cont_raster_smoothed_filename)
  if not exists(cont_raster_smoothed_path):
    cont_raster_smoothed = convolve(cont_raster_resampled_array, kernel, boundary='extend')
    # Round and export smoothed continuous raster
    cont_raster_smoothed_rounded = np.round(cont_raster_smoothed, decimals=int(precision))
    export_array_as_tif(cont_raster_smoothed_rounded, cont_raster_smoothed_path)
  # Update continuous progress
  continuous_progress_index += 1
  continuous_progress_label.value = f"Continuous progress: {continuous_progress_index}/{len(topo_precision_dict.items())}"

# TMF binary features

In [None]:
# Check TMF data users guide for classification. https://forobs.jrc.ec.europa.eu/static/tmf/TMF_DataUsersGuide.pdf

cell_size_x = gdal.Open(join(areas_dir, 'cell_size_x.tif')).ReadAsArray()
cell_size_y = gdal.Open(join(areas_dir, 'cell_size_y.tif')).ReadAsArray()
cell_size_ha = np.mean(cell_size_x) * np.mean(cell_size_y) / 10_000
sieve_size = int(np.ceil(0.5/cell_size_ha)) # Removes all forest patches smaller than 0.5 ha
print(f"Forest binary sieve size (>0.5 ha) is {sieve_size} pixels.")

# Generate list of valid TMF rasters to convert to binary
binary_list = []
for resampled_raster in os.listdir(resampled_dir):
  # Verify these are in the filenames
  if 'DisruptionObs' in resampled_raster or 'AnnualChanges' in resampled_raster or 'Ndisturb' in resampled_raster:
    # Verify this is the position of the year in the filename
    if '2024' in resampled_raster: year = 2024 # This one has a funny name
    else: year = resampled_raster[-8:-4]
    if int(year) >= 1990: binary_list.append(resampled_raster) # Data prior to 1990 is poor

# Binary progress
binary_progress_index = 0
binary_progress_label = widgets.Label(f"Binary progress: {binary_progress_index}/{len(binary_list)}")
display(binary_progress_label)

for resampled_raster in binary_list:
  if '2024' in resampled_raster: year = 2024 # This one has a funny name
  else: year = resampled_raster[-8:-4]
  # Forest binary
  if 'AnnualChanges' in resampled_raster:
    forest_binary_path = join(binary_dir, f"forest_binary_{year}.tif")
    if not exists(forest_binary_path):
      ac_raster_path = join(resampled_dir, resampled_raster)
      ac_array = gdal.Open(ac_raster_path).ReadAsArray()
      # Set classes 1 & 2 as 1, all else as 0
      forest_binary_array = np.where((ac_array == 1) | (ac_array == 2), 1, 0)

      # Sieve to 0.5 ha, using 8-connectedness (3, 3)
      fb_array_labelled, fb_array_features = label(forest_binary_array, structure=np.ones((3, 3)))
      # Determine the size of each patch
      fb_array_sizes = ndi_sum(forest_binary_array, fb_array_labelled, range(fb_array_features + 1))
      # Create a mask to remove patches smaller than the threshold
      fb_array_mask_sizes = fb_array_sizes >= sieve_size
      fb_array_mask_sizes[0] = 0 # Ensure non-forest (0) is excluded
      fb_array_mask = fb_array_mask_sizes[fb_array_labelled]
      # Apply the mask to the forest binary array and export
      fb_array_sieved = forest_binary_array * fb_array_mask
      export_array_as_tif(fb_array_sieved, forest_binary_path)

  # Disturbance binary
  if 'DisruptionObs' in resampled_raster or 'Ndisturb' in resampled_raster:
    disturbance_binary_path = join(binary_dir, f"disturbance_binary_{year}.tif")
    if not exists(disturbance_binary_path):
      ac_raster_path = glob.glob(f"{resampled_dir}/*AnnualChanges*{year}*")
      ac_array = gdal.Open(ac_raster_path[0]).ReadAsArray()
      do_raster_path = join(resampled_dir, resampled_raster)
      do_array = gdal.Open(do_raster_path).ReadAsArray()
      # Set all disruption events to '1' if they're not classed as undisturbed forest or water in AnnualChanges
      disturbance_binary_array = np.where((do_array >= 1) & ((ac_array != 1) & (ac_array != 5)), 1, 0)
      export_array_as_tif(disturbance_binary_array, disturbance_binary_path)

  # Update binary progress
  binary_progress_index += 1
  binary_progress_label.value = f"Binary progress: {binary_progress_index}/{len(binary_list)}"

In [None]:
# Extract mangrove binary (optional)
extract_mangrove_binary = True

if extract_mangrove_binary:
  mangrove_binary_path = join(binary_dir, "mangrove_binary.tif")
  if not exists(mangrove_binary_path):
    # Open subtypes array
    for resampled_raster in os.listdir(resampled_dir):
      if 'Subtypes' in resampled_raster:
        subtypes_raster_path = join(resampled_dir, resampled_raster)
        subtypes_raster_array = gdal.Open(subtypes_raster_path).ReadAsArray()

    # Open oldest available forest binary raster for full mangrove extent (1990)
    forest_binary_1990_path = join(binary_dir, "forest_binary_1990.tif")
    forest_binary_1990_array = gdal.Open(forest_binary_1990_path).ReadAsArray()

    mangrove_binary_array = np.logical_and(forest_binary_1990_array == 1,
        np.logical_or(subtypes_raster_array == 12,(subtypes_raster_array >= 61) & (subtypes_raster_array <= 69))
    )

    # Calculate the percentage of forest pixels that are mangrove
    forest_1990_pixels = np.sum(forest_binary_1990_array)
    mangrove_pixels = np.sum(mangrove_binary_array)
    if mangrove_pixels > 0:
      mangrove_percent = (mangrove_pixels / forest_1990_pixels) * 100
      export_array_as_tif(mangrove_binary_array, mangrove_binary_path)
      print(f"Number of mangrove pixels: {mangrove_pixels}")
      print(f"Percentage of 1990 forest pixels that are mangrove: {mangrove_percent:.2f}%")
    else: print("There are no mangrove pixels in the template area.")
  else: print("A mangrove binary raster already exists. Delete it to generate a new one.")

# LU polygon binary features

In [None]:
# Selected 'land use' polygons.
# Creating a 'complete recovery' or 'complete restoration' scenario requires ONE of these as a proxy.
# This can be multiple combined PAs / polygons that have no or minimal history of human disturbance.

polygons_to_exclude = ['template.gpkg', 'project_area.gpkg', 'project_area_buffered_bbox.gpkg', 'gedi_area.gpkg', 'project_area_inverse.gpkg', 'gedi_area_inverse.gpkg']
print("lu_polygons = [")
for polygon in os.listdir(polygons_dir):
  if polygon not in polygons_to_exclude:
    print(f"'{polygon}',")
print("]")

In [None]:
lu_polygons = [
# 'peninsular_malaysia.gpkg',
'lu_oldgrowth.gpkg',
]

# Convert all template values to 'nodata' in preparation
template_tif = gdal.Open(template_dir)
template_mask_array = gdal.Open(template_dir).ReadAsArray()
template_mask_array[template_mask_array != None] = 0

for lu_polygon in lu_polygons:
  lu_binary_name = f"{lu_polygon[:-5]}_binary.tif"
  lu_binary_path = join(binary_dir, lu_binary_name)
  if not exists(lu_binary_path):
    lu_polygon_path = join(polygons_dir, lu_polygon)
    export_array_as_tif(template_mask_array, lu_binary_path)
    # Burn the value '1' where it overlaps with the project area polygon
    burn_polygon_to_raster(lu_binary_path, lu_polygon_path, fixed=True, fixed_value=1, all_touched=False)
    print(f"{lu_binary_name} has been created.")
  else: print(f"{lu_binary_name} already exists.")

# Binary masks

In [None]:
# Generate masks for later scenario predictions, e.g. so outputs only show forest.
mask_type_list = []
for binary in os.listdir(binary_dir):
    mask_type = binary.split('_')[0]
    if mask_type not in mask_type_list:
        mask_type_list.append(mask_type)

print("mask_types = [")
for mask_type in mask_type_list:
    print(f"'{mask_type}',")
print("]")

In [None]:
mask_types = [
'forest',
# 'lu',
# 'disturbance',
]

# Create list of binary rasters to mask
binary_mask_list = []
for mask_type in mask_types:
  for binary in os.listdir(binary_dir):
    if mask_type in binary:
      binary_mask_list.append(binary)

# Binary progress
mask_progress_index = 0
mask_progress_label = widgets.Label(f"Binary progress: {mask_progress_index}/{len(binary_mask_list)}")
display(mask_progress_label)

# Create masks from the selected binary raster type
for mask_type in mask_types:
  for binary in binary_mask_list:
    binary_path = join(binary_dir, binary)
    try: year = str(int(binary[-8:-4])) # Check for year
    except: year = None
    mask_raster_path = join(scenario_mask_dir, f"mask_{mask_type}_{year}.tif")
    if not exists(mask_raster_path):
        binary_raster = gdal.Open(binary_path)
        binary_array = gdal.Open(binary_path).ReadAsArray()
        mask_array = np.where(binary_array == 0, nodatavalue, 1)
        export_array_as_tif(mask_array, mask_raster_path)
        print(f"A mask raster has been created: {mask_raster_path}")
    else: print(f"A mask raster already exists at: {mask_raster_path}")
    # Update mask progress
    mask_progress_index += 1
    mask_progress_label.value = f"Binary progress: {mask_progress_index}/{len(binary_mask_list)}"

# Binary feature edge effects

In [None]:
# Set smoothing kernel
kernel = Gaussian2DKernel(x_stddev=3, y_stddev=3)
# Set precision
precision = 2

binary_list = []
for binary_raster in os.listdir(binary_dir) + os.listdir(resampled_dir):
  if "binary" in binary_raster:
    binary_list.append(binary_raster)

# Edge effect progress
edge_effect_progress_index = 0
edge_effect_progress_label = widgets.Label(f"Edge effect progress: {edge_effect_progress_index}/{len(binary_list)}")
display(edge_effect_progress_label)

for binary_raster in binary_list:
  if "binary" in binary_raster:
    edge_effects_filename = binary_raster.replace('binary', 'with_edge_effects')
    edge_effects_path = join(edge_effects_dir, edge_effects_filename)
    if not exists(edge_effects_path):
      binary_raster_path = join(binary_dir, binary_raster)
      if not exists(binary_raster_path): binary_raster_path = join(resampled_dir, binary_raster)
      binary_array = gdal.Open(binary_raster_path).ReadAsArray()
      # Reclassify for binary differentiation after proximity conversion
      differentiator_array = binary_array.copy()
      differentiator_array[differentiator_array == 1] = 10
      # Positive proximity
      positive_distances = distance_transform_edt(binary_array == 0) # target pixels
      positive_proximity_array = np.where(positive_distances > 2, 0, positive_distances) # max distance 2
      # Negative proximity
      negative_distances = distance_transform_edt(binary_array == 1) # target pixels
      negative_proximity_array = np.where(negative_distances > 2, 0, negative_distances) # max distance 2
      # Sum proximities and differentiator
      pixel_prox_summed =  differentiator_array + positive_proximity_array + negative_proximity_array
      # Reclassify for better semantic understanding of pixel proximity
      pixel_prox_reclassed = pixel_prox_summed.copy()
      pixel_prox_reclass_table = [(0, 0, -4), (1, 1, -1), (1.4, 1.5, -2), (2, 2, -3), (10, 10, 3), (11, 11, 0), (11.4, 11.5, 1), (12, 12, 2)]
      for min_value, max_value, new_value in pixel_prox_reclass_table:
        pixel_prox_reclassed[(pixel_prox_reclassed >= min_value) & (pixel_prox_reclassed <= max_value)] = new_value
      # Smooth binary array using 2D convolution
      binary_smoothed = convolve(binary_array, kernel, boundary='extend')
      # Sum pixel proximity and smoothed binary array
      edge_effects_array = np.round(pixel_prox_reclassed + binary_smoothed, precision)
      # Export edge effects features
      export_array_as_tif(edge_effects_array, edge_effects_path)

  # Update binary progress
  edge_effect_progress_index += 1
  edge_effect_progress_label.value = f"Edge effect progress: {edge_effect_progress_index}/{len(binary_list)}"

# Disconnect runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()