<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/4_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports, directories and global functions

In [None]:
# Define base directory
base_dir = "/gdrive/Shareddrives/masfi"

# Mount Google Drive and set base directory
from google.colab import drive
import os
import sys
drive.mount('/gdrive', force_remount=True)
_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs
!pip install geopandas

In [None]:
# Reload imports, replacing those in the cache
%load_ext autoreload
%autoreload 2
# Imports
from datetime import datetime
import geopandas as gpd
from google.colab import runtime
import ipywidgets as widgets
from IPython.display import clear_output
import json
import matplotlib.pyplot as plt
import numpy as np
import operator
from os import makedirs
from os.path import exists, join
from osgeo import gdal
import pandas as pd
import requests
from shapely.geometry import Point
from shapely.ops import unary_union
from shutil import copyfile
from time import sleep

In [None]:
# 1_areas directories
areas_dir = join(base_dir, '1_areas')
polygons_dir = join(areas_dir, 'polygons')
template_dir = join(areas_dir, "template.tif")

# 2_variates directories
variates_final_dir = join(base_dir, "2_variates/pkl_final")

# 3_predictors directories
predictors_dir = join(base_dir, "3_predictors")
edge_effects_dir = join(predictors_dir, "binary_edge_effects")
continuous_final_dir = join(predictors_dir, "continuous_final")
topography_final_dir = join(predictors_dir, "topo_final")
topography_corrected_final_dir = join(predictors_dir, "topo_corrected_final")
coast_dir = join(predictors_dir, 'coast')
predictor_final_dir = join(predictors_dir, "final")

# 4_datasets directories
datasets_dir = join(base_dir, "4_datasets")
datasets_var_dir = join(datasets_dir, "variates")
datasets_add_pre_dir = join(datasets_dir, "add_predictors")
datasets_filtered_dir = join(datasets_dir, "filtered")
datasets_final_dir = join(datasets_dir, "final")
datasets_gpkg_dir = join(datasets_dir, "gpkg")

# Create directories
makedirs(predictor_final_dir, exist_ok=True)
makedirs(datasets_dir, exist_ok=True)
makedirs(datasets_var_dir, exist_ok=True)
makedirs(datasets_add_pre_dir, exist_ok=True)
makedirs(datasets_filtered_dir, exist_ok=True)
makedirs(datasets_final_dir, exist_ok=True)
makedirs(datasets_gpkg_dir, exist_ok=True)

In [None]:
# export_array_as_tif function
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_dir, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)


def sample_raster_values(pd_dataframe, raster_path, predictor=False, geometry_column='geometry'):
    raster_name = raster_path.split('/')[-1][:-4]
    if predictor: raster_name = 'pre_' + raster_name
    # Read raster array and dimensions
    raster = gdal.Open(raster_path)
    band = raster.GetRasterBand(1)
    geotransform = raster.GetGeoTransform()
    raster_array = band.ReadAsArray()
    # Compute x and y indices for raster sampling
    x_values = [(geom.x - geotransform[0]) / geotransform[1] for geom in pd_dataframe[geometry_column]]
    y_values = [(geom.y - geotransform[3]) / geotransform[5] for geom in pd_dataframe[geometry_column]]
    # Sample values from the raster array
    sampled_values = [raster_array[int(y), int(x)] for x, y in zip(x_values, y_values)]
    # Add the sampled values as a new column to the DataFrame
    pd_dataframe[raster_name] = sampled_values

# Finalise predictors

In [None]:
# Round latitude and longitude rasters and finalise
# These help account for ecological spatial patterns we do not have predictor rasters for.
# Precision should strike a balance between accuracy and overfitting/model training time
precision = 3 # 3 is equivalent to a precision of ~111 m at the equator, i.e. every three to four 30 m pixels
coordinates = ['latitude.tif', 'longitude.tif']
for coordinate in coordinates:
  coordinate_path = join(areas_dir, coordinate)
  coordinate_rounded_path = join(predictor_final_dir, coordinate)
  if not exists(coordinate_rounded_path):
    coordinate_array = gdal.Open(coordinate_path).ReadAsArray()
    coordinate_array_round = np.round(coordinate_array, precision)
    export_array_as_tif(coordinate_array_round, coordinate_rounded_path)
    print(f"{coordinate} has been rounded and exported to {predictor_final_dir}")
  else: print(f"{coordinate} already exists in {predictor_final_dir}")

In [None]:
# Compile and verify final predictor list
predictor_list = [join(coast_dir,'coast_proximity_km.tif')]
for predictor in os.listdir(continuous_final_dir):
    predictor_list.append(join(continuous_final_dir, predictor))
for predictor in os.listdir(edge_effects_dir):
  predictor_list.append(join(edge_effects_dir, predictor))
for predictor in os.listdir(topography_final_dir):
  predictor_list.append(join(topography_final_dir, predictor))
if exists(topography_corrected_final_dir):
  for predictor in os.listdir(topography_corrected_final_dir):
    predictor_list.append(join(topography_corrected_final_dir, predictor))
predictor_list = sorted(predictor_list)

print("predictor_list = [")
for predictor in predictor_list:
  print(f"'{predictor}',")
print(']')

In [None]:
predictor_list = [
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1990.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1991.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1992.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1993.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1994.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1995.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1996.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1997.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1998.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_1999.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2000.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2001.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2002.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2003.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2004.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2005.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2006.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2007.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2008.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2009.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2010.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2011.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2012.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2013.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2014.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2015.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2016.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2017.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2018.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2019.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2020.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2021.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2022.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/disturbance_with_edge_effects_2023.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1990.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1991.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1992.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1993.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1994.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1995.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1996.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1997.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1998.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_1999.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2000.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2001.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2002.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2003.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2004.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2005.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2006.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2007.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2008.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2009.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2010.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2011.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2012.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2013.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2014.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2015.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2016.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2017.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2018.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2019.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2020.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2021.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2022.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/forest_with_edge_effects_2023.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_ais_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_berkelah_jerantut_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_berkelah_kuantan_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_berkelah_temerloh_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_pa_taman_negara_krau_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_remen_chereh_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_tekai_tembeling_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_tekam_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_yong_lipis_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/binary_edge_effects/lu_yong_with_edge_effects.tif',
'/gdrive/Shareddrives/masfi/3_predictors/coast/coast_proximity_km.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_aspect_cosine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_aspect_sine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_circular_variance_aspect_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_circular_variance_aspect_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_circular_variance_aspect_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_deviation_mean_elevation_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_deviation_mean_elevation_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_deviation_mean_elevation_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_eastness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_elevation.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_northness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_profile_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_roughness_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_roughness_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_roughness_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_slope.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_stream_power_index_log10.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_surface_area_ratio.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_tangential_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_topographic_position_index_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_topographic_position_index_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_topographic_position_index_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_topographic_ruggedness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_smooth_topographic_wetness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_aspect_cosine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_aspect_sine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_circular_variance_aspect_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_circular_variance_aspect_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_circular_variance_aspect_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_deviation_mean_elevation_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_deviation_mean_elevation_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_deviation_mean_elevation_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_eastness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_elevation.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_northness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_profile_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_roughness_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_roughness_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_roughness_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_slope.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_stream_power_index_log10.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_surface_area_ratio.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_tangential_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_topographic_position_index_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_topographic_position_index_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_topographic_position_index_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_topographic_ruggedness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_corrected_final/topo_cor_unsmooth_topographic_wetness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_aspect_cosine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_aspect_sine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_circular_variance_aspect_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_circular_variance_aspect_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_circular_variance_aspect_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_deviation_mean_elevation_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_deviation_mean_elevation_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_deviation_mean_elevation_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_eastness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_elevation.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_northness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_profile_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_roughness_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_roughness_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_roughness_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_slope.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_stream_power_index_log10.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_surface_area_ratio.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_tangential_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_topographic_position_index_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_topographic_position_index_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_topographic_position_index_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_topographic_ruggedness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_smooth_topographic_wetness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_aspect_cosine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_aspect_sine.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_circular_variance_aspect_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_circular_variance_aspect_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_circular_variance_aspect_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_deviation_mean_elevation_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_deviation_mean_elevation_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_deviation_mean_elevation_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_eastness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_elevation.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_northness.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_profile_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_roughness_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_roughness_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_roughness_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_slope.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_stream_power_index_log10.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_surface_area_ratio.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_tangential_curvature.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_topographic_position_index_03.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_topographic_position_index_07.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_topographic_position_index_11.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_topographic_ruggedness_index.tif',
'/gdrive/Shareddrives/masfi/3_predictors/topo_final/topo_uncor_unsmooth_topographic_wetness_index.tif',
]

In [None]:
for predictor in predictor_list:
  predictor_name = predictor.split('/')[-1]
  predictor_destination = join(predictor_final_dir, predictor_name)
  if not exists(predictor_destination):
    copyfile(predictor, predictor_destination)
print("All predictors finalised.")

# Compile GEDI elevation

In [None]:
# Correct GEDI elevation data

# Download Earth Gravitation Model (EGM) from https://www.agisoft.com/downloads/geoids/
earth_gravitational_model_url = 'https://download.agisoft.com/gtg/us_nga_egm2008_1.tif'
earth_gravitational_model_path = join(datasets_var_dir, 'earth_gravitational_model.tif')
if not exists(earth_gravitational_model_path):
  request = requests.get(earth_gravitational_model_url, allow_redirects=True)
  open(earth_gravitational_model_path, 'wb').write(request.content)
  print(f'EGM raster downloaded to: {earth_gravitational_model_path}')
else: print(f'EGM raster already exists at: {earth_gravitational_model_path}')

# Select the GEDI .pkl with elevation ('lon_lowestmode') data
for pkl in os.listdir(variates_final_dir):
    print(f"variates_pkl = '{pkl}'")

In [None]:
variates_pkl = 'GEDI04_A.pkl'

use_gedi_area_polygon = True

variates_read_pkl = pd.read_pickle(join(variates_final_dir, variates_pkl))

# Ensure all points are in the training area
if use_gedi_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'gedi_area.gpkg'))
else: gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
variates_geodataframe = gpd.GeoDataFrame(variates_read_pkl, geometry='geometry')
clipped_variates_gdf = gpd.clip(variates_geodataframe, project_area_polygon)
clipped_variates_df = pd.DataFrame(clipped_variates_gdf)
print(f"{len(variates_read_pkl) - len(clipped_variates_df)} out of {len(variates_read_pkl)} data points were outside the training area and removed.")

# Drop NA values, if any
dataset_na_values = clipped_variates_df.isna().any(axis=1).sum()
dataset_variates = clipped_variates_df.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

dataset_variates = variates_read_pkl.copy()
columns_to_keep = ['shot_number','beam','geometry','elev_lowestmode','sensitivity']
dataset_variates.drop(columns=[col for col in dataset_variates.columns.values if col not in columns_to_keep], inplace=True)

# Sample EGM values
sample_raster_values(dataset_variates, earth_gravitational_model_path)

# Correct elevation
dataset_variates['elevation_corrected'] = dataset_variates['elev_lowestmode'] - dataset_variates['earth_gravitational_model']

# Sample base DEM values
base_dem_path = join(areas_dir, 'base_dem.tif')
sample_raster_values(dataset_variates, base_dem_path)

# Calculate elevation difference
dataset_variates['elevation_corrected_diff'] = dataset_variates['elevation_corrected'] - dataset_variates['base_dem']

# Evaluate difference for filtering. Outliers are usually GEDI measurement errors
elev_corr_diff_array = np.array(dataset_variates['elevation_corrected_diff'])
first_percentile = np.percentile(elev_corr_diff_array, 1)
ninety_ninth_percentile = np.percentile(elev_corr_diff_array, 99)
random_selection = np.random.choice(elev_corr_diff_array, size = min(100_000, len(dataset_variates)), replace = False)
_ = plt.hist(random_selection, bins='auto')  # arguments are passed to np.histogram
plt.title("Base DEM - GEDI corrected elevation")
plt.show()
print(f'The ninety-ninth percentile is {ninety_ninth_percentile} while the first percentile is {first_percentile}.')

In [None]:
override_bounds = False
upper_bound, lower_bound = 20, -70
if not override_bounds:
  upper_bound, lower_bound = ninety_ninth_percentile, first_percentile

dataset_variates_filtered = dataset_variates[(dataset_variates['elevation_corrected_diff'] >= lower_bound) & (dataset_variates['elevation_corrected_diff'] <= upper_bound)]
print(f'{len(dataset_variates) - len(dataset_variates_filtered)} data points were filtered from {len(dataset_variates)} original points.')

# Drop correction and filtering columns
dataset_variates_filtered = dataset_variates_filtered.drop(columns=['elev_lowestmode','earth_gravitational_model','base_dem','elevation_corrected_diff'])

# Export to .pkl
dataset_variates_path = join(datasets_var_dir, 'elevation_corrected.pkl')
dataset_variates_filtered.to_pickle(dataset_variates_path)
dataset_variates_filtered = pd.read_pickle(dataset_variates_path)
print(f"The GEDI corrected elevation dataset has been processed and exported to: {dataset_variates_path}.")

# Compile GEDI vegetation indices

In [None]:
# Select the GEDI .pkl with aboveground biomass density / ha ('agbd') data
for pkl in os.listdir(variates_final_dir):
    print(f"variates_pkl = '{pkl}'")

In [None]:
variates_pkl = 'GEDI04_A.pkl'

dataset_name = 'agbd'
use_gedi_area_polygon = True

variates_read_pkl = pd.read_pickle(join(variates_final_dir, variates_pkl))

# Ensure all points are in the training area
if use_gedi_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'gedi_area.gpkg'))
else: gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
variates_geodataframe = gpd.GeoDataFrame(variates_read_pkl, geometry='geometry')
clipped_variates_gdf = gpd.clip(variates_geodataframe, project_area_polygon)
clipped_variates_df = pd.DataFrame(clipped_variates_gdf)
print(f"{len(variates_read_pkl) - len(clipped_variates_df)} out of {len(variates_read_pkl)} data points were outside the training area and removed.")

# Drop NA values, if any
dataset_na_values = clipped_variates_df.isna().any(axis=1).sum()
dataset_variates = clipped_variates_df.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

# Create 'year' column from timestamp
dataset_variates['year'] = dataset_variates['timestamp'].astype(str).str[:4].astype(int)

# Drop uneeded columns
dataset_variates = dataset_variates.drop(columns=['elev_lowestmode','timestamp'])

In [None]:
# Filter with TMF data
filter_with_tmf = True

dataset_variates_filtered = dataset_variates.copy()
if filter_with_tmf:
  # Create list of GEDI collection years
  gedi_year_list = dataset_variates['year'].unique().tolist()
  # Add year before first collection date to detect disturbance change
  gedi_year_list.append(min(gedi_year_list) - 1)

  # Filter non-forest and 'new changes' (with single pixel 8 connectedness edge) in the collection year
  # The timing of the disturbance within the year (whether before or after collection) is not known from TMF data.
  # These will also be excluded as predictors for training, as there's only a ~50 % chance the information is valid.
  # The edge / disturbance data will be added to the previous year at the prediction stage to simulate its effect.
  print(f"There are {len(dataset_variates)} data points in the unfiltered dataset.")

  # Sample relevant forest and disturbance rasters
  for year in gedi_year_list:
    for predictor_type in ['forest_with_edge_effects', 'disturbance_with_edge_effects']:
      predictor_path = join(edge_effects_dir, f"{predictor_type}_{year}.tif")
      if not exists(predictor_path):
        print(f"{predictor_type}_{year}.tif does not exist, so GEDI data from this year have been removed.")
        dataset_variates_filtered = dataset_variates_filtered[dataset_variates_filtered['year'] != year]
      else:
        sample_raster_values(dataset_variates_filtered, predictor_path)

  print(f"{len(dataset_variates) - len(dataset_variates_filtered)} data points were dropped due to missing TMF years.")

  # Filter non-forest (negative values)
  indices_to_filter_non_forest = []
  for index, row in dataset_variates_filtered.iterrows():
    if row[f"forest_with_edge_effects_{row['year']}"] < 0:
      indices_to_filter_non_forest.append(index)
  dataset_variates_filtered.drop(indices_to_filter_non_forest, inplace=True)
  print(f"{len(indices_to_filter_non_forest)} non-forest data points were dropped.")

  # Filter new disturbance with a 1 within a 1 pixel edge effect (between > -2 with 8-connectedness)
  indices_to_filter_new_disturbance = []
  for index, row in dataset_variates_filtered.iterrows():
    if row[f"disturbance_with_edge_effects_{row['year']}"] > -2:
      if row[f"disturbance_with_edge_effects_{row['year'] -1}"] <= -2:
        indices_to_filter_new_disturbance.append(index)
  dataset_variates_filtered.drop(indices_to_filter_new_disturbance, inplace=True)
  print(f"{len(indices_to_filter_new_disturbance)} 'new disturbance' data points were dropped.")

  # Filter new forest edge effects within 1 pixel (between 0 and 2 with 8-connectedness)
  indices_to_filter_new_forest_edge = []
  for index, row in dataset_variates_filtered.iterrows():
    if row[f"forest_with_edge_effects_{row['year']}"] < 2:
      if row[f"disturbance_with_edge_effects_{row['year'] -1}"] >= 2:
        indices_to_filter_new_forest_edge.append(index)
  dataset_variates_filtered.drop(indices_to_filter_new_forest_edge, inplace=True)
  print(f"{len(indices_to_filter_new_forest_edge)} 'new forest edge' data points were dropped.")

  print(f"There are {len(dataset_variates_filtered)} data points remaining in the filtered dataset.")

  # Drop filtering columns
  dataset_variates_filtered = dataset_variates_filtered.loc[:,~dataset_variates_filtered.columns.str.contains('forest|disturbance')].reset_index(drop=True)

In [None]:
# Export to .pkl
dataset_variates_path = join(datasets_var_dir, f'{dataset_name}.pkl')
dataset_variates_filtered.to_pickle(dataset_variates_path)
dataset_variates_filtered = pd.read_pickle(dataset_variates_path)
print(f"The GEDI AGBD dataset has been processed and exported to: {dataset_variates_path}.")

# Compile uploaded CSV variates

In [None]:
# Select the GEDI .pkl with aboveground biomass density / ha ('agbd') data
for pkl in os.listdir(variates_final_dir):
    print(f"variates_pkl = '{pkl}'")

In [None]:
variates_pkl = 'GEDI02_A.pkl'

variates_read_pkl = pd.read_pickle(join(variates_final_dir, variates_pkl))

# Ensure all ppoints are in the training area
project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
variates_geodataframe = gpd.GeoDataFrame(variates_read_pkl, geometry='geometry')
clipped_variates_gdf = gpd.clip(variates_geodataframe, project_area_polygon)
clipped_variates_df = pd.DataFrame(clipped_variates_gdf)
print(f"{len(variates_read_pkl) - len(clipped_variates_df)} out of {len(variates_read_pkl)} data points were outside the training area and removed.")

# Drop NA values, if any
dataset_na_values = clipped_variates_df.isna().any(axis=1).sum()
dataset_variates = clipped_variates_df.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

In [None]:
dataset_name = 'rh_intact.pkl'

# Export to .pkl
dataset_variates_path = join(datasets_var_dir, dataset_name)
dataset_variates.to_pickle(dataset_variates_path)
dataset_variates = pd.read_pickle(dataset_variates_path)
print(f"The GEDI AGBD dataset has been processed and exported to: {dataset_variates_path}.")

# Add predictors

WARNING: This will add all predictors in '\3_predictors\final' to the user training and validation datasets. Remove these files first if you only wish to move the datasets to the final folder for use in a model.

In [None]:
# Select the dataset variates .pkl to add predictors
for pkl in os.listdir(datasets_var_dir):
  if pkl.endswith('.pkl'):
    print(f"dataset_variates_pkl = '{pkl}'")

In [None]:
dataset_variates_pkl = 'agbd.pkl'

# Create list of predictors in 'final' directory
predictor_list = []
for predictor in os.listdir(predictor_final_dir):
  predictor_list.append(predictor)

# Select predictors to add to the dataset.
# NOTE FOR CORRECTED ELEVATION:
# Land-cover more recent than the base DEM shouldn't be used, as it's intended for discrepencies between the base surface data and GEDI terrain data.
# In the case of Copernicus DEM, this is > 2014.
# NOTE FOR AGBD:
# Land-cover more recent than or the same year as the most recent GEDI data (e.g. 2022) will be removed at the finalisation stage.
print('predictor_list = [')
for predictor in sorted(predictor_list):
  print(f"'{predictor}',")
print(']')

In [None]:
predictor_list = [
'coast_proximity_km.tif',
'disturbance_with_edge_effects_1990.tif',
'disturbance_with_edge_effects_1991.tif',
'disturbance_with_edge_effects_1992.tif',
'disturbance_with_edge_effects_1993.tif',
'disturbance_with_edge_effects_1994.tif',
'disturbance_with_edge_effects_1995.tif',
'disturbance_with_edge_effects_1996.tif',
'disturbance_with_edge_effects_1997.tif',
'disturbance_with_edge_effects_1998.tif',
'disturbance_with_edge_effects_1999.tif',
'disturbance_with_edge_effects_2000.tif',
'disturbance_with_edge_effects_2001.tif',
'disturbance_with_edge_effects_2002.tif',
'disturbance_with_edge_effects_2003.tif',
'disturbance_with_edge_effects_2004.tif',
'disturbance_with_edge_effects_2005.tif',
'disturbance_with_edge_effects_2006.tif',
'disturbance_with_edge_effects_2007.tif',
'disturbance_with_edge_effects_2008.tif',
'disturbance_with_edge_effects_2009.tif',
'disturbance_with_edge_effects_2010.tif',
'disturbance_with_edge_effects_2011.tif',
'disturbance_with_edge_effects_2012.tif',
'disturbance_with_edge_effects_2013.tif',
'disturbance_with_edge_effects_2014.tif',
'disturbance_with_edge_effects_2015.tif',
'disturbance_with_edge_effects_2016.tif',
'disturbance_with_edge_effects_2017.tif',
'disturbance_with_edge_effects_2018.tif',
'disturbance_with_edge_effects_2019.tif',
'disturbance_with_edge_effects_2020.tif',
'disturbance_with_edge_effects_2021.tif',
'disturbance_with_edge_effects_2022.tif',
'disturbance_with_edge_effects_2023.tif',
'forest_with_edge_effects_1990.tif',
'forest_with_edge_effects_1991.tif',
'forest_with_edge_effects_1992.tif',
'forest_with_edge_effects_1993.tif',
'forest_with_edge_effects_1994.tif',
'forest_with_edge_effects_1995.tif',
'forest_with_edge_effects_1996.tif',
'forest_with_edge_effects_1997.tif',
'forest_with_edge_effects_1998.tif',
'forest_with_edge_effects_1999.tif',
'forest_with_edge_effects_2000.tif',
'forest_with_edge_effects_2001.tif',
'forest_with_edge_effects_2002.tif',
'forest_with_edge_effects_2003.tif',
'forest_with_edge_effects_2004.tif',
'forest_with_edge_effects_2005.tif',
'forest_with_edge_effects_2006.tif',
'forest_with_edge_effects_2007.tif',
'forest_with_edge_effects_2008.tif',
'forest_with_edge_effects_2009.tif',
'forest_with_edge_effects_2010.tif',
'forest_with_edge_effects_2011.tif',
'forest_with_edge_effects_2012.tif',
'forest_with_edge_effects_2013.tif',
'forest_with_edge_effects_2014.tif',
'forest_with_edge_effects_2015.tif',
'forest_with_edge_effects_2016.tif',
'forest_with_edge_effects_2017.tif',
'forest_with_edge_effects_2018.tif',
'forest_with_edge_effects_2019.tif',
'forest_with_edge_effects_2020.tif',
'forest_with_edge_effects_2021.tif',
'forest_with_edge_effects_2022.tif',
'forest_with_edge_effects_2023.tif',
'latitude.tif',
'longitude.tif',
'lu_ais_with_edge_effects.tif',
'lu_berkelah_jerantut_with_edge_effects.tif',
'lu_berkelah_kuantan_with_edge_effects.tif',
'lu_berkelah_temerloh_with_edge_effects.tif',
'lu_pa_taman_negara_krau_with_edge_effects.tif',
'lu_remen_chereh_with_edge_effects.tif',
'lu_tekai_tembeling_with_edge_effects.tif',
'lu_tekam_with_edge_effects.tif',
'lu_yong_lipis_with_edge_effects.tif',
'lu_yong_with_edge_effects.tif',
'topo_cor_smooth_aspect_cosine.tif',
'topo_cor_smooth_aspect_sine.tif',
'topo_cor_smooth_circular_variance_aspect_03.tif',
'topo_cor_smooth_circular_variance_aspect_07.tif',
'topo_cor_smooth_circular_variance_aspect_11.tif',
'topo_cor_smooth_deviation_mean_elevation_03.tif',
'topo_cor_smooth_deviation_mean_elevation_07.tif',
'topo_cor_smooth_deviation_mean_elevation_11.tif',
'topo_cor_smooth_eastness.tif',
'topo_cor_smooth_elevation.tif',
'topo_cor_smooth_northness.tif',
'topo_cor_smooth_profile_curvature.tif',
'topo_cor_smooth_roughness_03.tif',
'topo_cor_smooth_roughness_07.tif',
'topo_cor_smooth_roughness_11.tif',
'topo_cor_smooth_slope.tif',
'topo_cor_smooth_stream_power_index_log10.tif',
'topo_cor_smooth_surface_area_ratio.tif',
'topo_cor_smooth_tangential_curvature.tif',
'topo_cor_smooth_topographic_position_index_03.tif',
'topo_cor_smooth_topographic_position_index_07.tif',
'topo_cor_smooth_topographic_position_index_11.tif',
'topo_cor_smooth_topographic_ruggedness_index.tif',
'topo_cor_smooth_topographic_wetness_index.tif',
'topo_cor_unsmooth_aspect_cosine.tif',
'topo_cor_unsmooth_aspect_sine.tif',
'topo_cor_unsmooth_circular_variance_aspect_03.tif',
'topo_cor_unsmooth_circular_variance_aspect_07.tif',
'topo_cor_unsmooth_circular_variance_aspect_11.tif',
'topo_cor_unsmooth_deviation_mean_elevation_03.tif',
'topo_cor_unsmooth_deviation_mean_elevation_07.tif',
'topo_cor_unsmooth_deviation_mean_elevation_11.tif',
'topo_cor_unsmooth_eastness.tif',
'topo_cor_unsmooth_elevation.tif',
'topo_cor_unsmooth_northness.tif',
'topo_cor_unsmooth_profile_curvature.tif',
'topo_cor_unsmooth_roughness_03.tif',
'topo_cor_unsmooth_roughness_07.tif',
'topo_cor_unsmooth_roughness_11.tif',
'topo_cor_unsmooth_slope.tif',
'topo_cor_unsmooth_stream_power_index_log10.tif',
'topo_cor_unsmooth_surface_area_ratio.tif',
'topo_cor_unsmooth_tangential_curvature.tif',
'topo_cor_unsmooth_topographic_position_index_03.tif',
'topo_cor_unsmooth_topographic_position_index_07.tif',
'topo_cor_unsmooth_topographic_position_index_11.tif',
'topo_cor_unsmooth_topographic_ruggedness_index.tif',
'topo_cor_unsmooth_topographic_wetness_index.tif',
# 'topo_uncor_smooth_aspect_cosine.tif',
# 'topo_uncor_smooth_aspect_sine.tif',
# 'topo_uncor_smooth_circular_variance_aspect_03.tif',
# 'topo_uncor_smooth_circular_variance_aspect_07.tif',
# 'topo_uncor_smooth_circular_variance_aspect_11.tif',
# 'topo_uncor_smooth_deviation_mean_elevation_03.tif',
# 'topo_uncor_smooth_deviation_mean_elevation_07.tif',
# 'topo_uncor_smooth_deviation_mean_elevation_11.tif',
# 'topo_uncor_smooth_eastness.tif',
# 'topo_uncor_smooth_elevation.tif',
# 'topo_uncor_smooth_northness.tif',
# 'topo_uncor_smooth_profile_curvature.tif',
# 'topo_uncor_smooth_roughness_03.tif',
# 'topo_uncor_smooth_roughness_07.tif',
# 'topo_uncor_smooth_roughness_11.tif',
# 'topo_uncor_smooth_slope.tif',
# 'topo_uncor_smooth_stream_power_index_log10.tif',
# 'topo_uncor_smooth_surface_area_ratio.tif',
# 'topo_uncor_smooth_tangential_curvature.tif',
# 'topo_uncor_smooth_topographic_position_index_03.tif',
# 'topo_uncor_smooth_topographic_position_index_07.tif',
# 'topo_uncor_smooth_topographic_position_index_11.tif',
# 'topo_uncor_smooth_topographic_ruggedness_index.tif',
# 'topo_uncor_smooth_topographic_wetness_index.tif',
# 'topo_uncor_unsmooth_aspect_cosine.tif',
# 'topo_uncor_unsmooth_aspect_sine.tif',
# 'topo_uncor_unsmooth_circular_variance_aspect_03.tif',
# 'topo_uncor_unsmooth_circular_variance_aspect_07.tif',
# 'topo_uncor_unsmooth_circular_variance_aspect_11.tif',
# 'topo_uncor_unsmooth_deviation_mean_elevation_03.tif',
# 'topo_uncor_unsmooth_deviation_mean_elevation_07.tif',
# 'topo_uncor_unsmooth_deviation_mean_elevation_11.tif',
# 'topo_uncor_unsmooth_eastness.tif',
# 'topo_uncor_unsmooth_elevation.tif',
# 'topo_uncor_unsmooth_northness.tif',
# 'topo_uncor_unsmooth_profile_curvature.tif',
# 'topo_uncor_unsmooth_roughness_03.tif',
# 'topo_uncor_unsmooth_roughness_07.tif',
# 'topo_uncor_unsmooth_roughness_11.tif',
# 'topo_uncor_unsmooth_slope.tif',
# 'topo_uncor_unsmooth_stream_power_index_log10.tif',
# 'topo_uncor_unsmooth_surface_area_ratio.tif',
# 'topo_uncor_unsmooth_tangential_curvature.tif',
# 'topo_uncor_unsmooth_topographic_position_index_03.tif',
# 'topo_uncor_unsmooth_topographic_position_index_07.tif',
# 'topo_uncor_unsmooth_topographic_position_index_11.tif',
# 'topo_uncor_unsmooth_topographic_ruggedness_index.tif',
# 'topo_uncor_unsmooth_topographic_wetness_index.tif',
]

In [None]:
cache = True

dataset_variates_path = join(datasets_var_dir, dataset_variates_pkl)
dataset_variates = pd.read_pickle(dataset_variates_path)
dataset_add_pre_path = join(datasets_add_pre_dir, dataset_variates_pkl)

# Handle caching
if cache: print("Cache enabled.")
if not exists(dataset_add_pre_path):
  dataset_variates.to_pickle(dataset_add_pre_path)
  dataset_add_pre = pd.read_pickle(dataset_add_pre_path)
else:
  print(f"An 'add predictors' dataset already exists: {dataset_add_pre_path}")
  if cache:
    print("Continuing to add predictors to existing dataset. Delete it to start again.")
    dataset_add_pre = pd.read_pickle(dataset_add_pre_path)
  else:
    print("Cache disabled. The 'add predictors' dataset will be overwritten in 10 seconds (interrupt if unintended).")
    sleep(10)
    dataset_variates.to_pickle(dataset_add_pre_path)
    dataset_add_pre = pd.read_pickle(dataset_add_pre_path)

# Predictor progress
predictor_progress_index = 0
predictor_progress_label = widgets.Label(f"Predictor progress: {predictor_progress_index}/{len(predictor_list)}")
display(predictor_progress_label)

for predictor in predictor_list:
  if f"pre_{predictor}"[:-4] not in dataset_add_pre.columns:
    # Read the backup if the last save was corrupted
    if cache: dataset_add_pre = pd.read_pickle(dataset_add_pre_path)
    # Sample raster values
    predictor_path = join(predictor_final_dir, predictor)
    sample_raster_values(dataset_add_pre, predictor_path, predictor=True)
    # Save cached 'add_predictors' dataset if enabled
    if cache:
      dataset_add_pre_save = dataset_add_pre.copy() # Defragments the dataframe
      dataset_add_pre_save.to_pickle(dataset_add_pre_path)
  # Update predictor progress
  predictor_progress_index += 1
  predictor_progress_label.value = f"Predictor progress: {predictor_progress_index}/{len(predictor_list)}"

# Export completed 'add predictors' dataset if no cache
if not cache: dataset_add_pre.to_pickle(dataset_add_pre_path)

print(f"All predictors have been added to {dataset_add_pre_path}.")

# Drop columns (optional)

In [None]:
# Select the 'add predictors' dataset to drop columns
for pkl in os.listdir(datasets_add_pre_dir):
  print(f"dataset_drop_columns = '{pkl}'")

In [None]:
add_pre_dataset_name = 'elevation_corrected.pkl'

add_pre_dataset_path = join(datasets_add_pre_dir, add_pre_dataset_name)
add_pre_dataset = pd.read_pickle(add_pre_dataset_path)

# Inspect existing columns
sorted(add_pre_dataset.columns)

In [None]:
drop_dataset_columns = True

if drop_dataset_columns:
  dropped_columns = 'longitude'
  # Drop columns
  dropped_columns_dataset = add_pre_dataset.loc[:,~add_pre_dataset.columns.str.contains(dropped_columns)]
  dropped_columns_dataset.to_pickle(add_pre_dataset_path)
  # Inspect columns again
  add_pre_dataset = pd.read_pickle(add_pre_dataset_path)
sorted(add_pre_dataset.columns)

# Finalise GEDI corrected topography

In [None]:
# Select the corrected elevation dataset to finalise.
for pkl in os.listdir(datasets_add_pre_dir):
    print(f'final_dataset_name = "{pkl}"')

In [None]:
final_dataset_name = "elevation_corrected.pkl"

final_dataset_path = join(datasets_add_pre_dir, final_dataset_name)
final_dataset = pd.read_pickle(final_dataset_path)

# Drop the geometry column and reset index
final_dataset = final_dataset.drop(columns=['geometry'])

# Add 'var_' prefix to non-predictors
final_dataset.columns = ['var_' + col if not col.startswith('pre_') else col for col in final_dataset.columns]

# Sort columns alphabetically
variate_columns = [col for col in final_dataset.columns if col.startswith('var_')]
predictor_columns = [col for col in final_dataset.columns if col.startswith('pre_')]
sorted_columns = sorted(variate_columns) + sorted(predictor_columns)

# Reindex the DataFrame with the sorted column order
final_dataset = final_dataset.reindex(columns=sorted_columns)

# Export and check final dataset
final_dataset_path = join(datasets_final_dir, final_dataset_name)
final_dataset.to_pickle(final_dataset_path)
pd.read_pickle(final_dataset_path)

# Finalise GEDI vegetation indices

In [None]:
# Select the corrected elevation dataset to finalise.
for pkl in os.listdir(datasets_add_pre_dir):
    print(f'final_dataset_name = "{pkl}"')

In [None]:
final_dataset_name = "agbd.pkl"

filter_with_tmf = True

# load 'add predictors' dataset
dataset_add_pre_path = join(datasets_add_pre_dir, final_dataset_name)
dataset_add_pre = pd.read_pickle(dataset_add_pre_path)

if filter_with_tmf:
  # Define yearly predictors, add others if necessary (usually land-cover)
  yearly_predictors = ['forest_with_edge_effects', 'disturbance_with_edge_effects']

  # load 'add predictors' dataset columns
  dataset_add_pre_column_list = sorted(dataset_add_pre.columns, reverse=True)

  # Get list of GEDI years
  gedi_year_list = dataset_add_pre['year'].unique().tolist()
  gedi_year_list = [int(x) for x in gedi_year_list]
  gedi_year_max = np.max(gedi_year_list)

  # Create an index identifier
  dataset_add_pre['index_record'] = dataset_add_pre.index

  # Create an empty list for storing GEDI yearly sub-datasets
  dataset_gedi_year_list = []

  # Iterate through each GEDI year and shift the sample to appropriate predictor year
  # Column names are kept the same to avoid issues when ordering features in model training/prediction
  # However, e.g., a '2021' predictor for a 2022 GEDI point will actually be a 2019 predictor for a 2020 GEDI point
  for gedi_year in gedi_year_list:
      dataset_gedi_year = dataset_add_pre[dataset_add_pre['year'] == gedi_year].copy()
      sample_year_modifier = gedi_year_max - gedi_year
      for col in dataset_add_pre_column_list:
          for yearly_predictor in yearly_predictors:
              if yearly_predictor in col:
                  predictor_year = int(col[-4:])
                  original_sample = f"pre_{yearly_predictor}_{str(predictor_year)}"
                  corrected_sample = f"pre_{yearly_predictor}_{str(predictor_year + sample_year_modifier)}"
                  dataset_gedi_year.rename(columns={original_sample: corrected_sample}, inplace=True)
      dataset_gedi_year_list.append(dataset_gedi_year)

  # Concatenate dataframes and sort
  final_dataset = pd.concat(dataset_gedi_year_list, ignore_index=True)
  final_dataset.sort_values('index_record', inplace=True)
  final_dataset.reset_index(drop=True, inplace=True)

  # Drop predictor years out of the data range for one of the GEDI years
  final_dataset.dropna(axis=1, how='any', inplace=True)

  # Drop predictors from the most recent GEDI collection year (as timing cannot be certain)
  for yearly_predictor in yearly_predictors:
    yearly_predictor_max = f"pre_{yearly_predictor}_{str(gedi_year_max)}"
    for column in final_dataset.columns:
      if column == yearly_predictor_max:
        final_dataset.drop(columns=[yearly_predictor_max], inplace=True)

  # Drop the index identifier
  final_dataset.drop(columns=['index_record'], inplace=True)

else: final_dataset = dataset_add_pre.copy()

In [None]:
# Add 'var_' prefix to non-predictors
final_dataset.columns = ['var_' + col if not col.startswith('pre_') else col for col in final_dataset.columns]

# Sort columns alphabetically
variate_columns = [col for col in final_dataset.columns if col.startswith('var_')]
predictor_columns = [col for col in final_dataset.columns if col.startswith('pre_')]
sorted_columns = sorted(variate_columns) + sorted(predictor_columns)

# Reindex the DataFrame with the sorted column order
final_dataset = final_dataset.reindex(columns=sorted_columns)

# Export and check final dataset
final_dataset_path = join(datasets_final_dir, final_dataset_name)
final_dataset.to_pickle(final_dataset_path)
pd.read_pickle(final_dataset_path)

# Finalise uploaded CSV dataset

In [None]:
# Select the corrected elevation dataset to finalise.
for pkl in os.listdir(datasets_add_pre_dir):
    print(f'final_dataset_name = "{pkl}"')

In [None]:
final_dataset_name = "rh_intact.pkl"

# load 'add predictors' dataset
dataset_add_pre_path = join(datasets_add_pre_dir, final_dataset_name)
final_dataset = pd.read_pickle(dataset_add_pre_path)

# Add 'var_' prefix to non-predictors
final_dataset.columns = ['var_' + col if not col.startswith('pre_') else col for col in final_dataset.columns]

# Sort columns alphabetically
variate_columns = [col for col in final_dataset.columns if col.startswith('var_')]
predictor_columns = [col for col in final_dataset.columns if col.startswith('pre_')]
sorted_columns = sorted(variate_columns) + sorted(predictor_columns)

# Reindex the DataFrame with the sorted column order
final_dataset = final_dataset.reindex(columns=sorted_columns)

# Export and check final dataset
final_dataset_path = join(datasets_final_dir, final_dataset_name)
final_dataset.to_pickle(final_dataset_path)
pd.read_pickle(final_dataset_path)

# Export to .gpkg (optional)

In [None]:
# Select the dataset .pkl to export as a .gpkg
for pkl in os.listdir(datasets_final_dir):
  print(f'dataset_gpkg_name = "{pkl}"')

In [None]:
dataset_gpkg_name = "agbd.pkl"

dataset_gpkg_pkl_path = join(datasets_add_pre_dir, dataset_gpkg_name)
dataset_gpkg_pkl = pd.read_pickle(dataset_gpkg_pkl_path)

# Print columns that can be included
print("selected_gpkg_columns = [")
for col in sorted(dataset_gpkg_pkl.columns):
  if col != "geometry":
    print(f'  "{col}",')
print("]")

In [None]:
selected_gpkg_columns = [
  "agbd",
  "beam",
  "pre_topo_cor_unsmooth_elevation",
  "shot_number",
]

selected_gpkg_columns = selected_gpkg_columns + ['geometry']

In [None]:
dataset_gpkg_geodataframe = gpd.GeoDataFrame(dataset_gpkg_pkl[selected_gpkg_columns], geometry='geometry')

dataset_gpkg_export = join(datasets_gpkg_dir, f"{dataset_gpkg_name[:-4]}.gpkg")
dataset_gpkg_geodataframe.to_file(dataset_gpkg_export, driver="GPKG")

# Disconnect runtime

In [None]:
# Useful for stopping background execution
from google.colab import runtime
runtime.unassign()