<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/4_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports, directories and global functions

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs
!pip install geopandas

In [None]:
# Imports
import geopandas as gpd
from google.colab import runtime
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
from os import makedirs
from os.path import exists, join
from osgeo import gdal
gdal.UseExceptions()
import pandas as pd
import requests
from scipy.stats import norm
from shutil import copyfile, move
from sklearn.mixture import GaussianMixture
from time import sleep

In [None]:
# 1_areas directories
areas_dir = join(base_dir, '1_areas')
polygons_dir = join(areas_dir, 'polygons')
template_dir = join(areas_dir, "template.tif")

# 2_targets directories
targets_final_dir = join(base_dir, "2_targets/pkl_final")

# 3_features directories
features_dir = join(base_dir, "3_features")
alpha_earth_dir = join(features_dir, "alpha_earth")
edge_effects_dir = join(features_dir, "binary_edge_effects")
continuous_final_dir = join(features_dir, "continuous_final")
topo_dsm_final_dir = join(features_dir, "topo_dsm_final")
topo_dtm_final_dir = join(features_dir, "topo_dtm_final")
coast_dir = join(features_dir, 'coast')
feature_final_dir = join(features_dir, "final")

# 4_datasets directories
datasets_dir = join(base_dir, "4_datasets")
datasets_tar_dir = join(datasets_dir, "targets")
datasets_add_fea_dir = join(datasets_dir, "add_features")
datasets_final_dir = join(datasets_dir, "final")
datasets_gpkg_dir = join(datasets_dir, "gpkg")

# Create directories
makedirs(feature_final_dir, exist_ok=True)
makedirs(datasets_dir, exist_ok=True)
makedirs(datasets_tar_dir, exist_ok=True)
makedirs(datasets_add_fea_dir, exist_ok=True)
makedirs(datasets_final_dir, exist_ok=True)
makedirs(datasets_gpkg_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -11111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'] # Good speed / size ratio
    else: options = []
    if input_array.dtype == 'int16': dtype = gdal.GDT_Int16
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Global function: sample raster values
def sample_raster_values(pd_dataframe, raster_path, feature=False, geometry_column='geometry'):
    raster_name = raster_path.split('/')[-1][:-4]
    if feature: raster_name = 'fea_' + raster_name
    raster = gdal.Open(raster_path)
    band = raster.GetRasterBand(1)
    geotransform = raster.GetGeoTransform()
    raster_array = band.ReadAsArray()
    nodata = band.GetNoDataValue()
    rows, cols = raster_array.shape
    sampled_values = []
    for geom in pd_dataframe[geometry_column]:
        x_idx = int((geom.x - geotransform[0]) / geotransform[1])
        y_idx = int((geom.y - geotransform[3]) / geotransform[5])
        # bounds check
        if 0 <= x_idx < cols and 0 <= y_idx < rows: sampled_values.append(raster_array[y_idx, x_idx])
        else: sampled_values.append(nodata if nodata is not None else np.nan)
    pd_dataframe[raster_name] = sampled_values
    raster = band = None

# Global function: histogram-based density outlier bounds
# Filters sparse regions at distribution extremes based on bin counts
# Dense regions kept regardless of distance from median, only isolated tail values flagged
def histogram_outlier_bounds(data, title, sparse_threshold_percent=0.01):
    min_bin_count = max(10, int(len(data) * sparse_threshold_percent / 100))
    counts, bin_edges = np.histogram(data, bins='auto')
    # Scan inward from each tail until a dense bin is found
    first_dense_bin = next(i for i, c in enumerate(counts) if c >= min_bin_count)
    last_dense_bin = next(i for i, c in enumerate(reversed(counts)) if c >= min_bin_count)
    last_dense_bin = len(counts) - 1 - last_dense_bin
    # Bounds from dense bin edges
    lower_bound = bin_edges[first_dense_bin]
    upper_bound = bin_edges[last_dense_bin + 1]
    # Count filtered points
    n_below_lower = np.sum(data < lower_bound)
    n_above_upper = np.sum(data > upper_bound)
    n_remaining = len(data) - n_below_lower - n_above_upper
    # Plot distribution with bounds
    random_selection = np.random.choice(data, size=min(100_000, len(data)), replace=False)
    plt.hist(random_selection, bins='auto')
    plt.axvline(lower_bound, color='red', linestyle='--', label=f'Lower: {lower_bound:.1f}')
    plt.axvline(upper_bound, color='red', linestyle='--', label=f'Upper: {upper_bound:.1f}')
    plt.title(title)
    plt.legend()
    plt.show()
    print(f"Histogram density bounds (tails only): [{lower_bound:.2f}, {upper_bound:.2f}]")
    print(f"Filtered below lower bound: {n_below_lower:,}")
    print(f"Filtered above upper bound: {n_above_upper:,}")
    print(f"Remaining points: {n_remaining:,} out of {len(data):,} ({100 * (1 - n_remaining / len(data)):.2f}% removed)")
    return lower_bound, upper_bound

# Finalise targets

## GEDI elevation

In [None]:
# GEDI DTM (Digital Terrain Model) dataset preparation

# GEDI elev_lowestmode is WGS84 ellipsoidal height
# Copernicus DEM is EGM2008 orthometric height
# Geoid correction applied: H = h - N (ellipsoidal to orthometric)

# Outlier filtering compares Geoid corrected GEDI elevation to Copernicus DSM
# GEDI typically lower than DSM in forest (ground vs canopy surface)
# Extreme differences indicate measurement error or cloud contamination

# Download EGM2008 geoid model
earth_gravitational_model_url = 'https://download.agisoft.com/gtg/us_nga_egm2008_1.tif'
earth_gravitational_model_path = join(datasets_tar_dir, 'earth_gravitational_model.tif')
if not exists(earth_gravitational_model_path):
  request = requests.get(earth_gravitational_model_url, allow_redirects=True)
  open(earth_gravitational_model_path, 'wb').write(request.content)
  print(f'EGM raster downloaded to: {earth_gravitational_model_path}')
else: print(f'EGM raster already exists at: {earth_gravitational_model_path}')

# Select GEDI .pkl containing elev_lowestmode
for pkl in os.listdir(targets_final_dir):
    print(f"targets_pkl = '{pkl}'")

In [None]:
targets_pkl = 'GEDI04_A.pkl'

# Use the rectangular GEDI area extent, instead of the potentially complex project area
use_gedi_area_polygon = True

# Histogram-based outlier filtering
histogram_based_outlier_filtering = True
# Bins with fewer than this percentage of total data are considered sparse
sparse_threshold_percent = 0.01

targets_read_pkl = pd.read_pickle(join(targets_final_dir, targets_pkl))

# Ensure all points are in the training area
if use_gedi_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'gedi_area.gpkg'))
else: project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
targets_geodataframe = gpd.GeoDataFrame(targets_read_pkl, geometry='geometry')
clipped_targets_gdf = gpd.clip(targets_geodataframe, project_area_polygon)
clipped_targets_df = pd.DataFrame(clipped_targets_gdf)
print(f"{len(targets_read_pkl) - len(clipped_targets_df)} out of {len(targets_read_pkl)} data points were outside the training area and removed.")

# Drop NA values, if any
dataset_na_values = clipped_targets_df.isna().any(axis=1).sum()
dataset_targets = clipped_targets_df.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")
columns_to_keep = ['shot_number','beam','geometry','elev_lowestmode','sensitivity']
dataset_targets = dataset_targets[[col for col in columns_to_keep if col in dataset_targets.columns]]

# Sample EGM values
sample_raster_values(dataset_targets, earth_gravitational_model_path)
dataset_targets['gedi_elevation'] = dataset_targets['elev_lowestmode'] - dataset_targets['earth_gravitational_model']

# Sample base DEM values
base_dem_path = join(areas_dir, 'base_dem_dsm.tif')
sample_raster_values(dataset_targets, base_dem_path)

# Calculate elevation difference
dataset_targets['gedi_elevation_diff'] = dataset_targets['gedi_elevation'] - dataset_targets['base_dem_dsm']

# Histogram-based outlier filtering
if histogram_based_outlier_filtering:
  elevation_diff_lower_bound, elevation_diff_upper_bound = histogram_outlier_bounds(
      np.array(dataset_targets['gedi_elevation_diff']),
      title="GEDI âˆ’ Base DEM elevation",
      sparse_threshold_percent=sparse_threshold_percent)

In [None]:
# Manual override if histogram bounds are unsuitable
override_bounds = False
if override_bounds: elevation_diff_lower_bound, elevation_diff_upper_bound = -50, 5

dataset_targets_filtered = dataset_targets[(dataset_targets['gedi_elevation_diff'] >= elevation_diff_lower_bound) & (dataset_targets['gedi_elevation_diff'] <= elevation_diff_upper_bound)]
# Drop correction and filtering columns
dataset_targets_filtered = dataset_targets_filtered.drop(columns=['elev_lowestmode','earth_gravitational_model','base_dem_dsm','gedi_elevation_diff'])

# Export to .pkl
dataset_targets_path = join(datasets_tar_dir, 'gedi_elevation.pkl')
dataset_targets_filtered.to_pickle(dataset_targets_path)
dataset_targets_filtered = pd.read_pickle(dataset_targets_path)
print(f"The GEDI elevation dataset has been processed and exported to: {dataset_targets_path}.")

## GEDI vegetation indices

In [None]:
# Select the GEDI .pkl with the desired vegetation index (e.g. 'agbd')
for pkl in os.listdir(targets_final_dir):
    print(f"targets_pkl = '{pkl}'")

In [None]:
targets_pkl = 'GEDI04_A.pkl'
dataset_name = 'agbd'

# Use the rectangular GEDI area extent, instead of the potentially complex project area
use_gedi_area_polygon = True

targets_read_pkl = pd.read_pickle(join(targets_final_dir, targets_pkl))

# Ensure all points are in the training area
if use_gedi_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'gedi_area.gpkg'))
else: project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
targets_geodataframe = gpd.GeoDataFrame(targets_read_pkl, geometry='geometry')
clipped_targets_gdf = gpd.clip(targets_geodataframe, project_area_polygon)
clipped_targets_df = pd.DataFrame(clipped_targets_gdf)
print(f"{len(targets_read_pkl) - len(clipped_targets_df)} out of {len(targets_read_pkl)} data points were outside the training area and removed.")

# Drop NA values, if any
dataset_na_values = clipped_targets_df.isna().any(axis=1).sum()
dataset_targets = clipped_targets_df.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

# Create 'year' column from timestamp
dataset_targets['year'] = dataset_targets['timestamp'].astype(str).str[:4].astype(int)

# Drop uneeded columns
dataset_targets = dataset_targets.drop(columns=['elev_lowestmode','timestamp'])

In [None]:
# Filter with TMF data
# Filter non-forest and 'new changes' in the collection year
# TMF provides annual disturbance,  sub-annual timing relative to GEDI collection is unknown
# Points with same-year land-cover changes excluded due to temporal ambiguity
# Predictions use the previous year's disturbance state (Dec 31st of year prior to prediction)
filter_with_tmf = True

# Distance threshold in metres for edge effect filtering
# Ecological edge effects take several years to appear
# Threshold mainly accounts for GEDI footprint geolocation inaccuracy
edge_distance_threshold = 30

# Histogram-based outlier filtering
histogram_based_outlier_filtering = True
# Bins with fewer than this percentage of total data are considered sparse
sparse_threshold_percent = 0.01

dataset_targets_filtered = dataset_targets.copy()

if filter_with_tmf:
    gedi_year_list = dataset_targets['year'].unique().tolist()
    gedi_year_list.append(min(gedi_year_list) - 1)
    print(f"There are {len(dataset_targets)} data points in the unfiltered dataset.")

    # Sample relevant forest and disturbance rasters
    for year in gedi_year_list:
        for feature_type in ['forest_edge_distance', 'disturbance_edge_distance']:
            feature_path = join(edge_effects_dir, f"{feature_type}_{year}.tif")
            if not exists(feature_path):
                print(f"{feature_type}_{year}.tif does not exist, so GEDI data from this year have been removed.")
                dataset_targets_filtered = dataset_targets_filtered[dataset_targets_filtered['year'] != year]
            else: sample_raster_values(dataset_targets_filtered, feature_path)
    print(f"{len(dataset_targets) - len(dataset_targets_filtered)} data points were dropped due to missing TMF years.")

    # Filter non-forest (negative edge_distance = outside forest class)
    indices_to_filter_non_forest = []
    for index, row in dataset_targets_filtered.iterrows():
        if row[f"forest_edge_distance_{row['year']}"] < 0:
            indices_to_filter_non_forest.append(index)
    dataset_targets_filtered.drop(indices_to_filter_non_forest, inplace=True)
    print(f"{len(indices_to_filter_non_forest)} non-forest data points were dropped.")

    # Filter new disturbance within threshold distance of disturbance edge
    # Positive or small negative edge_distance = close to or inside disturbance
    indices_to_filter_new_disturbance = []
    for index, row in dataset_targets_filtered.iterrows():
        if row[f"disturbance_edge_distance_{row['year']}"] >= -edge_distance_threshold:
            if row[f"disturbance_edge_distance_{row['year'] -1}"] < -edge_distance_threshold:
                indices_to_filter_new_disturbance.append(index)
    dataset_targets_filtered.drop(indices_to_filter_new_disturbance, inplace=True)
    print(f"{len(indices_to_filter_new_disturbance)} 'new disturbance' data points were dropped.")

    # Filter new forest edge effects within threshold distance of forest edge
    # Small positive edge_distance = near forest edge (interior side)
    indices_to_filter_new_forest_edge = []
    for index, row in dataset_targets_filtered.iterrows():
        if row[f"forest_edge_distance_{row['year']}"] <= edge_distance_threshold:
            if row[f"forest_edge_distance_{row['year'] -1}"] > edge_distance_threshold:
                indices_to_filter_new_forest_edge.append(index)
    dataset_targets_filtered.drop(indices_to_filter_new_forest_edge, inplace=True)
    print(f"{len(indices_to_filter_new_forest_edge)} 'new forest edge' data points were dropped.")
    print(f"There are {len(dataset_targets_filtered)} data points remaining in the filtered dataset.")

    # Drop filtering columns
    dataset_targets_filtered = dataset_targets_filtered.loc[:,~dataset_targets_filtered.columns.str.contains(
        'forest_edge_distance|disturbance_edge_distance')].reset_index(drop=True)

# Histogram-based outlier filtering
if histogram_based_outlier_filtering:
  vegetation_lower_bound, vegetation_upper_bound = histogram_outlier_bounds(
      np.array(dataset_targets_filtered[dataset_name]),
      title=f"GEDI {dataset_name.upper()} distribution",
      sparse_threshold_percent=sparse_threshold_percent)

In [None]:
# Manual override if histogram bounds are unsuitable
override_bounds = False
if override_bounds: vegetation_lower_bound, vegetation_upper_bound = 0, 800

# Apply bounds
dataset_targets_filtered = dataset_targets_filtered[
    (dataset_targets_filtered[dataset_name] >= vegetation_lower_bound) &
    (dataset_targets_filtered[dataset_name] <= vegetation_upper_bound)
].reset_index(drop=True)

# Export to .pkl
dataset_targets_path = join(datasets_tar_dir, f'{dataset_name}.pkl')
dataset_targets_filtered.to_pickle(dataset_targets_path)
dataset_targets_filtered = pd.read_pickle(dataset_targets_path)
print(f"The GEDI {dataset_name.upper()} dataset has been processed and exported to: {dataset_targets_path}.")

## Uploaded data

In [None]:
# Select the uploaded target dataset to compile
for pkl in os.listdir(targets_final_dir):
    print(f"targets_pkl = '{pkl}'")

In [None]:
targets_pkl = 'user_upload.pkl'

# Name of target column for outlier filtering
target_column = 'target'

# Use the rectangular GEDI area extent, instead of the potentially complex project area
use_gedi_area_polygon = True

# Histogram-based outlier filtering
histogram_based_outlier_filtering = True
# Bins with fewer than this percentage of total data are considered sparse
sparse_threshold_percent = 0.01

targets_read_pkl = pd.read_pickle(join(targets_final_dir, targets_pkl))

# Ensure all points are in the training area
if use_gedi_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'gedi_area.gpkg'))
else: project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
targets_geodataframe = gpd.GeoDataFrame(targets_read_pkl, geometry='geometry')
clipped_targets_gdf = gpd.clip(targets_geodataframe, project_area_polygon)
clipped_targets_df = pd.DataFrame(clipped_targets_gdf)
print(f"{len(targets_read_pkl) - len(clipped_targets_df)} out of {len(targets_read_pkl)} data points were outside the training area and removed.")

# Drop NA values, if any
dataset_na_values = clipped_targets_df.isna().any(axis=1).sum()
dataset_targets = clipped_targets_df.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

# Histogram-based outlier filtering
if histogram_based_outlier_filtering:
    target_lower_bound, target_upper_bound = histogram_outlier_bounds(
        np.array(dataset_targets[target_column]),
        title=f"{target_column} distribution",
        sparse_threshold_percent=sparse_threshold_percent)

In [None]:
dataset_name = 'user_upload'

# Manual override if histogram bounds are unsuitable
override_bounds = False
if override_bounds: target_lower_bound, target_upper_bound = 0, 100

# Apply bounds
if histogram_based_outlier_filtering:
    dataset_targets = dataset_targets[
        (dataset_targets[target_column] >= target_lower_bound) &
        (dataset_targets[target_column] <= target_upper_bound)
    ].reset_index(drop=True)
    print(f"{len(dataset_targets)} data points remaining after outlier filtering.")

# Export to .pkl
dataset_targets_path = join(datasets_tar_dir, f'{dataset_name}.pkl')
dataset_targets.to_pickle(dataset_targets_path)
dataset_targets = pd.read_pickle(dataset_targets_path)
print(f"The {dataset_name} dataset has been processed and exported to: {dataset_targets_path}.")

# Finalise features

In [None]:
# Round latitude and longitude rasters and finalise
# These help account for ecological spatial patterns we do not have feature rasters for.
# Precision should strike a balance between accuracy and overfitting/model training time
precision = 3 # 3 is equivalent to a precision of ~111 m at the equator, i.e. every three to four 30 m pixels
coordinates = ['latitude.tif', 'longitude.tif']
for coordinate in coordinates:
  coordinate_path = join(areas_dir, coordinate)
  coordinate_rounded_path = join(feature_final_dir, coordinate)
  if not exists(coordinate_rounded_path):
    coordinate_array = gdal.Open(coordinate_path).ReadAsArray()
    coordinate_array_round = np.round(coordinate_array, precision)
    export_array_as_tif(coordinate_array_round, coordinate_rounded_path)
    print(f"{coordinate} has been rounded and exported to {feature_final_dir}")
  else: print(f"{coordinate} already exists in {feature_final_dir}")

In [None]:
use_alpha_earth_features = False

# Compile and verify final feature list
feature_list = []

# Add Alpha Earth features if enabled
if use_alpha_earth_features:
  for feature in os.listdir(alpha_earth_dir): feature_list.append(join(alpha_earth_dir, feature))
else:
  # Add DSM topographic features
  for feature in os.listdir(topo_dsm_final_dir): feature_list.append(join(topo_dsm_final_dir, feature))
  # Add DTM topographic features, if they exist
  if exists(topo_dtm_final_dir):
    for feature in os.listdir(topo_dtm_final_dir): feature_list.append(join(topo_dtm_final_dir, feature))
  # Add coast proximity if it exists
  cost_proximity_path = join(coast_dir,'coast_proximity_km.tif')
  if exists(cost_proximity_path): feature_list.append(cost_proximity_path)
  # Add LCLUC continuous features
  for feature in os.listdir(continuous_final_dir): feature_list.append(join(continuous_final_dir, feature))
  # Add LCLUC binary edge effect features
  for feature in os.listdir(edge_effects_dir): feature_list.append(join(edge_effects_dir, feature))

feature_list = sorted(feature_list)

print("feature_list = [")
for feature in feature_list:
  print(f"'{feature.split('/')[-2]}/{feature.split('/')[-1][:-4]}',")
print(']')

In [None]:
feature_list = [
'binary_edge_effects/disturbance_edge_distance_1990',
'binary_edge_effects/disturbance_edge_distance_1991',
'binary_edge_effects/disturbance_edge_distance_1992',
'binary_edge_effects/disturbance_edge_distance_1993',
'binary_edge_effects/disturbance_edge_distance_1994',
'binary_edge_effects/disturbance_edge_distance_1995',
'binary_edge_effects/disturbance_edge_distance_1996',
'binary_edge_effects/disturbance_edge_distance_1997',
'binary_edge_effects/disturbance_edge_distance_1998',
'binary_edge_effects/disturbance_edge_distance_1999',
'binary_edge_effects/disturbance_edge_distance_2000',
'binary_edge_effects/disturbance_edge_distance_2001',
'binary_edge_effects/disturbance_edge_distance_2002',
'binary_edge_effects/disturbance_edge_distance_2003',
'binary_edge_effects/disturbance_edge_distance_2004',
'binary_edge_effects/disturbance_edge_distance_2005',
'binary_edge_effects/disturbance_edge_distance_2006',
'binary_edge_effects/disturbance_edge_distance_2007',
'binary_edge_effects/disturbance_edge_distance_2008',
'binary_edge_effects/disturbance_edge_distance_2009',
'binary_edge_effects/disturbance_edge_distance_2010',
'binary_edge_effects/disturbance_edge_distance_2011',
'binary_edge_effects/disturbance_edge_distance_2012',
'binary_edge_effects/disturbance_edge_distance_2013',
'binary_edge_effects/disturbance_edge_distance_2014',
'binary_edge_effects/disturbance_edge_distance_2015',
'binary_edge_effects/disturbance_edge_distance_2016',
'binary_edge_effects/disturbance_edge_distance_2017',
'binary_edge_effects/disturbance_edge_distance_2018',
'binary_edge_effects/disturbance_edge_distance_2019',
'binary_edge_effects/disturbance_edge_distance_2020',
'binary_edge_effects/disturbance_edge_distance_2021',
'binary_edge_effects/disturbance_edge_distance_2022',
'binary_edge_effects/disturbance_edge_distance_2023',
'binary_edge_effects/disturbance_edge_distance_2024',
'binary_edge_effects/disturbance_local_density_1990',
'binary_edge_effects/disturbance_local_density_1991',
'binary_edge_effects/disturbance_local_density_1992',
'binary_edge_effects/disturbance_local_density_1993',
'binary_edge_effects/disturbance_local_density_1994',
'binary_edge_effects/disturbance_local_density_1995',
'binary_edge_effects/disturbance_local_density_1996',
'binary_edge_effects/disturbance_local_density_1997',
'binary_edge_effects/disturbance_local_density_1998',
'binary_edge_effects/disturbance_local_density_1999',
'binary_edge_effects/disturbance_local_density_2000',
'binary_edge_effects/disturbance_local_density_2001',
'binary_edge_effects/disturbance_local_density_2002',
'binary_edge_effects/disturbance_local_density_2003',
'binary_edge_effects/disturbance_local_density_2004',
'binary_edge_effects/disturbance_local_density_2005',
'binary_edge_effects/disturbance_local_density_2006',
'binary_edge_effects/disturbance_local_density_2007',
'binary_edge_effects/disturbance_local_density_2008',
'binary_edge_effects/disturbance_local_density_2009',
'binary_edge_effects/disturbance_local_density_2010',
'binary_edge_effects/disturbance_local_density_2011',
'binary_edge_effects/disturbance_local_density_2012',
'binary_edge_effects/disturbance_local_density_2013',
'binary_edge_effects/disturbance_local_density_2014',
'binary_edge_effects/disturbance_local_density_2015',
'binary_edge_effects/disturbance_local_density_2016',
'binary_edge_effects/disturbance_local_density_2017',
'binary_edge_effects/disturbance_local_density_2018',
'binary_edge_effects/disturbance_local_density_2019',
'binary_edge_effects/disturbance_local_density_2020',
'binary_edge_effects/disturbance_local_density_2021',
'binary_edge_effects/disturbance_local_density_2022',
'binary_edge_effects/disturbance_local_density_2023',
'binary_edge_effects/disturbance_local_density_2024',
'binary_edge_effects/forest_edge_distance_1990',
'binary_edge_effects/forest_edge_distance_1991',
'binary_edge_effects/forest_edge_distance_1992',
'binary_edge_effects/forest_edge_distance_1993',
'binary_edge_effects/forest_edge_distance_1994',
'binary_edge_effects/forest_edge_distance_1995',
'binary_edge_effects/forest_edge_distance_1996',
'binary_edge_effects/forest_edge_distance_1997',
'binary_edge_effects/forest_edge_distance_1998',
'binary_edge_effects/forest_edge_distance_1999',
'binary_edge_effects/forest_edge_distance_2000',
'binary_edge_effects/forest_edge_distance_2001',
'binary_edge_effects/forest_edge_distance_2002',
'binary_edge_effects/forest_edge_distance_2003',
'binary_edge_effects/forest_edge_distance_2004',
'binary_edge_effects/forest_edge_distance_2005',
'binary_edge_effects/forest_edge_distance_2006',
'binary_edge_effects/forest_edge_distance_2007',
'binary_edge_effects/forest_edge_distance_2008',
'binary_edge_effects/forest_edge_distance_2009',
'binary_edge_effects/forest_edge_distance_2010',
'binary_edge_effects/forest_edge_distance_2011',
'binary_edge_effects/forest_edge_distance_2012',
'binary_edge_effects/forest_edge_distance_2013',
'binary_edge_effects/forest_edge_distance_2014',
'binary_edge_effects/forest_edge_distance_2015',
'binary_edge_effects/forest_edge_distance_2016',
'binary_edge_effects/forest_edge_distance_2017',
'binary_edge_effects/forest_edge_distance_2018',
'binary_edge_effects/forest_edge_distance_2019',
'binary_edge_effects/forest_edge_distance_2020',
'binary_edge_effects/forest_edge_distance_2021',
'binary_edge_effects/forest_edge_distance_2022',
'binary_edge_effects/forest_edge_distance_2023',
'binary_edge_effects/forest_edge_distance_2024',
'binary_edge_effects/forest_local_density_1990',
'binary_edge_effects/forest_local_density_1991',
'binary_edge_effects/forest_local_density_1992',
'binary_edge_effects/forest_local_density_1993',
'binary_edge_effects/forest_local_density_1994',
'binary_edge_effects/forest_local_density_1995',
'binary_edge_effects/forest_local_density_1996',
'binary_edge_effects/forest_local_density_1997',
'binary_edge_effects/forest_local_density_1998',
'binary_edge_effects/forest_local_density_1999',
'binary_edge_effects/forest_local_density_2000',
'binary_edge_effects/forest_local_density_2001',
'binary_edge_effects/forest_local_density_2002',
'binary_edge_effects/forest_local_density_2003',
'binary_edge_effects/forest_local_density_2004',
'binary_edge_effects/forest_local_density_2005',
'binary_edge_effects/forest_local_density_2006',
'binary_edge_effects/forest_local_density_2007',
'binary_edge_effects/forest_local_density_2008',
'binary_edge_effects/forest_local_density_2009',
'binary_edge_effects/forest_local_density_2010',
'binary_edge_effects/forest_local_density_2011',
'binary_edge_effects/forest_local_density_2012',
'binary_edge_effects/forest_local_density_2013',
'binary_edge_effects/forest_local_density_2014',
'binary_edge_effects/forest_local_density_2015',
'binary_edge_effects/forest_local_density_2016',
'binary_edge_effects/forest_local_density_2017',
'binary_edge_effects/forest_local_density_2018',
'binary_edge_effects/forest_local_density_2019',
'binary_edge_effects/forest_local_density_2020',
'binary_edge_effects/forest_local_density_2021',
'binary_edge_effects/forest_local_density_2022',
'binary_edge_effects/forest_local_density_2023',
'binary_edge_effects/forest_local_density_2024',
'binary_edge_effects/lu_ais_edge_distance',
'binary_edge_effects/lu_ais_local_density',
'binary_edge_effects/lu_berkelah_jerantut_edge_distance',
# 'binary_edge_effects/lu_berkelah_jerantut_local_density',
'binary_edge_effects/lu_berkelah_kuantan_edge_distance',
# 'binary_edge_effects/lu_berkelah_kuantan_local_density',
'binary_edge_effects/lu_berkelah_temerloh_edge_distance',
# 'binary_edge_effects/lu_berkelah_temerloh_local_density',
'binary_edge_effects/lu_old-growth_protected_areas_edge_distance',
# 'binary_edge_effects/lu_old-growth_protected_areas_local_density',
'binary_edge_effects/lu_remen_chereh_edge_distance',
# 'binary_edge_effects/lu_remen_chereh_local_density',
'binary_edge_effects/lu_tekai_tembeling_edge_distance',
# 'binary_edge_effects/lu_tekai_tembeling_local_density',
'binary_edge_effects/lu_tekam_edge_distance',
# 'binary_edge_effects/lu_tekam_local_density',
'binary_edge_effects/lu_yong_edge_distance',
'binary_edge_effects/lu_yong_lipis_edge_distance',
# 'binary_edge_effects/lu_yong_lipis_local_density',
# 'binary_edge_effects/lu_yong_local_density',
'coast/coast_proximity_km',
'topo_dsm_final/topo_dsm_smooth_aspect_cosine',
'topo_dsm_final/topo_dsm_smooth_aspect_sine',
'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_03',
'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_07',
'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_11',
'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_03',
'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_07',
'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_11',
'topo_dsm_final/topo_dsm_smooth_eastness',
'topo_dsm_final/topo_dsm_smooth_elevation',
'topo_dsm_final/topo_dsm_smooth_northness',
'topo_dsm_final/topo_dsm_smooth_profile_curvature',
'topo_dsm_final/topo_dsm_smooth_roughness_03',
'topo_dsm_final/topo_dsm_smooth_roughness_07',
'topo_dsm_final/topo_dsm_smooth_roughness_11',
'topo_dsm_final/topo_dsm_smooth_slope',
'topo_dsm_final/topo_dsm_smooth_stream_power_index_log10',
'topo_dsm_final/topo_dsm_smooth_surface_area_ratio',
'topo_dsm_final/topo_dsm_smooth_tangential_curvature',
'topo_dsm_final/topo_dsm_smooth_topographic_position_index_03',
'topo_dsm_final/topo_dsm_smooth_topographic_position_index_07',
'topo_dsm_final/topo_dsm_smooth_topographic_position_index_11',
'topo_dsm_final/topo_dsm_smooth_topographic_ruggedness_index',
'topo_dsm_final/topo_dsm_smooth_topographic_wetness_index',
'topo_dsm_final/topo_dsm_unsmooth_aspect_cosine',
'topo_dsm_final/topo_dsm_unsmooth_aspect_sine',
'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_03',
'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_07',
'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_11',
'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_03',
'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_07',
'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_11',
'topo_dsm_final/topo_dsm_unsmooth_eastness',
'topo_dsm_final/topo_dsm_unsmooth_elevation',
'topo_dsm_final/topo_dsm_unsmooth_northness',
'topo_dsm_final/topo_dsm_unsmooth_profile_curvature',
'topo_dsm_final/topo_dsm_unsmooth_roughness_03',
'topo_dsm_final/topo_dsm_unsmooth_roughness_07',
'topo_dsm_final/topo_dsm_unsmooth_roughness_11',
'topo_dsm_final/topo_dsm_unsmooth_slope',
'topo_dsm_final/topo_dsm_unsmooth_stream_power_index_log10',
'topo_dsm_final/topo_dsm_unsmooth_surface_area_ratio',
'topo_dsm_final/topo_dsm_unsmooth_tangential_curvature',
'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_03',
'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_07',
'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_11',
'topo_dsm_final/topo_dsm_unsmooth_topographic_ruggedness_index',
'topo_dsm_final/topo_dsm_unsmooth_topographic_wetness_index',
]

In [None]:
move_files = False  # Set to True to move instead of copy
# Set to True to move only if 'alpha_earth' is in filename (can be large files)
move_if_alpha_earth = True

# Feature transfer progress
feature_progress_index = 0
feature_progress_label = widgets.Label(f"Feature transfer progress: {feature_progress_index}/{len(feature_list)}")
display(feature_progress_label)

for feature in feature_list:
    feature_path = join(features_dir, f"{feature}.tif")
    feature_destination = join(feature_final_dir, f"{feature.split('/')[-1]}.tif")
    if not exists(feature_destination):
        # Determine whether to move or copy
        should_move = move_files or (move_if_alpha_earth and 'alpha_earth' in feature)
        if should_move:
            move(feature_path, feature_destination)
        else:
            copyfile(feature_path, feature_destination)

    # update feature progress
    feature_progress_index += 1
    feature_progress_label.value = f"Feature transfer progress: {feature_progress_index}/{len(feature_list)}"

print("All features finalised.")

# Spatial joining

In [None]:
# Select the dataset targets .pkl to add features
for pkl in os.listdir(datasets_tar_dir):
  if pkl.endswith('.pkl'):
    print(f"dataset_targets_pkl = '{pkl}'")

In [None]:
dataset_targets_pkl = 'gedi_elevation.pkl'

# Modify this if experimenting with incompatible feature sets
# E.g. TMF or Alpha Earth
dataset_add_features_pkl_name = 'gedi_elevation.pkl'

# Create list of features in 'final' directory
feature_list = []
for feature in os.listdir(feature_final_dir):
  feature_list.append(feature)

# Select features to add to the dataset.
# NOTE FOR GEDI DTM:
# Land-cover more recent than the base DSM shouldn't be used, as it's intended to
# measure discrepencies between the base surface data and GEDI terrain data.
# In the case of Copernicus DEM, this is > 2015.
# NOTE FOR AGBD:
# Land-cover more recent than or the same year as the most recent GEDI data (e.g. 2024) will be removed at the finalisation stage.
print('feature_list = [')
for feature in sorted(feature_list):
  print(f"'{feature}',")
print(']')

In [None]:
feature_list = [
'coast_proximity_km.tif',
'disturbance_edge_distance_1990.tif',
'disturbance_edge_distance_1991.tif',
'disturbance_edge_distance_1992.tif',
'disturbance_edge_distance_1993.tif',
'disturbance_edge_distance_1994.tif',
'disturbance_edge_distance_1995.tif',
'disturbance_edge_distance_1996.tif',
'disturbance_edge_distance_1997.tif',
'disturbance_edge_distance_1998.tif',
'disturbance_edge_distance_1999.tif',
'disturbance_edge_distance_2000.tif',
'disturbance_edge_distance_2001.tif',
'disturbance_edge_distance_2002.tif',
'disturbance_edge_distance_2003.tif',
'disturbance_edge_distance_2004.tif',
'disturbance_edge_distance_2005.tif',
'disturbance_edge_distance_2006.tif',
'disturbance_edge_distance_2007.tif',
'disturbance_edge_distance_2008.tif',
'disturbance_edge_distance_2009.tif',
'disturbance_edge_distance_2010.tif',
'disturbance_edge_distance_2011.tif',
'disturbance_edge_distance_2012.tif',
'disturbance_edge_distance_2013.tif',
'disturbance_edge_distance_2014.tif',
'disturbance_edge_distance_2015.tif',
# 'disturbance_edge_distance_2016.tif',
# 'disturbance_edge_distance_2017.tif',
# 'disturbance_edge_distance_2018.tif',
# 'disturbance_edge_distance_2019.tif',
# 'disturbance_edge_distance_2020.tif',
# 'disturbance_edge_distance_2021.tif',
# 'disturbance_edge_distance_2022.tif',
# 'disturbance_edge_distance_2023.tif',
# 'disturbance_edge_distance_2024.tif',
'disturbance_local_density_1990.tif',
'disturbance_local_density_1991.tif',
'disturbance_local_density_1992.tif',
'disturbance_local_density_1993.tif',
'disturbance_local_density_1994.tif',
'disturbance_local_density_1995.tif',
'disturbance_local_density_1996.tif',
'disturbance_local_density_1997.tif',
'disturbance_local_density_1998.tif',
'disturbance_local_density_1999.tif',
'disturbance_local_density_2000.tif',
'disturbance_local_density_2001.tif',
'disturbance_local_density_2002.tif',
'disturbance_local_density_2003.tif',
'disturbance_local_density_2004.tif',
'disturbance_local_density_2005.tif',
'disturbance_local_density_2006.tif',
'disturbance_local_density_2007.tif',
'disturbance_local_density_2008.tif',
'disturbance_local_density_2009.tif',
'disturbance_local_density_2010.tif',
'disturbance_local_density_2011.tif',
'disturbance_local_density_2012.tif',
'disturbance_local_density_2013.tif',
'disturbance_local_density_2014.tif',
'disturbance_local_density_2015.tif',
# 'disturbance_local_density_2016.tif',
# 'disturbance_local_density_2017.tif',
# 'disturbance_local_density_2018.tif',
# 'disturbance_local_density_2019.tif',
# 'disturbance_local_density_2020.tif',
# 'disturbance_local_density_2021.tif',
# 'disturbance_local_density_2022.tif',
# 'disturbance_local_density_2023.tif',
# 'disturbance_local_density_2024.tif',
'forest_edge_distance_1990.tif',
# 'forest_edge_distance_1991.tif',
# 'forest_edge_distance_1992.tif',
# 'forest_edge_distance_1993.tif',
# 'forest_edge_distance_1994.tif',
# 'forest_edge_distance_1995.tif',
# 'forest_edge_distance_1996.tif',
# 'forest_edge_distance_1997.tif',
# 'forest_edge_distance_1998.tif',
# 'forest_edge_distance_1999.tif',
'forest_edge_distance_2000.tif',
# 'forest_edge_distance_2001.tif',
# 'forest_edge_distance_2002.tif',
# 'forest_edge_distance_2003.tif',
# 'forest_edge_distance_2004.tif',
# 'forest_edge_distance_2005.tif',
# 'forest_edge_distance_2006.tif',
# 'forest_edge_distance_2007.tif',
# 'forest_edge_distance_2008.tif',
# 'forest_edge_distance_2009.tif',
'forest_edge_distance_2010.tif',
'forest_edge_distance_2011.tif',
'forest_edge_distance_2012.tif',
'forest_edge_distance_2013.tif',
'forest_edge_distance_2014.tif',
'forest_edge_distance_2015.tif',
# 'forest_edge_distance_2016.tif',
# 'forest_edge_distance_2017.tif',
# 'forest_edge_distance_2018.tif',
# 'forest_edge_distance_2019.tif',
# 'forest_edge_distance_2020.tif',
# 'forest_edge_distance_2021.tif',
# 'forest_edge_distance_2022.tif',
# 'forest_edge_distance_2023.tif',
# 'forest_edge_distance_2024.tif',
'forest_local_density_1990.tif',
# 'forest_local_density_1991.tif',
# 'forest_local_density_1992.tif',
# 'forest_local_density_1993.tif',
# 'forest_local_density_1994.tif',
# 'forest_local_density_1995.tif',
# 'forest_local_density_1996.tif',
# 'forest_local_density_1997.tif',
# 'forest_local_density_1998.tif',
# 'forest_local_density_1999.tif',
'forest_local_density_2000.tif',
# 'forest_local_density_2001.tif',
# 'forest_local_density_2002.tif',
# 'forest_local_density_2003.tif',
# 'forest_local_density_2004.tif',
# 'forest_local_density_2005.tif',
# 'forest_local_density_2006.tif',
# 'forest_local_density_2007.tif',
# 'forest_local_density_2008.tif',
# 'forest_local_density_2009.tif',
'forest_local_density_2010.tif',
'forest_local_density_2011.tif',
'forest_local_density_2012.tif',
'forest_local_density_2013.tif',
'forest_local_density_2014.tif',
'forest_local_density_2015.tif',
# 'forest_local_density_2016.tif',
# 'forest_local_density_2017.tif',
# 'forest_local_density_2018.tif',
# 'forest_local_density_2019.tif',
# 'forest_local_density_2020.tif',
# 'forest_local_density_2021.tif',
# 'forest_local_density_2022.tif',
# 'forest_local_density_2023.tif',
# 'forest_local_density_2024.tif',
'latitude.tif',
'longitude.tif',
'lu_ais_edge_distance.tif',
'lu_berkelah_jerantut_edge_distance.tif',
'lu_berkelah_kuantan_edge_distance.tif',
'lu_berkelah_temerloh_edge_distance.tif',
'lu_old-growth_protected_areas_edge_distance.tif',
'lu_remen_chereh_edge_distance.tif',
'lu_tekai_tembeling_edge_distance.tif',
'lu_tekam_edge_distance.tif',
'lu_yong_edge_distance.tif',
'lu_yong_lipis_edge_distance.tif',
'topo_dsm_smooth_aspect_cosine.tif',
'topo_dsm_smooth_aspect_sine.tif',
'topo_dsm_smooth_circular_variance_aspect_03.tif',
'topo_dsm_smooth_circular_variance_aspect_07.tif',
'topo_dsm_smooth_circular_variance_aspect_11.tif',
'topo_dsm_smooth_deviation_mean_elevation_03.tif',
'topo_dsm_smooth_deviation_mean_elevation_07.tif',
'topo_dsm_smooth_deviation_mean_elevation_11.tif',
'topo_dsm_smooth_eastness.tif',
'topo_dsm_smooth_elevation.tif',
'topo_dsm_smooth_northness.tif',
'topo_dsm_smooth_profile_curvature.tif',
'topo_dsm_smooth_roughness_03.tif',
'topo_dsm_smooth_roughness_07.tif',
'topo_dsm_smooth_roughness_11.tif',
'topo_dsm_smooth_slope.tif',
'topo_dsm_smooth_stream_power_index_log10.tif',
'topo_dsm_smooth_surface_area_ratio.tif',
'topo_dsm_smooth_tangential_curvature.tif',
'topo_dsm_smooth_topographic_position_index_03.tif',
'topo_dsm_smooth_topographic_position_index_07.tif',
'topo_dsm_smooth_topographic_position_index_11.tif',
'topo_dsm_smooth_topographic_ruggedness_index.tif',
'topo_dsm_smooth_topographic_wetness_index.tif',
'topo_dsm_unsmooth_aspect_cosine.tif',
'topo_dsm_unsmooth_aspect_sine.tif',
'topo_dsm_unsmooth_circular_variance_aspect_03.tif',
'topo_dsm_unsmooth_circular_variance_aspect_07.tif',
'topo_dsm_unsmooth_circular_variance_aspect_11.tif',
'topo_dsm_unsmooth_deviation_mean_elevation_03.tif',
'topo_dsm_unsmooth_deviation_mean_elevation_07.tif',
'topo_dsm_unsmooth_deviation_mean_elevation_11.tif',
'topo_dsm_unsmooth_eastness.tif',
'topo_dsm_unsmooth_elevation.tif',
'topo_dsm_unsmooth_northness.tif',
'topo_dsm_unsmooth_profile_curvature.tif',
'topo_dsm_unsmooth_roughness_03.tif',
'topo_dsm_unsmooth_roughness_07.tif',
'topo_dsm_unsmooth_roughness_11.tif',
'topo_dsm_unsmooth_slope.tif',
'topo_dsm_unsmooth_stream_power_index_log10.tif',
'topo_dsm_unsmooth_surface_area_ratio.tif',
'topo_dsm_unsmooth_tangential_curvature.tif',
'topo_dsm_unsmooth_topographic_position_index_03.tif',
'topo_dsm_unsmooth_topographic_position_index_07.tif',
'topo_dsm_unsmooth_topographic_position_index_11.tif',
'topo_dsm_unsmooth_topographic_ruggedness_index.tif',
'topo_dsm_unsmooth_topographic_wetness_index.tif',
]

In [None]:
cache = True

dataset_targets_path = join(datasets_tar_dir, dataset_targets_pkl)
dataset_targets = pd.read_pickle(dataset_targets_path)
dataset_add_fea_path = join(datasets_add_fea_dir, dataset_add_features_pkl_name)

# Handle caching
if cache: print("Cache enabled.")
if not exists(dataset_add_fea_path):
    dataset_targets.to_pickle(dataset_add_fea_path)
    dataset_add_fea = pd.read_pickle(dataset_add_fea_path)
else:
    print(f"An 'add features' dataset already exists: {dataset_add_fea_path}")
    if cache:
        print("Continuing to add features to existing dataset. Delete it to start again.")
        dataset_add_fea = pd.read_pickle(dataset_add_fea_path)
    else:
        print("Cache disabled. The 'add features' dataset will be overwritten in 10 seconds (interrupt if unintended).")
        sleep(10)
        dataset_targets.to_pickle(dataset_add_fea_path)
        dataset_add_fea = pd.read_pickle(dataset_add_fea_path)

# Feature progress
feature_progress_index = 0
feature_progress_label = widgets.Label(f"feature progress: {feature_progress_index}/{len(feature_list)}")
display(feature_progress_label)

for feature in feature_list:
    feature_name = feature.replace('.tif', '')
    if f"fea_{feature_name}" not in dataset_add_fea.columns:
        feature_path = join(feature_final_dir, feature)
        sample_raster_values(dataset_add_fea, feature_path, feature=True)
        if cache: dataset_add_fea.to_pickle(dataset_add_fea_path)
    feature_progress_index += 1
    feature_progress_label.value = f"feature progress: {feature_progress_index}/{len(feature_list)}"
    # defragment every 50 features
    if feature_progress_index % 50 == 0:
        dataset_add_fea = dataset_add_fea.copy()

# Defragment and export
dataset_add_fea = dataset_add_fea.copy()
dataset_add_fea.to_pickle(dataset_add_fea_path)

print(f"All features have been added to {dataset_add_fea_path}.")

# Drop columns (optional)

In [None]:
# Targets and features can be removed from the dataset in case of any issues.
# Select the 'add features' dataset.
for pkl in os.listdir(datasets_add_fea_dir):
  print(f"dataset_drop_columns = '{pkl}'")

In [None]:
add_fea_dataset_name = 'gedi_elevation.pkl'

add_fea_dataset_path = join(datasets_add_fea_dir, add_fea_dataset_name)
add_fea_dataset = pd.read_pickle(add_fea_dataset_path)

# Inspect existing columns
sorted(add_fea_dataset.columns)

In [None]:
drop_dataset_columns = True

if drop_dataset_columns:
  # Anything containing the dropped_columns string will be removed
  dropped_columns = 'fea_topo_dsm_'
  # Drop columns
  dropped_columns_dataset = add_fea_dataset.loc[:,~add_fea_dataset.columns.str.contains(dropped_columns)]
  dropped_columns_dataset.to_pickle(add_fea_dataset_path)
  # Inspect columns again
  add_fea_dataset = pd.read_pickle(add_fea_dataset_path)
sorted(add_fea_dataset.columns)

# Finalise dataset

In [None]:
# Select the dataset to finalise.
for pkl in os.listdir(datasets_add_fea_dir):
    print(f'final_dataset_name = "{pkl}"')

In [None]:
final_dataset_name = "gedi_elevation.pkl"

dataset_path = join(datasets_add_fea_dir, final_dataset_name)
dataset = pd.read_pickle(dataset_path)

# Precision reduction for histogram-based XGBoost
# Fewer unique values = smaller max_bin = faster histogram construction
# XGBoost bins continuous features, excess precision wastes computation

print("precision_change_dict = {")
for col in sorted(dataset.columns):
    series = dataset[col]
    is_float = series.dtype in ['float32', 'float64']
    is_geometry = 'geometry' in str(series.dtype).lower()
    if is_geometry:
        print(f'    "{col}": None,  # geometry')
        continue
    col_unique = series.nunique()
    if not is_float: print(f'    "{col}": None,  # {series.dtype}, unique={col_unique:,}')
    else:
        col_min = series.min()
        col_max = series.max()
        # infer current decimal places from sample
        sample = series.dropna().head(10000).astype(str)
        decimals = sample.apply(lambda x: len(x.split('.')[-1]) if '.' in x else 0)
        current_precision = int(decimals.max())
        print(f'    "{col}": None,  # float, precision={current_precision}, min={col_min:.4g}, max={col_max:.4g}, unique={col_unique:,}')
print("}")

In [None]:
precision_change_dict = {
    "beam": None,  # object, unique=8
    "fea_coast_proximity_km": None,  # float, precision=1, min=18.6, max=148.9, unique=1,304
    "fea_disturbance_edge_distance_1990": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1991": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1992": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1993": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1994": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1995": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1996": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1997": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1998": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1999": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2000": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2001": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2002": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2003": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2004": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2005": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2006": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2007": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2008": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2009": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2010": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2011": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2012": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2013": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2014": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2015": None,  # int16, unique=24
    "fea_disturbance_local_density_1990": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1991": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1992": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1993": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1994": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1995": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1996": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1997": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1998": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1999": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2000": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2001": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2002": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2003": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2004": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2005": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2006": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2007": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2008": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2009": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2010": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2011": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2012": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2013": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2014": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2015": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_edge_distance_1990": None,  # int16, unique=24
    "fea_forest_edge_distance_2000": None,  # int16, unique=24
    "fea_forest_edge_distance_2010": None,  # int16, unique=24
    "fea_forest_edge_distance_2011": None,  # int16, unique=24
    "fea_forest_edge_distance_2012": None,  # int16, unique=24
    "fea_forest_edge_distance_2013": None,  # int16, unique=24
    "fea_forest_edge_distance_2014": None,  # int16, unique=24
    "fea_forest_edge_distance_2015": None,  # int16, unique=24
    "fea_forest_local_density_1990": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2000": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2010": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2011": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2012": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2013": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2014": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2015": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_latitude": None,  # float, precision=3, min=3.586, max=4.948, unique=1,363
    "fea_longitude": None,  # float, precision=3, min=102, max=103.2, unique=1,155
    "fea_lu_ais_edge_distance": None,  # int16, unique=24
    "fea_lu_berkelah_jerantut_edge_distance": None,  # int16, unique=24
    "fea_lu_berkelah_kuantan_edge_distance": None,  # int16, unique=24
    "fea_lu_berkelah_temerloh_edge_distance": None,  # int16, unique=24
    "fea_lu_old-growth_protected_areas_edge_distance": None,  # int16, unique=24
    "fea_lu_remen_chereh_edge_distance": None,  # int16, unique=24
    "fea_lu_tekai_tembeling_edge_distance": None,  # int16, unique=24
    "fea_lu_tekam_edge_distance": None,  # int16, unique=24
    "fea_lu_yong_edge_distance": None,  # int16, unique=24
    "fea_lu_yong_lipis_edge_distance": None,  # int16, unique=24
    "fea_topo_dsm_smooth_aspect_cosine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dsm_smooth_aspect_sine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dsm_smooth_circular_variance_aspect_03": None,  # float, precision=2, min=0, max=0.72, unique=72
    "fea_topo_dsm_smooth_circular_variance_aspect_07": None,  # float, precision=2, min=0, max=0.92, unique=93
    "fea_topo_dsm_smooth_circular_variance_aspect_11": None,  # float, precision=2, min=0, max=0.96, unique=97
    "fea_topo_dsm_smooth_deviation_mean_elevation_03": None,  # int16, unique=2
    "fea_topo_dsm_smooth_deviation_mean_elevation_07": None,  # float, precision=1, min=-0.9, max=1, unique=20
    "fea_topo_dsm_smooth_deviation_mean_elevation_11": None,  # float, precision=1, min=-1.4, max=1.6, unique=31
    "fea_topo_dsm_smooth_eastness": None,  # float, precision=2, min=-0.76, max=0.81, unique=149
    "fea_topo_dsm_smooth_elevation": None,  # int16, unique=1,942
    "fea_topo_dsm_smooth_northness": None,  # float, precision=2, min=-0.74, max=0.79, unique=150
    "fea_topo_dsm_smooth_profile_curvature": None,  # float, precision=3, min=-0.01, max=0.01, unique=21
    "fea_topo_dsm_smooth_roughness_03": None,  # int16, unique=123
    "fea_topo_dsm_smooth_roughness_07": None,  # int16, unique=284
    "fea_topo_dsm_smooth_roughness_11": None,  # int16, unique=396
    "fea_topo_dsm_smooth_slope": None,  # int16, unique=57
    "fea_topo_dsm_smooth_stream_power_index_log10": None,  # int16, unique=20
    "fea_topo_dsm_smooth_surface_area_ratio": None,  # float, precision=2, min=1, max=1.94, unique=82
    "fea_topo_dsm_smooth_tangential_curvature": None,  # float, precision=3, min=-0.011, max=0.011, unique=23
    "fea_topo_dsm_smooth_topographic_position_index_03": None,  # float, precision=1, min=-7.5, max=8.6, unique=125
    "fea_topo_dsm_smooth_topographic_position_index_07": None,  # int16, unique=66
    "fea_topo_dsm_smooth_topographic_position_index_11": None,  # int16, unique=116
    "fea_topo_dsm_smooth_topographic_ruggedness_index": None,  # int16, unique=45
    "fea_topo_dsm_smooth_topographic_wetness_index": None,  # int16, unique=41
    "fea_topo_dsm_unsmooth_aspect_cosine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dsm_unsmooth_aspect_sine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dsm_unsmooth_circular_variance_aspect_03": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_topo_dsm_unsmooth_circular_variance_aspect_07": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_topo_dsm_unsmooth_circular_variance_aspect_11": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_topo_dsm_unsmooth_deviation_mean_elevation_03": None,  # int16, unique=7
    "fea_topo_dsm_unsmooth_deviation_mean_elevation_07": None,  # float, precision=1, min=-3.6, max=3.9, unique=72
    "fea_topo_dsm_unsmooth_deviation_mean_elevation_11": None,  # float, precision=1, min=-4.6, max=3.9, unique=74
    "fea_topo_dsm_unsmooth_eastness": None,  # float, precision=2, min=-0.87, max=0.89, unique=169
    "fea_topo_dsm_unsmooth_elevation": None,  # int16, unique=1,953
    "fea_topo_dsm_unsmooth_northness": None,  # float, precision=2, min=-0.84, max=0.87, unique=168
    "fea_topo_dsm_unsmooth_profile_curvature": None,  # float, precision=3, min=-0.052, max=0.032, unique=71
    "fea_topo_dsm_unsmooth_roughness_03": None,  # int16, unique=142
    "fea_topo_dsm_unsmooth_roughness_07": None,  # int16, unique=298
    "fea_topo_dsm_unsmooth_roughness_11": None,  # int16, unique=403
    "fea_topo_dsm_unsmooth_slope": None,  # int16, unique=67
    "fea_topo_dsm_unsmooth_stream_power_index_log10": None,  # int16, unique=24
    "fea_topo_dsm_unsmooth_surface_area_ratio": None,  # float, precision=2, min=1, max=2.52, unique=107
    "fea_topo_dsm_unsmooth_tangential_curvature": None,  # float, precision=3, min=-0.039, max=0.035, unique=74
    "fea_topo_dsm_unsmooth_topographic_position_index_03": None,  # int16, unique=40
    "fea_topo_dsm_unsmooth_topographic_position_index_07": None,  # int16, unique=101
    "fea_topo_dsm_unsmooth_topographic_position_index_11": None,  # int16, unique=157
    "fea_topo_dsm_unsmooth_topographic_ruggedness_index": None,  # int16, unique=56
    "fea_topo_dsm_unsmooth_topographic_wetness_index": None,  # int16, unique=52
    "gedi_elevation": 0,  # float, precision=7, min=-20.26, max=2049, unique=863,822
    "geometry": None,  # geometry
    "sensitivity": 4,  # float, precision=8, min=0.95, max=0.9981, unique=484,671
    "shot_number": None,  # object, unique=873,355
}

In [None]:
# Preview precision changes before applying
# Only columns wi,h integer values (not None) will be rounded
# Review unique value reductions

dataset_path = join(datasets_add_fea_dir, final_dataset_name)
dataset = pd.read_pickle(dataset_path)
dataset_preview = dataset.copy()

print(f"{'Column':<60} {'Old unique':>12} {'New unique':>12} {'Reduction':>10}")
print("-" * 96)

changed_columns = []
for col, precision in precision_change_dict.items():
    if precision is None:
        continue
    if dataset_preview[col].dtype not in ['float32', 'float64']:
        print(f"{col:<60} {'SKIPPED (not float)':<36}")
        continue

    old_unique = dataset_preview[col].nunique()
    dataset_preview[col] = dataset_preview[col].round(precision)
    new_unique = dataset_preview[col].nunique()

    reduction_pct = 100 * (1 - new_unique / old_unique) if old_unique > 0 else 0
    print(f"{col:<60} {old_unique:>12,} {new_unique:>12,} {reduction_pct:>9.1f}%")
    changed_columns.append(col)

print("-" * 96)
print(f"{len(changed_columns)} columns will be modified")

# max_bin estimate from preview
feature_cols = [c for c in dataset_preview.columns if c.startswith('fea_')]
max_unique = dataset_preview[feature_cols].nunique().max()
print(f"Max unique values across features: {max_unique:,} (use for max_bin)")

# preview not saved, run next block to apply changes,

In [None]:
# Whether targets have associated collection years requiring temporal feature alignment
yearly_targets = False
# Column containing the year for each observation (only used if yearly_targets = True)
year_column = 'year'

# Yearly features requiring temporal alignment
# Multi-year features: multiple years retained for temporal context (e.g., TMF land-cover)
yearly_features_multi = [
    'forest_edge_distance',
    'forest_local_density',
    'disturbance_edge_distance',
    'disturbance_local_density',
]
# Single-year features: only t-1 retained
# Alpha Earth embeddings already encode landscape state; multi-year would cause feature bloat
yearly_features_single = [
    'alpha_earth',
]

# Load 'add features' dataset
dataset_add_fea_path = join(datasets_add_fea_dir, final_dataset_name)
dataset_add_fea = pd.read_pickle(dataset_add_fea_path)
print(f"Loaded dataset: {final_dataset_name} ({len(dataset_add_fea):,} rows, {len(dataset_add_fea.columns)} columns)")

# Apply precision reduction
precision_changes = 0
for col, precision in precision_change_dict.items():
    if precision is None:
        continue
    if col in dataset_add_fea.columns and dataset_add_fea[col].dtype in ['float32', 'float64']:
        dataset_add_fea[col] = dataset_add_fea[col].round(precision)
        precision_changes += 1
if precision_changes > 0:
    print(f"Applied precision reduction to {precision_changes} columns")

if yearly_targets:
    print(f"\nYearly targets enabled. Year column: '{year_column}'")

    # Combine yearly feature lists for renaming
    yearly_features = yearly_features_multi + yearly_features_single

    # Check which yearly features are present
    dataset_add_fea_column_list = sorted(dataset_add_fea.columns, reverse=True)

    for yearly_feature in yearly_features_multi:
        found = any(yearly_feature in col and col[-4:].isdigit() for col in dataset_add_fea_column_list)
        if found: print(f"  Multi-year feature found: {yearly_feature}")
        else: print(f"  Multi-year feature NOT found: {yearly_feature}")

    for yearly_feature in yearly_features_single:
        found = any(yearly_feature in col and col[-4:].isdigit() for col in dataset_add_fea_column_list)
        if found: print(f"  Single-year feature found: {yearly_feature}")
        else: print(f"  Single-year feature NOT found: {yearly_feature}")

    # Get list of target years
    target_year_list = dataset_add_fea[year_column].unique().tolist()
    target_year_list = [int(x) for x in target_year_list]
    target_year_max = np.max(target_year_list)
    print(f"\nTarget years: {sorted(target_year_list)} (max: {target_year_max})")

    # Create an index identifier
    dataset_add_fea['index_record'] = dataset_add_fea.index

    # Create an empty list for storing yearly sub-datasets
    dataset_year_list = []

    # Iterate through each target year and shift the sample to appropriate feature year
    # Column names are kept the same to avoid issues when ordering features in model training/prediction
    # E.g. a '2021' feature for a 2022 target will actually be a 2019 feature for a 2020 target
    for target_year in target_year_list:
        dataset_year = dataset_add_fea[dataset_add_fea[year_column] == target_year].copy()
        sample_year_modifier = target_year_max - target_year
        for col in dataset_add_fea_column_list:
            for yearly_feature in yearly_features:
                # Check column contains feature name and ends with 4-digit year
                if yearly_feature in col and col[-4:].isdigit():
                    feature_year = int(col[-4:])
                    col_prefix = col[:-4]
                    corrected_sample = f"{col_prefix}{feature_year + sample_year_modifier}"
                    dataset_year.rename(columns={col: corrected_sample}, inplace=True)
                    break
        dataset_year_list.append(dataset_year)

    # Concatenate dataframes and sort
    final_dataset = pd.concat(dataset_year_list, ignore_index=True)
    final_dataset.sort_values('index_record', inplace=True)
    final_dataset.reset_index(drop=True, inplace=True)

    # Drop feature years out of the data range for one of the target years
    cols_before_na = len(final_dataset.columns)
    final_dataset.dropna(axis=1, how='any', inplace=True)
    cols_dropped_na = cols_before_na - len(final_dataset.columns)
    if cols_dropped_na > 0: print(f"\nDropped {cols_dropped_na} columns with NA values (out of range years)")

    # Drop multi-year features from the most recent target year (timing cannot be certain)
    cols_to_drop = [col for col in final_dataset.columns
                    for yearly_feature in yearly_features_multi
                    if yearly_feature in col and col.endswith(str(target_year_max))]
    if cols_to_drop:
        final_dataset.drop(columns=cols_to_drop, inplace=True)
        print(f"Dropped {len(cols_to_drop)} multi-year feature columns from year {target_year_max}")

    # Drop single-year features except t-1
    previous_year = target_year_max - 1
    single_cols_to_drop = [col for col in final_dataset.columns
                           for yearly_feature in yearly_features_single
                           if yearly_feature in col and not col.endswith(str(previous_year))]
    if single_cols_to_drop:
        final_dataset.drop(columns=single_cols_to_drop, inplace=True)
        print(f"Dropped {len(single_cols_to_drop)} single-year feature columns (keeping only year {previous_year})")

    # Drop the index identifier
    final_dataset.drop(columns=['index_record'], inplace=True)

else:
    print("\nYearly targets disabled. No temporal feature alignment applied.")
    final_dataset = dataset_add_fea.copy()

# Add 'tar_' prefix to non-features
final_dataset.columns = ['tar_' + col if not col.startswith('fea_') else col for col in final_dataset.columns]

# Sort columns alphabetically
target_columns = [col for col in final_dataset.columns if col.startswith('tar_')]
feature_columns = [col for col in final_dataset.columns if col.startswith('fea_')]
sorted_columns = sorted(target_columns) + sorted(feature_columns)

# Reindex the DataFrame with the sorted column order
final_dataset = final_dataset.reindex(columns=sorted_columns)

# Export and check final dataset
final_dataset_path = join(datasets_final_dir, final_dataset_name)
final_dataset.to_pickle(final_dataset_path)

print(f"\nFinal dataset: {len(final_dataset):,} rows, {len(target_columns)} target columns, {len(feature_columns)} feature columns")
print(f"Exported to: {final_dataset_path}")

pd.read_pickle(final_dataset_path)

# Export to .gpkg

In [None]:
# For verification and visualisation in GIS softaware.
# Select the dataset .pkl to export as a .gpkg
for pkl in os.listdir(datasets_final_dir):
  print(f'dataset_gpkg_name = "{pkl}"')

In [None]:
dataset_gpkg_name = "gedi_elevation.pkl"

dataset_gpkg_pkl_path = join(datasets_final_dir, dataset_gpkg_name)
dataset_gpkg_pkl = pd.read_pickle(dataset_gpkg_pkl_path)

# Print columns that can be included
print("selected_gpkg_columns = [")
for col in sorted(dataset_gpkg_pkl.columns):
  if col != "tar_geometry":
    print(f'  "{col}",')
print("]")

In [None]:
selected_gpkg_columns = [
  "fea_topo_dsm_unsmooth_elevation",
  "tar_gedi_elevation",
]

selected_gpkg_columns = selected_gpkg_columns + ['tar_geometry']

dataset_gpkg_geodataframe = gpd.GeoDataFrame(dataset_gpkg_pkl[selected_gpkg_columns], geometry='tar_geometry')

dataset_gpkg_export = join(datasets_gpkg_dir, f"{dataset_gpkg_name[:-4]}.gpkg")
dataset_gpkg_geodataframe.to_file(dataset_gpkg_export, driver="GPKG")

# Disconnect runtime

In [None]:
# Useful for stopping background execution
from google.colab import runtime
runtime.unassign()