<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/4_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports, directories and global functions

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/content/drive/MyDrive/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs
!pip install geopandas

In [None]:
# Imports
from concurrent.futures import ThreadPoolExecutor
import gc
import geopandas as gpd
from google.colab import runtime
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
from os import makedirs
from os.path import exists, join
from osgeo import gdal
gdal.UseExceptions()
import pandas as pd
import requests
from scipy.stats import norm
from shutil import copyfile, move
from sklearn.mixture import GaussianMixture
from time import sleep

In [None]:
# 1_areas directories
areas_dir = join(base_dir, '1_areas')
polygons_dir = join(areas_dir, 'polygons')
template_dir = join(areas_dir, "template.tif")

# 2_targets directories
targets_final_dir = join(base_dir, "2_targets/pkl_final")

# 3_features directories
features_dir = join(base_dir, "3_features")
alpha_earth_dir = join(features_dir, "alpha_earth")
continuous_final_dir = join(features_dir, "continuous_final")
edge_effects_dir = join(features_dir, "binary_edge_effects")
topo_dsm_final_dir = join(features_dir, "topo_dsm_final")
topo_dtm_final_dir = join(features_dir, "topo_dtm_final")
geographic_final_dir = join(features_dir, 'geographic_final')

# 4_datasets directories
datasets_dir = join(base_dir, "4_datasets")
datasets_tar_dir = join(datasets_dir, "targets")
datasets_add_fea_dir = join(datasets_dir, "add_features")
datasets_final_dir = join(datasets_dir, "final")
datasets_gpkg_dir = join(datasets_dir, "gpkg")

# Create directories
makedirs(datasets_dir, exist_ok=True)
makedirs(datasets_tar_dir, exist_ok=True)
makedirs(datasets_add_fea_dir, exist_ok=True)
makedirs(datasets_final_dir, exist_ok=True)
makedirs(datasets_gpkg_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -11111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress, dtype=gdal.GDT_Float32):
    template_ds = gdal.Open(template)
    template_band = template_ds.GetRasterBand(1)
    template_dimensions, template_projection = template_ds.GetGeoTransform(), template_ds.GetProjection()
    if compress: options = ['COMPRESS=ZSTD', 'ZSTD_LEVEL=1'] # Good speed / size ratio
    else: options = []
    if input_array.dtype == 'int16': dtype = gdal.GDT_Int16
    driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, 1, dtype, options=options)
    driver.GetRasterBand(1).WriteArray(input_array)
    driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
    driver.SetGeoTransform(template_dimensions)
    driver.SetProjection(template_projection)
    template_ds = driver = None

# Global function: Sample raster values
def sample_raster_values(pd_dataframe, raster_path, geom_x, geom_y, feature=False, n_threads=1):
    # Derive column name from filename
    raster_name = raster_path.split('/')[-1][:-4]
    if feature: raster_name = 'fea_' + raster_name
    # Load raster and extract metadata
    raster = gdal.Open(raster_path)
    band = raster.GetRasterBand(1)
    geotransform = raster.GetGeoTransform()
    raster_array = band.ReadAsArray()
    nodata = band.GetNoDataValue()
    rows, cols = raster_array.shape
    fill_value = nodata if nodata is not None else np.nan
    # Initialise output array with nodata
    sampled_values = np.full(len(geom_x), fill_value, dtype=raster_array.dtype)
    # Worker function for threaded sampling
    def sample_chunk(start, end):
        x_idx = ((geom_x[start:end] - geotransform[0]) / geotransform[1]).astype(int)
        y_idx = ((geom_y[start:end] - geotransform[3]) / geotransform[5]).astype(int)
        valid = (x_idx >= 0) & (x_idx < cols) & (y_idx >= 0) & (y_idx < rows)
        local_values = np.full(end - start, fill_value, dtype=raster_array.dtype)
        local_values[valid] = raster_array[y_idx[valid], x_idx[valid]]
        sampled_values[start:end] = local_values
    # Split points into chunks and process in parallel
    n_points = len(geom_x)
    chunk_size = (n_points + n_threads - 1) // n_threads
    chunk_ranges = [(i, min(i + chunk_size, n_points)) for i in range(0, n_points, chunk_size)]
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        executor.map(lambda r: sample_chunk(*r), chunk_ranges)
    # Assign to dataframe and release resources
    pd_dataframe[raster_name] = sampled_values
    raster = band = None

# Global function: histogram-based density outlier bounds
# Filters sparse regions at distribution extremes based on bin counts
# Dense regions kept regardless of distance from median, only isolated tail values flagged
def histogram_outlier_bounds(data, title, sparse_threshold_percent=0.01):
    min_bin_count = max(10, int(len(data) * sparse_threshold_percent / 100))
    counts, bin_edges = np.histogram(data, bins='auto')
    # Scan inward from each tail until a dense bin is found
    first_dense_bin = next(i for i, c in enumerate(counts) if c >= min_bin_count)
    last_dense_bin = next(i for i, c in enumerate(reversed(counts)) if c >= min_bin_count)
    last_dense_bin = len(counts) - 1 - last_dense_bin
    # Bounds from dense bin edges
    lower_bound = bin_edges[first_dense_bin]
    upper_bound = bin_edges[last_dense_bin + 1]
    # Count filtered points
    n_below_lower = np.sum(data < lower_bound)
    n_above_upper = np.sum(data > upper_bound)
    n_remaining = len(data) - n_below_lower - n_above_upper
    # Plot distribution with bounds
    random_selection = np.random.choice(data, size=min(100_000, len(data)), replace=False)
    plt.hist(random_selection, bins='auto')
    plt.axvline(lower_bound, color='red', linestyle='--', label=f'Lower: {lower_bound:.1f}')
    plt.axvline(upper_bound, color='red', linestyle='--', label=f'Upper: {upper_bound:.1f}')
    plt.title(title)
    plt.legend()
    plt.show()
    print(f"Histogram density bounds (tails only): [{lower_bound:.2f}, {upper_bound:.2f}]")
    print(f"Filtered below lower bound: {n_below_lower:,}")
    print(f"Filtered above upper bound: {n_above_upper:,}")
    print(f"Remaining points: {n_remaining:,} out of {len(data):,} ({100 * (1 - n_remaining / len(data)):.2f}% removed)")
    return lower_bound, upper_bound

# Filter targets with features

## GEDI elevation

In [None]:
# GEDI DTM (Digital Terrain Model) dataset preparation

# GEDI elev_lowestmode is WGS84 ellipsoidal height
# Copernicus DEM is EGM2008 orthometric height
# Geoid correction applied: H = h - N (ellipsoidal to orthometric)

# Outlier filtering compares Geoid corrected GEDI elevation to Copernicus DSM
# GEDI typically lower than DSM in forest (ground vs canopy surface)
# Extreme differences indicate measurement error or cloud contamination

# Download EGM2008 geoid model
earth_gravitational_model_url = 'https://download.agisoft.com/gtg/us_nga_egm2008_1.tif'
earth_gravitational_model_path = join(datasets_tar_dir, 'earth_gravitational_model.tif')
if not exists(earth_gravitational_model_path):
  request = requests.get(earth_gravitational_model_url, allow_redirects=True)
  open(earth_gravitational_model_path, 'wb').write(request.content)
  print(f'EGM raster downloaded to: {earth_gravitational_model_path}')
else: print(f'EGM raster already exists at: {earth_gravitational_model_path}')

# Select GEDI .pkl containing elev_lowestmode
for pkl in os.listdir(targets_final_dir):
    print(f"targets_pkl = '{pkl}'")

In [None]:
targets_pkl = 'GEDI04_A.pkl'

# Use the rectangular prediction area extent, instead of the potentially complex project area
use_prediction_area_polygon = True

# Sensitivity threshold. 0.98 is recommended for evergreen broadleaf tropical (EBT)
# strata tropical forests (following the aboveground biomass density algorithm theoretical
# basis document, Kellner et al., 2023).
sensitivity_threshold = 0.98

# Histogram-based outlier filtering
histogram_based_outlier_filtering = True
# Bins with fewer than this percentage of total data are considered sparse
sparse_threshold_percent = 0.01

# Ensure all points are in the prediction area
dataset_targets = pd.read_pickle(join(targets_final_dir, targets_pkl))
if use_prediction_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'prediction_area.gpkg'))
else: project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
geodataframe_targets = gpd.GeoDataFrame(dataset_targets, geometry='geometry')
geodataframe_targets_clipped = gpd.clip(geodataframe_targets, project_area_polygon)
dataset_targets_clipped = pd.DataFrame(geodataframe_targets_clipped)
print(f"{len(dataset_targets) - len(dataset_targets_clipped)} out of {len(dataset_targets)} data points were outside the prediction area and removed.")

# Drop NA values, if any
dataset_na_values = dataset_targets_clipped.isna().any(axis=1).sum()
dataset_na_dropped = dataset_targets_clipped.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")
columns_to_keep = ['shot_number','beam','geometry','elev_lowestmode','sensitivity']
dataset_na_dropped = dataset_na_dropped[[col for col in columns_to_keep if col in dataset_na_dropped.columns]]

# Filter by sensitivity threshold
print(f"Minimum sensitivity before filtering: {dataset_na_dropped['sensitivity'].min():.4f}")
rows_before_sensitivity = len(dataset_na_dropped)
dataset_filtered_sensitivity = dataset_na_dropped[dataset_na_dropped['sensitivity'] >= sensitivity_threshold].reset_index(drop=True)
print(f"Sensitivity filtering: {rows_before_sensitivity:,} -> {len(dataset_filtered_sensitivity):,} rows ({rows_before_sensitivity - len(dataset_filtered_sensitivity):,} removed)")

# Sample EGM values
elevation_geom_x = np.array([g.x for g in dataset_filtered_sensitivity['geometry']])
elevation_geom_y = np.array([g.y for g in dataset_filtered_sensitivity['geometry']])
sample_raster_values(dataset_filtered_sensitivity, earth_gravitational_model_path, elevation_geom_x, elevation_geom_y)
dataset_filtered_sensitivity['gedi_elevation'] = dataset_filtered_sensitivity['elev_lowestmode'] - dataset_filtered_sensitivity['earth_gravitational_model']

# Sample base DEM values
base_dem_path = join(areas_dir, 'base_dem_dsm.tif')
sample_raster_values(dataset_filtered_sensitivity, base_dem_path, elevation_geom_x, elevation_geom_y)

# Calculate elevation difference
dataset_filtered_sensitivity['gedi_elevation_diff'] = dataset_filtered_sensitivity['gedi_elevation'] - dataset_filtered_sensitivity['base_dem_dsm']

# Histogram-based outlier filtering
if histogram_based_outlier_filtering:
  elevation_diff_lower_bound, elevation_diff_upper_bound = histogram_outlier_bounds(
      np.array(dataset_filtered_sensitivity['gedi_elevation_diff']),
      title="GEDI âˆ’ Base DEM elevation",
      sparse_threshold_percent=sparse_threshold_percent)

In [None]:
# Manual override if histogram bounds are unsuitable
override_bounds = False
if override_bounds: elevation_diff_lower_bound, elevation_diff_upper_bound = -50, 5

dataset_filtered_bounds = dataset_filtered_sensitivity[(dataset_filtered_sensitivity['gedi_elevation_diff'] >= elevation_diff_lower_bound) & (dataset_filtered_sensitivity['gedi_elevation_diff'] <= elevation_diff_upper_bound)].reset_index(drop=True)
# Drop correction and filtering columns
dataset_filtered_bounds = dataset_filtered_bounds.drop(columns=['elev_lowestmode','earth_gravitational_model','base_dem_dsm','gedi_elevation_diff'])

# Export to .pkl
dataset_targets_path = join(datasets_tar_dir, 'gedi_elevation.pkl')
dataset_filtered_bounds.to_pickle(dataset_targets_path)
dataset_filtered_bounds = pd.read_pickle(dataset_targets_path)
print(f"The GEDI elevation dataset has been processed and exported to: {dataset_targets_path}.")

In [None]:
# Clear datasets from memory
del dataset_targets, geodataframe_targets, geodataframe_targets_clipped, dataset_targets_clipped
del dataset_na_dropped, dataset_filtered_sensitivity, dataset_filtered_bounds

## GEDI vegetation indices

In [None]:
# Select the GEDI .pkl with the desired vegetation index (e.g. 'agbd')
for pkl in os.listdir(targets_final_dir):
    print(f"targets_pkl = '{pkl}'")

In [None]:
targets_pkl = 'GEDI04_A.pkl'
dataset_name = 'agbd'

# Use the rectangular prediction area extent, instead of the potentially complex project area
use_prediction_area_polygon = True

# Sensitivity threshold. 0.98 is recommended for evergreen broadleaf tropical (EBT)
# strata tropical forests (following the aboveground biomass density algorithm theoretical
# basis document, Kellner et al., 2023).
sensitivity_threshold = 0.98

# Ensure all points are in the prediction area
dataset_targets = pd.read_pickle(join(targets_final_dir, targets_pkl))
if use_prediction_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'prediction_area.gpkg'))
else: project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
geodataframe_targets = gpd.GeoDataFrame(dataset_targets, geometry='geometry')
geodataframe_targets_clipped = gpd.clip(geodataframe_targets, project_area_polygon)
dataset_targets_clipped = pd.DataFrame(geodataframe_targets_clipped)
print(f"{len(dataset_targets) - len(dataset_targets_clipped)} out of {len(dataset_targets)} data points were outside the prediction area and removed.")

# Drop NA values, if any
dataset_na_values = dataset_targets_clipped.isna().any(axis=1).sum()
dataset_na_dropped = dataset_targets_clipped.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

# Filter by sensitivity threshold
print(f"Minimum sensitivity before filtering: {dataset_na_dropped['sensitivity'].min():.4f}")
rows_before_sensitivity = len(dataset_na_dropped)
dataset_filtered_sensitivity = dataset_na_dropped[dataset_na_dropped['sensitivity'] >= sensitivity_threshold].reset_index(drop=True)
print(f"Sensitivity filtering: {rows_before_sensitivity:,} -> {len(dataset_filtered_sensitivity):,} rows ({rows_before_sensitivity - len(dataset_filtered_sensitivity):,} removed)")

# Create 'year' column from timestamp
dataset_filtered_sensitivity['year'] = dataset_filtered_sensitivity['timestamp'].astype(str).str[:4].astype(int)

# Drop unused columns
dataset_filtered_sensitivity = dataset_filtered_sensitivity.drop(columns=['elev_lowestmode', 'timestamp'])

In [None]:
# Filter with TMF data
# Filter non-forest and 'new changes' in the collection year
# TMF provides annual disturbance,  sub-annual timing relative to GEDI collection is unknown
# Points with same-year land-cover changes excluded due to temporal ambiguity
# Predictions use the previous year's disturbance state (Dec 31st of year prior to prediction)
filter_with_tmf = True

# Distance threshold in metres for new disturbance filtering,
# accounting for GEDI footprint geolocation inaccuracy.
# 12.5 m radius + 10 m GEDI L4A v3 uncertainty + 12 m Landsat Collection 2 uncertainty.
edge_distance_threshold = 34.5

# Histogram-based outlier filtering
histogram_based_outlier_filtering = True
# Bins with fewer than this percentage of total data are considered sparse
sparse_threshold_percent = 0.01

if filter_with_tmf:
    dataset_filtered_tmf = dataset_filtered_sensitivity.copy()
    gedi_year_list = dataset_filtered_sensitivity['year'].unique().tolist()
    gedi_year_list.append(min(gedi_year_list) - 1)
    print(f"There are {len(dataset_filtered_sensitivity)} data points in the unfiltered dataset.")

    # Sample relevant forest and disturbance rasters
    # If there are not (yet) TMF feature years matching the GEDI year then these GEDI rows are dropped.
    # This is because we cannot know for certain whether there was new deforestation or disturbance that
    # misalign affect the vegetation metric with the features during training.
    vegetation_geom_x = np.array([g.x for g in dataset_filtered_tmf['geometry']])
    vegetation_geom_y = np.array([g.y for g in dataset_filtered_tmf['geometry']])
    for year in gedi_year_list:
        for feature_type in ['forest_edge_distance', 'disturbance_edge_distance']:
            feature_path = join(edge_effects_dir, f"{feature_type}_{year}.tif")
            if not exists(feature_path):
                print(f"{feature_type}_{year}.tif does not exist, so GEDI data from this year have been removed.")
                dataset_filtered_tmf = dataset_filtered_tmf[dataset_filtered_tmf['year'] != year]
            else: sample_raster_values(dataset_filtered_tmf, feature_path, vegetation_geom_x, vegetation_geom_y)
    print(f"{len(dataset_filtered_sensitivity) - len(dataset_filtered_tmf)} data points were dropped due to missing TMF years.")

    # Filter non-forest (negative edge_distance = outside forest class)
    indices_to_filter_non_forest = []
    for index, row in dataset_filtered_tmf.iterrows():
        if row[f"forest_edge_distance_{row['year']}"] < 0:
            indices_to_filter_non_forest.append(index)
    dataset_filtered_tmf.drop(indices_to_filter_non_forest, inplace=True)
    print(f"{len(indices_to_filter_non_forest)} non-forest data points were dropped.")

    # Filter new disturbance within threshold distance of disturbance edge
    # Positive or small negative edge_distance = close to or inside disturbance
    indices_to_filter_new_disturbance = []
    for index, row in dataset_filtered_tmf.iterrows():
        if row[f"disturbance_edge_distance_{row['year']}"] >= -edge_distance_threshold:
            if row[f"disturbance_edge_distance_{row['year'] -1}"] < -edge_distance_threshold:
                indices_to_filter_new_disturbance.append(index)
    dataset_filtered_tmf.drop(indices_to_filter_new_disturbance, inplace=True)
    print(f"{len(indices_to_filter_new_disturbance)} 'new disturbance' data points were dropped.")

    # Filter new forest edge effects within threshold distance of forest edge
    # Small positive edge_distance = near forest edge (interior side)
    indices_to_filter_new_forest_edge = []
    for index, row in dataset_filtered_tmf.iterrows():
        if row[f"forest_edge_distance_{row['year']}"] <= edge_distance_threshold:
            if row[f"forest_edge_distance_{row['year'] -1}"] > edge_distance_threshold:
                indices_to_filter_new_forest_edge.append(index)
    dataset_filtered_tmf.drop(indices_to_filter_new_forest_edge, inplace=True)
    print(f"{len(indices_to_filter_new_forest_edge)} 'new forest edge' data points were dropped.")
    print(f"There are {len(dataset_filtered_tmf)} data points remaining in the filtered dataset.")

    # Drop filtering columns
    dataset_filtered_tmf = dataset_filtered_tmf.loc[:,~dataset_filtered_tmf.columns.str.contains(
        'forest_edge_distance|disturbance_edge_distance')].reset_index(drop=True)

# Histogram-based outlier filtering
if histogram_based_outlier_filtering:
  vegetation_lower_bound, vegetation_upper_bound = histogram_outlier_bounds(
      np.array(dataset_filtered_tmf[dataset_name]),
      title=f"GEDI {dataset_name.upper()} distribution",
      sparse_threshold_percent=sparse_threshold_percent)

In [None]:
# Manual override if histogram bounds are unsuitable
override_bounds = False
if override_bounds: vegetation_lower_bound, vegetation_upper_bound = 0, 800

# Apply bounds
dataset_filtered_bounds = dataset_filtered_tmf[
    (dataset_filtered_tmf[dataset_name] >= vegetation_lower_bound) &
    (dataset_filtered_tmf[dataset_name] <= vegetation_upper_bound)
].reset_index(drop=True)

# Export to .pkl
dataset_targets_path = join(datasets_tar_dir, f'{dataset_name}.pkl')
dataset_filtered_bounds.to_pickle(dataset_targets_path)
dataset_filtered_bounds = pd.read_pickle(dataset_targets_path)
print(f"The GEDI {dataset_name.upper()} dataset has been processed and exported to: {dataset_targets_path}.")

In [None]:
# Clear datasets from memory after verification
del dataset_targets, geodataframe_targets, geodataframe_targets_clipped, dataset_targets_clipped
del dataset_na_dropped, dataset_filtered_sensitivity, dataset_filtered_tmf, dataset_filtered_bounds

## Uploaded data

In [None]:
# Select the uploaded target dataset to compile
for pkl in os.listdir(targets_final_dir):
    print(f"targets_pkl = '{pkl}'")

In [None]:
targets_pkl = 'user_upload.pkl'

# Name of target column for outlier filtering
target_column = 'target'

# Use the rectangular prediction area extent, instead of the potentially complex project area
use_prediction_area_polygon = True

# Histogram-based outlier filtering
histogram_based_outlier_filtering = True
# Bins with fewer than this percentage of total data are considered sparse
sparse_threshold_percent = 0.01

dataset_targets = pd.read_pickle(join(targets_final_dir, targets_pkl))

# Ensure all points are in the prediction area
if use_prediction_area_polygon: project_area_polygon = gpd.read_file(join(polygons_dir, 'prediction_area.gpkg'))
else: project_area_polygon = gpd.read_file(join(polygons_dir, 'project_area.gpkg'))
geodataframe_targets = gpd.GeoDataFrame(dataset_targets, geometry='geometry')
geodataframe_targets_clipped = gpd.clip(geodataframe_targets, project_area_polygon)
dataset_targets_clipped = pd.DataFrame(geodataframe_targets_clipped)
print(f"{len(dataset_targets) - len(dataset_targets_clipped)} out of {len(dataset_targets)} data points were outside the prediction area and removed.")

# Drop NA values, if any
dataset_na_values = dataset_targets_clipped.isna().any(axis=1).sum()
dataset_na_dropped = dataset_targets_clipped.dropna().reset_index(drop=True)
print(f"{dataset_na_values} data points had NA values and were removed.")

# Histogram-based outlier filtering
if histogram_based_outlier_filtering:
    target_lower_bound, target_upper_bound = histogram_outlier_bounds(
        np.array(dataset_na_dropped[target_column]),
        title=f"{target_column} distribution",
        sparse_threshold_percent=sparse_threshold_percent)

In [None]:
new_dataset_name = 'user_upload'

# Manual override if histogram bounds are unsuitable
override_bounds = False
if override_bounds: target_lower_bound, target_upper_bound = 0, 100

# Apply bounds
if histogram_based_outlier_filtering:
    dataset_filtered_bounds = dataset_na_dropped[
        (dataset_na_dropped[target_column] >= target_lower_bound) &
        (dataset_na_dropped[target_column] <= target_upper_bound)
    ].reset_index(drop=True)
    print(f"{len(dataset_filtered_bounds)} data points remaining after outlier filtering.")
else:
    dataset_filtered_bounds = dataset_na_dropped

# Export to .pkl
dataset_targets_path = join(datasets_tar_dir, f'{new_dataset_name}.pkl')
dataset_filtered_bounds.to_pickle(dataset_targets_path)
dataset_filtered_bounds = pd.read_pickle(dataset_targets_path)
print(f"The {new_dataset_name} dataset has been processed and exported to: {dataset_targets_path}.")

In [None]:
# Clear datasets from memory
del dataset_targets, geodataframe_targets, geodataframe_targets_clipped, dataset_targets_clipped
del dataset_na_dropped, dataset_filtered_bounds

# Spatial joining

## Feature lists

In [None]:
# Select the dataset targets .pkl to add features
for pkl in os.listdir(datasets_tar_dir):
  if pkl.endswith('.pkl'):
    print(f"dataset_targets_final_name = '{pkl}'")

In [None]:
dataset_targets_final_name = 'agbd.pkl'

# Modify this if experimenting with incompatible feature sets
# E.g. TMF or Alpha Earth
dataset_add_features_pkl_name = 'agbd_alpha_earth.pkl'

use_alpha_earth_features = True

# Select features to add to the dataset.
# NOTE FOR GEDI DTM:
# Land-cover more recent than the base DSM shouldn't be used, as it's intended to
# measure discrepencies between the base surface data and GEDI terrain data.
# In the case of Copernicus DEM, this is > 2015.
# NOTE FOR AGBD:
# Land-cover more recent than or the same year as the most recent GEDI data (e.g. 2024)
# will be removed at the finalisation stage.
final_feature_rasters = []

# Add Alpha Earth features if enabled
if use_alpha_earth_features:
  for feature in os.listdir(alpha_earth_dir): final_feature_rasters.append(join(alpha_earth_dir, feature))
else:
  # Add DSM topographic features
  if exists(topo_dsm_final_dir):
    for feature in os.listdir(topo_dsm_final_dir): final_feature_rasters.append(join(topo_dsm_final_dir, feature))
  # Add DTM topographic features, if they exist
  if exists(topo_dtm_final_dir):
    for feature in os.listdir(topo_dtm_final_dir): final_feature_rasters.append(join(topo_dtm_final_dir, feature))
    # Add geographic features
  if exists(geographic_final_dir):
    for feature in os.listdir(geographic_final_dir): final_feature_rasters.append(join(geographic_final_dir, feature))
  # Add LCLUC continuous features
  if exists(continuous_final_dir):
    for feature in os.listdir(continuous_final_dir): final_feature_rasters.append(join(continuous_final_dir, feature))
  # Add LCLUC binary edge effect features
  if exists(edge_effects_dir):
    for feature in os.listdir(edge_effects_dir):
      if 'land_' not in feature: # Only used for alternative scenarios (e.g. all land = forest)
        final_feature_rasters.append(join(edge_effects_dir, feature))

final_feature_rasters = sorted(final_feature_rasters)

print("feature_list = [")
for feature in final_feature_rasters:
  print(f"'{feature.split('/')[-2]}/{feature.split('/')[-1][:-4]}',")
print(']')

In [None]:
# Elevation corrected

feature_list = [
'binary_edge_effects/disturbance_edge_distance_1990',
'binary_edge_effects/disturbance_edge_distance_1991',
'binary_edge_effects/disturbance_edge_distance_1992',
'binary_edge_effects/disturbance_edge_distance_1993',
'binary_edge_effects/disturbance_edge_distance_1994',
'binary_edge_effects/disturbance_edge_distance_1995',
'binary_edge_effects/disturbance_edge_distance_1996',
'binary_edge_effects/disturbance_edge_distance_1997',
'binary_edge_effects/disturbance_edge_distance_1998',
'binary_edge_effects/disturbance_edge_distance_1999',
'binary_edge_effects/disturbance_edge_distance_2000',
'binary_edge_effects/disturbance_edge_distance_2001',
'binary_edge_effects/disturbance_edge_distance_2002',
'binary_edge_effects/disturbance_edge_distance_2003',
'binary_edge_effects/disturbance_edge_distance_2004',
'binary_edge_effects/disturbance_edge_distance_2005',
'binary_edge_effects/disturbance_edge_distance_2006',
'binary_edge_effects/disturbance_edge_distance_2007',
'binary_edge_effects/disturbance_edge_distance_2008',
'binary_edge_effects/disturbance_edge_distance_2009',
'binary_edge_effects/disturbance_edge_distance_2010',
'binary_edge_effects/disturbance_edge_distance_2011',
'binary_edge_effects/disturbance_edge_distance_2012',
'binary_edge_effects/disturbance_edge_distance_2013',
'binary_edge_effects/disturbance_edge_distance_2014',
'binary_edge_effects/disturbance_edge_distance_2015',
# 'binary_edge_effects/disturbance_edge_distance_2016',
# 'binary_edge_effects/disturbance_edge_distance_2017',
# 'binary_edge_effects/disturbance_edge_distance_2018',
# 'binary_edge_effects/disturbance_edge_distance_2019',
# 'binary_edge_effects/disturbance_edge_distance_2020',
# 'binary_edge_effects/disturbance_edge_distance_2021',
# 'binary_edge_effects/disturbance_edge_distance_2022',
# 'binary_edge_effects/disturbance_edge_distance_2023',
# 'binary_edge_effects/disturbance_edge_distance_2024',
'binary_edge_effects/disturbance_local_density_1990',
'binary_edge_effects/disturbance_local_density_1991',
'binary_edge_effects/disturbance_local_density_1992',
'binary_edge_effects/disturbance_local_density_1993',
'binary_edge_effects/disturbance_local_density_1994',
'binary_edge_effects/disturbance_local_density_1995',
'binary_edge_effects/disturbance_local_density_1996',
'binary_edge_effects/disturbance_local_density_1997',
'binary_edge_effects/disturbance_local_density_1998',
'binary_edge_effects/disturbance_local_density_1999',
'binary_edge_effects/disturbance_local_density_2000',
'binary_edge_effects/disturbance_local_density_2001',
'binary_edge_effects/disturbance_local_density_2002',
'binary_edge_effects/disturbance_local_density_2003',
'binary_edge_effects/disturbance_local_density_2004',
'binary_edge_effects/disturbance_local_density_2005',
'binary_edge_effects/disturbance_local_density_2006',
'binary_edge_effects/disturbance_local_density_2007',
'binary_edge_effects/disturbance_local_density_2008',
'binary_edge_effects/disturbance_local_density_2009',
'binary_edge_effects/disturbance_local_density_2010',
'binary_edge_effects/disturbance_local_density_2011',
'binary_edge_effects/disturbance_local_density_2012',
'binary_edge_effects/disturbance_local_density_2013',
'binary_edge_effects/disturbance_local_density_2014',
'binary_edge_effects/disturbance_local_density_2015',
# 'binary_edge_effects/disturbance_local_density_2016',
# 'binary_edge_effects/disturbance_local_density_2017',
# 'binary_edge_effects/disturbance_local_density_2018',
# 'binary_edge_effects/disturbance_local_density_2019',
# 'binary_edge_effects/disturbance_local_density_2020',
# 'binary_edge_effects/disturbance_local_density_2021',
# 'binary_edge_effects/disturbance_local_density_2022',
# 'binary_edge_effects/disturbance_local_density_2023',
# 'binary_edge_effects/disturbance_local_density_2024',
'binary_edge_effects/forest_edge_distance_1990',
# 'binary_edge_effects/forest_edge_distance_1991',
# 'binary_edge_effects/forest_edge_distance_1992',
# 'binary_edge_effects/forest_edge_distance_1993',
# 'binary_edge_effects/forest_edge_distance_1994',
# 'binary_edge_effects/forest_edge_distance_1995',
# 'binary_edge_effects/forest_edge_distance_1996',
# 'binary_edge_effects/forest_edge_distance_1997',
# 'binary_edge_effects/forest_edge_distance_1998',
# 'binary_edge_effects/forest_edge_distance_1999',
'binary_edge_effects/forest_edge_distance_2000',
# 'binary_edge_effects/forest_edge_distance_2001',
# 'binary_edge_effects/forest_edge_distance_2002',
# 'binary_edge_effects/forest_edge_distance_2003',
# 'binary_edge_effects/forest_edge_distance_2004',
# 'binary_edge_effects/forest_edge_distance_2005',
# 'binary_edge_effects/forest_edge_distance_2006',
# 'binary_edge_effects/forest_edge_distance_2007',
# 'binary_edge_effects/forest_edge_distance_2008',
# 'binary_edge_effects/forest_edge_distance_2009',
'binary_edge_effects/forest_edge_distance_2010',
'binary_edge_effects/forest_edge_distance_2011',
'binary_edge_effects/forest_edge_distance_2012',
'binary_edge_effects/forest_edge_distance_2013',
'binary_edge_effects/forest_edge_distance_2014',
'binary_edge_effects/forest_edge_distance_2015',
# 'binary_edge_effects/forest_edge_distance_2016',
# 'binary_edge_effects/forest_edge_distance_2017',
# 'binary_edge_effects/forest_edge_distance_2018',
# 'binary_edge_effects/forest_edge_distance_2019',
# 'binary_edge_effects/forest_edge_distance_2020',
# 'binary_edge_effects/forest_edge_distance_2021',
# 'binary_edge_effects/forest_edge_distance_2022',
# 'binary_edge_effects/forest_edge_distance_2023',
# 'binary_edge_effects/forest_edge_distance_2024',
'binary_edge_effects/forest_local_density_1990',
# 'binary_edge_effects/forest_local_density_1991',
# 'binary_edge_effects/forest_local_density_1992',
# 'binary_edge_effects/forest_local_density_1993',
# 'binary_edge_effects/forest_local_density_1994',
# 'binary_edge_effects/forest_local_density_1995',
# 'binary_edge_effects/forest_local_density_1996',
# 'binary_edge_effects/forest_local_density_1997',
# 'binary_edge_effects/forest_local_density_1998',
# 'binary_edge_effects/forest_local_density_1999',
'binary_edge_effects/forest_local_density_2000',
# 'binary_edge_effects/forest_local_density_2001',
# 'binary_edge_effects/forest_local_density_2002',
# 'binary_edge_effects/forest_local_density_2003',
# 'binary_edge_effects/forest_local_density_2004',
# 'binary_edge_effects/forest_local_density_2005',
# 'binary_edge_effects/forest_local_density_2006',
# 'binary_edge_effects/forest_local_density_2007',
# 'binary_edge_effects/forest_local_density_2008',
# 'binary_edge_effects/forest_local_density_2009',
'binary_edge_effects/forest_local_density_2010',
'binary_edge_effects/forest_local_density_2011',
'binary_edge_effects/forest_local_density_2012',
'binary_edge_effects/forest_local_density_2013',
'binary_edge_effects/forest_local_density_2014',
'binary_edge_effects/forest_local_density_2015',
# 'binary_edge_effects/forest_local_density_2016',
# 'binary_edge_effects/forest_local_density_2017',
# 'binary_edge_effects/forest_local_density_2018',
# 'binary_edge_effects/forest_local_density_2019',
# 'binary_edge_effects/forest_local_density_2020',
# 'binary_edge_effects/forest_local_density_2021',
# 'binary_edge_effects/forest_local_density_2022',
# 'binary_edge_effects/forest_local_density_2023',
# 'binary_edge_effects/forest_local_density_2024',
'binary_edge_effects/lu_ais_edge_distance',
# 'binary_edge_effects/lu_ais_local_density',
'binary_edge_effects/lu_berkelah_jerantut_edge_distance',
# 'binary_edge_effects/lu_berkelah_jerantut_local_density',
'binary_edge_effects/lu_berkelah_kuantan_edge_distance',
# 'binary_edge_effects/lu_berkelah_kuantan_local_density',
'binary_edge_effects/lu_berkelah_temerloh_edge_distance',
# 'binary_edge_effects/lu_berkelah_temerloh_local_density',
'binary_edge_effects/lu_old-growth_protected_areas_edge_distance',
# 'binary_edge_effects/lu_old-growth_protected_areas_local_density',
'binary_edge_effects/lu_remen_chereh_edge_distance',
# 'binary_edge_effects/lu_remen_chereh_local_density',
'binary_edge_effects/lu_tekai_tembeling_edge_distance',
# 'binary_edge_effects/lu_tekai_tembeling_local_density',
'binary_edge_effects/lu_tekam_edge_distance',
# 'binary_edge_effects/lu_tekam_local_density',
'binary_edge_effects/lu_yong_edge_distance',
'binary_edge_effects/lu_yong_lipis_edge_distance',
# 'binary_edge_effects/lu_yong_lipis_local_density',
# 'binary_edge_effects/lu_yong_local_density',
'geographic_final/coast_proximity_km',
'geographic_final/latitude',
'geographic_final/longitude',
'topo_dsm_final/topo_dsm_smooth_aspect_cosine',
'topo_dsm_final/topo_dsm_smooth_aspect_sine',
'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_03',
'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_07',
'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_11',
'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_03',
'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_07',
'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_11',
'topo_dsm_final/topo_dsm_smooth_eastness',
'topo_dsm_final/topo_dsm_smooth_elevation',
'topo_dsm_final/topo_dsm_smooth_northness',
'topo_dsm_final/topo_dsm_smooth_profile_curvature',
'topo_dsm_final/topo_dsm_smooth_roughness_03',
'topo_dsm_final/topo_dsm_smooth_roughness_07',
'topo_dsm_final/topo_dsm_smooth_roughness_11',
'topo_dsm_final/topo_dsm_smooth_slope',
'topo_dsm_final/topo_dsm_smooth_stream_power_index_log10',
'topo_dsm_final/topo_dsm_smooth_surface_area_ratio',
'topo_dsm_final/topo_dsm_smooth_tangential_curvature',
'topo_dsm_final/topo_dsm_smooth_topographic_position_index_03',
'topo_dsm_final/topo_dsm_smooth_topographic_position_index_07',
'topo_dsm_final/topo_dsm_smooth_topographic_position_index_11',
'topo_dsm_final/topo_dsm_smooth_topographic_ruggedness_index',
'topo_dsm_final/topo_dsm_smooth_topographic_wetness_index',
'topo_dsm_final/topo_dsm_unsmooth_aspect_cosine',
'topo_dsm_final/topo_dsm_unsmooth_aspect_sine',
'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_03',
'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_07',
'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_11',
'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_03',
'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_07',
'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_11',
'topo_dsm_final/topo_dsm_unsmooth_eastness',
'topo_dsm_final/topo_dsm_unsmooth_elevation',
'topo_dsm_final/topo_dsm_unsmooth_northness',
'topo_dsm_final/topo_dsm_unsmooth_profile_curvature',
'topo_dsm_final/topo_dsm_unsmooth_roughness_03',
'topo_dsm_final/topo_dsm_unsmooth_roughness_07',
'topo_dsm_final/topo_dsm_unsmooth_roughness_11',
'topo_dsm_final/topo_dsm_unsmooth_slope',
'topo_dsm_final/topo_dsm_unsmooth_stream_power_index_log10',
'topo_dsm_final/topo_dsm_unsmooth_surface_area_ratio',
'topo_dsm_final/topo_dsm_unsmooth_tangential_curvature',
'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_03',
'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_07',
'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_11',
'topo_dsm_final/topo_dsm_unsmooth_topographic_ruggedness_index',
'topo_dsm_final/topo_dsm_unsmooth_topographic_wetness_index',
]

In [None]:
# # AGBD

# feature_list = [
# 'binary_edge_effects/disturbance_edge_distance_1990',
# 'binary_edge_effects/disturbance_edge_distance_1991',
# 'binary_edge_effects/disturbance_edge_distance_1992',
# 'binary_edge_effects/disturbance_edge_distance_1993',
# 'binary_edge_effects/disturbance_edge_distance_1994',
# 'binary_edge_effects/disturbance_edge_distance_1995',
# 'binary_edge_effects/disturbance_edge_distance_1996',
# 'binary_edge_effects/disturbance_edge_distance_1997',
# 'binary_edge_effects/disturbance_edge_distance_1998',
# 'binary_edge_effects/disturbance_edge_distance_1999',
# 'binary_edge_effects/disturbance_edge_distance_2000',
# 'binary_edge_effects/disturbance_edge_distance_2001',
# 'binary_edge_effects/disturbance_edge_distance_2002',
# 'binary_edge_effects/disturbance_edge_distance_2003',
# 'binary_edge_effects/disturbance_edge_distance_2004',
# 'binary_edge_effects/disturbance_edge_distance_2005',
# 'binary_edge_effects/disturbance_edge_distance_2006',
# 'binary_edge_effects/disturbance_edge_distance_2007',
# 'binary_edge_effects/disturbance_edge_distance_2008',
# 'binary_edge_effects/disturbance_edge_distance_2009',
# 'binary_edge_effects/disturbance_edge_distance_2010',
# 'binary_edge_effects/disturbance_edge_distance_2011',
# 'binary_edge_effects/disturbance_edge_distance_2012',
# 'binary_edge_effects/disturbance_edge_distance_2013',
# 'binary_edge_effects/disturbance_edge_distance_2014',
# 'binary_edge_effects/disturbance_edge_distance_2015',
# 'binary_edge_effects/disturbance_edge_distance_2016',
# 'binary_edge_effects/disturbance_edge_distance_2017',
# 'binary_edge_effects/disturbance_edge_distance_2018',
# 'binary_edge_effects/disturbance_edge_distance_2019',
# 'binary_edge_effects/disturbance_edge_distance_2020',
# 'binary_edge_effects/disturbance_edge_distance_2021',
# 'binary_edge_effects/disturbance_edge_distance_2022',
# 'binary_edge_effects/disturbance_edge_distance_2023',
# 'binary_edge_effects/disturbance_edge_distance_2024',
# 'binary_edge_effects/disturbance_local_density_1990',
# 'binary_edge_effects/disturbance_local_density_1991',
# 'binary_edge_effects/disturbance_local_density_1992',
# 'binary_edge_effects/disturbance_local_density_1993',
# 'binary_edge_effects/disturbance_local_density_1994',
# 'binary_edge_effects/disturbance_local_density_1995',
# 'binary_edge_effects/disturbance_local_density_1996',
# 'binary_edge_effects/disturbance_local_density_1997',
# 'binary_edge_effects/disturbance_local_density_1998',
# 'binary_edge_effects/disturbance_local_density_1999',
# 'binary_edge_effects/disturbance_local_density_2000',
# 'binary_edge_effects/disturbance_local_density_2001',
# 'binary_edge_effects/disturbance_local_density_2002',
# 'binary_edge_effects/disturbance_local_density_2003',
# 'binary_edge_effects/disturbance_local_density_2004',
# 'binary_edge_effects/disturbance_local_density_2005',
# 'binary_edge_effects/disturbance_local_density_2006',
# 'binary_edge_effects/disturbance_local_density_2007',
# 'binary_edge_effects/disturbance_local_density_2008',
# 'binary_edge_effects/disturbance_local_density_2009',
# 'binary_edge_effects/disturbance_local_density_2010',
# 'binary_edge_effects/disturbance_local_density_2011',
# 'binary_edge_effects/disturbance_local_density_2012',
# 'binary_edge_effects/disturbance_local_density_2013',
# 'binary_edge_effects/disturbance_local_density_2014',
# 'binary_edge_effects/disturbance_local_density_2015',
# 'binary_edge_effects/disturbance_local_density_2016',
# 'binary_edge_effects/disturbance_local_density_2017',
# 'binary_edge_effects/disturbance_local_density_2018',
# 'binary_edge_effects/disturbance_local_density_2019',
# 'binary_edge_effects/disturbance_local_density_2020',
# 'binary_edge_effects/disturbance_local_density_2021',
# 'binary_edge_effects/disturbance_local_density_2022',
# 'binary_edge_effects/disturbance_local_density_2023',
# 'binary_edge_effects/disturbance_local_density_2024',
# 'binary_edge_effects/forest_edge_distance_1990',
# 'binary_edge_effects/forest_edge_distance_1991',
# 'binary_edge_effects/forest_edge_distance_1992',
# 'binary_edge_effects/forest_edge_distance_1993',
# 'binary_edge_effects/forest_edge_distance_1994',
# 'binary_edge_effects/forest_edge_distance_1995',
# 'binary_edge_effects/forest_edge_distance_1996',
# 'binary_edge_effects/forest_edge_distance_1997',
# 'binary_edge_effects/forest_edge_distance_1998',
# 'binary_edge_effects/forest_edge_distance_1999',
# 'binary_edge_effects/forest_edge_distance_2000',
# 'binary_edge_effects/forest_edge_distance_2001',
# 'binary_edge_effects/forest_edge_distance_2002',
# 'binary_edge_effects/forest_edge_distance_2003',
# 'binary_edge_effects/forest_edge_distance_2004',
# 'binary_edge_effects/forest_edge_distance_2005',
# 'binary_edge_effects/forest_edge_distance_2006',
# 'binary_edge_effects/forest_edge_distance_2007',
# 'binary_edge_effects/forest_edge_distance_2008',
# 'binary_edge_effects/forest_edge_distance_2009',
# 'binary_edge_effects/forest_edge_distance_2010',
# 'binary_edge_effects/forest_edge_distance_2011',
# 'binary_edge_effects/forest_edge_distance_2012',
# 'binary_edge_effects/forest_edge_distance_2013',
# 'binary_edge_effects/forest_edge_distance_2014',
# 'binary_edge_effects/forest_edge_distance_2015',
# 'binary_edge_effects/forest_edge_distance_2016',
# 'binary_edge_effects/forest_edge_distance_2017',
# 'binary_edge_effects/forest_edge_distance_2018',
# 'binary_edge_effects/forest_edge_distance_2019',
# 'binary_edge_effects/forest_edge_distance_2020',
# 'binary_edge_effects/forest_edge_distance_2021',
# 'binary_edge_effects/forest_edge_distance_2022',
# 'binary_edge_effects/forest_edge_distance_2023',
# 'binary_edge_effects/forest_edge_distance_2024',
# 'binary_edge_effects/forest_local_density_1990',
# 'binary_edge_effects/forest_local_density_1991',
# 'binary_edge_effects/forest_local_density_1992',
# 'binary_edge_effects/forest_local_density_1993',
# 'binary_edge_effects/forest_local_density_1994',
# 'binary_edge_effects/forest_local_density_1995',
# 'binary_edge_effects/forest_local_density_1996',
# 'binary_edge_effects/forest_local_density_1997',
# 'binary_edge_effects/forest_local_density_1998',
# 'binary_edge_effects/forest_local_density_1999',
# 'binary_edge_effects/forest_local_density_2000',
# 'binary_edge_effects/forest_local_density_2001',
# 'binary_edge_effects/forest_local_density_2002',
# 'binary_edge_effects/forest_local_density_2003',
# 'binary_edge_effects/forest_local_density_2004',
# 'binary_edge_effects/forest_local_density_2005',
# 'binary_edge_effects/forest_local_density_2006',
# 'binary_edge_effects/forest_local_density_2007',
# 'binary_edge_effects/forest_local_density_2008',
# 'binary_edge_effects/forest_local_density_2009',
# 'binary_edge_effects/forest_local_density_2010',
# 'binary_edge_effects/forest_local_density_2011',
# 'binary_edge_effects/forest_local_density_2012',
# 'binary_edge_effects/forest_local_density_2013',
# 'binary_edge_effects/forest_local_density_2014',
# 'binary_edge_effects/forest_local_density_2015',
# 'binary_edge_effects/forest_local_density_2016',
# 'binary_edge_effects/forest_local_density_2017',
# 'binary_edge_effects/forest_local_density_2018',
# 'binary_edge_effects/forest_local_density_2019',
# 'binary_edge_effects/forest_local_density_2020',
# 'binary_edge_effects/forest_local_density_2021',
# 'binary_edge_effects/forest_local_density_2022',
# 'binary_edge_effects/forest_local_density_2023',
# 'binary_edge_effects/forest_local_density_2024',
# 'binary_edge_effects/lu_ais_edge_distance',
# # 'binary_edge_effects/lu_ais_local_density',
# 'binary_edge_effects/lu_berkelah_jerantut_edge_distance',
# # 'binary_edge_effects/lu_berkelah_jerantut_local_density',
# 'binary_edge_effects/lu_berkelah_kuantan_edge_distance',
# # 'binary_edge_effects/lu_berkelah_kuantan_local_density',
# 'binary_edge_effects/lu_berkelah_temerloh_edge_distance',
# # 'binary_edge_effects/lu_berkelah_temerloh_local_density',
# 'binary_edge_effects/lu_old-growth_protected_areas_edge_distance',
# # 'binary_edge_effects/lu_old-growth_protected_areas_local_density',
# 'binary_edge_effects/lu_remen_chereh_edge_distance',
# # 'binary_edge_effects/lu_remen_chereh_local_density',
# 'binary_edge_effects/lu_tekai_tembeling_edge_distance',
# # 'binary_edge_effects/lu_tekai_tembeling_local_density',
# 'binary_edge_effects/lu_tekam_edge_distance',
# # 'binary_edge_effects/lu_tekam_local_density',
# 'binary_edge_effects/lu_yong_edge_distance',
# 'binary_edge_effects/lu_yong_lipis_edge_distance',
# # 'binary_edge_effects/lu_yong_lipis_local_density',
# # 'binary_edge_effects/lu_yong_local_density',
# 'geographic_final/coast_proximity_km',
# 'geographic_final/latitude',
# 'geographic_final/longitude',
# # 'topo_dsm_final/topo_dsm_smooth_aspect_cosine',
# # 'topo_dsm_final/topo_dsm_smooth_aspect_sine',
# # 'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_03',
# # 'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_07',
# # 'topo_dsm_final/topo_dsm_smooth_circular_variance_aspect_11',
# # 'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_03',
# # 'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_07',
# # 'topo_dsm_final/topo_dsm_smooth_deviation_mean_elevation_11',
# # 'topo_dsm_final/topo_dsm_smooth_eastness',
# # 'topo_dsm_final/topo_dsm_smooth_elevation',
# # 'topo_dsm_final/topo_dsm_smooth_northness',
# # 'topo_dsm_final/topo_dsm_smooth_profile_curvature',
# # 'topo_dsm_final/topo_dsm_smooth_roughness_03',
# # 'topo_dsm_final/topo_dsm_smooth_roughness_07',
# # 'topo_dsm_final/topo_dsm_smooth_roughness_11',
# # 'topo_dsm_final/topo_dsm_smooth_slope',
# # 'topo_dsm_final/topo_dsm_smooth_stream_power_index_log10',
# # 'topo_dsm_final/topo_dsm_smooth_surface_area_ratio',
# # 'topo_dsm_final/topo_dsm_smooth_tangential_curvature',
# # 'topo_dsm_final/topo_dsm_smooth_topographic_position_index_03',
# # 'topo_dsm_final/topo_dsm_smooth_topographic_position_index_07',
# # 'topo_dsm_final/topo_dsm_smooth_topographic_position_index_11',
# # 'topo_dsm_final/topo_dsm_smooth_topographic_ruggedness_index',
# # 'topo_dsm_final/topo_dsm_smooth_topographic_wetness_index',
# # 'topo_dsm_final/topo_dsm_unsmooth_aspect_cosine',
# # 'topo_dsm_final/topo_dsm_unsmooth_aspect_sine',
# # 'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_03',
# # 'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_07',
# # 'topo_dsm_final/topo_dsm_unsmooth_circular_variance_aspect_11',
# # 'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_03',
# # 'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_07',
# # 'topo_dsm_final/topo_dsm_unsmooth_deviation_mean_elevation_11',
# # 'topo_dsm_final/topo_dsm_unsmooth_eastness',
# # 'topo_dsm_final/topo_dsm_unsmooth_elevation',
# # 'topo_dsm_final/topo_dsm_unsmooth_northness',
# # 'topo_dsm_final/topo_dsm_unsmooth_profile_curvature',
# # 'topo_dsm_final/topo_dsm_unsmooth_roughness_03',
# # 'topo_dsm_final/topo_dsm_unsmooth_roughness_07',
# # 'topo_dsm_final/topo_dsm_unsmooth_roughness_11',
# # 'topo_dsm_final/topo_dsm_unsmooth_slope',
# # 'topo_dsm_final/topo_dsm_unsmooth_stream_power_index_log10',
# # 'topo_dsm_final/topo_dsm_unsmooth_surface_area_ratio',
# # 'topo_dsm_final/topo_dsm_unsmooth_tangential_curvature',
# # 'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_03',
# # 'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_07',
# # 'topo_dsm_final/topo_dsm_unsmooth_topographic_position_index_11',
# # 'topo_dsm_final/topo_dsm_unsmooth_topographic_ruggedness_index',
# # 'topo_dsm_final/topo_dsm_unsmooth_topographic_wetness_index',
# 'topo_dtm_final/topo_dtm_smooth_aspect_cosine',
# 'topo_dtm_final/topo_dtm_smooth_aspect_sine',
# 'topo_dtm_final/topo_dtm_smooth_circular_variance_aspect_03',
# 'topo_dtm_final/topo_dtm_smooth_circular_variance_aspect_07',
# 'topo_dtm_final/topo_dtm_smooth_circular_variance_aspect_11',
# 'topo_dtm_final/topo_dtm_smooth_deviation_mean_elevation_03',
# 'topo_dtm_final/topo_dtm_smooth_deviation_mean_elevation_07',
# 'topo_dtm_final/topo_dtm_smooth_deviation_mean_elevation_11',
# 'topo_dtm_final/topo_dtm_smooth_eastness',
# 'topo_dtm_final/topo_dtm_smooth_elevation',
# 'topo_dtm_final/topo_dtm_smooth_northness',
# 'topo_dtm_final/topo_dtm_smooth_profile_curvature',
# 'topo_dtm_final/topo_dtm_smooth_roughness_03',
# 'topo_dtm_final/topo_dtm_smooth_roughness_07',
# 'topo_dtm_final/topo_dtm_smooth_roughness_11',
# 'topo_dtm_final/topo_dtm_smooth_slope',
# 'topo_dtm_final/topo_dtm_smooth_stream_power_index_log10',
# 'topo_dtm_final/topo_dtm_smooth_surface_area_ratio',
# 'topo_dtm_final/topo_dtm_smooth_tangential_curvature',
# 'topo_dtm_final/topo_dtm_smooth_topographic_position_index_03',
# 'topo_dtm_final/topo_dtm_smooth_topographic_position_index_07',
# 'topo_dtm_final/topo_dtm_smooth_topographic_position_index_11',
# 'topo_dtm_final/topo_dtm_smooth_topographic_ruggedness_index',
# 'topo_dtm_final/topo_dtm_smooth_topographic_wetness_index',
# 'topo_dtm_final/topo_dtm_unsmooth_aspect_cosine',
# 'topo_dtm_final/topo_dtm_unsmooth_aspect_sine',
# 'topo_dtm_final/topo_dtm_unsmooth_circular_variance_aspect_03',
# 'topo_dtm_final/topo_dtm_unsmooth_circular_variance_aspect_07',
# 'topo_dtm_final/topo_dtm_unsmooth_circular_variance_aspect_11',
# 'topo_dtm_final/topo_dtm_unsmooth_deviation_mean_elevation_03',
# 'topo_dtm_final/topo_dtm_unsmooth_deviation_mean_elevation_07',
# 'topo_dtm_final/topo_dtm_unsmooth_deviation_mean_elevation_11',
# 'topo_dtm_final/topo_dtm_unsmooth_eastness',
# 'topo_dtm_final/topo_dtm_unsmooth_elevation',
# 'topo_dtm_final/topo_dtm_unsmooth_northness',
# 'topo_dtm_final/topo_dtm_unsmooth_profile_curvature',
# 'topo_dtm_final/topo_dtm_unsmooth_roughness_03',
# 'topo_dtm_final/topo_dtm_unsmooth_roughness_07',
# 'topo_dtm_final/topo_dtm_unsmooth_roughness_11',
# 'topo_dtm_final/topo_dtm_unsmooth_slope',
# 'topo_dtm_final/topo_dtm_unsmooth_stream_power_index_log10',
# 'topo_dtm_final/topo_dtm_unsmooth_surface_area_ratio',
# 'topo_dtm_final/topo_dtm_unsmooth_tangential_curvature',
# 'topo_dtm_final/topo_dtm_unsmooth_topographic_position_index_03',
# 'topo_dtm_final/topo_dtm_unsmooth_topographic_position_index_07',
# 'topo_dtm_final/topo_dtm_unsmooth_topographic_position_index_11',
# 'topo_dtm_final/topo_dtm_unsmooth_topographic_ruggedness_index',
# 'topo_dtm_final/topo_dtm_unsmooth_topographic_wetness_index',
# ]

In [None]:
# # Alpha Earth

# feature_list = [
# 'alpha_earth/alpha_earth_A00_2017',
# 'alpha_earth/alpha_earth_A00_2018',
# 'alpha_earth/alpha_earth_A00_2019',
# 'alpha_earth/alpha_earth_A00_2020',
# 'alpha_earth/alpha_earth_A00_2021',
# 'alpha_earth/alpha_earth_A00_2022',
# 'alpha_earth/alpha_earth_A00_2023',
# 'alpha_earth/alpha_earth_A00_2024',
# 'alpha_earth/alpha_earth_A01_2017',
# 'alpha_earth/alpha_earth_A01_2018',
# 'alpha_earth/alpha_earth_A01_2019',
# 'alpha_earth/alpha_earth_A01_2020',
# 'alpha_earth/alpha_earth_A01_2021',
# 'alpha_earth/alpha_earth_A01_2022',
# 'alpha_earth/alpha_earth_A01_2023',
# 'alpha_earth/alpha_earth_A01_2024',
# 'alpha_earth/alpha_earth_A02_2017',
# 'alpha_earth/alpha_earth_A02_2018',
# 'alpha_earth/alpha_earth_A02_2019',
# 'alpha_earth/alpha_earth_A02_2020',
# 'alpha_earth/alpha_earth_A02_2021',
# 'alpha_earth/alpha_earth_A02_2022',
# 'alpha_earth/alpha_earth_A02_2023',
# 'alpha_earth/alpha_earth_A02_2024',
# 'alpha_earth/alpha_earth_A03_2017',
# 'alpha_earth/alpha_earth_A03_2018',
# 'alpha_earth/alpha_earth_A03_2019',
# 'alpha_earth/alpha_earth_A03_2020',
# 'alpha_earth/alpha_earth_A03_2021',
# 'alpha_earth/alpha_earth_A03_2022',
# 'alpha_earth/alpha_earth_A03_2023',
# 'alpha_earth/alpha_earth_A03_2024',
# 'alpha_earth/alpha_earth_A04_2017',
# 'alpha_earth/alpha_earth_A04_2018',
# 'alpha_earth/alpha_earth_A04_2019',
# 'alpha_earth/alpha_earth_A04_2020',
# 'alpha_earth/alpha_earth_A04_2021',
# 'alpha_earth/alpha_earth_A04_2022',
# 'alpha_earth/alpha_earth_A04_2023',
# 'alpha_earth/alpha_earth_A04_2024',
# 'alpha_earth/alpha_earth_A05_2017',
# 'alpha_earth/alpha_earth_A05_2018',
# 'alpha_earth/alpha_earth_A05_2019',
# 'alpha_earth/alpha_earth_A05_2020',
# 'alpha_earth/alpha_earth_A05_2021',
# 'alpha_earth/alpha_earth_A05_2022',
# 'alpha_earth/alpha_earth_A05_2023',
# 'alpha_earth/alpha_earth_A05_2024',
# 'alpha_earth/alpha_earth_A06_2017',
# 'alpha_earth/alpha_earth_A06_2018',
# 'alpha_earth/alpha_earth_A06_2019',
# 'alpha_earth/alpha_earth_A06_2020',
# 'alpha_earth/alpha_earth_A06_2021',
# 'alpha_earth/alpha_earth_A06_2022',
# 'alpha_earth/alpha_earth_A06_2023',
# 'alpha_earth/alpha_earth_A06_2024',
# 'alpha_earth/alpha_earth_A07_2017',
# 'alpha_earth/alpha_earth_A07_2018',
# 'alpha_earth/alpha_earth_A07_2019',
# 'alpha_earth/alpha_earth_A07_2020',
# 'alpha_earth/alpha_earth_A07_2021',
# 'alpha_earth/alpha_earth_A07_2022',
# 'alpha_earth/alpha_earth_A07_2023',
# 'alpha_earth/alpha_earth_A07_2024',
# 'alpha_earth/alpha_earth_A08_2017',
# 'alpha_earth/alpha_earth_A08_2018',
# 'alpha_earth/alpha_earth_A08_2019',
# 'alpha_earth/alpha_earth_A08_2020',
# 'alpha_earth/alpha_earth_A08_2021',
# 'alpha_earth/alpha_earth_A08_2022',
# 'alpha_earth/alpha_earth_A08_2023',
# 'alpha_earth/alpha_earth_A08_2024',
# 'alpha_earth/alpha_earth_A09_2017',
# 'alpha_earth/alpha_earth_A09_2018',
# 'alpha_earth/alpha_earth_A09_2019',
# 'alpha_earth/alpha_earth_A09_2020',
# 'alpha_earth/alpha_earth_A09_2021',
# 'alpha_earth/alpha_earth_A09_2022',
# 'alpha_earth/alpha_earth_A09_2023',
# 'alpha_earth/alpha_earth_A09_2024',
# 'alpha_earth/alpha_earth_A10_2017',
# 'alpha_earth/alpha_earth_A10_2018',
# 'alpha_earth/alpha_earth_A10_2019',
# 'alpha_earth/alpha_earth_A10_2020',
# 'alpha_earth/alpha_earth_A10_2021',
# 'alpha_earth/alpha_earth_A10_2022',
# 'alpha_earth/alpha_earth_A10_2023',
# 'alpha_earth/alpha_earth_A10_2024',
# 'alpha_earth/alpha_earth_A11_2017',
# 'alpha_earth/alpha_earth_A11_2018',
# 'alpha_earth/alpha_earth_A11_2019',
# 'alpha_earth/alpha_earth_A11_2020',
# 'alpha_earth/alpha_earth_A11_2021',
# 'alpha_earth/alpha_earth_A11_2022',
# 'alpha_earth/alpha_earth_A11_2023',
# 'alpha_earth/alpha_earth_A11_2024',
# 'alpha_earth/alpha_earth_A12_2017',
# 'alpha_earth/alpha_earth_A12_2018',
# 'alpha_earth/alpha_earth_A12_2019',
# 'alpha_earth/alpha_earth_A12_2020',
# 'alpha_earth/alpha_earth_A12_2021',
# 'alpha_earth/alpha_earth_A12_2022',
# 'alpha_earth/alpha_earth_A12_2023',
# 'alpha_earth/alpha_earth_A12_2024',
# 'alpha_earth/alpha_earth_A13_2017',
# 'alpha_earth/alpha_earth_A13_2018',
# 'alpha_earth/alpha_earth_A13_2019',
# 'alpha_earth/alpha_earth_A13_2020',
# 'alpha_earth/alpha_earth_A13_2021',
# 'alpha_earth/alpha_earth_A13_2022',
# 'alpha_earth/alpha_earth_A13_2023',
# 'alpha_earth/alpha_earth_A13_2024',
# 'alpha_earth/alpha_earth_A14_2017',
# 'alpha_earth/alpha_earth_A14_2018',
# 'alpha_earth/alpha_earth_A14_2019',
# 'alpha_earth/alpha_earth_A14_2020',
# 'alpha_earth/alpha_earth_A14_2021',
# 'alpha_earth/alpha_earth_A14_2022',
# 'alpha_earth/alpha_earth_A14_2023',
# 'alpha_earth/alpha_earth_A14_2024',
# 'alpha_earth/alpha_earth_A15_2017',
# 'alpha_earth/alpha_earth_A15_2018',
# 'alpha_earth/alpha_earth_A15_2019',
# 'alpha_earth/alpha_earth_A15_2020',
# 'alpha_earth/alpha_earth_A15_2021',
# 'alpha_earth/alpha_earth_A15_2022',
# 'alpha_earth/alpha_earth_A15_2023',
# 'alpha_earth/alpha_earth_A15_2024',
# 'alpha_earth/alpha_earth_A16_2017',
# 'alpha_earth/alpha_earth_A16_2018',
# 'alpha_earth/alpha_earth_A16_2019',
# 'alpha_earth/alpha_earth_A16_2020',
# 'alpha_earth/alpha_earth_A16_2021',
# 'alpha_earth/alpha_earth_A16_2022',
# 'alpha_earth/alpha_earth_A16_2023',
# 'alpha_earth/alpha_earth_A16_2024',
# 'alpha_earth/alpha_earth_A17_2017',
# 'alpha_earth/alpha_earth_A17_2018',
# 'alpha_earth/alpha_earth_A17_2019',
# 'alpha_earth/alpha_earth_A17_2020',
# 'alpha_earth/alpha_earth_A17_2021',
# 'alpha_earth/alpha_earth_A17_2022',
# 'alpha_earth/alpha_earth_A17_2023',
# 'alpha_earth/alpha_earth_A17_2024',
# 'alpha_earth/alpha_earth_A18_2017',
# 'alpha_earth/alpha_earth_A18_2018',
# 'alpha_earth/alpha_earth_A18_2019',
# 'alpha_earth/alpha_earth_A18_2020',
# 'alpha_earth/alpha_earth_A18_2021',
# 'alpha_earth/alpha_earth_A18_2022',
# 'alpha_earth/alpha_earth_A18_2023',
# 'alpha_earth/alpha_earth_A18_2024',
# 'alpha_earth/alpha_earth_A19_2017',
# 'alpha_earth/alpha_earth_A19_2018',
# 'alpha_earth/alpha_earth_A19_2019',
# 'alpha_earth/alpha_earth_A19_2020',
# 'alpha_earth/alpha_earth_A19_2021',
# 'alpha_earth/alpha_earth_A19_2022',
# 'alpha_earth/alpha_earth_A19_2023',
# 'alpha_earth/alpha_earth_A19_2024',
# 'alpha_earth/alpha_earth_A20_2017',
# 'alpha_earth/alpha_earth_A20_2018',
# 'alpha_earth/alpha_earth_A20_2019',
# 'alpha_earth/alpha_earth_A20_2020',
# 'alpha_earth/alpha_earth_A20_2021',
# 'alpha_earth/alpha_earth_A20_2022',
# 'alpha_earth/alpha_earth_A20_2023',
# 'alpha_earth/alpha_earth_A20_2024',
# 'alpha_earth/alpha_earth_A21_2017',
# 'alpha_earth/alpha_earth_A21_2018',
# 'alpha_earth/alpha_earth_A21_2019',
# 'alpha_earth/alpha_earth_A21_2020',
# 'alpha_earth/alpha_earth_A21_2021',
# 'alpha_earth/alpha_earth_A21_2022',
# 'alpha_earth/alpha_earth_A21_2023',
# 'alpha_earth/alpha_earth_A21_2024',
# 'alpha_earth/alpha_earth_A22_2017',
# 'alpha_earth/alpha_earth_A22_2018',
# 'alpha_earth/alpha_earth_A22_2019',
# 'alpha_earth/alpha_earth_A22_2020',
# 'alpha_earth/alpha_earth_A22_2021',
# 'alpha_earth/alpha_earth_A22_2022',
# 'alpha_earth/alpha_earth_A22_2023',
# 'alpha_earth/alpha_earth_A22_2024',
# 'alpha_earth/alpha_earth_A23_2017',
# 'alpha_earth/alpha_earth_A23_2018',
# 'alpha_earth/alpha_earth_A23_2019',
# 'alpha_earth/alpha_earth_A23_2020',
# 'alpha_earth/alpha_earth_A23_2021',
# 'alpha_earth/alpha_earth_A23_2022',
# 'alpha_earth/alpha_earth_A23_2023',
# 'alpha_earth/alpha_earth_A23_2024',
# 'alpha_earth/alpha_earth_A24_2017',
# 'alpha_earth/alpha_earth_A24_2018',
# 'alpha_earth/alpha_earth_A24_2019',
# 'alpha_earth/alpha_earth_A24_2020',
# 'alpha_earth/alpha_earth_A24_2021',
# 'alpha_earth/alpha_earth_A24_2022',
# 'alpha_earth/alpha_earth_A24_2023',
# 'alpha_earth/alpha_earth_A24_2024',
# 'alpha_earth/alpha_earth_A25_2017',
# 'alpha_earth/alpha_earth_A25_2018',
# 'alpha_earth/alpha_earth_A25_2019',
# 'alpha_earth/alpha_earth_A25_2020',
# 'alpha_earth/alpha_earth_A25_2021',
# 'alpha_earth/alpha_earth_A25_2022',
# 'alpha_earth/alpha_earth_A25_2023',
# 'alpha_earth/alpha_earth_A25_2024',
# 'alpha_earth/alpha_earth_A26_2017',
# 'alpha_earth/alpha_earth_A26_2018',
# 'alpha_earth/alpha_earth_A26_2019',
# 'alpha_earth/alpha_earth_A26_2020',
# 'alpha_earth/alpha_earth_A26_2021',
# 'alpha_earth/alpha_earth_A26_2022',
# 'alpha_earth/alpha_earth_A26_2023',
# 'alpha_earth/alpha_earth_A26_2024',
# 'alpha_earth/alpha_earth_A27_2017',
# 'alpha_earth/alpha_earth_A27_2018',
# 'alpha_earth/alpha_earth_A27_2019',
# 'alpha_earth/alpha_earth_A27_2020',
# 'alpha_earth/alpha_earth_A27_2021',
# 'alpha_earth/alpha_earth_A27_2022',
# 'alpha_earth/alpha_earth_A27_2023',
# 'alpha_earth/alpha_earth_A27_2024',
# 'alpha_earth/alpha_earth_A28_2017',
# 'alpha_earth/alpha_earth_A28_2018',
# 'alpha_earth/alpha_earth_A28_2019',
# 'alpha_earth/alpha_earth_A28_2020',
# 'alpha_earth/alpha_earth_A28_2021',
# 'alpha_earth/alpha_earth_A28_2022',
# 'alpha_earth/alpha_earth_A28_2023',
# 'alpha_earth/alpha_earth_A28_2024',
# 'alpha_earth/alpha_earth_A29_2017',
# 'alpha_earth/alpha_earth_A29_2018',
# 'alpha_earth/alpha_earth_A29_2019',
# 'alpha_earth/alpha_earth_A29_2020',
# 'alpha_earth/alpha_earth_A29_2021',
# 'alpha_earth/alpha_earth_A29_2022',
# 'alpha_earth/alpha_earth_A29_2023',
# 'alpha_earth/alpha_earth_A29_2024',
# 'alpha_earth/alpha_earth_A30_2017',
# 'alpha_earth/alpha_earth_A30_2018',
# 'alpha_earth/alpha_earth_A30_2019',
# 'alpha_earth/alpha_earth_A30_2020',
# 'alpha_earth/alpha_earth_A30_2021',
# 'alpha_earth/alpha_earth_A30_2022',
# 'alpha_earth/alpha_earth_A30_2023',
# 'alpha_earth/alpha_earth_A30_2024',
# 'alpha_earth/alpha_earth_A31_2017',
# 'alpha_earth/alpha_earth_A31_2018',
# 'alpha_earth/alpha_earth_A31_2019',
# 'alpha_earth/alpha_earth_A31_2020',
# 'alpha_earth/alpha_earth_A31_2021',
# 'alpha_earth/alpha_earth_A31_2022',
# 'alpha_earth/alpha_earth_A31_2023',
# 'alpha_earth/alpha_earth_A31_2024',
# 'alpha_earth/alpha_earth_A32_2017',
# 'alpha_earth/alpha_earth_A32_2018',
# 'alpha_earth/alpha_earth_A32_2019',
# 'alpha_earth/alpha_earth_A32_2020',
# 'alpha_earth/alpha_earth_A32_2021',
# 'alpha_earth/alpha_earth_A32_2022',
# 'alpha_earth/alpha_earth_A32_2023',
# 'alpha_earth/alpha_earth_A32_2024',
# 'alpha_earth/alpha_earth_A33_2017',
# 'alpha_earth/alpha_earth_A33_2018',
# 'alpha_earth/alpha_earth_A33_2019',
# 'alpha_earth/alpha_earth_A33_2020',
# 'alpha_earth/alpha_earth_A33_2021',
# 'alpha_earth/alpha_earth_A33_2022',
# 'alpha_earth/alpha_earth_A33_2023',
# 'alpha_earth/alpha_earth_A33_2024',
# 'alpha_earth/alpha_earth_A34_2017',
# 'alpha_earth/alpha_earth_A34_2018',
# 'alpha_earth/alpha_earth_A34_2019',
# 'alpha_earth/alpha_earth_A34_2020',
# 'alpha_earth/alpha_earth_A34_2021',
# 'alpha_earth/alpha_earth_A34_2022',
# 'alpha_earth/alpha_earth_A34_2023',
# 'alpha_earth/alpha_earth_A34_2024',
# 'alpha_earth/alpha_earth_A35_2017',
# 'alpha_earth/alpha_earth_A35_2018',
# 'alpha_earth/alpha_earth_A35_2019',
# 'alpha_earth/alpha_earth_A35_2020',
# 'alpha_earth/alpha_earth_A35_2021',
# 'alpha_earth/alpha_earth_A35_2022',
# 'alpha_earth/alpha_earth_A35_2023',
# 'alpha_earth/alpha_earth_A35_2024',
# 'alpha_earth/alpha_earth_A36_2017',
# 'alpha_earth/alpha_earth_A36_2018',
# 'alpha_earth/alpha_earth_A36_2019',
# 'alpha_earth/alpha_earth_A36_2020',
# 'alpha_earth/alpha_earth_A36_2021',
# 'alpha_earth/alpha_earth_A36_2022',
# 'alpha_earth/alpha_earth_A36_2023',
# 'alpha_earth/alpha_earth_A36_2024',
# 'alpha_earth/alpha_earth_A37_2017',
# 'alpha_earth/alpha_earth_A37_2018',
# 'alpha_earth/alpha_earth_A37_2019',
# 'alpha_earth/alpha_earth_A37_2020',
# 'alpha_earth/alpha_earth_A37_2021',
# 'alpha_earth/alpha_earth_A37_2022',
# 'alpha_earth/alpha_earth_A37_2023',
# 'alpha_earth/alpha_earth_A37_2024',
# 'alpha_earth/alpha_earth_A38_2017',
# 'alpha_earth/alpha_earth_A38_2018',
# 'alpha_earth/alpha_earth_A38_2019',
# 'alpha_earth/alpha_earth_A38_2020',
# 'alpha_earth/alpha_earth_A38_2021',
# 'alpha_earth/alpha_earth_A38_2022',
# 'alpha_earth/alpha_earth_A38_2023',
# 'alpha_earth/alpha_earth_A38_2024',
# 'alpha_earth/alpha_earth_A39_2017',
# 'alpha_earth/alpha_earth_A39_2018',
# 'alpha_earth/alpha_earth_A39_2019',
# 'alpha_earth/alpha_earth_A39_2020',
# 'alpha_earth/alpha_earth_A39_2021',
# 'alpha_earth/alpha_earth_A39_2022',
# 'alpha_earth/alpha_earth_A39_2023',
# 'alpha_earth/alpha_earth_A39_2024',
# 'alpha_earth/alpha_earth_A40_2017',
# 'alpha_earth/alpha_earth_A40_2018',
# 'alpha_earth/alpha_earth_A40_2019',
# 'alpha_earth/alpha_earth_A40_2020',
# 'alpha_earth/alpha_earth_A40_2021',
# 'alpha_earth/alpha_earth_A40_2022',
# 'alpha_earth/alpha_earth_A40_2023',
# 'alpha_earth/alpha_earth_A40_2024',
# 'alpha_earth/alpha_earth_A41_2017',
# 'alpha_earth/alpha_earth_A41_2018',
# 'alpha_earth/alpha_earth_A41_2019',
# 'alpha_earth/alpha_earth_A41_2020',
# 'alpha_earth/alpha_earth_A41_2021',
# 'alpha_earth/alpha_earth_A41_2022',
# 'alpha_earth/alpha_earth_A41_2023',
# 'alpha_earth/alpha_earth_A41_2024',
# 'alpha_earth/alpha_earth_A42_2017',
# 'alpha_earth/alpha_earth_A42_2018',
# 'alpha_earth/alpha_earth_A42_2019',
# 'alpha_earth/alpha_earth_A42_2020',
# 'alpha_earth/alpha_earth_A42_2021',
# 'alpha_earth/alpha_earth_A42_2022',
# 'alpha_earth/alpha_earth_A42_2023',
# 'alpha_earth/alpha_earth_A42_2024',
# 'alpha_earth/alpha_earth_A43_2017',
# 'alpha_earth/alpha_earth_A43_2018',
# 'alpha_earth/alpha_earth_A43_2019',
# 'alpha_earth/alpha_earth_A43_2020',
# 'alpha_earth/alpha_earth_A43_2021',
# 'alpha_earth/alpha_earth_A43_2022',
# 'alpha_earth/alpha_earth_A43_2023',
# 'alpha_earth/alpha_earth_A43_2024',
# 'alpha_earth/alpha_earth_A44_2017',
# 'alpha_earth/alpha_earth_A44_2018',
# 'alpha_earth/alpha_earth_A44_2019',
# 'alpha_earth/alpha_earth_A44_2020',
# 'alpha_earth/alpha_earth_A44_2021',
# 'alpha_earth/alpha_earth_A44_2022',
# 'alpha_earth/alpha_earth_A44_2023',
# 'alpha_earth/alpha_earth_A44_2024',
# 'alpha_earth/alpha_earth_A45_2017',
# 'alpha_earth/alpha_earth_A45_2018',
# 'alpha_earth/alpha_earth_A45_2019',
# 'alpha_earth/alpha_earth_A45_2020',
# 'alpha_earth/alpha_earth_A45_2021',
# 'alpha_earth/alpha_earth_A45_2022',
# 'alpha_earth/alpha_earth_A45_2023',
# 'alpha_earth/alpha_earth_A45_2024',
# 'alpha_earth/alpha_earth_A46_2017',
# 'alpha_earth/alpha_earth_A46_2018',
# 'alpha_earth/alpha_earth_A46_2019',
# 'alpha_earth/alpha_earth_A46_2020',
# 'alpha_earth/alpha_earth_A46_2021',
# 'alpha_earth/alpha_earth_A46_2022',
# 'alpha_earth/alpha_earth_A46_2023',
# 'alpha_earth/alpha_earth_A46_2024',
# 'alpha_earth/alpha_earth_A47_2017',
# 'alpha_earth/alpha_earth_A47_2018',
# 'alpha_earth/alpha_earth_A47_2019',
# 'alpha_earth/alpha_earth_A47_2020',
# 'alpha_earth/alpha_earth_A47_2021',
# 'alpha_earth/alpha_earth_A47_2022',
# 'alpha_earth/alpha_earth_A47_2023',
# 'alpha_earth/alpha_earth_A47_2024',
# 'alpha_earth/alpha_earth_A48_2017',
# 'alpha_earth/alpha_earth_A48_2018',
# 'alpha_earth/alpha_earth_A48_2019',
# 'alpha_earth/alpha_earth_A48_2020',
# 'alpha_earth/alpha_earth_A48_2021',
# 'alpha_earth/alpha_earth_A48_2022',
# 'alpha_earth/alpha_earth_A48_2023',
# 'alpha_earth/alpha_earth_A48_2024',
# 'alpha_earth/alpha_earth_A49_2017',
# 'alpha_earth/alpha_earth_A49_2018',
# 'alpha_earth/alpha_earth_A49_2019',
# 'alpha_earth/alpha_earth_A49_2020',
# 'alpha_earth/alpha_earth_A49_2021',
# 'alpha_earth/alpha_earth_A49_2022',
# 'alpha_earth/alpha_earth_A49_2023',
# 'alpha_earth/alpha_earth_A49_2024',
# 'alpha_earth/alpha_earth_A50_2017',
# 'alpha_earth/alpha_earth_A50_2018',
# 'alpha_earth/alpha_earth_A50_2019',
# 'alpha_earth/alpha_earth_A50_2020',
# 'alpha_earth/alpha_earth_A50_2021',
# 'alpha_earth/alpha_earth_A50_2022',
# 'alpha_earth/alpha_earth_A50_2023',
# 'alpha_earth/alpha_earth_A50_2024',
# 'alpha_earth/alpha_earth_A51_2017',
# 'alpha_earth/alpha_earth_A51_2018',
# 'alpha_earth/alpha_earth_A51_2019',
# 'alpha_earth/alpha_earth_A51_2020',
# 'alpha_earth/alpha_earth_A51_2021',
# 'alpha_earth/alpha_earth_A51_2022',
# 'alpha_earth/alpha_earth_A51_2023',
# 'alpha_earth/alpha_earth_A51_2024',
# 'alpha_earth/alpha_earth_A52_2017',
# 'alpha_earth/alpha_earth_A52_2018',
# 'alpha_earth/alpha_earth_A52_2019',
# 'alpha_earth/alpha_earth_A52_2020',
# 'alpha_earth/alpha_earth_A52_2021',
# 'alpha_earth/alpha_earth_A52_2022',
# 'alpha_earth/alpha_earth_A52_2023',
# 'alpha_earth/alpha_earth_A52_2024',
# 'alpha_earth/alpha_earth_A53_2017',
# 'alpha_earth/alpha_earth_A53_2018',
# 'alpha_earth/alpha_earth_A53_2019',
# 'alpha_earth/alpha_earth_A53_2020',
# 'alpha_earth/alpha_earth_A53_2021',
# 'alpha_earth/alpha_earth_A53_2022',
# 'alpha_earth/alpha_earth_A53_2023',
# 'alpha_earth/alpha_earth_A53_2024',
# 'alpha_earth/alpha_earth_A54_2017',
# 'alpha_earth/alpha_earth_A54_2018',
# 'alpha_earth/alpha_earth_A54_2019',
# 'alpha_earth/alpha_earth_A54_2020',
# 'alpha_earth/alpha_earth_A54_2021',
# 'alpha_earth/alpha_earth_A54_2022',
# 'alpha_earth/alpha_earth_A54_2023',
# 'alpha_earth/alpha_earth_A54_2024',
# 'alpha_earth/alpha_earth_A55_2017',
# 'alpha_earth/alpha_earth_A55_2018',
# 'alpha_earth/alpha_earth_A55_2019',
# 'alpha_earth/alpha_earth_A55_2020',
# 'alpha_earth/alpha_earth_A55_2021',
# 'alpha_earth/alpha_earth_A55_2022',
# 'alpha_earth/alpha_earth_A55_2023',
# 'alpha_earth/alpha_earth_A55_2024',
# 'alpha_earth/alpha_earth_A56_2017',
# 'alpha_earth/alpha_earth_A56_2018',
# 'alpha_earth/alpha_earth_A56_2019',
# 'alpha_earth/alpha_earth_A56_2020',
# 'alpha_earth/alpha_earth_A56_2021',
# 'alpha_earth/alpha_earth_A56_2022',
# 'alpha_earth/alpha_earth_A56_2023',
# 'alpha_earth/alpha_earth_A56_2024',
# 'alpha_earth/alpha_earth_A57_2017',
# 'alpha_earth/alpha_earth_A57_2018',
# 'alpha_earth/alpha_earth_A57_2019',
# 'alpha_earth/alpha_earth_A57_2020',
# 'alpha_earth/alpha_earth_A57_2021',
# 'alpha_earth/alpha_earth_A57_2022',
# 'alpha_earth/alpha_earth_A57_2023',
# 'alpha_earth/alpha_earth_A57_2024',
# 'alpha_earth/alpha_earth_A58_2017',
# 'alpha_earth/alpha_earth_A58_2018',
# 'alpha_earth/alpha_earth_A58_2019',
# 'alpha_earth/alpha_earth_A58_2020',
# 'alpha_earth/alpha_earth_A58_2021',
# 'alpha_earth/alpha_earth_A58_2022',
# 'alpha_earth/alpha_earth_A58_2023',
# 'alpha_earth/alpha_earth_A58_2024',
# 'alpha_earth/alpha_earth_A59_2017',
# 'alpha_earth/alpha_earth_A59_2018',
# 'alpha_earth/alpha_earth_A59_2019',
# 'alpha_earth/alpha_earth_A59_2020',
# 'alpha_earth/alpha_earth_A59_2021',
# 'alpha_earth/alpha_earth_A59_2022',
# 'alpha_earth/alpha_earth_A59_2023',
# 'alpha_earth/alpha_earth_A59_2024',
# 'alpha_earth/alpha_earth_A60_2017',
# 'alpha_earth/alpha_earth_A60_2018',
# 'alpha_earth/alpha_earth_A60_2019',
# 'alpha_earth/alpha_earth_A60_2020',
# 'alpha_earth/alpha_earth_A60_2021',
# 'alpha_earth/alpha_earth_A60_2022',
# 'alpha_earth/alpha_earth_A60_2023',
# 'alpha_earth/alpha_earth_A60_2024',
# 'alpha_earth/alpha_earth_A61_2017',
# 'alpha_earth/alpha_earth_A61_2018',
# 'alpha_earth/alpha_earth_A61_2019',
# 'alpha_earth/alpha_earth_A61_2020',
# 'alpha_earth/alpha_earth_A61_2021',
# 'alpha_earth/alpha_earth_A61_2022',
# 'alpha_earth/alpha_earth_A61_2023',
# 'alpha_earth/alpha_earth_A61_2024',
# 'alpha_earth/alpha_earth_A62_2017',
# 'alpha_earth/alpha_earth_A62_2018',
# 'alpha_earth/alpha_earth_A62_2019',
# 'alpha_earth/alpha_earth_A62_2020',
# 'alpha_earth/alpha_earth_A62_2021',
# 'alpha_earth/alpha_earth_A62_2022',
# 'alpha_earth/alpha_earth_A62_2023',
# 'alpha_earth/alpha_earth_A62_2024',
# 'alpha_earth/alpha_earth_A63_2017',
# 'alpha_earth/alpha_earth_A63_2018',
# 'alpha_earth/alpha_earth_A63_2019',
# 'alpha_earth/alpha_earth_A63_2020',
# 'alpha_earth/alpha_earth_A63_2021',
# 'alpha_earth/alpha_earth_A63_2022',
# 'alpha_earth/alpha_earth_A63_2023',
# 'alpha_earth/alpha_earth_A63_2024',
# ]

## Sample feature pixels

In [None]:
cache = True

# Set the number of threads for raster pixel sampling.
# Lower if the runtime is unstable.
n_threads = os.cpu_count()-1

# Define paths
dataset_targets_final_path = join(datasets_tar_dir, dataset_targets_final_name)
dataset_targets_final = pd.read_pickle(dataset_targets_final_path)
dataset_add_features_path = join(datasets_add_fea_dir, dataset_add_features_pkl_name)

# Handle caching
if cache: print("Cache enabled.")
if not exists(dataset_add_features_path):
    dataset_targets_final.to_pickle(dataset_add_features_path)
    dataset_add_fea = pd.read_pickle(dataset_add_features_path)
else:
    print(f"An 'add features' dataset already exists: {dataset_add_features_path}")
    if cache:
        print("Continuing to add features to existing dataset. Delete it to start again.")
        dataset_add_fea = pd.read_pickle(dataset_add_features_path)
    else:
        print("Cache disabled. The 'add features' dataset will be overwritten in 10 seconds (interrupt if unintended).")
        sleep(10)
        dataset_targets_final.to_pickle(dataset_add_features_path)
        dataset_add_features = pd.read_pickle(dataset_add_features_path)

# Initialise geometries
geom_x = np.array([g.x for g in dataset_add_features['geometry']])
geom_y = np.array([g.y for g in dataset_add_features['geometry']])

# Feature progress
feature_progress_index = 0
feature_progress_label = widgets.Label(f"feature progress: {feature_progress_index}/{len(feature_list)}")
display(feature_progress_label)

for feature in feature_list:
    feature_name = feature.split('/')[-1]
    if f"fea_{feature_name}" not in dataset_add_features.columns:
        feature_path = join(features_dir, f"{feature}.tif")
        # Retry on read failures (e.g. gdrive connection issues)
        while True:
            try:
                sample_raster_values(dataset_add_features, feature_path, geom_x, geom_y,
                                     feature=True, n_threads=n_threads)
                if cache: # Atomic save to prevent corruption
                    temp_path = dataset_add_features_path + '.tmp'
                    dataset_add_features.to_pickle(temp_path)
                    os.replace(temp_path, dataset_add_features_path)
                break
            except (RuntimeError, MemoryError) as e:
                print(f"  {feature_name} failed: {e}, retrying in 5 seconds")
                gc.collect()
                sleep(5)
    feature_progress_index += 1
    feature_progress_label.value = f"feature progress: {feature_progress_index}/{len(feature_list)}"
    # Periodic defragmentation and memory cleanup
    if feature_progress_index % 50 == 0:
        dataset_add_features = dataset_add_features.copy()
        gc.collect()

# Defragment and export
dataset_add_features = dataset_add_features.copy()
temp_path = dataset_add_features_path + '.tmp'
dataset_add_features.to_pickle(temp_path)
os.replace(temp_path, dataset_add_features_path)

print(f"All features have been added to {dataset_add_features_path}.")

In [None]:
# Clear datasets from memory after verification
del dataset_targets_final, dataset_add_features

# Drop columns (optional)

In [None]:
# Targets and features can be removed from the dataset in case of any issues.
# Select the 'add features' dataset.
for pkl in os.listdir(datasets_add_fea_dir):
  print(f"dataset_drop_columns = '{pkl}'")

In [None]:
dataset_add_features_name = 'gedi_elevation.pkl'

dataset_add_features_path = join(datasets_add_fea_dir, dataset_add_features_name)
dataset_add_features = pd.read_pickle(dataset_add_features_path)

# Inspect existing columns
sorted(dataset_add_features.columns)

In [None]:
drop_dataset_columns = True

if drop_dataset_columns:
  # Anything containing the dropped_columns string will be removed
  dropped_columns = 'fea_topo_dsm_'
  # Drop columns
  dataset_drop_columns = dataset_add_features.loc[:,~dataset_add_features.columns.str.contains(dropped_columns)]
  dataset_drop_columns.to_pickle(dataset_add_features_path)
  # Inspect columns again
  dataset_add_features = pd.read_pickle(dataset_add_features_path)
sorted(dataset_add_features.columns)

# Finalise dataset

In [None]:
# Select the dataset to finalise.
for pkl in os.listdir(datasets_add_fea_dir):
    print(f'dataset_final_name = "{pkl}"')

In [None]:
dataset_final_name = "agbd_alpha_earth.pkl"

dataset_to_finalise_path = join(datasets_add_fea_dir, dataset_final_name)
dataset_to_finalise = pd.read_pickle(dataset_to_finalise_path)

# Precision reduction for histogram-based XGBoost
# Fewer unique values = smaller max_bin = faster histogram construction
# XGBoost bins continuous features, excess precision wastes computation

print("precision_change_dict = {")
for col in sorted(dataset_to_finalise.columns):
    series = dataset_to_finalise[col]
    is_float = series.dtype in ['float32', 'float64']
    is_geometry = 'geometry' in str(series.dtype).lower()
    if is_geometry:
        print(f'    "{col}": None,  # geometry')
        continue
    col_unique = series.nunique()
    if not is_float: print(f'    "{col}": None,  # {series.dtype}, unique={col_unique:,}')
    else:
        col_min = series.min()
        col_max = series.max()
        # infer current decimal places from sample
        sample = series.dropna().head(10000).astype(str)
        decimals = sample.apply(lambda x: len(x.split('.')[-1]) if '.' in x else 0)
        current_precision = int(decimals.max())
        print(f'    "{col}": None,  # float, precision={current_precision}, min={col_min:.4g}, max={col_max:.4g}, unique={col_unique:,}')
print("}")

In [None]:
precision_change_dict = {
    "agbd": 0,  # float, precision=7, min=0.05733, max=889.7, unique=612,822
    "agbd_se": 2,  # float, precision=7, min=2.982, max=17.81, unique=34,814
    "beam": None,  # object, unique=8
    "fea_coast_proximity_km": None,  # float, precision=1, min=18.7, max=148.9, unique=1,303
    "fea_disturbance_edge_distance_1990": None,  # int16, unique=23
    "fea_disturbance_edge_distance_1991": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1992": None,  # int16, unique=20
    "fea_disturbance_edge_distance_1993": None,  # int16, unique=23
    "fea_disturbance_edge_distance_1994": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1995": None,  # int16, unique=18
    "fea_disturbance_edge_distance_1996": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1997": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1998": None,  # int16, unique=24
    "fea_disturbance_edge_distance_1999": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2000": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2001": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2002": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2003": None,  # int16, unique=23
    "fea_disturbance_edge_distance_2004": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2005": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2006": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2007": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2008": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2009": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2010": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2011": None,  # int16, unique=23
    "fea_disturbance_edge_distance_2012": None,  # int16, unique=23
    "fea_disturbance_edge_distance_2013": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2014": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2015": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2016": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2017": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2018": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2019": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2020": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2021": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2022": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2023": None,  # int16, unique=24
    "fea_disturbance_edge_distance_2024": None,  # int16, unique=24
    "fea_disturbance_local_density_1990": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1991": None,  # float, precision=2, min=0, max=1, unique=100
    "fea_disturbance_local_density_1992": None,  # float, precision=2, min=0, max=1, unique=99
    "fea_disturbance_local_density_1993": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1994": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1995": None,  # float, precision=2, min=0, max=0.99, unique=99
    "fea_disturbance_local_density_1996": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1997": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1998": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_1999": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2000": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2001": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2002": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2003": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2004": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2005": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2006": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2007": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2008": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2009": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2010": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2011": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2012": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2013": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2014": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2015": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2016": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2017": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2018": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2019": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2020": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2021": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2022": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2023": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_disturbance_local_density_2024": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_edge_distance_1990": None,  # int16, unique=12
    "fea_forest_edge_distance_1991": None,  # int16, unique=12
    "fea_forest_edge_distance_1992": None,  # int16, unique=12
    "fea_forest_edge_distance_1993": None,  # int16, unique=12
    "fea_forest_edge_distance_1994": None,  # int16, unique=12
    "fea_forest_edge_distance_1995": None,  # int16, unique=12
    "fea_forest_edge_distance_1996": None,  # int16, unique=12
    "fea_forest_edge_distance_1997": None,  # int16, unique=12
    "fea_forest_edge_distance_1998": None,  # int16, unique=12
    "fea_forest_edge_distance_1999": None,  # int16, unique=12
    "fea_forest_edge_distance_2000": None,  # int16, unique=12
    "fea_forest_edge_distance_2001": None,  # int16, unique=12
    "fea_forest_edge_distance_2002": None,  # int16, unique=12
    "fea_forest_edge_distance_2003": None,  # int16, unique=12
    "fea_forest_edge_distance_2004": None,  # int16, unique=12
    "fea_forest_edge_distance_2005": None,  # int16, unique=12
    "fea_forest_edge_distance_2006": None,  # int16, unique=12
    "fea_forest_edge_distance_2007": None,  # int16, unique=12
    "fea_forest_edge_distance_2008": None,  # int16, unique=12
    "fea_forest_edge_distance_2009": None,  # int16, unique=12
    "fea_forest_edge_distance_2010": None,  # int16, unique=12
    "fea_forest_edge_distance_2011": None,  # int16, unique=12
    "fea_forest_edge_distance_2012": None,  # int16, unique=12
    "fea_forest_edge_distance_2013": None,  # int16, unique=12
    "fea_forest_edge_distance_2014": None,  # int16, unique=12
    "fea_forest_edge_distance_2015": None,  # int16, unique=12
    "fea_forest_edge_distance_2016": None,  # int16, unique=12
    "fea_forest_edge_distance_2017": None,  # int16, unique=12
    "fea_forest_edge_distance_2018": None,  # int16, unique=12
    "fea_forest_edge_distance_2019": None,  # int16, unique=12
    "fea_forest_edge_distance_2020": None,  # int16, unique=24
    "fea_forest_edge_distance_2021": None,  # int16, unique=24
    "fea_forest_edge_distance_2022": None,  # int16, unique=24
    "fea_forest_edge_distance_2023": None,  # int16, unique=24
    "fea_forest_edge_distance_2024": None,  # int16, unique=24
    "fea_forest_local_density_1990": None,  # float, precision=2, min=0.14, max=1, unique=84
    "fea_forest_local_density_1991": None,  # float, precision=2, min=0.14, max=1, unique=84
    "fea_forest_local_density_1992": None,  # float, precision=2, min=0.14, max=1, unique=84
    "fea_forest_local_density_1993": None,  # float, precision=2, min=0.14, max=1, unique=85
    "fea_forest_local_density_1994": None,  # float, precision=2, min=0.14, max=1, unique=84
    "fea_forest_local_density_1995": None,  # float, precision=2, min=0.14, max=1, unique=85
    "fea_forest_local_density_1996": None,  # float, precision=2, min=0.14, max=1, unique=85
    "fea_forest_local_density_1997": None,  # float, precision=2, min=0.14, max=1, unique=85
    "fea_forest_local_density_1998": None,  # float, precision=2, min=0.14, max=1, unique=85
    "fea_forest_local_density_1999": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2000": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2001": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2002": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2003": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2004": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2005": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2006": None,  # float, precision=2, min=0.14, max=1, unique=87
    "fea_forest_local_density_2007": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2008": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2009": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2010": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2011": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2012": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2013": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2014": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2015": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2016": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2017": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2018": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2019": None,  # float, precision=2, min=0.13, max=1, unique=88
    "fea_forest_local_density_2020": None,  # float, precision=2, min=0, max=1, unique=97
    "fea_forest_local_density_2021": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2022": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2023": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_forest_local_density_2024": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_latitude": None,  # float, precision=3, min=3.586, max=4.948, unique=1,363
    "fea_longitude": None,  # float, precision=3, min=102, max=103.2, unique=1,155
    "fea_lu_ais_edge_distance": None,  # int16, unique=24
    "fea_lu_ais_local_density": None,  # float, precision=1, min=0, max=1, unique=100
    "fea_lu_berkelah_jerantut_edge_distance": None,  # int16, unique=24
    "fea_lu_berkelah_kuantan_edge_distance": None,  # int16, unique=24
    "fea_lu_berkelah_temerloh_edge_distance": None,  # int16, unique=24
    "fea_lu_old-growth_protected_areas_edge_distance": None,  # int16, unique=24
    "fea_lu_remen_chereh_edge_distance": None,  # int16, unique=24
    "fea_lu_tekai_tembeling_edge_distance": None,  # int16, unique=24
    "fea_lu_tekam_edge_distance": None,  # int16, unique=24
    "fea_lu_yong_edge_distance": None,  # int16, unique=24
    "fea_lu_yong_lipis_edge_distance": None,  # int16, unique=24
    "fea_topo_dtm_smooth_aspect_cosine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dtm_smooth_aspect_sine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dtm_smooth_circular_variance_aspect_03": None,  # float, precision=2, min=0, max=0.87, unique=87
    "fea_topo_dtm_smooth_circular_variance_aspect_07": None,  # float, precision=2, min=0, max=0.96, unique=97
    "fea_topo_dtm_smooth_circular_variance_aspect_11": None,  # float, precision=2, min=0, max=0.98, unique=99
    "fea_topo_dtm_smooth_deviation_mean_elevation_03": None,  # float, precision=1, min=-0.9, max=0.9, unique=19
    "fea_topo_dtm_smooth_deviation_mean_elevation_07": None,  # float, precision=2, min=-1.49, max=1.82, unique=320
    "fea_topo_dtm_smooth_deviation_mean_elevation_11": None,  # float, precision=2, min=-1.99, max=2.82, unique=412
    "fea_topo_dtm_smooth_eastness": None,  # float, precision=2, min=-0.85, max=0.85, unique=168
    "fea_topo_dtm_smooth_elevation": None,  # int16, unique=1,950
    "fea_topo_dtm_smooth_northness": None,  # float, precision=2, min=-0.84, max=0.85, unique=166
    "fea_topo_dtm_smooth_profile_curvature": None,  # float, precision=4, min=-0.0187, max=0.0198, unique=337
    "fea_topo_dtm_smooth_roughness_03": None,  # int16, unique=148
    "fea_topo_dtm_smooth_roughness_07": None,  # int16, unique=307
    "fea_topo_dtm_smooth_roughness_11": None,  # int16, unique=418
    "fea_topo_dtm_smooth_slope": None,  # float, precision=1, min=0.4, max=64, unique=574
    "fea_topo_dtm_smooth_stream_power_index_log10": None,  # float, precision=1, min=-10.4, max=0.6, unique=84
    "fea_topo_dtm_smooth_surface_area_ratio": None,  # float, precision=2, min=1, max=2.56, unique=119
    "fea_topo_dtm_smooth_tangential_curvature": None,  # float, precision=4, min=-0.0214, max=0.0204, unique=341
    "fea_topo_dtm_smooth_topographic_position_index_03": None,  # float, precision=1, min=-14.5, max=15.7, unique=234
    "fea_topo_dtm_smooth_topographic_position_index_07": None,  # int16, unique=98
    "fea_topo_dtm_smooth_topographic_position_index_11": None,  # int16, unique=154
    "fea_topo_dtm_smooth_topographic_ruggedness_index": None,  # float, precision=1, min=0, max=62.8, unique=446
    "fea_topo_dtm_smooth_topographic_wetness_index": None,  # float, precision=1, min=-9.1, max=15.4, unique=189
    "fea_topo_dtm_unsmooth_aspect_cosine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dtm_unsmooth_aspect_sine": None,  # float, precision=2, min=-1, max=1, unique=201
    "fea_topo_dtm_unsmooth_circular_variance_aspect_03": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_topo_dtm_unsmooth_circular_variance_aspect_07": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_topo_dtm_unsmooth_circular_variance_aspect_11": None,  # float, precision=2, min=0, max=1, unique=101
    "fea_topo_dtm_unsmooth_deviation_mean_elevation_03": None,  # float, precision=2, min=-2.83, max=2.67, unique=503
    "fea_topo_dtm_unsmooth_deviation_mean_elevation_07": None,  # float, precision=2, min=-4.33, max=3.98, unique=539
    "fea_topo_dtm_unsmooth_deviation_mean_elevation_11": None,  # float, precision=2, min=-4.4, max=5.02, unique=568
    "fea_topo_dtm_unsmooth_eastness": None,  # float, precision=2, min=-0.87, max=0.89, unique=174
    "fea_topo_dtm_unsmooth_elevation": None,  # int16, unique=1,942
    "fea_topo_dtm_unsmooth_northness": None,  # float, precision=2, min=-0.89, max=0.9, unique=178
    "fea_topo_dtm_unsmooth_profile_curvature": None,  # float, precision=4, min=-0.0372, max=0.0404, unique=627
    "fea_topo_dtm_unsmooth_roughness_03": None,  # int16, unique=158
    "fea_topo_dtm_unsmooth_roughness_07": None,  # int16, unique=314
    "fea_topo_dtm_unsmooth_roughness_11": None,  # int16, unique=419
    "fea_topo_dtm_unsmooth_slope": None,  # float, precision=1, min=0, max=69.7, unique=612
    "fea_topo_dtm_unsmooth_stream_power_index_log10": None,  # float, precision=1, min=-20.2, max=2.6, unique=148
    "fea_topo_dtm_unsmooth_surface_area_ratio": None,  # float, precision=2, min=1, max=2.96, unique=129
    "fea_topo_dtm_unsmooth_tangential_curvature": None,  # float, precision=4, min=-0.0535, max=0.0533, unique=706
    "fea_topo_dtm_unsmooth_topographic_position_index_03": None,  # int16, unique=63
    "fea_topo_dtm_unsmooth_topographic_position_index_07": None,  # int16, unique=127
    "fea_topo_dtm_unsmooth_topographic_position_index_11": None,  # int16, unique=181
    "fea_topo_dtm_unsmooth_topographic_ruggedness_index": None,  # int16, unique=64
    "fea_topo_dtm_unsmooth_topographic_wetness_index": None,  # float, precision=1, min=-9.7, max=34.8, unique=267
    "geometry": None,  # geometry
    "sensitivity": 4,  # float, precision=8, min=0.95, max=0.9981, unique=395,056
    "shot_number": None,  # object, unique=641,477
    "year": None,  # int64, unique=6
}

In [None]:
# Preview precision changes before applying
# Only columns wi,h integer values (not None) will be rounded
# Review unique value reductions

dataset_to_finalise_path = join(datasets_add_fea_dir, dataset_final_name)
dataset_to_finalise = pd.read_pickle(dataset_to_finalise_path)
dataset_preview = dataset.copy()

print(f"{'Column':<60} {'Old unique':>12} {'New unique':>12} {'Reduction':>10}")
print("-" * 96)

changed_columns = []
for col, precision in precision_change_dict.items():
    if precision is None:
        continue
    if dataset_preview[col].dtype not in ['float32', 'float64']:
        print(f"{col:<60} {'SKIPPED (not float)':<36}")
        continue

    old_unique = dataset_preview[col].nunique()
    dataset_preview[col] = dataset_preview[col].round(precision)
    new_unique = dataset_preview[col].nunique()

    reduction_pct = 100 * (1 - new_unique / old_unique) if old_unique > 0 else 0
    print(f"{col:<60} {old_unique:>12,} {new_unique:>12,} {reduction_pct:>9.1f}%")
    changed_columns.append(col)

print("-" * 96)
print(f"{len(changed_columns)} columns will be modified")

# max_bin estimate from preview
feature_cols = [c for c in dataset_preview.columns if c.startswith('fea_')]
max_unique = dataset_preview[feature_cols].nunique().max()
print(f"Max unique values across features: {max_unique:,} (use for max_bin)")

# preview not saved, run next block to apply changes,

In [None]:
# Whether targets have associated collection years requiring temporal feature alignment
yearly_targets = True
# Column containing the year for each observation (only used if yearly_targets = True)
year_column = 'year'

# Yearly feature patterns requiring temporal alignment (matched against column names ending in 4-digit year)
# Multi-year patterns: multiple years retained for temporal context (e.g., TMF land-cover)
yearly_patterns_multi = [
    'forest_edge_distance',
    'forest_local_density',
    'disturbance_edge_distance',
    'disturbance_local_density',
]
# Single-year patterns: only t-1 retained
# Alpha Earth embeddings already encode landscape state; multi-year would cause feature bloat
yearly_patterns_single = [
    'alpha_earth',
]

# Load 'add features' dataset
dataset_add_features_path = join(datasets_add_fea_dir, dataset_final_name)
dataset_add_fea = pd.read_pickle(dataset_add_features_path)
print(f"Loaded dataset: {dataset_final_name} ({len(dataset_add_fea):,} rows, {len(dataset_add_fea.columns)} columns)")

# Apply precision reduction
precision_changes = 0
for col, precision in precision_change_dict.items():
    if precision is None:
        continue
    if col in dataset_add_fea.columns and dataset_add_fea[col].dtype in ['float32', 'float64']:
        dataset_add_fea[col] = dataset_add_fea[col].round(precision)
        precision_changes += 1
if precision_changes > 0:
    print(f"Applied precision reduction to {precision_changes} columns")

if yearly_targets:
    print(f"\nYearly targets enabled. Year column: '{year_column}'")

    # Combine yearly pattern lists for renaming
    yearly_patterns = yearly_patterns_multi + yearly_patterns_single

    # Check which yearly patterns are present
    dataset_add_fea_column_list = sorted(dataset_add_fea.columns, reverse=True)

    for pattern in yearly_patterns_multi:
        found = any(pattern in col and col[-4:].isdigit() for col in dataset_add_fea_column_list)
        if found: print(f"  Multi-year pattern found: {pattern}")
        else: print(f"  Multi-year pattern not found: {pattern}")

    for pattern in yearly_patterns_single:
        found = any(pattern in col and col[-4:].isdigit() for col in dataset_add_fea_column_list)
        if found: print(f"  Single-year pattern found: {pattern}")
        else: print(f"  Single-year pattern not found: {pattern}")

    # Get list of target years
    target_year_list = dataset_add_fea[year_column].unique().tolist()
    target_year_list = [int(x) for x in target_year_list]
    target_year_max = np.max(target_year_list)
    print(f"\nTarget years: {sorted(target_year_list)} (max: {target_year_max})")

    # Create an index identifier
    dataset_add_fea['index_record'] = dataset_add_fea.index

    # Create an empty list for storing yearly sub-datasets
    dataset_year_list = []

    # Iterate through each target year and shift the sample to appropriate feature year
    # Column names are kept the same to avoid issues when ordering features in model training/prediction
    # E.g. a '2021' feature for a 2022 target will actually be a 2019 feature for a 2020 target
    for target_year in target_year_list:
        dataset_year = dataset_add_fea[dataset_add_fea[year_column] == target_year].copy()
        sample_year_modifier = target_year_max - target_year
        for col in dataset_add_fea_column_list:
            for pattern in yearly_patterns:
                # Check column contains pattern and ends with 4-digit year
                if pattern in col and col[-4:].isdigit():
                    feature_year = int(col[-4:])
                    col_prefix = col[:-4]
                    corrected_sample = f"{col_prefix}{feature_year + sample_year_modifier}"
                    dataset_year.rename(columns={col: corrected_sample}, inplace=True)
                    break
        dataset_year_list.append(dataset_year)

    # Concatenate dataframes and sort
    dataset_final = pd.concat(dataset_year_list, ignore_index=True)
    dataset_final.sort_values('index_record', inplace=True)
    dataset_final.reset_index(drop=True, inplace=True)

    # Drop feature years out of the data range for one of the target years
    cols_before_na = len(dataset_final.columns)
    na_cols = dataset_final.columns[dataset_final.isna().any()].tolist()
    dataset_final.dropna(axis=1, how='any', inplace=True)
    cols_dropped_na = cols_before_na - len(dataset_final.columns)

    if cols_dropped_na > 0:
        # extract years from dropped columns
        dropped_years = set()
        for col in na_cols:
            if col[-4:].isdigit():
                dropped_years.add(int(col[-4:]))
        # extract years from retained columns
        retained_years = set()
        for col in dataset_final.columns:
            if col[-4:].isdigit():
                retained_years.add(int(col[-4:]))
        print(f"\nDropped {cols_dropped_na} columns with NA values (out of range years)")
        if dropped_years:
            print(f"  Years dropped: {sorted(dropped_years)}")
        if retained_years:
            print(f"  Years retained: {min(retained_years)}â€“{max(retained_years)}")

    # Drop multi-year features from the most recent target year (timing cannot be certain)
    cols_to_drop = [col for col in dataset_final.columns
                    for pattern in yearly_patterns_multi
                    if pattern in col and col.endswith(str(target_year_max))]
    if cols_to_drop:
        dataset_final.drop(columns=cols_to_drop, inplace=True)
        print(f"\nDropped {len(cols_to_drop)} multi-year feature columns from year {target_year_max}")
        print(f"  Patterns: {yearly_patterns_multi}")

    # Drop single-year features except t-1
    previous_year = target_year_max - 1
    single_cols_to_drop = [col for col in dataset_final.columns
                           for pattern in yearly_patterns_single
                           if pattern in col and not col.endswith(str(previous_year))]
    if single_cols_to_drop:
        # extract years being dropped
        single_dropped_years = set()
        for col in single_cols_to_drop:
            if col[-4:].isdigit():
                single_dropped_years.add(int(col[-4:]))
        dataset_final.drop(columns=single_cols_to_drop, inplace=True)
        print(f"\nDropped {len(single_cols_to_drop)} single-year feature columns (keeping only year {previous_year})")
        print(f"  Patterns: {yearly_patterns_single}")
        print(f"  Years dropped: {sorted(single_dropped_years)}")

    # Drop the index identifier
    dataset_final.drop(columns=['index_record'], inplace=True)

else:
    print("\nYearly targets disabled. No temporal feature alignment applied.")
    dataset_final = dataset_add_fea.copy()

# Add 'tar_' prefix to non-features
dataset_final.columns = ['tar_' + col if not col.startswith('fea_') else col for col in dataset_final.columns]

# Sort columns alphabetically
target_columns = [col for col in dataset_final.columns if col.startswith('tar_')]
feature_columns = [col for col in dataset_final.columns if col.startswith('fea_')]
sorted_columns = sorted(target_columns) + sorted(feature_columns)

# Reindex the DataFrame with the sorted column order
dataset_final = dataset_final.reindex(columns=sorted_columns)

# Export and check final dataset
dataset_final_path = join(datasets_final_dir, dataset_final_name)
dataset_final.to_pickle(dataset_final_path)

print(f"\nFinal dataset: {len(dataset_final):,} rows, {len(target_columns)} target columns, {len(feature_columns)} feature columns")
print(f"Exported to: {dataset_final_path}")

pd.read_pickle(dataset_final_path)

In [None]:
# Clear datasets from memory after verification
del dataset_to_finalise, dataset_preview, dataset_add_fea, dataset_year_list, dataset_final

# Export to .gpkg

In [None]:
# For verification and visualisation in GIS softaware.
# Select the dataset .pkl to export as a .gpkg
for pkl in os.listdir(datasets_final_dir):
  print(f'dataset_gpkg_name = "{pkl}"')

In [None]:
dataset_gpkg_name = "agbd.pkl"

dataset_gpkg_path = join(datasets_final_dir, dataset_gpkg_name)
dataset_gpkg = pd.read_pickle(dataset_gpkg_path)

# Print columns that can be included
print("selected_gpkg_columns = [")
for col in sorted(dataset_gpkg.columns):
  if col != "tar_geometry":
    print(f'  "{col}",')
print("]")

In [None]:
selected_gpkg_columns = [
  "fea_disturbance_local_density_2010",
  "fea_disturbance_local_density_2011",
  "fea_disturbance_local_density_2012",
  "fea_disturbance_local_density_2013",
  "fea_disturbance_local_density_2014",
  "fea_disturbance_local_density_2015",
  "fea_disturbance_local_density_2016",
  "fea_disturbance_local_density_2017",
  "fea_disturbance_local_density_2018",
  "fea_disturbance_local_density_2019",
  "fea_disturbance_local_density_2020",
  "fea_disturbance_local_density_2021",
  "fea_disturbance_local_density_2022",
  "fea_disturbance_local_density_2023",
  "tar_shot_number",
  "tar_year",
]

selected_gpkg_columns = selected_gpkg_columns + ['tar_geometry']

dataset_gpkg_geodataframe = gpd.GeoDataFrame(dataset_gpkg[selected_gpkg_columns], geometry='tar_geometry')

dataset_gpkg_export = join(datasets_gpkg_dir, f"{dataset_gpkg_name[:-4]}.gpkg")
dataset_gpkg_geodataframe.to_file(dataset_gpkg_export, driver="GPKG")

# Disconnect runtime

In [None]:
# Useful for stopping background execution
from google.colab import runtime
runtime.unassign()