<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/2_targets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports, directories and global functions

In [None]:
# Define base directory
# Use '/content/drive/MyDrive/' for a personal drive
# Use '/gdrive/Shareddrives/' for a shared drive (must be created first)

base_dir = "/gdrive/Shareddrives/masfi"
# base_dir = '/gdrive/Shareddrives/masfi'

# Mount Google Drive
from google.colab import drive
import os
import sys
if base_dir.startswith('/gdrive/Shareddrives/'):
  drive.mount('/gdrive', force_remount=True)
elif base_dir.startswith('/content/drive/MyDrive/'):
  drive.mount('/content/drive', force_remount=True)
  os.makedirs(base_dir, exist_ok=True)
else: print("Create a base_dir beginning with '/gdrive/Shareddrives/' or '/content/drive/MyDrive/'.")

_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs and upgrades
!pip install geopandas

In [None]:
# Imports
import geopandas as gpd
import h5py
import ipywidgets as widgets
import json
import numpy as np
import pandas as pd
import pprint
import re
import requests
from time import sleep
import urllib
from datetime import datetime
from getpass import getpass
from google.colab import runtime
from http.cookiejar import CookieJar
from os import listdir, makedirs, path
from osgeo import gdal
gdal.UseExceptions()
from os.path import exists, join
import re
from requests.auth import HTTPBasicAuth
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import orient
from shapely.strtree import STRtree

In [None]:
# 1_areas directories
areas_dir = join(base_dir, "1_areas")
polygons_dir = join(base_dir, "1_areas/polygons")
masks_dir = join(base_dir, "1_areas/masks")

# 2_targets directories
targets_dir = join(base_dir, "2_targets")
gedi_links_dir = join(targets_dir, "gedi_h5_links")
username = password = None
gedi_h5_downloads_dir = join(targets_dir, "gedi_h5_downloads")
gedi_raster_downloads_dir = join(targets_dir, "gedi_raster_downloads")
gedi_raster_final_dir = join(targets_dir, "gedi_raster_final")
h5_pkl_cache_dir = join(targets_dir, "gedi_h5_pkl_cache")
pkl_final_dir = join(targets_dir, "pkl_final")
targets_gpkg_dir = join(targets_dir, "gpkg_final")
targets_user_csv_dir = join(targets_dir, "csv")

# Create directories
makedirs(targets_dir, exist_ok=True)
makedirs(gedi_links_dir, exist_ok=True)
makedirs(gedi_h5_downloads_dir, exist_ok=True)
makedirs(gedi_raster_final_dir, exist_ok=True)
makedirs(gedi_raster_downloads_dir, exist_ok=True)
makedirs(h5_pkl_cache_dir, exist_ok=True)
makedirs(pkl_final_dir, exist_ok=True)
makedirs(targets_gpkg_dir, exist_ok=True)
makedirs(targets_user_csv_dir, exist_ok=True)

# Download GEDI data

## Get links

In [None]:
# Define Prediction area in which to download GEDI data
prediction_area_path = join(polygons_dir, "prediction_area.gpkg")
if not exists(prediction_area_path):
  print(f"Run '1_areas.ipynb' to add 'prediction_area.gpkg' to {polygons_dir}.")
else:
  prediction_area = gpd.read_file(prediction_area_path)
  prediction_area.geometry = prediction_area.geometry.apply(orient, args=(1,))

  # https://github.com/ornldaac/gedi_tutorials/blob/main/1_gedi_l4a_search_download.ipynb
  # CMR API base url
  cmrurl='https://cmr.earthdata.nasa.gov/search/'

  # GEDI product DOI list
  gedi_doi_list = {
  'GEDI01_B': '10.5067/GEDI/GEDI01_B.002', # GEDI L1B
  'GEDI02_A': '10.5067/GEDI/GEDI02_A.002', # GEDI L2A
  'GEDI02_B': '10.5067/GEDI/GEDI02_B.002', # GEDI L2B
  'GEDI04_A': '10.3334/ORNLDAAC/2056', # GEDI L4A
  'GEDI04_B': '10.3334/ORNLDAAC/2056', # GEDI L4B
  'GEDI04_C': '10.3334/ORNLDAAC/2338', # GEDI L4C
  'GEDI04_D': '10.3334/ORNLDAAC/2455', # GEDI L4D
  }

  # Some concept IDs do not retrieve dynamically
  gedi_concept_ids = {
    'GEDI01_B': 'C2142749196-LPCLOUD',
    'GEDI02_A': 'C2142771958-LPCLOUD',
    'GEDI02_B': 'C2142776747-LPCLOUD',
    'GEDI04_A': 'C2237824918-ORNL_CLOUD',
    'GEDI04_B': 'C2792577683-ORNL_CLOUD',
    'GEDI04_C': 'C3049900163-ORNL_CLOUD',
    'GEDI04_D': 'C3904758954-ORNL_CLOUD',
  }

  # Provider mapping for different GEDI products
  gedi_providers = {
    'GEDI01_B': 'LPCLOUD',
    'GEDI02_A': 'LPCLOUD',
    'GEDI02_B': 'LPCLOUD',
    'GEDI04_A': 'ORNL_CLOUD',
    'GEDI04_B': 'ORNL_CLOUD',
    'GEDI04_C': 'ORNL_CLOUD',
    'GEDI04_D': 'ORNL_CLOUD',
  }

  # File extension per product
  gedi_file_extensions = {
      'GEDI01_B': '.h5',
      'GEDI02_A': '.h5',
      'GEDI02_B': '.h5',
      'GEDI04_A': '.h5',
      'GEDI04_B': '.tif',
      'GEDI04_C': '.h5',
      'GEDI04_D': '.tif',
  }

  for gedi_product, doi in gedi_doi_list.items():
    # Use hardcoded concept_id if available, otherwise retrieve from DOI
    if gedi_concept_ids[gedi_product]:
      concept_id = gedi_concept_ids[gedi_product]
    else:
      response = requests.get(cmrurl + 'collections.json?doi=' + doi)
      response.raise_for_status()
      concept_id = response.json()['feed']['entry'][0]['id']

    geojson = {"shapefile": ("prediction_area.geojson", prediction_area.geometry.to_json(), "application/geo+json")}
    page_num, page_size = 1, 2000 # CMR page size limit
    granule_arr = []
    while True:
        # Set up provider-specific parameters
        if gedi_providers[gedi_product]:
            cmr_param = {"collection_concept_id": concept_id, "page_size": page_size, "page_num": page_num,
                "simplify-shapefile": 'true', # Needed to bypass 5000 coordinates limit of CMR
                "provider": gedi_providers[gedi_product]  # Set correct provider
            }
        else:
            # Use default parameters when provider not specified
            cmr_param = {"collection_concept_id": concept_id, "page_size": page_size, "page_num": page_num,
                "simplify-shapefile": 'true' # Needed to bypass 5000 coordinates limit of CMR
            }

        granulesearch = cmrurl + 'granules.json'
        response = requests.post(granulesearch, data=cmr_param, files=geojson)
        granules = response.json()['feed']['entry']
        if granules:
            for g in granules:
                granule_url, granule_size, granule_poly = None, None, None
                file_ext = gedi_file_extensions[gedi_product]
                # Check file extensions match
                for links in g['links']:
                    if links['href'].endswith(file_ext) and not links['href'].startswith('https://opendap') and not links['href'].startswith('s3'):
                        granule_url = links['href']
                if granule_url != None: # Some GEDI2A granules do not have links in the metadata
                  # Read file size
                  granule_size = float(g['granule_size'])
                  # Reading bounding geometries
                  if 'polygons' in g:
                      polygons= g['polygons']
                      multipolygons = []
                      for poly in polygons:
                          i=iter(poly[0].split (" "))
                          ltln = list(map(" ".join,zip(i,i)))
                          multipolygons.append(Polygon([[float(p.split(" ")[1]), float(p.split(" ")[0])] for p in ltln]))
                      granule_poly = MultiPolygon(multipolygons)
                  # Get URL to HDF5 files
                  granule_arr.append([granule_url, granule_size, granule_poly])
            page_num += 1
        else: break

    # Create pandas dataframe
    doi_df = pd.DataFrame(granule_arr, columns=["granule_url", "granule_size", "granule_poly"])
    # Drop granules with empty geometry
    gedi_product_granules = doi_df[doi_df['granule_poly'] != ""]
    print(f"Total granules found for {gedi_product}: {len(gedi_product_granules)}")
    print(f"Total file size for {gedi_product}: {round(gedi_product_granules['granule_size'].sum() / 1024, 3)}GB\n")

    # Export links list
    if len(gedi_product_granules) > 0:
        gedi_product_links_path = join(gedi_links_dir, f"{gedi_product}_links.txt")
        gedi_product_granules.to_csv(gedi_product_links_path, columns=['granule_url'], index=False, header=False)
    else:print(f"No valid granules for {gedi_product} link export.")

## H5 files: 1B, 2A, 2B, 4A, 4C

In [None]:
# Login to Earthdata account
if not username or not password:
  username = getpass("Username: ")
  password = getpass("Password: ")
  manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
  manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)
  manager.add_password(None, "https://data.ornldaac.earthdata.nasa.gov", username, password)
  cookie_jar = CookieJar()
  opener = urllib.request.build_opener(
      urllib.request.HTTPBasicAuthHandler(manager),
      urllib.request.HTTPCookieProcessor(cookie_jar))
  urllib.request.install_opener(opener)

In [None]:
# Get list of .txt files in the 'links' directory
link_files = []
for file in os.listdir(gedi_links_dir):
  link_files.append(file)

print('# Select downloads')
print("downloads = [")
for link_file in sorted(link_files):
  if 'GEDI04_D' not in link_file:
    print(f'  "{link_file}",')
print("]")

In [None]:
# Select downloads
downloads = [
  # "GEDI01_B_links.txt",
  # "GEDI02_A_links.txt",
  # "GEDI02_B_links.txt",
  "GEDI04_A_links.txt",
  # "GEDI04_C_links.txt",
]

# Process URLs
for text_file in downloads:
  text_file_path = join(gedi_links_dir, text_file)
  with open(text_file_path, 'r') as file:
    url_list = file.readlines()
  url_list = [url.strip() for url in url_list] # Remove any white space
  product = text_file[:8]
  # Display progress
  index = 0
  progress_label = widgets.Label(value=f"{product} progress: {index}/{len(url_list)} files downloaded.")
  display(progress_label)
  for url in url_list:
    product_dir = join(gedi_h5_downloads_dir, product)
    makedirs(product_dir, exist_ok=True)
    product_filename = url.split('/')[-1]
    product_file_path = join(product_dir, product_filename)
    if not exists(product_file_path):
      try:
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        body = response.read()
        open(product_file_path, 'wb').write(body)
      except Exception as e:
        print(f"Failed URL in {product}: {url}")
        print(f"Error: {e}")
    # Update progress
    index += 1
    progress_label.value = f"{product} progress: {index}/{len(url_list)} files downloaded."

## Rasters: 4B, 4D

In [None]:
# Login to Earthdata account
if not username or not password:
  username = getpass("Username: ")
  password = getpass("Password: ")
  manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
  manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)
  manager.add_password(None, "https://data.ornldaac.earthdata.nasa.gov", username, password)
  cookie_jar = CookieJar()
  opener = urllib.request.build_opener(
      urllib.request.HTTPBasicAuthHandler(manager),
      urllib.request.HTTPCookieProcessor(cookie_jar))
  urllib.request.install_opener(opener)

In [None]:
# Get available raster link files
print("# Select raster downloads")
print("raster_downloads = [")
for file in sorted(os.listdir(gedi_links_dir)):
    if file in ["GEDI04_B_links.txt", "GEDI04_D_links.txt"]:
        print(f'  "{file}",')
print("]")

In [None]:
# Select raster downloads
raster_downloads = [
    "GEDI04_B_links.txt",
    "GEDI04_D_links.txt",
]

# Print available raster types per product
for text_file in ["GEDI04_B_links.txt", "GEDI04_D_links.txt"]:
    text_file_path = join(gedi_links_dir, text_file)
    if not exists(text_file_path):
        continue
    product = text_file[:8]
    with open(text_file_path, 'r') as file:
        url_list = [url.strip() for url in file.readlines()]

    raster_types = set()
    for url in url_list:
        filename = url.split('/')[-1].replace('.tif', '')
        raster_type = '_'.join(filename.split('_')[6:])
        raster_types.add(raster_type)

    print(f"{product.lower()}_rasters = [")
    for r in sorted(raster_types):
        print(f'  "{r}",')
    print("]\n")

In [None]:
gedi04_b_rasters = [
  # "R01000M_MI",
  "R01000M_MU",
  # "R01000M_NC",
  # "R01000M_NS",
  "R01000M_PE",
  # "R01000M_PS",
  # "R01000M_QF",
  "R01000M_SE",
  # "R01000M_V1",
  # "R01000M_V2",
]

gedi04_d_rasters = [
  # "QA",
  "agbd",
  # "cover_z_000",
  # "rh_010",
  # "rh_020",
  # "rh_030",
  # "rh_040",
  # "rh_050",
  # "rh_060",
  # "rh_070",
  # "rh_080",
  # "rh_090",
  # "rh_095",
  # "rh_098",
  # "sensitivity_a2",
  # "shot_number",
]


In [None]:
# Download rasters
for text_file in raster_downloads:
    product = text_file[:8]
    text_file_path = join(gedi_links_dir, text_file)
    with open(text_file_path, 'r') as file:
        url_list = [url.strip() for url in file.readlines()]

    selected_types = gedi04_b_rasters if product == "GEDI04_B" else gedi04_d_rasters
    filtered_urls = [url for url in url_list if any(f"_{r}.tif" in url for r in selected_types)]

    product_dir = join(gedi_raster_downloads_dir, product)
    makedirs(product_dir, exist_ok=True)

    index = 0
    progress_label = widgets.Label(value=f"{product} progress: {index}/{len(filtered_urls)} files downloaded.")
    display(progress_label)

    for url in filtered_urls:
        product_filename = url.split('/')[-1]
        product_file_path = join(product_dir, product_filename)
        if not exists(product_file_path):
            try:
                request = urllib.request.Request(url)
                response = urllib.request.urlopen(request)
                body = response.read()
                open(product_file_path, 'wb').write(body)
            except Exception as e:
                print(f"Failed: {url}\nError: {e}")
        index += 1
        progress_label.value = f"{product} progress: {index}/{len(filtered_urls)} files downloaded."

# Finalise GEDI rasters

In [None]:
# Clip GEDI rasters, output original and reprojected versions
nodatavalue = -11111

# Define template area polygon
template_area = gpd.read_file(join(polygons_dir, "template.gpkg"))
bbox_4326 = list(template_area.total_bounds)

# Determine centroid UTM zone
centroid = template_area.union_all().centroid
utm_zone = int((centroid.x + 180) / 6) + 1
centroid_utm_epsg = 32600 + utm_zone if centroid.y >= 0 else 32700 + utm_zone
centroid_utm_name = f"UTM{utm_zone}_{'North' if centroid.y >= 0 else 'South'}"
print(f"Centroid UTM: {centroid_utm_name}, EPSG:{centroid_utm_epsg}")

# Pre-compute transformed bounds
bbox_6933 = list(template_area.to_crs(epsg=6933).total_bounds)
bbox_centroid_utm = list(template_area.to_crs(epsg=centroid_utm_epsg).total_bounds)

# Process GEDI04_B
gedi04b_dir = join(gedi_raster_downloads_dir, "GEDI04_B")
if exists(gedi04b_dir) and os.listdir(gedi04b_dir):
    print("\nGEDI04_B detected")
    gedi04b_final_dir = join(gedi_raster_final_dir, "GEDI04_B")
    raster_groups = {}
    for file in os.listdir(gedi04b_dir):
        if file.endswith(".tif"):
            raster_type = '_'.join(file.replace('.tif', '').split('_')[6:]).replace('R01000M_', '')
            raster_groups.setdefault(raster_type, []).append(join(gedi04b_dir, file))

    for raster_type, tiles in raster_groups.items():
        # Original EPSG:6933
        output_original = join(gedi04b_final_dir, f"GEDI04_B_original_epsg6933_EASE-Grid_{raster_type}.tif")
        if not exists(output_original):
            makedirs(gedi04b_final_dir, exist_ok=True)
            print(f"  {raster_type}: clipping in native CRS (EPSG:6933 EASE-Grid)")
            gdal.Warp(output_original, tiles, options=gdal.WarpOptions(
                outputBounds=bbox_6933, dstNodata=nodatavalue,
                format='GTiff', creationOptions=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1']))
        else: print(f"  {raster_type}: original exists")

        # Reprojected to WGS84
        output_reproj = join(gedi04b_final_dir, f"GEDI04_B_reprojected_epsg4326_WGS84_{raster_type}.tif")
        if not exists(output_reproj):
            makedirs(gedi04b_final_dir, exist_ok=True)
            print(f"  {raster_type}: reprojecting to EPSG:4326 WGS84")
            gdal.Warp(output_reproj, tiles, options=gdal.WarpOptions(
                dstSRS='EPSG:4326', outputBounds=bbox_4326, dstNodata=nodatavalue,
                format='GTiff', creationOptions=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1']))
        else: print(f"  {raster_type}: WGS84 exists")
else: print("\nGEDI04_B: not detected")

# Process GEDI04_D
gedi04d_dir = join(gedi_raster_downloads_dir, "GEDI04_D")
if exists(gedi04d_dir) and os.listdir(gedi04d_dir):
    print("\nGEDI04_D detected")
    gedi04d_final_dir = join(gedi_raster_final_dir, "GEDI04_D")

    # Group by raster type and UTM zone
    raster_groups = {}
    all_by_type = {}
    for file in os.listdir(gedi04d_dir):
        if file.endswith(".tif"):
            raster_type = '_'.join(file.replace('.tif', '').split('_')[6:])
            match = re.search(r'(UTM\d+)_(North|South)', file)
            if match:
                utm_name = f"{match.group(1)}_{match.group(2)}"
                zone_num = int(match.group(1).replace('UTM', ''))
                utm_epsg = 32600 + zone_num if match.group(2) == 'North' else 32700 + zone_num
            else: utm_name, utm_epsg = None, None
            filepath = join(gedi04d_dir, file)
            raster_groups.setdefault((raster_type, utm_name, utm_epsg), []).append(filepath)
            all_by_type.setdefault(raster_type, []).append(filepath)

    # Original: clip each UTM zone separately, track which have data
    zones_with_data = {}
    for (raster_type, utm_name, utm_epsg), tiles in raster_groups.items():
        output_original = join(gedi04d_final_dir, f"GEDI04_D_original_epsg{utm_epsg}_{utm_name}_{raster_type}.tif")
        if not exists(output_original):
            makedirs(gedi04d_final_dir, exist_ok=True)
            print(f"  {raster_type}: clipping in native CRS (EPSG:{utm_epsg} {utm_name})")
            bbox_utm = list(template_area.to_crs(epsg=utm_epsg).total_bounds)
            gdal.Warp(output_original, tiles, options=gdal.WarpOptions(
                outputBounds=bbox_utm, dstNodata=nodatavalue,
                format='GTiff', creationOptions=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1']))
            # Check if output has data, delete if empty
            ds = gdal.Open(output_original)
            arr = ds.GetRasterBand(1).ReadAsArray()
            ds = None
            if (arr == nodatavalue).all():
                os.remove(output_original)
                print(f"    {utm_name}: no data in extent, removed")
            else: zones_with_data.setdefault(raster_type, []).append((utm_name, utm_epsg))
        else:
            print(f"  {raster_type} ({utm_name}): original exists")
            # Check existing file for data
            ds = gdal.Open(output_original)
            arr = ds.GetRasterBand(1).ReadAsArray()
            ds = None
            if not (arr == nodatavalue).all(): zones_with_data.setdefault(raster_type, []).append((utm_name, utm_epsg))

    # Merged reprojections
    for raster_type, tiles in all_by_type.items():
        valid_zones = zones_with_data.get(raster_type, [])
        # Reprojected to centroid UTM (skip if only 1 zone with data)
        if len(valid_zones) > 1:
            output_utm = join(gedi04d_final_dir, f"GEDI04_D_reprojected_epsg{centroid_utm_epsg}_{centroid_utm_name}_{raster_type}.tif")
            if not exists(output_utm):
                makedirs(gedi04d_final_dir, exist_ok=True)
                print(f"  {raster_type}: merging {len(valid_zones)} zones and reprojecting to EPSG:{centroid_utm_epsg} {centroid_utm_name}")
                gdal.Warp(output_utm, tiles, options=gdal.WarpOptions(
                    dstSRS=f'EPSG:{centroid_utm_epsg}', outputBounds=bbox_centroid_utm, dstNodata=nodatavalue,
                    format='GTiff', creationOptions=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1']))
            else: print(f"  {raster_type}: {centroid_utm_name} exists")
        elif len(valid_zones) == 1:
            print(f"  {raster_type}: only 1 zone with data ({valid_zones[0][0]}), skipping centroid UTM reprojection")

        # Reprojected to EPSG:6933
        output_6933 = join(gedi04d_final_dir, f"GEDI04_D_reprojected_epsg6933_EASE-Grid_{raster_type}.tif")
        if not exists(output_6933):
            makedirs(gedi04d_final_dir, exist_ok=True)
            print(f"  {raster_type}: merging and reprojecting to EPSG:6933 EASE-Grid")
            gdal.Warp(output_6933, tiles, options=gdal.WarpOptions(
                dstSRS='EPSG:6933', outputBounds=bbox_6933, dstNodata=nodatavalue,
                format='GTiff', creationOptions=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1']))
        else: print(f"  {raster_type}: EASE-Grid exists")

        # Reprojected to WGS84
        output_4326 = join(gedi04d_final_dir, f"GEDI04_D_reprojected_epsg4326_WGS84_{raster_type}.tif")
        if not exists(output_4326):
            makedirs(gedi04d_final_dir, exist_ok=True)
            print(f"  {raster_type}: merging and reprojecting to EPSG:4326 WGS84")
            gdal.Warp(output_4326, tiles, options=gdal.WarpOptions(
                dstSRS='EPSG:4326', outputBounds=bbox_4326, dstNodata=nodatavalue,
                format='GTiff', creationOptions=['COMPRESS=ZSTD', 'ZSTD_LEVEL=1']))
        else: print(f"  {raster_type}: WGS84 exists")
else: print("\nGEDI04_D: not detected")

# Convert GEDI .h5 to .pkl


In [None]:
# Load project and GEDI area polygons (filters to extent)
project_area_polygon_path = join(polygons_dir, 'project_area.gpkg')
project_area = gpd.read_file(project_area_polygon_path)
print("Project area polygon:")
display(project_area["geometry"].iloc[0])

prediction_area_polygon_path = join(polygons_dir, "prediction_area.gpkg")
prediction_area = gpd.read_file(prediction_area_polygon_path)["geometry"].iloc[0]
prediction_area_nw_lon, prediction_area_se_lat, prediction_area_se_lon, prediction_area_nw_lat = prediction_area.bounds
print(f"GEDI area polygon:")
display(prediction_area)

In [None]:
# Print GEDI products in the 'downloads' directory
gedi_products = sorted(listdir(gedi_h5_downloads_dir))
print(f'Found the following GEDI gedi products in the GEDI downloads directory:\n {gedi_products}')

In [None]:
# Print GEDI product parameters and data types

# Simplified function from pyGEDI
def getLayer(layer, files):
    dictionary = {}
    for h5_file in files:
        layers , stack = [] , [h5_file['BEAM0000']]
        while stack:
            item = stack.pop()
            if isinstance(item, h5py.Dataset):
                layers.append(item.name.replace('/BEAM0000/', ''))
            elif isinstance(item, h5py.Group):
                stack.extend(item.values())
        filtered_layers = [l for l in layers if layer in l]
        if filtered_layers:
            dictionary[h5_file.filename] = filtered_layers
    return dictionary

dict_gedi_parameters = {}
for product in gedi_products:
  product_dir = join(gedi_h5_downloads_dir, product)
  # Sample a single .H5 file
  sample_file_path = listdir(product_dir)[0]
  h5_file = h5py.File(join(product_dir, sample_file_path), 'r')
  layers = list(getLayer('', [h5_file]).values())[0]
  # Sample from beam '0000'
  compatible_shape = (len(h5_file['BEAM0000']['shot_number']), )
  dict_layer = {}
  for layer in layers:
    layer_shape = h5_file['BEAM0000'][layer].shape
    if layer_shape == compatible_shape:
      # Get data types
      df_layer = pd.DataFrame()
      df_layer[layer] = h5_file['BEAM0000'][layer]
      # Add parameter name and data type to dictionary
      dict_layer[layer] = str(df_layer[layer].dtype)
  dict_gedi_parameters[product] = dict_layer
# Print config
print("Copy selected targets into the next cell.\n")
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(dict_gedi_parameters)
# Close files
h5_file.close()
df_layer = None

In [None]:
# Copy and paste GEDI parameters for inclusion in .pkl outputs.
# Each parameter requires additional memory and time to process.
# Changing the data type (e.g. float64 -> float32) may increase performance,
# but can reduce precision and cause other issues. Fot example, Float32 rounds to
# 7 significant digits, imprecise for coordinates (lat and lon lowest mode).
# Include at least beam, lat_lowestmode, lon_lowestmode, shot_number
# and ONE quality flag for each product.

supported_data_types = ["int8","int16","int32","int64","uint8","uint16","uint32","uint64","float16","float32","float64","str"]

selected_parameters = {

  'GEDI04_A': {
    'agbd': 'float32',
    'agbd_se': 'float32',
    'beam': 'str',
    'elev_lowestmode': 'float32',
    'lat_lowestmode': 'float64',
    'lon_lowestmode': 'float64',
    'l4_quality_flag': 'uint8',
    'sensitivity': 'float32',
    'shot_number': 'str',
  },

  # 'GEDI02_A': {
  #   'rh_parameters': 'float16', # e.g. rh25, rh50, rh75, rh95, rh98, modify in block below
  #   'beam': 'object',
  #   # 'elev_highestreturn': 'float32',
  #   'elev_lowestmode': 'float32',
  #   'lat_lowestmode': 'float64',
  #   'lon_lowestmode': 'float64',
  #   'quality_flag': 'uint8',
  #   'sensitivity': 'float32',
  #   'shot_number': 'str',
  # },

  # 'GEDI02_B': {
  #   'beam': 'object',
  #   'cover': 'float32',
  #   'fhd_normal': 'float32',
  #   'omega': 'float32',
  #   'pai': 'float32',
  #   'geolocation/lat_lowestmode': 'float64',
  #   'geolocation/lon_lowestmode': 'float64',
  #   'l2b_quality_flag': 'uint8',
  #   'sensitivity': 'float32',
  #   'shot_number': 'str',
  # },

}

# Check if GEDI products in the selected parameters match those in the downloads directory
assert set(selected_parameters.keys()).issubset(set(gedi_products)), f"GEDI products in selected parameters do not match those in {gedi_h5_downloads_dir}"
# Check if parameters match those in the selected_parameters dictionary
for product in selected_parameters.keys():
  parameter_list = list(dict_gedi_parameters[product].keys())
  if product == 'GEDI02_A': parameter_list.append("rh_parameters") # These are custom parameters added later
  assert set(selected_parameters[product]).issubset(set(parameter_list)), f"{product}'s selected parameters are not all available"
# Check that data types are supported
data_types_list = []
for key in selected_parameters.keys():
  for data_type in list(selected_parameters[key].values()):
    data_types_list.append(data_type)
# Remove duplicates for assert
data_types_list = list(dict.fromkeys(data_types_list))
assert set(data_types_list).issubset(set(supported_data_types)), "An unsupported data type has been selected."
print("All selected GEDI products, parameters and data types successfully applied for conversion from .h5 to .pkl format.")

In [None]:
# Convert .h5 files to .pkl

use_project_area = False  # Toggle to use complex project area bounds, will impact performance.
test_processing = False  # Toggle for testing mode to limit the number of files processed.
test_number = 20  # Number of files to process in test mode.
sensitivity_threshold = 0.95  # Sensitivity threshold for data filtering.
print_filename = True  # Toggle to print the filename being processed.
max_attempts = 3 # At trying to open an .h5 file if it initially fails
delay = 5 # Waiting between attempts

# Loop through products and their parameters.
for product, parameters in selected_parameters.items():
    # Set up directories for caching and downloading.
    gedi_h5_pkl_cache_product_dir = join(h5_pkl_cache_dir, product)
    makedirs(gedi_h5_pkl_cache_product_dir, exist_ok=True)
    gedi_downloads_product_dir = join(gedi_h5_downloads_dir, product)

    # List .h5 files to be processed.
    h5_data = [[join(gedi_downloads_product_dir, h5_file_dir), parameters, gedi_h5_pkl_cache_product_dir] for h5_file_dir in listdir(gedi_downloads_product_dir)]

    # Initialize progress display.
    index = 0
    progress_label = widgets.Label(value=f"{product} progress: {index}/{len(h5_data)} H5 files converted to a .pkl cache.")
    display(progress_label)

    # Processing loop for each file.
    for filename, parameters_dict, gedi_h5_pkl_cache_product_dir in h5_data:

        # Construct the destination filename for the .pkl file
        dst_filename = join(gedi_h5_pkl_cache_product_dir, path.splitext(path.split(filename)[-1])[0]) + '.pkl'

        # Check if .pkl file already exists to skip processing
        if not exists(dst_filename):
            if print_filename: print(f"Processing started: {filename.split('/')[-1]}")

            # Extract data from h5 first (keeping it open is unstable on Google Drive)
            h5_file = {}
            for attempt in range(max_attempts):
                try:
                    with h5py.File(filename, 'r') as h5:
                        for beam in ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011']:
                            if beam not in h5: continue  # Skip missing beams.
                            h5_file[beam] = {}
                            latlayer, lonlayer = ('lat_lowestmode', 'lon_lowestmode') if 'lat_lowestmode' in h5[beam] else ('geolocation/lat_lowestmode', 'geolocation/lon_lowestmode')
                            h5_file[beam][latlayer] = h5[beam][latlayer][:]
                            h5_file[beam][lonlayer] = h5[beam][lonlayer][:]
                            h5_file[beam]['shot_number'] = h5[beam]['shot_number'][:].astype(parameters_dict['shot_number'])
                            for layer in parameters_dict.keys():
                                if layer in h5[beam]: h5_file[beam][layer] = h5[beam][layer][:]
                                elif layer == 'rh_parameters': h5_file[beam]['rh'] = h5[beam]['rh'][:, :100]
                    break  # If successful, break the retry loop
                except Exception as e:
                    if attempt < max_attempts - 1:
                        print(f"Error opening file (attempt {attempt + 1}/{max_attempts}): {str(e)}")
                        print(f"Retrying in {delay} seconds")
                        sleep(delay)
                    else:
                        print(f"Failed to open file after {max_attempts} attempts. Skipping.")
                        continue  # Skip to the next file in the outer loop
            if attempt == max_attempts - 1: continue  # Skip to the next file if all attempts failed

            layer_df_list = []  # List to store data frames for each layer and beam combination.
            first_layer = True # For shot_number and beam to be added

            # Iterate through each parameter layer 'beam' and 'shot_number'.
            for layer in [layer for layer in parameters_dict.keys() if layer not in ('beam', 'shot_number')]:
                beam_df_list = []
                # Iterate through each beam.
                for beam in ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011']:
                    if beam not in h5_file: continue  # Skip missing beams.
                    # Define geospatial extent.
                    latlayer, lonlayer = 'lat_lowestmode', 'lon_lowestmode'
                    if latlayer not in h5_file[beam]:
                        latlayer, lonlayer = 'geolocation/lat_lowestmode', 'geolocation/lon_lowestmode'
                    lat_array = h5_file[beam][latlayer]
                    lon_array = h5_file[beam][lonlayer]

                    # Calculate index based on geographic filters.
                    if use_project_area:
                        project_polygon = project_area.geometry.iloc[0]
                        gedi_points = gpd.GeoDataFrame(geometry=gpd.points_from_xy(lon_array, lat_array), crs="EPSG:4326")
                        points_tree = STRtree(gedi_points.geometry.values)
                        points_tree_indices = points_tree.query(project_polygon, predicate='contains')
                        if len(points_tree_indices) > 0: geo_index = points_tree_indices
                        else: geo_index = np.array([], dtype=int)
                    else: geo_index = np.where((lat_array > prediction_area_se_lat) & (lat_array < prediction_area_nw_lat) & (lon_array > prediction_area_nw_lon) & (lon_array < prediction_area_se_lon))[0]

                    # Collect data for each layer and beam, handling special cases.
                    beam_df = pd.DataFrame()
                    if layer == 'rh_parameters':
                        # All rh values
                        data = h5_file[beam]['rh'][geo_index][:, :100]
                        columns = [f'rh{i+1}' for i in range(100)]
                        # rh values in intervals of 5
                        # data = h5_file[beam]['rh'][geo_index][:, [0, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 97, 99]]
                        # columns = ['rh1', 'rh5', 'rh10', 'rh15', 'rh20', 'rh25', 'rh30', 'rh35', 'rh40', 'rh45', 'rh50', 'rh55', 'rh60', 'rh65', 'rh70', 'rh75', 'rh80', 'rh85', 'rh90', 'rh95', 'rh98', 'rh100']
                        beam_df = pd.DataFrame(data, columns=columns)
                    elif h5_file[beam][layer].shape == (len(lon_array), ):
                        beam_df[layer] = h5_file[beam][layer][geo_index].astype(parameters_dict[layer])

                    if len(beam_df) > 0:
                      if first_layer:
                        beam_df.insert(0, 'shot_number', h5_file[beam]['shot_number'][geo_index].astype(parameters_dict['shot_number']))
                        beam_df.insert(1, 'beam', beam)

                    beam_df_list.append(beam_df)

                # Concatenate data frames for all beams within the same layer.
                if len(beam_df_list) > 0:
                    layer_df = pd.concat(beam_df_list)
                    layer_df_list.append(layer_df)
                    first_layer = False

            first_layer = True # Restart first layer for adding shot_number and beam

            # Concatenate all layer data frames for the file.
            if len(layer_df_list) > 0:
                h5_df = pd.concat(layer_df_list, axis=1)
                # Define timestamp and convert geodataframe.
                for part in filename.split('_'):
                  try: timestamp = pd.to_datetime(part, format='%Y%j%H%M%S', utc=True)
                  except: continue
                h5_df['timestamp'] = timestamp
                # Apply quality and sensitivity filters.
                quality_flag = [key for key in parameters_dict.keys() if 'quality_flag' in key][0]
                h5_df = h5_df.loc[(h5_df[quality_flag] == 1) & (h5_df['sensitivity'] >= sensitivity_threshold)]
                h5_df = h5_df.drop(columns=[quality_flag])
                # Final type check and conversion if necessary
                for column, dtype in parameters_dict.items():
                    if column in h5_df.columns:
                        h5_df[column] = h5_df[column].astype(dtype)
                    elif column == 'rh_parameters':
                        rh_columns = [col for col in h5_df.columns if col.startswith('rh')]
                        h5_df[rh_columns] = h5_df[rh_columns].astype(dtype)
                # Export to pickle
                h5_df.to_pickle(dst_filename)
                if print_filename: print(f"Processing complete: {filename.split('/')[-1]}")

        # Update progress display.
        index += 1
        progress_label.value = f"{product} progress: {index}/{len(h5_data)} H5 files converted to a .pkl cache."
        if test_processing and index == test_number: break  # Break after processing specified number of files if in test mode.

In [None]:
# Convert .h5 files to .pkl

use_project_area = False  # Toggle to use complex project area bounds, will impact performance.
test_processing = False  # Toggle for testing mode to limit the number of files processed.
test_number = 20  # Number of files to process in test mode.
sensitivity_threshold = 0.95  # Sensitivity threshold for data filtering.
print_filename = True  # Toggle to print the filename being processed.
max_attempts = 3  # Attempts at opening an .h5 file if it initially fails.
delay = 5  # Seconds between attempts.

# Loop through products and their parameters.
for product, parameters in selected_parameters.items():
    # Set up directories for caching and downloading.
    gedi_h5_pkl_cache_product_dir = join(h5_pkl_cache_dir, product)
    makedirs(gedi_h5_pkl_cache_product_dir, exist_ok=True)
    gedi_downloads_product_dir = join(gedi_h5_downloads_dir, product)

    # Validate quality_flag parameter exists.
    quality_flags = [key for key in parameters.keys() if 'quality_flag' in key]
    if not quality_flags:
        print(f"No quality_flag parameter found in {product} parameters. Skipping product.")
        continue
    quality_flag = quality_flags[0]

    # List .h5 files to be processed.
    h5_data = [[join(gedi_downloads_product_dir, h5_file_dir), parameters, gedi_h5_pkl_cache_product_dir] for h5_file_dir in listdir(gedi_downloads_product_dir)]

    # Initialize progress display.
    index = 0
    progress_label = widgets.Label(value=f"{product} progress: {index}/{len(h5_data)} H5 files converted to a .pkl cache.")
    display(progress_label)

    # Processing loop for each file.
    for filename, parameters_dict, gedi_h5_pkl_cache_product_dir in h5_data:

        # Construct the destination filename for the .pkl file.
        dst_filename = join(gedi_h5_pkl_cache_product_dir, path.splitext(path.split(filename)[-1])[0]) + '.pkl'

        # Check if .pkl file already exists to skip processing.
        if not exists(dst_filename):
            if print_filename: print(f"Processing started: {filename.split('/')[-1]}")

            # Extract data from h5 first (keeping it open is unstable on Google Drive).
            h5_file = {}
            file_opened = False
            for attempt in range(max_attempts):
                try:
                    with h5py.File(filename, 'r') as h5:
                        for beam in ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011']:
                            if beam not in h5: continue  # Skip missing beams.
                            h5_file[beam] = {}
                            latlayer, lonlayer = ('lat_lowestmode', 'lon_lowestmode') if 'lat_lowestmode' in h5[beam] else ('geolocation/lat_lowestmode', 'geolocation/lon_lowestmode')
                            h5_file[beam][latlayer] = h5[beam][latlayer][:]
                            h5_file[beam][lonlayer] = h5[beam][lonlayer][:]
                            h5_file[beam]['shot_number'] = h5[beam]['shot_number'][:].astype(parameters_dict['shot_number'])
                            for layer in parameters_dict.keys():
                                if layer in h5[beam]: h5_file[beam][layer] = h5[beam][layer][:]
                                elif layer == 'rh_parameters': h5_file[beam]['rh'] = h5[beam]['rh'][:, :100]
                    file_opened = True
                    break
                except Exception as e:
                    if attempt < max_attempts - 1:
                        print(f"Error opening file (attempt {attempt + 1}/{max_attempts}): {str(e)}")
                        print(f"Retrying in {delay} seconds")
                        sleep(delay)
                    else:
                        print(f"Failed to open file after {max_attempts} attempts. Skipping.")

            if not file_opened:
                index += 1
                progress_label.value = f"{product} progress: {index}/{len(h5_data)} H5 files converted to a .pkl cache."
                continue

            layer_df_list = []  # List to store data frames for each layer and beam combination.
            first_layer = True  # For shot_number and beam to be added.

            # Iterate through each parameter layer 'beam' and 'shot_number'.
            for layer in [layer for layer in parameters_dict.keys() if layer not in ('beam', 'shot_number')]:
                beam_df_list = []
                # Iterate through each beam.
                for beam in ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011']:
                    if beam not in h5_file: continue  # Skip missing beams.
                    # Define geospatial extent.
                    latlayer, lonlayer = 'lat_lowestmode', 'lon_lowestmode'
                    if latlayer not in h5_file[beam]:
                        latlayer, lonlayer = 'geolocation/lat_lowestmode', 'geolocation/lon_lowestmode'
                    lat_array = h5_file[beam][latlayer]
                    lon_array = h5_file[beam][lonlayer]

                    # Calculate index based on geographic filters.
                    if use_project_area:
                        project_polygon = project_area.geometry.iloc[0]
                        gedi_points = gpd.GeoDataFrame(geometry=gpd.points_from_xy(lon_array, lat_array), crs="EPSG:4326")
                        points_tree = STRtree(gedi_points.geometry.values)
                        points_tree_indices = points_tree.query(project_polygon, predicate='contains')
                        if len(points_tree_indices) > 0: geo_index = points_tree_indices
                        else: geo_index = np.array([], dtype=int)
                    else: geo_index = np.where((lat_array > prediction_area_se_lat) & (lat_array < prediction_area_nw_lat) & (lon_array > prediction_area_nw_lon) & (lon_array < prediction_area_se_lon))[0]

                    # Collect data for each layer and beam, handling special cases.
                    beam_df = pd.DataFrame()
                    if layer == 'rh_parameters':
                        # All rh values.
                        data = h5_file[beam]['rh'][geo_index][:, :100]
                        columns = [f'rh{i+1}' for i in range(100)]
                        # rh values in intervals of 5.
                        # data = h5_file[beam]['rh'][geo_index][:, [0, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 97, 99]]
                        # columns = ['rh1', 'rh5', 'rh10', 'rh15', 'rh20', 'rh25', 'rh30', 'rh35', 'rh40', 'rh45', 'rh50', 'rh55', 'rh60', 'rh65', 'rh70', 'rh75', 'rh80', 'rh85', 'rh90', 'rh95', 'rh98', 'rh100']
                        beam_df = pd.DataFrame(data, columns=columns)
                    elif h5_file[beam][layer].shape == (len(lon_array), ):
                        beam_df[layer] = h5_file[beam][layer][geo_index].astype(parameters_dict[layer])

                    if len(beam_df) > 0:
                      if first_layer:
                        beam_df.insert(0, 'shot_number', h5_file[beam]['shot_number'][geo_index].astype(parameters_dict['shot_number']))
                        beam_df.insert(1, 'beam', beam)

                    beam_df_list.append(beam_df)

                # Concatenate data frames for all beams within the same layer.
                if len(beam_df_list) > 0:
                    layer_df = pd.concat(beam_df_list)
                    layer_df_list.append(layer_df)
                    first_layer = False

            # Concatenate all layer data frames for the file.
            if len(layer_df_list) > 0:
                h5_df = pd.concat(layer_df_list, axis=1)
                # Define timestamp and convert geodataframe.
                for part in filename.split('_'):
                  try: timestamp = pd.to_datetime(part, format='%Y%j%H%M%S', utc=True)
                  except: continue
                h5_df['timestamp'] = timestamp
                # Apply quality and sensitivity filters.
                h5_df = h5_df.loc[(h5_df[quality_flag] == 1) & (h5_df['sensitivity'] >= sensitivity_threshold)]
                h5_df = h5_df.drop(columns=[quality_flag])
                # Final type check and conversion if necessary.
                for column, dtype in parameters_dict.items():
                    if column in h5_df.columns:
                        h5_df[column] = h5_df[column].astype(dtype)
                    elif column == 'rh_parameters':
                        rh_columns = [col for col in h5_df.columns if col.startswith('rh')]
                        h5_df[rh_columns] = h5_df[rh_columns].astype(dtype)
                # Export to pickle.
                h5_df.to_pickle(dst_filename)
                if print_filename: print(f"Processing complete: {filename.split('/')[-1]}")

        # Update progress display.
        index += 1
        progress_label.value = f"{product} progress: {index}/{len(h5_data)} H5 files converted to a .pkl cache."
        if test_processing and index == test_number: break  # Break after processing specified number of files if in test mode.

In [None]:
# Get list of proucts with cache files
print('products_to_finalise = [')
for product in os.listdir(h5_pkl_cache_dir):
  print(f"'{product}',")
print(']')

In [None]:
products_to_finalise = [
'GEDI04_A',
]

In [None]:
for product in products_to_finalise:
  final_pkl_path = join(pkl_final_dir, f"{product}.pkl")
  product_cache_dir = join(h5_pkl_cache_dir, product)
  # List prevents duplicates (e.g. ' (1)' from being added).
  cache_files = [f for f in os.listdir(product_cache_dir) if ' (' not in f]
  if len(cache_files) > 0:
    # Progress.
    final_pkl_index = 0
    final_pkl_progress_label = widgets.Label(value=f"{product} progress: {final_pkl_index}/{len(cache_files)} .pkl caches appended to list.")
    display(final_pkl_progress_label)
    # Initiate cache dataframe list.
    cache_dataframe_list = []
    # Loop through cache files, appending each to the list.
    for cache in cache_files:
      chunk_cache = pd.read_pickle(join(product_cache_dir, cache))
      if len(chunk_cache) > 0:
          cache_dataframe_list.append(chunk_cache)
      final_pkl_index += 1
      final_pkl_progress_label.value = f"{product} progress: {final_pkl_index}/{len(cache_files)} .pkl caches appended to list."
    # Check cache list is not empty.
    if len(cache_dataframe_list) == 0:
      print(f"All cache files for {product} were empty.")
      continue
    # Concatenate and save final .pkl file.
    print("Concatenating list of cache dataframes.")
    final_pkl_df_concat = pd.concat(cache_dataframe_list)
    duplicate_count = final_pkl_df_concat.duplicated(subset=['shot_number']).sum()
    if duplicate_count > 0:
        print(f"Removing {duplicate_count} duplicate shot_numbers.")
        final_pkl_df_concat = final_pkl_df_concat.drop_duplicates(subset=['shot_number'], keep='first')
    cache_dataframe_list = []
    # Detect lat/lon column names.
    if 'lat_lowestmode' in final_pkl_df_concat.columns:
        lat_col, lon_col = 'lat_lowestmode', 'lon_lowestmode'
    else:
        lat_col, lon_col = 'geolocation/lat_lowestmode', 'geolocation/lon_lowestmode'
    # Make geodataframe.
    print("Converting to a geodataframe.")
    final_pkl_gdf_concat = gpd.GeoDataFrame(final_pkl_df_concat,
        geometry=gpd.points_from_xy(final_pkl_df_concat[lon_col],
                                    final_pkl_df_concat[lat_col], crs="EPSG:4326"))
    final_pkl_gdf_concat = final_pkl_gdf_concat.drop(columns=[lat_col, lon_col])
    # Save the final converted dataframe.
    final_pkl_gdf_concat = final_pkl_gdf_concat.reset_index(drop=True)
    final_pkl_gdf_concat.to_pickle(f"{final_pkl_path}")
    print(f"{product} final .pkl complete.")
  else: print(f"No cache data found for {product}.")

# Convert GEDI .pkl to .gpkg

In [None]:
# For verification and visualisation in GIS software
# Get list of .txt files in the 'links' directory
pkl_final_list = []
for file in os.listdir(pkl_final_dir):
  pkl_final_list.append(file)

# Select final .pkl file to convert to .gpkg
for pkl_final in sorted(pkl_final_list):
  print(f"pkl_to_convert = '{pkl_final}'")

In [None]:
pkl_to_convert = 'GEDI04_A.pkl'

# Load final .pkl
pkl_to_convert_dir = join(pkl_final_dir, pkl_to_convert)
pkl_to_convert_df = pd.read_pickle(pkl_to_convert_dir)
print(f"Final .pkl has {len(pkl_to_convert_df)} rows.\n\n")

# Print available parameters in final .pkl for selection
print(f'selected_parameters = [')
for column in pkl_to_convert_df.columns:
    if column in ('geometry'):
        continue
    print(f"\t'{column}',")
print(f']')

In [None]:
# Select from available parameters in .pkl

selected_parameters = [
	'shot_number',
	'beam',
	'agbd',
	'agbd_se',
	'elev_lowestmode',
	'sensitivity',
	'timestamp',
]

assert all([selected_parameter in pkl_to_convert_df.columns for selected_parameter in selected_parameters])
final_gpkg_file_dir = join(targets_gpkg_dir, f'{pkl_to_convert[:-4]}.gpkg')
if exists(final_gpkg_file_dir):
  print(f'File {final_gpkg_file_dir} already exists.')
else:
  print(f'Converting final {pkl_to_convert} to .gpkg.')
  df_selected_parameters = pkl_to_convert_df[['geometry', *selected_parameters]]
  gpkg_gdf = gpd.GeoDataFrame(df_selected_parameters, geometry='geometry')
  gpkg_gdf = gpkg_gdf.set_crs(4326) # WGS84
  gpkg_gdf.to_file(final_gpkg_file_dir, driver="GPKG")
  print(f'Saved to {final_gpkg_file_dir}.')

# User uploaded targets

In [None]:
# Upload of any spatial data for XGBoost prediction.
# Place .csv(s) with 'x' and 'y' columns in '2_targets/csv'.

print("Select 'user targets' csv to be compiled\n")
for file in os.listdir(targets_user_csv_dir):
  if file.endswith(".csv"):
    print(f'user_uploaded_targets = "{file}"')

In [None]:
user_uploaded_targets = "user_uploaded.csv"

user_targets_csv = pd.read_csv(join(targets_user_csv_dir, user_uploaded_targets))

print("Select targets columns to transfer to the dataset\n")
print("targets_columns = [")
for column in list(user_targets_csv.columns):
  print(f'  "{column}",')
print("]")

In [None]:
targets_columns = [

]

export_gpkg = False

# Add column flags.
dataset_targets_columns = [(targets_column, f"tar_{targets_column}") for targets_column in targets_columns]
dataset_targets_columns_dict = dict(dataset_targets_columns)

# Detect coordinate columns (case-insensitive).
cols_lower = {col.lower(): col for col in user_targets_csv.columns}
lon_variants = ['x', 'lon', 'long', 'longitude', 'lon_lowestmode']
lat_variants = ['y', 'lat', 'latitude', 'lat_lowestmode']

lon_col, lat_col = None, None
for lon_var in lon_variants:
    if lon_var in cols_lower:
        lon_col = cols_lower[lon_var]
        break
for lat_var in lat_variants:
    if lat_var in cols_lower:
        lat_col = cols_lower[lat_var]
        break

if lon_col is None or lat_col is None:
    raise ValueError(f"CSV must contain coordinate columns. Supported: {lon_variants} and {lat_variants} (case-insensitive).")

coord_cols = [lon_col, lat_col]

# Drop any columns which aren't selected (keep coordinate columns).
cols_to_keep = list(dataset_targets_columns_dict.keys()) + coord_cols
user_targets_csv = user_targets_csv.drop(columns=[col for col in user_targets_csv.columns if col not in cols_to_keep])

# Rename target columns with tar_ prefix.
user_targets_csv = user_targets_csv.rename(columns=dataset_targets_columns_dict)

# Create geometry.
user_targets_geometry = gpd.GeoDataFrame(
    user_targets_csv,
    geometry=gpd.points_from_xy(user_targets_csv[lon_col], user_targets_csv[lat_col], crs="EPSG:4326")
)
user_targets_geometry = user_targets_geometry.drop(columns=coord_cols)

user_targets_final = f"user_{user_uploaded_targets[:-4]}_{datetime.utcnow().strftime('%y%m%d_%H%M%S')}"

# Export to .pkl.
user_targets_geometry.to_pickle(join(pkl_final_dir, f"{user_targets_final}.pkl"))

# Check .pkl.
display(pd.read_pickle(join(pkl_final_dir, f"{user_targets_final}.pkl")))

# Export to .gpkg.
if export_gpkg:
  user_targets_gpkg = join(targets_gpkg_dir, f"{user_targets_final}.gpkg")
  user_targets_geometry.to_file(user_targets_gpkg, driver="GPKG")

# Disconnect runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()