In [1]:
!pip install fiona

[0m

In [1]:
import os

from pathlib import Path
from datetime import datetime
from calendar import monthrange

import fiona

import multiprocessing as mp

import geopandas as gpd

import copy

In [2]:
import sys
sys.path.append('/projects/my-private-bucket/code-git-shared/icesat2_boreal/lib')
import plotlib
from mosaiclib import *

In [3]:
import rasterio
from rasterio.plot import show_hist, show
import numpy as np
import matplotlib.pyplot as plt

In [4]:
import multiprocessing as mp

num_cores = mp.cpu_count()
print(f"Number of CPU cores: {num_cores}")

import psutil

mem = psutil.virtual_memory()
print(f"Total memory: {mem.total / 1e9:.2f} GB")
print(f"Available memory: {mem.available / 1e9:.2f} GB")

Number of CPU cores: 32
Total memory: 267.33 GB
Available memory: 199.23 GB


In [5]:
from pathlib import Path

In [28]:
# --- minimal imports here so you can paste this as-is ---
from pathlib import Path
import os, shutil

In [29]:
# Functions 
def wrapper_composite(params):

    FOCAL_TILE = params.get('FOCAL_TILE')
    SAT_API = params.get('SAT_API')
    MS_COMP_TYPE = params.get('MS_COMP_TYPE')
    YEAR = params.get('YEAR')
    MIN_N_FILT_RESULTS = params.get('MIN_N_FILT_RESULTS')
    SEASON_START = params.get('SEASON_START')
    SEASON_STOP = params.get('SEASON_STOP')
    # INDEX_FN = params.get('INDEX_FN')
    # INDEX_LYR = params.get('INDEX_LYR')
    
    STAT = params.get('STAT')
    STAT_PCT = params.get('STAT_PCT')
    TARGET_SPECTRAL = params.get('TARGET_SPECTRAL')

    INDEX_FN =  params.get('INDEX_FN') #'https://maap-ops-workspace.s3.amazonaws.com/shared/montesano/databank/boreal_tiles_v004.gpkg'
    INDEX_LYR = params.get('INDEX_LYR') # 'boreal_tiles_v004'
    
    YEAR_START, YEAR_STOP = (YEAR, YEAR)
    HLS_PRODUCT = params.get('HLS_PRODUCT') #HLS_PRODUCT = 'H30'
    MAX_CLOUDS = params.get('MAX_CLOUDS') #MAX_CLOUDS = 0

    OUTDIR = params.get('OUTDIR') #'/projects/my-private-bucket/tmp/mask_test_keep_snow'
    
    args = f"--in_tile_fn {INDEX_FN} \
        --in_tile_layer {INDEX_LYR} \
        --sat_api {SAT_API} \
        --tile_buffer_m 0 \
        --in_tile_num {FOCAL_TILE} \
        --output_dir {OUTDIR} \
        -sy {YEAR_START} -ey {YEAR_STOP} -smd {SEASON_START} -emd {SEASON_STOP} -mc {MAX_CLOUDS} \
        --composite_type {MS_COMP_TYPE} \
        --hls_product {HLS_PRODUCT} \
        --thresh_min_ndvi -1 \
        --min_n_filt_results {MIN_N_FILT_RESULTS} \
        --stat {STAT} \
        --stat_pct {STAT_PCT} \
        --target_spectral_index {TARGET_SPECTRAL}"
    args += " --do_indices"
    #args += " --search_only"
    # args += " --rangelims_red 0.01 0.1" # the default now effectively no limit [-1e9, 1e9]
    args += " --rangelims_red 0.01 1" # the default now effectively no limit [-1e9, 1e9]
    
    ###########################################
    # Output file name
    ###########################################
    fn = f'{OUTDIR}/{MS_COMP_TYPE}_{FOCAL_TILE}_{SEASON_START}_{SEASON_STOP}_{YEAR_START}_{YEAR_STOP}_{STAT}{TARGET_SPECTRAL}.tif'
    if STAT == 'percentile':
        fn = f'{OUTDIR}/{MS_COMP_TYPE}_{FOCAL_TILE}_{SEASON_START}_{SEASON_STOP}_{YEAR_START}_{YEAR_STOP}_{STAT}{STAT_PCT}{TARGET_SPECTRAL}.tif'

    
    fn = Path(fn)

    ###########################################
    # Check if it exists, if it does return 
    if fn.exists():
        print(f'Skipping: {fn} - file already exists')
        return str(fn)


    ## 
    # cmd = f'python /projects/my-private-bucket/code-git-shared/icesat2_boreal/lib/build_ms_composite.py {args}'
    # cmd = f'python /projects/my-private-bucket/code-git-shared/icesat2_boreal/lib/build_ms_composite_multip.py {args}'
    cmd = f'python /projects/my-private-bucket/code-git-shared/icesat2_boreal/lib/build_ms_composite_addEVI2.py {args}'
    # !echo $cmd
    !eval $cmd
    ##


    
    #rescaled_multiband_fn = os.path.join(os.path.dirname(fn), os.path.basename(fn).replace('.tif','_rescaled_3band_temp.tif'))
    # plotlib.rescale_multiband_for_plot(fn, rescaled_multiband_fn, bandlist = [5,7,3], pct=[20,90], nodata=-9999.0) 

    
    return str(fn)

In [30]:
# OPTION 2
def wrapper_composite(params):
    

    FOCAL_TILE = params.get('FOCAL_TILE')
    SAT_API = params.get('SAT_API')
    MS_COMP_TYPE = params.get('MS_COMP_TYPE')
    YEAR = params.get('YEAR')
    MIN_N_FILT_RESULTS = params.get('MIN_N_FILT_RESULTS')
    SEASON_START = params.get('SEASON_START')
    SEASON_STOP = params.get('SEASON_STOP')

    STAT = params.get('STAT')
    STAT_PCT = params.get('STAT_PCT')
    TARGET_SPECTRAL = params.get('TARGET_SPECTRAL')

    INDEX_FN  = params.get('INDEX_FN')
    INDEX_LYR = params.get('INDEX_LYR')

    YEAR_START, YEAR_STOP = (YEAR, YEAR)
    HLS_PRODUCT = params.get('HLS_PRODUCT')
    MAX_CLOUDS = params.get('MAX_CLOUDS')

    # << the user’s desired final directory >>
    OUTDIR = params.get('OUTDIR')

    # --------- always write to a safe tmp first ----------
    TMP_OUTDIR = f"/tmp/{os.getenv('USER','user')}/ms_comp"
    Path(TMP_OUTDIR).mkdir(parents=True, exist_ok=True)
    Path(OUTDIR).mkdir(parents=True, exist_ok=True)

    # Build the base output name once (same naming as before)
    if STAT == 'percentile':
        base_name = f'{MS_COMP_TYPE}_{FOCAL_TILE}_{SEASON_START}_{SEASON_STOP}_{YEAR_START}_{YEAR_STOP}_{STAT}{STAT_PCT}{TARGET_SPECTRAL}.tif'
    else:
        base_name = f'{MS_COMP_TYPE}_{FOCAL_TILE}_{SEASON_START}_{SEASON_STOP}_{YEAR_START}_{YEAR_STOP}_{STAT}{TARGET_SPECTRAL}.tif'

    tmp_fn   = Path(TMP_OUTDIR) / base_name
    final_fn = Path(OUTDIR)     / base_name

    # If final already exists, skip the run
    if final_fn.exists():
        print(f"Skipping: {final_fn} - file already exists")
        return str(final_fn)

    # ------- build CLI args but point output_dir to TMP_OUTDIR -------
    args = f"--in_tile_fn {INDEX_FN} " \
           f"--in_tile_layer {INDEX_LYR} " \
           f"--sat_api {SAT_API} " \
           f"--tile_buffer_m 0 " \
           f"--in_tile_num {FOCAL_TILE} " \
           f"--output_dir {TMP_OUTDIR} " \
           f"-sy {YEAR_START} -ey {YEAR_STOP} -smd {SEASON_START} -emd {SEASON_STOP} -mc {MAX_CLOUDS} " \
           f"--composite_type {MS_COMP_TYPE} " \
           f"--hls_product {HLS_PRODUCT} " \
           f"--thresh_min_ndvi -1 " \
           f"--min_n_filt_results {MIN_N_FILT_RESULTS} " \
           f"--stat {STAT} " \
           f"--stat_pct {STAT_PCT} " \
           f"--target_spectral_index {TARGET_SPECTRAL}"
    args += " --do_indices"
    args += " --rangelims_red 0.01 1"

    # ------- run your builder (kept simple, same as you had) -------
    cmd = f'python /projects/my-private-bucket/code-git-shared/icesat2_boreal/lib/build_ms_composite_addEVI2.py {args}'
    !eval $cmd

    # ------- move from /tmp -> final OUTDIR, then remove tmp if present -------
    if not tmp_fn.exists():
        print(f"[warn] Expected output not found at {tmp_fn}")
        return None

    try:
        shutil.move(str(tmp_fn), str(final_fn))
        # after move, tmp_fn should be gone; clean up if it still exists for any reason
        if tmp_fn.exists():
            try:
                tmp_fn.unlink()
            except Exception as _:
                pass
        print(f"[io] moved to final: {final_fn}")
        return str(final_fn)
    except Exception as e:
        print(f"[error] Could not move {tmp_fn} -> {final_fn}: {e}")
        # leave tmp file in place so you can recover it
        return str(tmp_fn)


In [31]:
# Define input and output paths
# INDEX_FN = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/shp/atlantic_forest/tiles/br_af_grid90km_prj.gpkg' 
# BASE_OUTDIR = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/tif/HLS_composites/monthly/br_af_grid90km_prj_evi2_max'


############# Run THIS
# Define input and output paths
# INDEX_FN = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/shp/atlantic_forest/tiles/br_af_grid60km_prj.gpkg' 
# BASE_OUTDIR = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/tif/HLS_composites/monthly/br_af_grid60km_prj_evi2_max'
#######################
### TEST
INDEX_FN = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/shp/atlantic_forest/tiles/br_af_grid60km_prj.gpkg' 
BASE_OUTDIR = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/tif/HLS_composites/monthly/br_af_grid60km_prj_ndvi_max_test'



# # Define input and output paths
# INDEX_FN = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/shp/atlantic_forest/tiles/br_af_grid30km_prj.gpkg' 
# BASE_OUTDIR = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/tif/HLS_composites/monthly/br_af_grid30km_prj_evi2_max'


In [32]:
# Set default parameters
SAT_API = 'https://cmr.earthdata.nasa.gov/stac/LPCLOUD'
MS_COMP_TYPE = 'HLS'
HLS_PRODUCT = 'H30'

STAT = 'max'
# STAT = 'percentile'

TARGET_SPECTRAL = 'ndvi'
# TARGET_SPECTRAL = 'evi2'
STAT_PCT = 95.0

MIN_N_FILT_RESULTS = 10
MAX_CLOUDS = 0 # Threshold of max clouds to start search, e.g. when set to 0 it starts at 0% of cloud cover and go up to 90% 


In [33]:
# Get first Layer name (First layer is used as default)
layer_names = fiona.listlayers(INDEX_FN)
# Get the first layer name
first_layer_name = layer_names[0]
INDEX_LYR = first_layer_name

In [34]:
# Define parameters
params = {
    'SAT_API': SAT_API,
    'HLS_PRODUCT': HLS_PRODUCT,
    'MS_COMP_TYPE': MS_COMP_TYPE,
    'MAX_CLOUDS': MAX_CLOUDS,
    'MIN_N_FILT_RESULTS': MIN_N_FILT_RESULTS,
    'STAT': STAT,
    'STAT_PCT': STAT_PCT,
    'TARGET_SPECTRAL': TARGET_SPECTRAL,
    'INDEX_FN': INDEX_FN,
    'INDEX_LYR': INDEX_LYR,
}

In [35]:
# Define year
years = [2022]

# Get all tiles IDs
with fiona.open(INDEX_FN, layer=0) as src:
    tiles = [feature["properties"]["tile_num"] for feature in src]
    
tiles_gpkg = gpd.read_file(INDEX_FN)
tiles = tiles_gpkg["tile_num"].tolist()
# # tiles

In [36]:
len(tiles)

454

In [37]:
#######################################
# Select tiles to run
#######################################
# tiles_run = tiles

# tiles_run = tiles[3:4]
# tiles_run = [89,99]
# tiles_run = [89]
# tiles_run


In [38]:
#######################################
# Select based on overlap (doing that during tests )
ref_gpkg_fn = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/shp/atlantic_forest/tiles/br_af_grid90km_prj.gpkg' 
# ref_gpkg_fn = '/projects/my-private-bucket/HLS-1DCNN-AGB/data/shp/atlantic_forest/tiles/br_af_grid60km_prj.gpkg' 

ref_tile_num = 89


gdf_index = gpd.read_file(INDEX_FN)
gdf_ref = gpd.read_file(ref_gpkg_fn)

# Select reference tile 89
ref_tile = gdf_ref[gdf_ref["tile_num"] == ref_tile_num]

# Spatial join / intersection
matches = gdf_index[gdf_index.intersects(ref_tile.union_all())]

# Get the tile_num of matches
tiles_run = matches["tile_num"].tolist()



In [39]:
tiles_run

# Tile 89 of grid 90 km used as a reference to get GEDI data
# These are the overlapping tiles from grid 60 km
# [176, 177, 178, 196, 197, 198, 210, 211, 212]

[176, 177, 178, 196, 197, 198, 210, 211, 212]

In [40]:
# import folium
# # Center map on the reference tile
# center = ref_tile.geometry.centroid.iloc[0].coords[0][::-1]  # (lat, lon)

# m = folium.Map(location=center, zoom_start=8)

# # Add reference tile (90 km) in red
# folium.GeoJson(
#     ref_tile,
#     style_function=lambda x: {"color": "red", "weight": 3, "fillOpacity": 0},
#     name="Reference tile (90 km)"
# ).add_to(m)

# # Add matching 30 km tiles in blue
# folium.GeoJson(
#     matches,
#     style_function=lambda x: {"color": "blue", "weight": 2, "fillOpacity": 0.2},
#     tooltip=folium.GeoJsonTooltip(fields=["tile_num"], aliases=["Tile #"])
# ).add_to(m)

# # Add layer control
# folium.LayerControl().add_to(m)

In [41]:
# years = [2022]
# years = [2022, 2021,2020]
years = [2022,2021,2020,2019,2018]
# years = [2019,2018]
# years = [2023]
# type(years)

In [42]:
%%time
# Create parameter list and create output directories
# This is set up to do monthly composites
params_list = []
for tile in tiles_run: 
    for year in years:
        # for month in range(2, 4):
        for month in range(1, 13):
            start_day = f"{month:02d}-01"
            end_day = f"{month:02d}-{monthrange(year, month)[1]:02d}"
    
            # output directory for specific run
            # outdir = f'{BASE_OUTDIR}/tile_{tile:03d}/{year}/{month:02d}'
            outdir = f'{BASE_OUTDIR}/tile_{tile:03d}/{year}/'
            
            os.makedirs(outdir, exist_ok=True)
    
            run_params = copy.deepcopy(params)
            run_params.update({
                'FOCAL_TILE': tile,
                'YEAR': year,
                'SEASON_START': start_day,
                'SEASON_STOP': end_day,
                'OUTDIR': str(outdir)
            })
    
            params_list.append(run_params)



CPU times: user 8.49 ms, sys: 12.1 ms, total: 20.5 ms
Wall time: 1.41 s


In [43]:
len(params_list)

540

In [47]:
# 

In [None]:
wrapper_composite(params_list[-1])

In [44]:
# params_list[0]#['SEASON_START'].split('-')[0]
# params_list[10:13]

In [45]:
%%time
for params in params_list:
    try:
        print(f"Running tile {params['FOCAL_TILE']}, year {params['YEAR']}, month {params['SEASON_START'].split('-')[0]}")
        output_file = wrapper_composite(params)
        print(f"Output saved to: {output_file}")
    except Exception as e:
        print(f"Error processing tile {params['FOCAL_TILE']}, year {params['YEAR']}, month {params['SEASON_START'].split('-')[0]}: {e}")

Running tile 176, year 2022, month 01
shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
Traceback (most recent call last):
  File "/projects/my-private-bucket/code-git-shared/icesat2_boreal/lib/build_ms_composite_addEVI2.py", line 18, in <module>
    from rio_tiler.io import COGReader
  File "/opt/conda/envs/pangeo/lib/python3.12/site-packages/rio_tiler/__init__.py", line 5, in <module>
    from . import (  # noqa
  File "/opt/conda/envs/pangeo/lib/python3.12/site-packages/rio_tiler/io/__init__.py", line 5, in <module>
    from .stac import STACReader  # noqa
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/pangeo/lib/python3.12/site-packages/rio_tiler/io/stac.py", line 21, in <module>
    import httpx
  File "/opt/conda/envs/pangeo/lib/python3.12/site-packages/httpx/__init__.py", line 15, in <module>
    from ._main import main
  File "/opt/conda/envs/pangeo/lib/python3.12/site-packages/httpx/_main.py"

In [22]:
## NOTES

# 1 tile 1 year
# Wall time: 23min 42s

## MULTIPROCESSING - Error with multiple requests

In [None]:
%%time
# mp.cpu_count() - 1
# with mp.Pool(processes= 5) as pool:
#     fn_list = pool.map(wrapper_composite, params_list)
with mp.Pool(processes= 12) as pool:
    fn_list = pool.map(wrapper_composite, params_list)

In [None]:
# # Plotting to check false composite to check
# fn = fn_list[0]
# fn
# rescaled_multiband_fn = fn.replace('.tif', '_rescaled_3band_temp.tif')
# plotlib.rescale_multiband_for_plot(fn, rescaled_multiband_fn, bandlist = [6,4,3], pct=[20,98], nodata=-9999.0) 

# fig,ax=plt.subplots(figsize=(10,10))
# with rasterio.open(rescaled_multiband_fn) as src:
#     #print(src.profile)
#     show(src.read(),transform=src.transform, ax=ax, title=os.path.basename(rescaled_multiband_fn))

In [None]:
### Notes

# Time to run 1 tile 1 year 12 processors



In [None]:
## CHECK ERRORS

## No output is created sometimes when runnning multiple jobs