In [1]:
from maap.maap import MAAP
maap = MAAP(maap_host='api.ops.maap-project.org')

In [2]:
!pip install -U -r /projects/Developer/icesat2_boreal/dps/requirements_main.txt

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m[33m
[0m

# Launch DPS for build_stack.py

In [3]:
import os
import geopandas
import pandas as pd
import glob
import datetime
!pip install xmltodict
import xmltodict
import sys
sys.path.append('/projects/Developer/icesat2_boreal/lib')
import ExtractUtils

  shapely_geos_version, geos_capi_version_string


You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m[33m
[0m

# Test (locally) the script for DPS

##### To run build_stack.py across a tiled raster dataset you need a bunch of args that we'll gather into a dictionary

s3 you need to have a vector footprint of that dataset

In [4]:
TILE_NUM = 1615

#### Dictionary preparation makes this script very flexible and transferable to another s3 dataset
This dictionary is specific to the ESA Worldcover dataset.  
To run '`build_stack.py` across another dataset, just prepare another dictionary here and everything below should be exactly the same.  

In [5]:
# ESA Worldcover 2020
BUILD_STACK_DICT = {
            #'INDEX_FN': '/projects/my-public-bucket/boreal_tiles_v003.gpkg',
            'INDEX_FN': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v003.gpkg',
            'ID_COL_NAME': 'tile_num',
            'TILE_NUM':TILE_NUM,
            'INDEX_LYR': 'boreal_tiles_v003',
            # Worldcover data is accessed via its footprint, with a 's3_path' col identifying the s3 locations of each tile
            'RASTER_NAME': 'esa_worldcover_v100_2020',
            #'COVAR_TILE_FN': '/projects/my-public-bucket/analyze_agb/footprints_v100_2020_v100_2020_map-s3.gpkg',
            'COVAR_TILE_FN': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/analyze_agb/footprints_v100_2020_v100_2020_map-s3.gpkg',
            'IN_COVAR_S3_COL': 's3_path',
            'OUTDIR': '/projects/my-public-bucket/DPS_ESA_LC',
            'NODATA_VAL': 0,
            'OUTPUT_CLIP_COG_FN':'',
            'CREDENTIALS_FN': None
        }

In [6]:
# INDEX clip shapes should be in equal area
# COVAR_TILE_FN tiles (of raster COGs) should be in 4326
args = f"\
    --in_tile_fn {BUILD_STACK_DICT['INDEX_FN']} \
    --in_tile_id_col {BUILD_STACK_DICT['ID_COL_NAME']} \
    --in_tile_num {BUILD_STACK_DICT['TILE_NUM']} \
    --tile_buffer_m 0 \
    --in_tile_layer {BUILD_STACK_DICT['INDEX_LYR']} \
    -o {BUILD_STACK_DICT['OUTDIR']} \
    --topo_off \
    --covar_src_name {BUILD_STACK_DICT['RASTER_NAME']} \
    --covar_tile_fn {BUILD_STACK_DICT['COVAR_TILE_FN']} \
    --in_covar_s3_col {BUILD_STACK_DICT['IN_COVAR_S3_COL']} \
    --input_nodata_value {BUILD_STACK_DICT['NODATA_VAL']} \
    --clip"

In [8]:
!python /projects/Developer/icesat2_boreal/lib/build_stack.py $args

  shapely_geos_version, geos_capi_version_string

---Running build_stack()---

The covariate's filename(s) intersecting the 0 m bbox for tile id 1615:
 s3://esa-worldcover/v100/2020/map/ESA_WorldCover_10m_2020_v100_N45W084_Map.tif
bbox: [5048522.564673773, 5193303.643996402, 5138522.529666357, 5283303.645755118]
Writing stack as cloud-optimized geotiff: /projects/my-public-bucket/DPS_ESA_LC/esa_worldcover_v100_2020_1615_cog.tif
Clipping to feature polygon...
Orig stack shape:		 (1, 3000, 3000)
Output resolution:		 (30, 30)
Writing img to memory...
  input_nodata_value=input_nodata_value
{'driver': 'VRT', 'dtype': 'uint8', 'nodata': 0, 'width': 3000, 'height': 3000, 'count': 1, 'crs': CRS.from_wkt('PROJCS["unnamed",GEOGCS["GRS 1980(IUGG, 1980)",DATUM["unknown",SPHEROID["GRS80",6378137,298.257222101],TOWGS84[0,0,0,0,0,0,0]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433]],PROJECTION["Albers_Conic_Equal_Area"],PARAMETER["standard_parallel_1",50],PARAMETER["standard_parallel_2",70]

In [8]:
DPS_INPUT_TILE_NUM_LIST = []

# Register DPS algorithm
We need to register a DPS algorithm called 'run_build_stack' before proceeding to the chunks below...

In [73]:
!python /projects/register-algorithm /projects/Developer/icesat2_boreal/dps/alg_3-1-3/algorithm_config.yaml

{
  "algorithm_name": "run_build_stack",
  "code_version": "master",
  "environment_name": "ubuntu",
  "repo_url": "https://github.com/lauraduncanson/icesat2_boreal.git",
  "docker_container_url": "mas.dit.maap-project.org/root/maap-workspaces/base_images/vanilla:dit",
  "queue": "8GB",
  "algorithm_description": "DPS run to create ESA LandCover masks",
  "build_command": "icesat2_boreal/dps/build_command_main.sh",
  "script_command": "icesat2_boreal/dps/alg_3-1-3/run.sh",
  "disk_space": "10GB",
  "algorithm_params": [
    {
      "field": "covar_tile_url",
      "download": true
    },
    {
      "field": "in_tile_url",
      "download": true
    },
    {
      "field": "covar_tile_fn",
      "download": false
    },
    {
      "field": "in_tile_fn",
      "download": false
    },
    {
      "field": "in_tile_id_col",
      "download": false
    },
    {
      "field": "in_tile_num",
      "download": false
    },
    {
      "field": "tile_buffer_m",
      "download": false
    }

# Build a DPS list

In [74]:
RUN_NAME = 'build_stack_test'

In [107]:
HLS_tindex_master_fn = 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/HLS_test_redo/spring2022/HLS_tindex_master.csv'
Topo_tindex_master_fn = 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv'

DPS_INPUT_TILE_NUM_LIST = []
for tindex_fn in [HLS_tindex_master_fn, Topo_tindex_master_fn]:
    tindex = pd.read_csv(tindex_fn)
    
    print(len(tindex['tile_num'].values))
    
    if len(tindex['tile_num'].values) > len(DPS_INPUT_TILE_NUM_LIST):
        print(f"{len(tindex['tile_num'].values)} tiles in {tindex_fn}")
        print(f'{os.path.basename(tindex_fn)} has the most tiles. Using this for DPS tiles list.')
        DPS_INPUT_TILE_NUM_LIST = tindex['tile_num'].values


5245
5245 tiles in s3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/HLS_test_redo/spring2022/HLS_tindex_master.csv
HLS_tindex_master.csv has the most tiles. Using this for DPS tiles list.
5220


#### Note: make sure the `in_params_dict` coincides with the args of `build_stack.py`

In [108]:
#'INDEX_FN': '/projects/my-public-bucket/boreal_tiles_v003.gpkg',
in_params_dict = {
            'covar_tile_url': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/analyze_agb/footprints_v100_2020_v100_2020_map-s3.gpkg',
            'in_tile_url': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v003.gpkg',
            'covar_tile_fn': 'footprints_v100_2020_v100_2020_map-s3.gpkg',
            'in_tile_fn': 'boreal_tiles_v003.gpkg',
            'in_tile_id_col': 'tile_num',
            'in_tile_num':"",
            'tile_buffer_m': 0,
            'in_tile_layer': 'boreal_tiles_v003',
            #'output_dir': 'dummy',  # a dummy dir so i dont have to change the .yaml now
            #'topo_off': 'dummy', # functionality to turn off build a 'topo' stack is FALSE by default - so we have to set this flag (which turns OFF topo building) - this is hardcoded in the .sh now - if we want to build a topo stack with this script, need to learn how to set this flag with params dict 
            'covar_src_name': 'esa_worldcover_v100_2020',
            'in_covar_s3_col': 's3_path',
            'input_nodata_value': 0,
            'shape': 3000
            #'clip': 'dummy' # this is hardcoded in the .sh now - if we want to build a topo stack with this script, need to learn how to set this flag with params dict 
        }

In [110]:
in_params_dict

{'covar_tile_url': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/analyze_agb/footprints_v100_2020_v100_2020_map-s3.gpkg',
 'in_tile_url': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v003.gpkg',
 'covar_tile_fn': 'footprints_v100_2020_v100_2020_map-s3.gpkg',
 'in_tile_fn': 'boreal_tiles_v003.gpkg',
 'in_tile_id_col': 'tile_num',
 'in_tile_num': '',
 'tile_buffer_m': 0,
 'in_tile_layer': 'boreal_tiles_v003',
 'covar_src_name': 'esa_worldcover_v100_2020',
 'in_covar_s3_col': 's3_path',
 'input_nodata_value': 0}

## Run a DPS job across the list

In [175]:
%%time
# import logging
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
import json

submit_results_df_list = []
len_input_list = len(DPS_INPUT_TILE_NUM_LIST)
print(f"# of input tiles for DPS: {len_input_list}")

for i, INPUT_TILE_NUM in enumerate(DPS_INPUT_TILE_NUM_LIST):
    
    DPS_num = i+1
    IDENTIFIER = 'run_build_stack'
    ALGO_ID = f'{IDENTIFIER}_ubuntu'
    USER = 'nathanmthomas'
    WORKER_TYPE = 'maap-dps-worker-8gb'
    
    # Update the in_params_dict with th current INPUT_TILE_NUM
    in_params_dict['in_tile_num'] = INPUT_TILE_NUM
    
    submit_result = maap.submitJob(
            identifier=IDENTIFIER,
            algo_id=ALGO_ID,
            version='master',
            username=USER, # username needs to be the same as whoever created the workspace
            queue=WORKER_TYPE,
            **in_params_dict
        )
    
    #print(submit_result)
    #break
    
    # Build a dataframe of submission details
    submit_result['dps_num'] = DPS_num
    submit_result['tile_num'] = INPUT_TILE_NUM
    submit_result['submit_time'] = datetime.datetime.now()
    submit_result['dbs_job_hour'] =datetime.datetime.now().hour
    submit_result['algo_id'] = ALGO_ID
    submit_result['user'] = USER
    submit_result['worker_type'] = WORKER_TYPE

    # Append to a list of data frames of submission results
    submit_results_df_list.append(pd.DataFrame([submit_result]))
    
    if DPS_num in [1, 25,50, 100,200,300,400,500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000, 3250, 3500, 3750, 4000, 4250, 5000, 7000, 9000, 11000, 13000, 15000, 17000, 19000, 21000, 24000, len_input_list]:
        print(f"DPS run #: {DPS_num}\t| tile num: {INPUT_TILE_NUM}\t| submit status: {submit_result['status']}\t| job id: {submit_result['job_id']}") 
        
# Build a final submission results df and save
submit_results_df = pd.concat(submit_results_df_list)
submit_results_df['run_name'] = RUN_NAME
nowtime = pd.Timestamp.now().strftime('%Y%m%d%H%M')
print(f"Current time:\t{nowtime}")
submit_results_df.to_csv(f'/projects/my-public-bucket/dps_submission_results/DPS_{IDENTIFIER}_submission_results_{len_input_list}_{nowtime}.csv')
submit_results_df


# of input tiles for DPS: 2
DPS run #: 1	| tile num: 3855	| submit status: success	| job id: 8c922119-0f49-4965-92cd-ba1fc26d0b35
DPS run #: 2	| tile num: 24389	| submit status: success	| job id: cd4e13a0-5bea-41ab-bba8-d0094a3da955
Current time:	202209161759
CPU times: user 29.5 ms, sys: 361 µs, total: 29.9 ms
Wall time: 455 ms


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type,run_name
0,success,200,8c922119-0f49-4965-92cd-ba1fc26d0b35,1,3855,2022-09-16 17:59:07.241520,17,run_build_stack_ubuntu,nathanmthomas,maap-dps-worker-8gb,build_stack_test
0,success,200,cd4e13a0-5bea-41ab-bba8-d0094a3da955,2,24389,2022-09-16 17:59:07.307532,17,run_build_stack_ubuntu,nathanmthomas,maap-dps-worker-8gb,build_stack_test


After almost any DPS job, you have to assess what succeeded and failed. This involves:
1. building a table of job status based on job ids captured in the job_results_df from the DPS run chunk (this takes 40 mins for ~47k jobs) --> this tells you how many jobs failed
2. merging the job status table with the job results df --> this tells you which specific granules (or tile nums) failed
3. building another input list of granules for a follow-up DPS
## Assess DPS results
Build a table of job status based on job id - how many jobs failed?

In [181]:
%%time
LIST_SUBMISSIONS = sorted(glob.glob(f'/projects/my-public-bucket/dps_submission_results/DPS_{IDENTIFIER}_submission_results_*.csv'),key=ExtractUtils.func, reverse=True)
for DPS_DATETIME in [nowtime]:
    for fn in LIST_SUBMISSIONS:
        if DPS_DATETIME in fn and not 'job_status' in fn:
            DPS_alg_id = os.path.basename(fn.split('_submission_results_')[0].replace('DPS_',''))
            thentime = fn.split('_')[-1].replace('.csv','')
            print(f'DPS alg:\t\t{DPS_alg_id}')
            print(f'DPS launch time:\t{thentime}')
            z = ExtractUtils.BUILD_TABLE_JOBSTATUS(pd.read_csv(fn))
            # Save job status table
            z.to_csv(f'/projects/my-public-bucket/dps_submission_results/DPS_{IDENTIFIER}_submission_results_job_status_{len(z)}_{thentime}.csv')

DPS alg:		run_build_stack
DPS launch time:	202209161759
Count total jobs:	2
Count pending jobs:	0
Count running jobs:	0
Count succeeded jobs:	2
Count failed jobs:	0
% of failed jobs:	Nothing has failed...yet

CPU times: user 29.5 ms, sys: 3.61 ms, total: 33.1 ms
Wall time: 975 ms


In [179]:
z[z['wps:Status'] =='Succeeded'].tile_num.to_list()

[24389]

In [157]:
xmltodict.parse(maap.getJobResult(z[z['wps:Status'] =='Failed'].iloc[1].job_id).content)

{'wps:Result': {'@xmlns:ows': 'http://www.opengis.net/ows/2.0',
  '@xmlns:schemaLocation': 'http://schemas.opengis.net/wps/2.0/wps.xsd',
  '@xmlns:wps': 'http://www.opengis.net/wps/2.0',
  '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
  'wps:JobID': '4087d811-f95f-4f47-9f31-59923ff80e51',
  'wps:Output': {'@id': 'traceback',
   'wps:Data': 'activate does not accept more than one argument:\n[\'/app/icesat2_boreal/dps/alg_3-1-3/run.sh\', \'footprints_v100_2020_v100_2020_map-s3.gpkg\', \'boreal_tiles_v003.gpkg\', \'tile_num\', \'980\', \'0\', \'boreal_tiles_v003\', \'esa_worldcover_v100_2020\', \'s3_path\', \'0\']\n\n+ /app/icesat2_boreal/dps/alg_3-1-3/run.sh footprints_v100_2020_v100_2020_map-s3.gpkg boreal_tiles_v003.gpkg tile_num 980 0 boreal_tiles_v003 esa_worldcover_v100_2020 s3_path 0\n+ unset PROJ_LIB\n+ mkdir output\n+++ dirname /app/icesat2_boreal/dps/alg_3-1-3/run.sh\n++ cd /app/icesat2_boreal/dps/alg_3-1-3\n++ pwd -P\n+ basedir=/app/icesat2_boreal/dps/alg_3-1-3\n+ 

## Update the DPS input tiles list with only the tiles that failed - then run the DPS submit chunk above

In [160]:
DPS_INPUT_TILE_NUM_LIST = z[z['wps:Status'] =='Failed'].tile_num.to_list()
len(DPS_INPUT_TILE_NUM_LIST)
DPS_INPUT_TILE_NUM_LIST

[2917, 980, 25175, 25740, 2981, 3879]

In [162]:
STRAGGLER_TILE_LIST = [3269, 3088, 2917, 2513, 2228, 1554, 1274, 1200, 620, 980, 790, 23828, 24389, 24108, 23501, 23830, 108, 25175, 25458, 25740, 163, 28967, 1172, 1431, 2981, 3531, 4463, 3879, 4207, 4445, 2888, 3790]
DPS_INPUT_TILE_NUM_LIST = STRAGGLER_TILE_LIST
len(DPS_INPUT_TILE_NUM_LIST)

32

In [169]:
FINISHED_LIST = z[z['wps:Status'] =='Succeeded'].tile_num.to_list()
DPS_INPUT_TILE_NUM_LIST = set(STRAGGLER_TILE_LIST) - set(FINISHED_LIST)
len(DPS_INPUT_TILE_NUM_LIST)

6

In [174]:
STRAGGLER_TILE_LIST2 = [3855,24389]
DPS_INPUT_TILE_NUM_LIST = STRAGGLER_TILE_LIST2