In [1]:
from maap.maap import MAAP
maap = MAAP(maap_host='api.ops.maap-project.org')

# Launch DPS for tile_atl08.py

In [2]:
import os
import geopandas
import pandas as pd
import glob
import datetime
!pip install xmltodict
import xmltodict

  shapely_geos_version, geos_capi_version_string


Collecting xmltodict
  Using cached xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0
[0m

In [3]:
def get_stack_fn(stack_list_fn, in_tile_num):
    # Find most recent topo/Landsat stack path for tile in list of stack paths from *tindex_master.csv
    all_stacks_df = pd.read_csv(stack_list_fn)
    stack_for_tile = all_stacks_df[all_stacks_df['location'].str.contains("_"+str(in_tile_num))]
    [print(i) for i in stack_for_tile.path.to_list()]
    stack_for_tile_fn = stack_for_tile.path.to_list()[0]
    if len(stack_for_tile)==0:
        stack_for_tile_fn = None
    return(stack_for_tile_fn)

# nmt added: code that returns df of landsat locations and tile number
# This is basically CountOutput.py
def get_stack_df(dps_dir, TYPE, dps_year):
    
    if "Landsat" in TYPE:
        root = f"/projects/my-private-bucket/dps_output/do_landsat_stack_3-1-2_ubuntu/ops/{dps_year}/"
        ends_with_str = "_dps.tif"
    if "Topo" in TYPE:
        root = f"/projects/my-private-bucket/dps_output/do_topo_stack_3-1-5_ubuntu/ops/{dps_year}/"
        ends_with_str = "_stack.tif"
    if "ATL08" in TYPE:
        root = f"/projects/my-private-bucket/dps_output/run_extract_ubuntu/ops/{dps_year}/"
        ends_with_str = "0m.csv"
            
    df = pd.DataFrame(columns=['location', 'tile_num'])

    for dir, subdir, files in os.walk(root):
        for fname in files:
            if fname.endswith(ends_with_str): 
                 
                tile_num = fname.split('_')[1]
                   
                if "ATL08" in TYPE:
                    df = df.append({'location':os.path.join(dir+"/", fname)},ignore_index=True)
                else:
                    df = df.append({'location':os.path.join(dir+"/", fname), 'tile_num':tile_num},ignore_index=True)
        
    return df

#### Set the names of the data frames to create

In [4]:
# Topo and Landsat tindex_master csvs from build_tindex_master.py
topo_tindex    = "/projects/my-public-bucket/DPS_tile_lists/Topo_tindex_master.csv"
landsat_tindex = "/projects/my-public-bucket/DPS_tile_lists/Landsat_tindex_master.csv"
HLS_tindex     = "/projects/my-public-bucket/DPS_tile_lists/HLS_tindex_master.csv"

# Model-ready subset of tiles for which Topo and Landsat coincide
model_ready_tiles_topo = "/projects/my-public-bucket/DPS_tile_lists/model_ready_tiles_topo_paths.csv"
model_ready_tiles_landsat = "/projects/my-public-bucket/DPS_tile_lists/model_ready_tiles_landsat_paths.csv"

## Make the data frames from build_tindex_master.py csvs for Topo and Landsat tiles
python lib/build_tindex_master.py

In [5]:
if os.path.isfile(landsat_tindex) and os.path.isfile(topo_tindex):
    print('Reading existing...')
    ls8_df = pd.read_csv(landsat_tindex)
    topo_df = pd.read_csv(topo_tindex)
else:
    s3_stem = 'https://s3.console.aws.amazon.com/s3/buckets/maap-ops-workspace/nathanmthomas'
    local_stem = '/projects/my-private-bucket'

    ls8_root =  s3_stem + '/dps_output/do_landsat_stack_3-1-2_ubuntu'
    topo_root = s3_stem + '/dps_output/do_topo_stack_3-1-5_ubuntu'
    
    ls8_df = get_stack_df(ls8_root, "Landsat")
    topo_df = get_stack_df(topo_root, "Topo")
topo_df.head()

Reading existing...


Unnamed: 0.1,Unnamed: 0,local_path,tile_num
0,0,/projects/my-private-bucket/dps_output/do_topo...,421
1,1,/projects/my-private-bucket/dps_output/do_topo...,455
2,2,/projects/my-private-bucket/dps_output/do_topo...,456
3,3,/projects/my-private-bucket/dps_output/do_topo...,491
4,4,/projects/my-private-bucket/dps_output/do_topo...,492


In [5]:

topo_df = pd.read_csv(topo_tindex)
topo_df[topo_df.tile_num == 3457].local_path.tolist()[0].replace('/projects/my-private-bucket', 'https://s3.console.aws.amazon.com/s3/buckets/maap-ops-workspace/nathanmthomas')


'https://s3.console.aws.amazon.com/s3/buckets/maap-ops-workspace/nathanmthomas/dps_output/do_topo_stack_3-1-5_ubuntu/ops/2021/07/23/23/32/27/934649/Copernicus_3457_covars_cog_topo_stack.tif'

## Get tile ids for which both Topo and Landsat stacks exist

In [6]:
# added by nmt: get filenames of co-incident landsat and topo
if False:
    topo_sub_df = pd.DataFrame(columns=['local_path','tile_num'])
    ls8_sub_df = pd.DataFrame(columns=['local_path','tile_num'])

    for i in range(len(ls8_df['tile_num'])):
        ls_tile_num = ls8_df['tile_num'][i]
        for j in range(len(topo_df['tile_num'])):
            topo_tile_num = topo_df['tile_num'][j]
            if ls_tile_num == topo_tile_num:
                # Only need to choose one, but we'll do 2 and then check
                ls8_sub_df = ls8_sub_df.append({'local_path':ls8_df['local_path'][i],'tile_num':ls8_df['tile_num'][i].astype(int)}, ignore_index=True)
                topo_sub_df = topo_sub_df.append({'local_path':topo_df['local_path'][j],'tile_num':topo_df['tile_num'][j].astype(int)}, ignore_index=True)

    #ls8_sub_df['tile_num'] = ls8_sub_df['tile_num'].astype(float, errors = 'raise')
    print(ls8_sub_df.head())
    print(topo_sub_df.head())
    print(len(ls8_sub_df),len(topo_sub_df))

    topo_sub_df.to_csv( model_ready_tiles_topo, index=False, encoding='utf-8-sig')
    ls8_sub_df.to_csv( model_ready_tiles_landsat, index=False, encoding='utf-8-sig')

#### Now you have a set of tile ids for which both Landsat and Topo stacks exist

In [7]:
topo_sub_df = pd.read_csv("/projects/my-public-bucket/DPS_tile_lists/model_ready_tiles_topo_paths.csv")
INPUT_TILE_NUM_LIST = topo_sub_df['tile_num'].values.astype(int).tolist()
len(INPUT_TILE_NUM_LIST)

4465

##### Test: get a subset of tile ids for test tiles (Norway and others in NA)

In [8]:
DO_EXPERIMENT = True

NORWAY_TILE_LIST = pd.read_csv('/projects/shared-buckets/lduncanson/misc_files/norway_tiles.csv').layer.tolist()
    
DELTA_TILE_LIST = [3365,3366,3367,3458,3459,3460,3353,3354,3355] + [3361, 3362]
BONA_TILE_LIST  = [3270,3271,3272, 3456,3457,  3363,3364,3365] + [3268, 3269]
HEALY_TILE_LIST = [ 3551,3552,3553,3645,3646,3647] + [3648, 3649, 3555, 3554]

#DELTA_TILE_LIST = [3365,3366,3367,3458,3460,3353,3354,3355,3549]
#BONA_TILE_LIST  = [3270,3271,3272,3364,3456,3457,3458,3364,3365]
#HEALY_TILE_LIST = [3456,3457,3458,3551,3552, 3553,3645,3646,3647]
INPUT_EXPERIMENT_TILE_NUM_LIST = NORWAY_TILE_LIST + DELTA_TILE_LIST + BONA_TILE_LIST + HEALY_TILE_LIST
ALASKA_TILE_LIST =  list(range(3268,3272+1))+\
                    list(range(3361,3366+1))+\
                    list(range(3454,3459+1))+\
                    list(range(3549,3555+1))+\
                    list(range(3643,3648+1))

INPUT_EXPERIMENT_TILE_NUM_LIST = NORWAY_TILE_LIST + ALASKA_TILE_LIST
len(INPUT_EXPERIMENT_TILE_NUM_LIST)

153

#### Read in the latest tindex and compare with a previous set of completed tiles to see which ones still need to be run

In [129]:
import numpy as np
#tindex_master_fn = '/projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/run_no_LC_height_thresholds/ATL08_filt_tindex_master.csv'
NAME_TEST_SUBDIR = 'run_LC_height_thresholds_v2'
tindex_master_DIR = f'/projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/{NAME_TEST_SUBDIR}'
!python /projects/icesat2_boreal/lib/build_tindex_master.py -t ATL08_filt -y 2022 -m $NAME_TEST_SUBDIR -o $tindex_master_DIR

tindex_master_fn = os.path.join(tindex_master_DIR, 'ATL08_filt_tindex_master.csv')
tiles_completed = pd.read_csv(tindex_master_fn)

print(f'Tiles completed: {len(tiles_completed)}')
tile_nums_missing = np.setdiff1d(DPS_INPUT_TILE_NUM_LIST, tiles_completed.tile_num)
print(f'Tiles missing: {len(tile_nums_missing)}')
INPUT_EXPERIMENT_TILE_NUM_LIST = tile_nums_missing.tolist()
print(len(INPUT_EXPERIMENT_TILE_NUM_LIST))

DO_EXPERIMENT = True

  shapely_geos_version, geos_capi_version_string

Building a list of tiles:
MAAP version:		master
Type:		ATL08_filt
Year:		2022
Month:		run_LC_height_thresholds_v2
Days:		1-31

Output dir:  /projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/run_LC_height_thresholds_v2
                                             s3_path  ...                                               file
0  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220322_0013.csv
2  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220322_0012.csv
4  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220322_0020.csv
6  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220322_0011.csv
8  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220322_0018.csv

[5 rows x 3 columns]
# of duplicate tiles: 0
Final # o

In [9]:
len(INPUT_EXPERIMENT_TILE_NUM_LIST)

153

In [10]:
tindex_master_fn = f'/projects/shared-buckets/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv'
tiles = pd.read_csv(tindex_master_fn)
len(tiles)

46166

In [11]:
tiles_completed_no_LC = pd.read_csv('/projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/run_no_LC_height_thresholds/ATL08_filt_tindex_master.csv')
tiles_completed_LC = pd.read_csv('/projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/run_LC_height_thresholds/ATL08_filt_tindex_master.csv')
print(f"# tiles for no LC:\t{len(tiles_completed_no_LC)}")
print(f"# tiles for LC:\t\t{len(tiles_completed_LC)}")
tile_nums_missing_no_LC = np.setdiff1d(INPUT_TEST_TILE_NUM_LIST, tiles_completed_no_LC.tile_num)
tile_nums_missing_LC = np.setdiff1d(INPUT_TEST_TILE_NUM_LIST, tiles_completed_LC.tile_num)
tile_nums_missing_no_LC = set(INPUT_TEST_TILE_NUM_LIST) - set(tiles_completed_no_LC.tile_num)
tile_nums_missing_LC = set(INPUT_TEST_TILE_NUM_LIST) - set(tiles_completed_LC.tile_num)
print(f"tiles missing for no LC:\t{tile_nums_missing_no_LC}")
print(f"tiles missing for LC:\t\t{tile_nums_missing_LC}")

#print(f"tiles for no LC:\t{tiles_completed_no_LC.tile_num}")
#print(f"tiles for LC:\t\t{tiles_completed_LC.tile_num}")

# The missing tiles common to both runs probably wont process b/c they have no ATL08 over land, or no corresponding Landsat or Topo tiles.
# Those missing that are different in each set need to be run
DPS_INPUT_TILE_NUM_LIST_no_LC = list(set(tiles_completed_LC.tile_num) - set(tiles_completed_no_LC.tile_num))
DPS_INPUT_TILE_NUM_LIST_LC = list(set(tiles_completed_no_LC.tile_num) - set(tiles_completed_LC.tile_num))
print(f"Tiles still needed for no LC run: {DPS_INPUT_TILE_NUM_LIST_no_LC}")
print(f"Tiles still needed for LC run: {DPS_INPUT_TILE_NUM_LIST_LC}")


# tiles for no LC:	133
# tiles for LC:		133


NameError: name 'np' is not defined

In [202]:
month_dir_str = 'run_LC_height_thresholds'
index_out_dir = os.path.join('/projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022', month_dir_str)
!python /projects/icesat2_boreal/lib/build_tindex_master.py -t ATL08_filt -y 2022 -m $month_dir_str -o $index_out_dir

  shapely_geos_version, geos_capi_version_string

Building a list of tiles:  ATL08_filt

Output dir:  /projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/run_LC_height_thresholds
                                              s3_path  ...                                               file
0   s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220312_0043.csv
2   s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220312_0054.csv
5   s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220312_0065.csv
7   s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220312_0030.csv
10  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220312_0042.csv

[5 rows x 3 columns]
# of duplicate tiles: 23
Final # of tiles: 133
df shape :                                               s3_path  ... t

In [12]:
TEST_DPS  = False

if TEST_DPS:
    DPS_INPUT_TILE_NUM_LIST = INPUT_TEST_TILE_NUM_LIST
    
    if True:
        #!python /projects/icesat2_boreal/lib/build_tindex_master_v2.py -t ATL08_filt -y 2022 -m $month_dir_str -o $index_out_dir
        t = pd.read_csv(os.path.join(index_out_dir,'ATL08_filt_tindex_master.csv'))
        COMPLETED_TILES = t.tile_num.to_list()
        NEED_TILES = list(set(DPS_INPUT_TILE_NUM_LIST) - set(COMPLETED_TILES))

        print(NEED_TILES)
        DPS_INPUT_TILE_NUM_LIST = NEED_TILES
    
else:
    if DO_EXPERIMENT:
        print('Running DPS on the FULL list of EXPERIMENT input')
        DPS_INPUT_TILE_NUM_LIST = INPUT_EXPERIMENT_TILE_NUM_LIST
    else:
        print('Running DPS on the FULL list of input')
        DPS_INPUT_TILE_NUM_LIST = INPUT_TILE_NUM_LIST
    
print(f"List length: {len(DPS_INPUT_TILE_NUM_LIST)}")

Running DPS on the FULL list of EXPERIMENT input
List length: 153


## Customize the DPS run (choose params, set up the params dictionary)

In [13]:
# Set a default params dict
in_param_dict = {
                        'in_tile_num': '',
                        'in_tile_fn': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v003.gpkg',
                        'in_tile_layer': 'boreal_tiles_v003',
                        'csv_list_fn': 's3://maap-ops-workspace/shared/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv',
                        'topo_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv',
                        'landsat_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Landsat_tindex_master.csv',
                        'landsat_cols_list': 'Blue Green Red NIR SWIR NDVI SAVI MSAVI NDMI EVI NBR NBR2 TCB TCG TCW ValidMask Xgeo Ygeo', 
                        'years_list': '2020',
                        'user_stacks': 'nathanmthomas',
                        'user_atl08': 'lduncanson',
                        'thresh_sol_el': 0,
                        'v_ATL08': 5,
                        'minmonth': 6,
                        'maxmonth': 9,
                        'LC_filter': False
    }

# Norway test 01 --> run_no_LC_height_thresholds_v2
# Just include sol_el so we can use sol_el < 5
in_param_dict_norway01 = in_param_dict
in_param_dict_norway01['years_list']    = '2019 2020 2021'
in_param_dict_norway01['thresh_sol_el'] = 5
in_param_dict_norway01['minmonth']      = 4
in_param_dict_norway01['maxmonth']      = 10
in_param_dict_norway01['LC_filter']     = False

# Norway test 02 --> run_LC_height_thresholds_v2
# Use v005 ATL08, which will apply lc-based thresholds, extend to all months
# NOTE!! make sure you manually update to use the correct filter in tile_atl08.py
in_param_dict_norway02 = in_param_dict
in_param_dict_norway02['years_list']    = '2019 2020 2021'
in_param_dict_norway02['thresh_sol_el'] = 5
in_param_dict_norway02['minmonth']      = 4
in_param_dict_norway02['maxmonth']      = 10
in_param_dict_norway02['LC_filter']     = True

# Norway test 03 --> run_LC_height_thresholds_HLS
# same as test 02, but with HLS composites from 2019 - 2021
# NOTE: HLS composites have SWIR2 JulianDate yearDate
in_param_dict_norway03 = in_param_dict
in_param_dict_norway03['landsat_stack_list_fn'] = 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/HLS_tindex_master.csv'
in_param_dict_norway03['landsat_cols_list']     = 'Blue Green Red NIR SWIR SWIR2 NDVI SAVI MSAVI NDMI EVI NBR NBR2 TCB TCG TCW ValidMask Xgeo Ygeo JulianDate yearDate'
in_param_dict_norway03['years_list']            = '2019 2020 2021'
in_param_dict_norway03['thresh_sol_el']         = 5
in_param_dict_norway03['minmonth']              = 4
in_param_dict_norway03['maxmonth']              = 10
in_param_dict_norway03['LC_filter']             = True

In [14]:
in_param_dict = in_param_dict_norway03
in_param_dict

{'in_tile_num': '',
 'in_tile_fn': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v003.gpkg',
 'in_tile_layer': 'boreal_tiles_v003',
 'csv_list_fn': 's3://maap-ops-workspace/shared/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv',
 'topo_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv',
 'landsat_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/HLS_tindex_master.csv',
 'landsat_cols_list': 'Blue Green Red NIR SWIR SWIR2 NDVI SAVI MSAVI NDMI EVI NBR NBR2 TCB TCG TCW ValidMask Xgeo Ygeo JulianDate yearDate',
 'years_list': '2019 2020 2021',
 'user_stacks': 'nathanmthomas',
 'user_atl08': 'lduncanson',
 'thresh_sol_el': 5,
 'v_ATL08': 5,
 'minmonth': 4,
 'maxmonth': 10,
 'LC_filter': True}

## Run a DPS job across the list

In [15]:
%%time
submit_results_df_list = []
len_input_list = len(DPS_INPUT_TILE_NUM_LIST)
print(f"# of input tiles for DPS: {len_input_list}")

for i, INPUT_TILE_NUM in enumerate(DPS_INPUT_TILE_NUM_LIST):

    DPS_num = i+1
    IDENTIFIER = 'run_tile_atl08'
    ALGO_ID = f'{IDENTIFIER}_ubuntu'
    USER = 'lduncanson'
    WORKER_TYPE = 'maap-dps-worker-16gb'
    
    in_param_dict['in_tile_num'] = INPUT_TILE_NUM
    
    submit_result = maap.submitJob(
            identifier=IDENTIFIER,
            algo_id=ALGO_ID,
            version='master',
            username=USER, # username needs to be the same as whoever created the workspace
            queue=WORKER_TYPE,
            **in_param_dict
        )
    
    # Build a dataframe of submission details
    submit_result['dps_num'] = DPS_num
    submit_result['tile_num'] = INPUT_TILE_NUM
    submit_result['submit_time'] = datetime.datetime.now()
    submit_result['dbs_job_hour'] =datetime.datetime.now().hour
    submit_result['algo_id'] = ALGO_ID
    submit_result['user'] = USER
    submit_result['worker_type'] = WORKER_TYPE

    # Append to a list of data frames of submission results
    submit_results_df_list.append(pd.DataFrame([submit_result]))
    
    if DPS_num in [1, 25, 50, 100, 500, 1000, 1500, 2000, 3000, 5000, 7000, 9000, 11000, 13000, 15000, 17000, 19000, 21000, 24000, len_input_list]:
        print(f"DPS run #: {DPS_num}\t| tile num: {INPUT_TILE_NUM}\t| submit status: {submit_result['status']}\t| job id: {submit_result['job_id']}") 

# Build a final submission results df and save
submit_results_df = pd.concat(submit_results_df_list)
nowtime = pd.Timestamp.now().strftime('%Y%m%d%H%M')
print(f"Current time:\t{nowtime}")
OUT_SUBMISSION_CSV_FN = f'/projects/my-public-bucket/DPS_{IDENTIFIER}_submission_results_{len_input_list}_{nowtime}.csv'
submit_results_df.to_csv(OUT_SUBMISSION_CSV_FN)
print(OUT_SUBMISSION_CSV_FN)
submit_results_df

# of input tiles for DPS: 153
DPS run #: 1	| tile num: 131	| submit status: success	| job id: d24767f8-c73c-4f12-adda-3926cdd6cc54
DPS run #: 25	| tile num: 151	| submit status: success	| job id: ec788068-498a-4240-bc19-6fbad723cd84
DPS run #: 50	| tile num: 301	| submit status: success	| job id: 6e1d0e64-de2a-40bd-a019-3b35ae87c0c0
DPS run #: 100	| tile num: 354	| submit status: success	| job id: 1f786e45-691a-43bd-ad71-b29dfa781c17
DPS run #: 153	| tile num: 3648	| submit status: success	| job id: 2e13e554-a5ee-4da5-967b-d5ff4de28db6
Current time:	202203291816
/projects/my-public-bucket/DPS_run_tile_atl08_submission_results_153_202203291816.csv
CPU times: user 1.74 s, sys: 162 ms, total: 1.91 s
Wall time: 36.5 s


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type
0,success,200,d24767f8-c73c-4f12-adda-3926cdd6cc54,1,131,2022-03-29 18:15:36.419619,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,b997d8a0-3cfc-4446-b088-77ba35e9837c,2,132,2022-03-29 18:15:36.525031,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,4a5a70ee-d1c9-4dc6-b608-9acc22317003,3,133,2022-03-29 18:15:36.634964,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,ab3798b4-1e99-4b71-98b2-84b1dafae681,4,4,2022-03-29 18:15:36.869841,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,6a8bf3f0-a149-4bf6-941f-9ee44fa00482,5,5,2022-03-29 18:15:37.099964,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
...,...,...,...,...,...,...,...,...,...,...
0,success,200,848329da-c06d-49eb-954f-739903942909,149,3644,2022-03-29 18:16:11.663888,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,9bfac526-dd55-4928-afec-575567af3b9d,150,3645,2022-03-29 18:16:11.888976,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,b437ded6-0576-4d9e-821d-0786448bfacb,151,3646,2022-03-29 18:16:12.114767,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb
0,success,200,ed6b090b-995e-43c3-abcc-98f8de09c0ad,152,3647,2022-03-29 18:16:12.376203,18,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-16gb


After almost any DPS job, you have to assess what succeeded and failed. This involves:
1. building a table of job status based on job ids captured in the job_results_df from the DPS run chunk (this takes 40 mins for ~47k jobs) --> this tells you how many jobs failed
2. merging the job status table with the job results df --> this tells you which specific granules (or tile nums) failed
3. building another input list of granules for a follow-up DPS
## Assess DPS results
Build a table of job status based on job id - how many jobs failed?

In [16]:
def BUILD_TABLE_JOBSTATUS(submit_results_df):
    import xmltodict
    job_status_df = pd.concat([pd.DataFrame(xmltodict.parse(maap.getJobStatus(job_id).content)).transpose() for job_id in submit_results_df.job_id.to_list()])
    job_status_df = submit_results_df.merge(job_status_df, how='left', left_on='job_id',  right_on='wps:JobID')
    return job_status_df

In [24]:
%%time
z = BUILD_TABLE_JOBSTATUS(submit_results_df)

print(f'Count total jobs:\t{len(z)}')
print(f"Count pending jobs:\t{z[z['wps:Status'] =='Accepted'].shape[0]}")
print(f"Count running jobs:\t{z[z['wps:Status'] =='Running'].shape[0]}")
print(f"Count succeeded jobs:\t{z[z['wps:Status'] =='Succeeded'].shape[0]}")
print(f"Count failed jobs:\t{z[z['wps:Status'] =='Failed'].shape[0]}")
print(f"% of failed jobs:\t{round(z[z['wps:Status'] =='Failed'].shape[0] / ( z[z['wps:Status'] =='Failed'].shape[0] + z[z['wps:Status'] =='Succeeded'].shape[0] ), 4) * 100}")

Count total jobs:	153
Count pending jobs:	0
Count running jobs:	0
Count succeeded jobs:	136
Count failed jobs:	17
% of failed jobs:	11.110000000000001
CPU times: user 1.63 s, sys: 123 ms, total: 1.75 s
Wall time: 4.68 s


In [100]:
xmltodict.parse(maap.getJobResult(z[z['wps:Status'] =='Succeeded'].iloc[0].job_id).content)

OrderedDict([('wps:Result',
              OrderedDict([('@xmlns:ows', 'http://www.opengis.net/ows/2.0'),
                           ('@xmlns:schemaLocation',
                            'http://schemas.opengis.net/wps/2.0/wps.xsd'),
                           ('@xmlns:wps', 'http://www.opengis.net/wps/2.0'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('wps:JobID',
                            'fd103a8d-83b1-41d7-b795-f2f328f16269'),
                           ('wps:Output',
                            OrderedDict([('@id',
                                          'output-2022-03-22T19:20:58.003202'),
                                         ('wps:Data',
                                          ['http://maap-ops-workspace.s3-website-us-west-2.amazonaws.com/lduncanson/dps_output/run_tile_atl08_ubuntu/master/2022/03/22/19/20/58/003202',
                                           's3://s3.us

In [64]:
xmltodict.parse(maap.getJobResult(z[z['wps:Status'] =='Failed'].iloc[0].job_id).content)

OrderedDict([('wps:Result',
              OrderedDict([('@xmlns:ows', 'http://www.opengis.net/ows/2.0'),
                           ('@xmlns:schemaLocation',
                            'http://schemas.opengis.net/wps/2.0/wps.xsd'),
                           ('@xmlns:wps', 'http://www.opengis.net/wps/2.0'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('wps:JobID',
                            '9e3ec766-af50-4d54-9412-2d966c141a3d'),
                           ('wps:Output',
                            OrderedDict([('@id', 'traceback'),
                                         ('wps:Data',
                                          'Traceback (most recent call last):\n  File "/home/ops/verdi/ops/hysds-0.3.11/hysds/job_worker.py", line 1126, in run_job\n    raise RuntimeError("Got non-zero exit code: {}".format(status))\nRuntimeError: Got non-zero exit code: 143')]))]))])

In [25]:
!python /projects/icesat2_boreal/lib/build_tindex_master.py -t ATL08_filt -y 2022 -m 03 -o /projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/03

  shapely_geos_version, geos_capi_version_string

Building a list of tiles:
MAAP version:		master
Type:		ATL08_filt
Year:		2022
Month:		03
Days:		1-31

Output dir:  /projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/03
                                             s3_path  ...                                               file
0  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220329_0004.csv
2  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220329_0133.csv
4  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220329_0131.csv
6  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220329_0132.csv
8  s3://maap-ops-workspace/lduncanson/dps_output/...  ...  atl08_005_30m_filt_topo_landsat_20220329_0009.csv

[5 rows x 3 columns]
# of duplicate tiles: 0
Final # of tiles: 128
df shape :                           

[387, 388, 5, 6, 7, 10, 14, 16, 275, 21, 25, 27, 28, 29, 416, 37, 38, 296, 26025, 299, 300, 301, 48, 177, 50, 178, 60, 198, 3270, 72, 326, 327, 328, 26574, 354, 355, 356, 357, 247]


In [153]:
%%time
TILE_NUM = 131 #NEED_TILES[6]
args = f"\
-LC_filter True \
--extract_covars \
--do_30m \
--do_dps \
-years_list 2019 2020 2021 \
-o /projects/my-public-bucket/atl08_filt_covar_tiles \
-in_tile_num {TILE_NUM} \
-in_tile_fn /projects/shared-buckets/nathanmthomas/boreal_tiles_v003.gpkg \
-in_tile_layer boreal_tiles_v003 \
-in_tile_id_col tile_num \
-csv_list_fn /projects/shared-buckets/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv \
-topo_stack_list_fn /projects/shared-buckets/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv \
-landsat_stack_list_fn /projects/shared-buckets/nathanmthomas/DPS_tile_lists/Landsat_tindex_master.csv \
-user_stacks nathanmthomas \
-user_atl08 lduncanson \
-thresh_sol_el 5 \
-v_ATL08 5 -minmonth 4 -maxmonth 10"
print(args)
!python /projects/icesat2_boreal/lib/tile_atl08.py $args

-LC_filter True --extract_covars --do_30m --do_dps -years_list 2019 2020 2021 -o /projects/my-public-bucket/atl08_filt_covar_tiles -in_tile_num 131 -in_tile_fn /projects/shared-buckets/nathanmthomas/boreal_tiles_v003.gpkg -in_tile_layer boreal_tiles_v003 -in_tile_id_col tile_num -csv_list_fn /projects/shared-buckets/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv -topo_stack_list_fn /projects/shared-buckets/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv -landsat_stack_list_fn /projects/shared-buckets/nathanmthomas/DPS_tile_lists/Landsat_tindex_master.csv -user_stacks nathanmthomas -user_atl08 lduncanson -thresh_sol_el 5 -v_ATL08 5 -minmonth 4 -maxmonth 10
  shapely_geos_version, geos_capi_version_string

Land cover filtering set to: True

Working on tile:	 131
From layer:		 boreal_tiles_v003
In vector file:		 /projects/shared-buckets/nathanmthomas/boreal_tiles_v003.gpkg
ATL08 version:		 5
Season start:		 04-01
Season end:		 10-31
Years:			 [2019, 2020, 2021]
ATL08 bin length:	 3