In [1]:
from maap.maap import MAAP
maap = MAAP(maap_host='api.ops.maap-project.org')

# Launch DPS for extract_filter_atl08.py
using the rebinned 30m h5 granules

In [2]:
from os import path
import os, glob
import datetime
import pandas as pd
import geopandas as gpd

  shapely_geos_version, geos_capi_version_string


In [3]:
curr_dir = wk_dir = os.path.dirname(os.path.realpath('__file__'))
print(curr_dir)

/projects/icesat2_boreal/dps/alg_2-3


## Test extract_filter_atl08.py code on Test Data
We are running extract_filter_atl08.py (but *FILTERING* is turned off):

python /projects/icesat2_boreal/lib/extract_filter_atl08.py -o "/projects/test_data/test_data_30m" -i "/projects/my-private-bucket/dps_output/run_rebinning_ubuntu/master/2022/02/25/18/51/17/542274/ATL08_30m_20190721220156_03640403_005_01.h5" --no-filter-qual --do_30m

python /projects/icesat2_boreal/lib/extract_filter_atl08.py -o "/projects/test_data/test_data_30m" -i "https://maap-ops-workspace.s3.amazonaws.com.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/02/25/18/51/17/542274/ATL08_30m_20190721220156_03640403_005_01.h5" --no-filter-qual --do_30m

python lib/extract_filter_atl08.py -i "path/to/h5file" -o "path/of/out/dir" --no-filter-qual --do_30m

### Build the input list of granules - s3fs implementation

In [5]:
%%time
import s3fs
s3 = s3fs.S3FileSystem()
bucket = "s3://maap-ops-workspace"
searchkey_h5_list = [f'lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/{DAY}/**/*.h5' for DAY in ['03','04','05']]
searchkey_csv_list = [f'lduncanson/dps_output/run_extract_filter_atl08_ubuntu/master/2022/03/{DAY}/**/*.csv' for DAY in ['06','07','08', '09', '10','11']]

# Concat list of lists to data frame
atl0830m_h5_path_df = pd.concat([pd.DataFrame(s3.glob(os.path.join(bucket, searchkey)), columns=['maap_path']) for searchkey in searchkey_h5_list])
atl0830m_csv_path_df = pd.concat([pd.DataFrame(s3.glob(os.path.join(bucket, searchkey)), columns=['maap_path']) for searchkey in searchkey_csv_list])

#Convert data frame to list
ATL08_h5_GRANULE_LIST = [i.replace("maap-ops-workspace", "https://maap-ops-workspace.s3.amazonaws.com") for i in atl0830m_h5_path_df.maap_path.to_list()]
ATL08_csv_GRANULE_LIST  = [i.replace("maap-ops-workspace", "https://maap-ops-workspace.s3.amazonaws.com") for i in atl0830m_csv_path_df.maap_path.to_list()]

CPU times: user 54.1 s, sys: 831 ms, total: 54.9 s
Wall time: 1min 32s


### Find h5 granules that dont yet have csvs

In [6]:
grans = [os.path.split(i)[-1].replace('.h5','') for i in atl0830m_h5_path_df.maap_path.to_list()]
ATL08_h5_GRANULE_df = pd.DataFrame({'h5_path': ATL08_h5_GRANULE_LIST, 'granule_name': grans})

csvs  = [os.path.split(i)[-1].replace('_30m.csv','') for i in atl0830m_csv_path_df.maap_path.to_list()]
ATL08_csv_GRANULE_df = pd.DataFrame({'csv_path': ATL08_csv_GRANULE_LIST, 'granule_name': csvs})

z = ATL08_h5_GRANULE_df.merge(ATL08_csv_GRANULE_df, how='left', on='granule_name')

#print(z[z.csv_path.isnull()])

INPUT_ATL08_GRANULE_LIST = z[z.csv_path.isnull()].h5_path.to_list()
len(INPUT_ATL08_GRANULE_LIST)

25053

In [7]:
TEST_DPS  = True

if TEST_DPS:
    print('Running DPS on a SUBSET list of input')
    DPS_INPUT_ATL08_GRANULE_LIST = INPUT_ATL08_GRANULE_LIST[-10:]
else:
    print('Running DPS on the FULL list of input')
    DPS_INPUT_ATL08_GRANULE_LIST = INPUT_ATL08_GRANULE_LIST
    
DPS_INPUT_ATL08_GRANULE_LIST[-1]

Running DPS on a SUBSET list of input


'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/05/04/57/42/150243/ATL08_30m_20210707203601_02171206_005_01.h5'

## Run a DPS job across the list

In [8]:
job_results_list = []

for i, INPUT_ATL08_GRANULE in enumerate(DPS_INPUT_ATL08_GRANULE_LIST):

    DPS_num = i+1
    IDENTIFIER = 'run_extract_filter_atl08'
    ALGO_ID = f'{IDENTIFIER}_ubuntu'
    USER = 'lduncanson'
    WORKER_TYPE = 'maap-dps-worker-8gb'
    
    in_param_dict = {
                        'input_file': INPUT_ATL08_GRANULE
                    }

    submit_result = maap.submitJob(
            identifier=IDENTIFIER,
            algo_id=ALGO_ID,
            version='master',
            username=USER, # username needs to be the same as whoever created the workspace
            queue=WORKER_TYPE,
            **in_param_dict
        )
    
    # Build a dataframe of submission details
    submit_result['dps_num'] = DPS_num
    submit_result['tile_num'] = INPUT_ATL08_GRANULE
    submit_result['submit_time'] = datetime.datetime.now()
    submit_result['dbs_job_hour'] =datetime.datetime.now().hour
    submit_result['algo_id'] = ALGO_ID
    submit_result['user'] = USER
    submit_result['worker_type'] = WORKER_TYPE
    job_results_list.append(pd.DataFrame([submit_result]))
    
    if DPS_num in [1, 100, 500, 1000, 3000, 5000, 7000, 9000, 11000, 13000, 15000, 17000, 19000, 21000, 24000, len(DPS_INPUT_ATL08_GRANULE_LIST)]:
        print(f"DPS run #: {DPS_num} | granule name: {os.path.basename(INPUT_ATL08_GRANULE)} | sumbit status: {submit_result['status']} | job id: {submit_result['job_id']}") 
        #print(submit_result)
        
job_results_df = pd.concat(job_results_list)
job_results_df

DPS run #: 1 | granule name: ATL08_30m_20210621224339_13611106_005_01.h5 | sumbit status: success | job id: 6b7866ad-aa99-4b7c-84df-8daab6e1def6
DPS run #: 10 | granule name: ATL08_30m_20210707203601_02171206_005_01.h5 | sumbit status: success | job id: ebe486e6-a7f4-4d4a-90b6-333751bf1971


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type
0,success,200,6b7866ad-aa99-4b7c-84df-8daab6e1def6,1,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:42.102770,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,ea31aa55-2921-4fdb-af8c-eeb073891ec7,2,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:42.233879,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,d09425e2-4638-43db-9461-726520d84f4a,3,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:42.379710,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,538c7cd2-1c43-41c4-8531-c54aacfb9109,4,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:42.571315,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,0a259372-6ab4-406f-a7db-dd9c589e19bb,5,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:42.847262,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,962dc014-e8d4-484c-bdc4-602a8a31ee76,6,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:43.090748,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,9e567569-074e-44fc-8f01-85f7e26a9b11,7,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:43.259073,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,0c2065df-8249-4023-874a-f35de4aadd30,8,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:43.502665,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,45846ce4-9a58-43a1-aafe-c1b8057d5195,9,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:43.683376,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,ebe486e6-a7f4-4d4a-90b6-333751bf1971,10,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 13:28:43.854852,13,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb


In [9]:
!pip install xmltodict

Collecting xmltodict
  Using cached xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0
[0m

In [39]:
pd.__version__

'1.2.2'

After almost any DPS job, you have to assess what succeeded and failed. This involves:
1. building a table of job status based on job ids captured in the job_results_df from the DPS run chunk (this takes 40 mins for ~47k jobs) --> this tells you how many jobs failed
2. merging the job status table with the job results df --> this tells you which specific granules (or tile nums) failed
3. building another input list of granules for a follow-up DPS
### Build a table of job status based on job id - how many jobs failed?

In [13]:
%%time
def BUILD_TABLE_JOBSTATUS(job_results_df):
    import xmltodict
    #tmp_job_results_df = job_results_df#.iloc[47300:47310]
    df = pd.concat([pd.DataFrame(xmltodict.parse(maap.getJobStatus(job_id).content)).transpose() for job_id in job_results_df.job_id.to_list()])
    return df

df = BUILD_TABLE_JOBSTATUS(job_results_df)
df

CPU times: user 117 ms, sys: 7.59 ms, total: 125 ms
Wall time: 362 ms


Unnamed: 0,@xmlns:ows,@xmlns:schemaLocation,@xmlns:wps,@xmlns:xsi,wps:JobID,wps:Status
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,6b7866ad-aa99-4b7c-84df-8daab6e1def6,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,ea31aa55-2921-4fdb-af8c-eeb073891ec7,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,d09425e2-4638-43db-9461-726520d84f4a,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,538c7cd2-1c43-41c4-8531-c54aacfb9109,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,0a259372-6ab4-406f-a7db-dd9c589e19bb,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,962dc014-e8d4-484c-bdc4-602a8a31ee76,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,9e567569-074e-44fc-8f01-85f7e26a9b11,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,0c2065df-8249-4023-874a-f35de4aadd30,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,45846ce4-9a58-43a1-aafe-c1b8057d5195,Failed
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,ebe486e6-a7f4-4d4a-90b6-333751bf1971,Failed


#### Join job_results_df with job status info - which granules failed?

In [11]:
num_jobs_round1 = job_results_df.shape[0]
z = job_results_df.merge(df, how='left', left_on='job_id', right_on='wps:JobID')
fails_df = z[z['wps:Status'] =='Failed']
num_fails_round1 = fails_df.shape[0]
print(f'Count Round 1 total jobs:\t{num_jobs_round1}')
print(f'Count Round 1 failed jobs:\t{num_fails_round1}')
print(f'Count Round 1 jobs for Round 2:\t{num_jobs_round1-num_fails_round1}')

Count Round 1 total jobs:	10
Count Round 1 failed jobs:	2
Count Round 1 jobs for Round 2:	8


##### Pick a job and manually test - does it produce output as expected?
if so, then AWS had some issue during the DPS: contact MAAP dev about this

In [12]:
f = fails_df.iloc[1].tile_num.replace('https://maap-ops-workspace.s3.amazonaws.com/lduncanson','/projects/my-private-bucket')
!python /projects/icesat2_boreal/dps/alg_2-3/extract_filter_atl08.py --i $f --no-filter-qual --do_30m -o /projects/my-public-bucket/test_output


Written by:
	Nathan Thomas	| @Nmt28
	Paul Montesano	| paul.m.montesano@nasa.gov

Min lat: 45.0
Max lat: 75.0
Min lon: -180.0
Max lon: 180.0
Month range: 6-9

ATL08 granule name: 	ATL08_30m_20210714235338_03261206_005_01
Input dir: 		/projects/my-private-bucket/dps_output/run_rebinning_ubuntu/master/2022/03/05/04/51/06/341661

Segment length: 30m
Find src nodata value using max of h_can: 	nan

Building pandas dataframe...
Setting pandas df nodata values to np.nan for some basic eval.
# of ATL08 obs: 		211006
# of ATL08 obs (can pho.>=0): 	77997
# of ATL08 obs (toc pho.>=0): 	77997
# of ATL08 obs (h_can>=0): 	77905
# of ATL08 obs (h_can<0): 	92
Setting out pandas df nodata values: 	3.4028234663852886e+38
Quality Filtering: 	[OFF] (do downstream)
Geographic Filtering: 	[ON] xmin = -180.0, xmax = 180.0, ymin = 45.0, ymax = 75.0
Creating CSV: 		/projects/my-public-bucket/test_output/ATL08_30m_20210714235338_03261206_005_01_30m.csv


### Compile granules of the fails for another DPS batch (Round2)

In [117]:
DPS_INPUT_ATL08_GRANULE_LIST_ROUND2 = fails_df.tile_num.to_list()
DPS_INPUT_ATL08_GRANULE_LIST_ROUND2[0]

'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/03/01/08/50/075739/ATL08_30m_20181116094717_07450103_005_01.h5'

In [181]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [189]:
import pprint
DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_1, DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_2, DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_3, DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_4, DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_5 = list(chunks(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4, 5000))
print(
    len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_1), len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_2), len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_3), len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_4), len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_5)
)

5000 5000 5000 5000 280


In [190]:
#job_results_list_round2 = []
DPS_INPUT_ATL08_GRANULE_LIST = DPS_INPUT_ATL08_GRANULE_LIST_ROUND4_1
for i, INPUT_ATL08_GRANULE in enumerate(DPS_INPUT_ATL08_GRANULE_LIST):

    DPS_num = i+1
    IDENTIFIER = 'run_extract_filter_atl08'
    ALGO_ID = f'{IDENTIFIER}_ubuntu'
    USER = 'lduncanson'
    WORKER_TYPE = 'maap-dps-worker-8gb'
    
    in_param_dict = {
                        'input_file': INPUT_ATL08_GRANULE
                    }

    submit_result = maap.submitJob(
            identifier=IDENTIFIER,
            algo_id=ALGO_ID,
            version='master',
            username=USER, # username needs to be the same as whoever created the workspace
            queue=WORKER_TYPE,
            **in_param_dict
        )
    
    # Build a dataframe of submission details
    submit_result['dps_num'] = DPS_num
    submit_result['tile_num'] = INPUT_ATL08_GRANULE
    submit_result['submit_time'] = datetime.datetime.now()
    submit_result['dbs_job_hour'] =datetime.datetime.now().hour
    submit_result['algo_id'] = ALGO_ID
    submit_result['user'] = USER
    submit_result['worker_type'] = WORKER_TYPE
    job_results_list.append(pd.DataFrame([submit_result]))
    
    if DPS_num in [1, 100, 500, 1000, 3000, len(DPS_INPUT_ATL08_GRANULE_LIST)]:
        #print(f"DPS run #: {DPS_num} | granule name: {os.path.basename(INPUT_ATL08_GRANULE)} | job info: {submit_result}") 
        print(submit_result)
        
#job_results_df_round3 = pd.concat(job_results_list)
#job_results_df_round3

{'status': 'success', 'http_status_code': 200, 'job_id': 'b7670534-aa0f-4e98-9916-4338b17de8df', 'dps_num': 1, 'tile_num': 'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/05/03/42/03/243629/ATL08_30m_20201212034903_12060906_005_01.h5', 'submit_time': datetime.datetime(2022, 3, 9, 0, 29, 52, 808637), 'dbs_job_hour': 0, 'algo_id': 'run_extract_filter_atl08_ubuntu', 'user': 'lduncanson', 'worker_type': 'maap-dps-worker-8gb'}
{'status': 'success', 'http_status_code': 200, 'job_id': 'b425b3aa-960b-4ea8-9f3c-2e52a6152a1d', 'dps_num': 100, 'tile_num': 'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/05/03/42/58/464148/ATL08_30m_20201122041440_09010903_005_01.h5', 'submit_time': datetime.datetime(2022, 3, 9, 0, 30, 12, 777351), 'dbs_job_hour': 0, 'algo_id': 'run_extract_filter_atl08_ubuntu', 'user': 'lduncanson', 'worker_type': 'maap-dps-worker-8gb'}
{'status': 'success', 'http_status_c

##### The round2 job results list was appended to the round1 list: build a table with all

In [1]:
job_results_df_all = pd.concat(job_results_list)
job_results_df_all

NameError: name 'pd' is not defined

#### Update the job status table with job status results from Round 2

In [165]:
%%time
# The index of the last job from round 1: num_jobs_round1-1
# Confirm by checking the submit_time difference between the last round1 and the first round2 job
print(
    job_results_df_all.iloc[num_jobs_round1-1] ,
    job_results_df_all.iloc[num_jobs_round1]
)

job_status_df_all = BUILD_TABLE_JOBSTATUS(job_results_df_all)
job_status_df_all

status                                                        success
http_status_code                                                  200
job_id                           df9753b8-dd90-486b-8bbf-90eb1ad1a41c
dps_num                                                         47310
tile_num            https://maap-ops-workspace.s3.amazonaws.com/ld...
submit_time                                2022-03-08 00:05:30.819628
dbs_job_hour                                                        0
algo_id                               run_extract_filter_atl08_ubuntu
user                                                       lduncanson
worker_type                                       maap-dps-worker-8gb
Name: 0, dtype: object status                                                        success
http_status_code                                                  200
job_id                           a0303acd-fe1d-4ade-affa-2326a6aca7f1
dps_num                                                            

Unnamed: 0,@xmlns:ows,@xmlns:schemaLocation,@xmlns:wps,@xmlns:xsi,wps:JobID,wps:Status,ows:Exception
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,28522faa-f1b2-489f-b578-36c445a9a159,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,ad3d5b38-61a8-4283-a2ae-63b827c309e1,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,1ab341b7-c4c4-49e9-b6d3-ec5f9a8beb83,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,4309a4e8-02a0-4ae8-9928-32ebfb883c4c,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,dba94bc2-7ba4-4a1a-8164-7c8a653cb2b7,Failed,
...,...,...,...,...,...,...,...
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,7a00e4a7-d54b-48b8-b7ee-d91b4d802357,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,c4fa432e-7b14-4c5d-8ca0-ad7c724c5d3b,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,1893451b-28ff-4de9-8b7d-29331db1d98d,Failed,
wps:StatusInfo,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,8180012f-d9dc-4f67-83fe-609f13cd1599,Failed,


In [172]:
print(
    job_results_df_all.iloc[num_jobs_round1:].shape,
    job_results_df_all.iloc[-len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND3):].shape
)
job_results_df_all.iloc[-len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND3):].head()

(51279, 10) (24524, 10)


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type
0,success,200,03a32d3e-05a5-485c-ba4d-9803e71f6831,26756,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:44.660007,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,dedff53d-2936-4fcb-8da5-54c775e2c0a0,26757,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.045439,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,abfdb407-178e-4eb5-8c6e-2e69f0883c68,26758,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.203085,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,a32cb133-7b01-4710-a106-b387d53a11b3,26759,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.295580,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,d17cd083-9149-4ff1-a1de-dd9584a9d4bf,26760,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.406540,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb


##### Isolate the granules associated with the last round's fails from the updated 'all' job status table

In [175]:
j_results_status_last_round = job_results_df_all.iloc[-len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND3):].merge(job_status_df_all, how='left', left_on='job_id', right_on='wps:JobID')
fails_df = j_results_status_last_round[j_results_status_last_round['wps:Status'] =='Failed']
num_fails_last_round = fails_df.shape[0]
print(f"Count total jobs:\t{num_jobs_round1}")
print(f"Count last round's failed jobs:\t{num_fails_last_round}")
fails_df.head()

Count total jobs:	47310
Count last round's failed jobs:	20280


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type,@xmlns:ows,@xmlns:schemaLocation,@xmlns:wps,@xmlns:xsi,wps:JobID,wps:Status,ows:Exception
1,success,200,dedff53d-2936-4fcb-8da5-54c775e2c0a0,26757,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.045439,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,dedff53d-2936-4fcb-8da5-54c775e2c0a0,Failed,
2,success,200,abfdb407-178e-4eb5-8c6e-2e69f0883c68,26758,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.203085,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,abfdb407-178e-4eb5-8c6e-2e69f0883c68,Failed,
5,success,200,d6f25679-c8ee-4931-816f-d82be164d81f,26761,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:45.625142,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,d6f25679-c8ee-4931-816f-d82be164d81f,Failed,
7,success,200,2c62acfd-1b35-4464-b456-5370c281dfd2,26763,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:46.101489,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,2c62acfd-1b35-4464-b456-5370c281dfd2,Failed,
8,success,200,3299d6bf-3c3e-4974-a627-ef42910e9209,26764,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-08 05:10:46.283356,5,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb,http://www.opengis.net/ows/2.0,http://schemas.opengis.net/wps/2.0/wps.xsd,http://www.opengis.net/wps/2.0,http://www.w3.org/2001/XMLSchema-instance,3299d6bf-3c3e-4974-a627-ef42910e9209,Failed,


### Compile granules from last round's fails for a new DPS batch

In [179]:
DPS_INPUT_ATL08_GRANULE_LIST_ROUND4 = fails_df.tile_num.to_list()
print(len(DPS_INPUT_ATL08_GRANULE_LIST_ROUND4))
DPS_INPUT_ATL08_GRANULE_LIST_ROUND4[0]


20280


'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/05/03/42/03/243629/ATL08_30m_20201212034903_12060906_005_01.h5'

In [162]:
DPS_INPUT_ATL08_GRANULE_LIST_ROUND3[1]

'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/03/01/12/41/622031/ATL08_30m_20181115083009_07290102_005_01.h5'

### Examine output of DPS - but replace glob.glob with s3.glob

In [19]:
#print(f"The data frame show you submitted {len(job_results_df)} jobs. Check the returned results to see if the total returned = total submitted...")
for JOB_HOUR in range(20,21):
    returned_results_list = glob.glob(f"/projects/my-private-bucket/dps_output/run_extract_filter_atl08_ubuntu/master/2022/03/04/{JOB_HOUR}/**/*.csv", recursive=True)
    print(f"For DPS job that returned results in hour {JOB_HOUR}, # granules that ran: {len(returned_results_list)}")

For DPS job that returned results in hour 20, # granules that ran: 10


In [32]:
# Merge all files in the list
print("Creating pandas data frame...")
atl08_gdf = pd.concat((pd.read_csv(f) for f in returned_results_list ), sort=False, ignore_index=True) # <--generator is (), list is []
atl08_gdf = gpd.GeoDataFrame(atl08_gdf, geometry=gpd.points_from_xy(atl08_gdf.lon, atl08_gdf.lat), crs='epsg:4326')

Creating pandas data frame...


In [34]:
atl08_gdf.head()

Unnamed: 0,fid,lon,lat,dt,orb_orient,orb_num,rgt,gt,segid_beg,segid_end,...,asr,h_dif_ref,ter_flg,ph_rem_flg,dem_rem_flg,seg_wmask,lyr_flg,seg_cover,granule_name,geometry
0,14887,75.620664,45.00049,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249813.842915,249817.842915,...,0.158199,-1.342682,1,0,0,0,0,48,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62066 45.00049)
1,14888,75.620628,45.000758,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249815.336919,249819.336919,...,0.158199,-1.342682,1,0,0,0,0,48,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62063 45.00076)
2,14889,75.620401,45.002371,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249824.331298,249828.331298,...,0.158199,-1.073212,1,0,0,0,0,42,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62040 45.00237)
3,14890,75.620364,45.00264,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249825.83088,249829.83088,...,0.158199,-1.073212,1,0,0,0,0,42,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62036 45.00264)
4,14891,75.620291,45.003178,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249828.83088,249832.83088,...,0.129356,-1.743439,1,0,0,0,0,24,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62029 45.00318)


In [27]:
os.system( "python /projects/icesat2_boreal/lib/build_tindex_master.py -t ATL08 -dps_year 2022 -m 3 --start_day '04' --outdir /projects/my-public-bucket/test_output")

512