In [50]:
from maap.maap import MAAP
maap = MAAP(maap_host='api.ops.maap-project.org')
!pip install xmltodict
import xmltodict

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

# Launch DPS for extract_filter_atl08.py
using the rebinned 30m h5 granules

In [51]:
from os import path
import os, glob
import datetime
import pandas as pd
import geopandas as gpd

In [52]:
curr_dir = wk_dir = os.path.dirname(os.path.realpath('__file__'))
print(curr_dir)

/projects/icesat2_boreal/dps/alg_2-3


## Test extract_filter_atl08.py code on Test Data
We are running extract_filter_atl08.py (but *FILTERING* is turned off):

python /projects/icesat2_boreal/lib/extract_filter_atl08.py -o "/projects/test_data/test_data_30m" -i "/projects/my-private-bucket/dps_output/run_rebinning_ubuntu/master/2022/02/25/18/51/17/542274/ATL08_30m_20190721220156_03640403_005_01.h5" --no-filter-qual --do_30m

python /projects/icesat2_boreal/lib/extract_filter_atl08.py -o "/projects/test_data/test_data_30m" -i "https://maap-ops-workspace.s3.amazonaws.com.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/02/25/18/51/17/542274/ATL08_30m_20190721220156_03640403_005_01.h5" --no-filter-qual --do_30m

python lib/extract_filter_atl08.py -i "path/to/h5file" -o "path/of/out/dir" --no-filter-qual --do_30m

### Build the input DPS list of granules - s3fs implementation
by finding h5 granules that dont yet have csvs

In [53]:
%%time
import s3fs
s3 = s3fs.S3FileSystem()
bucket = "s3://maap-ops-workspace"
searchkey_h5_list = [f'lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/{DAY}/**/*.h5' for DAY in ['03','04','05']]
searchkey_csv_list = [f'lduncanson/dps_output/run_extract_filter_atl08_ubuntu/master/2022/03/{DAY}/**/*.csv' for DAY in ['06','07','08', '09', '10','11']]

# Concat list of lists to data frame
atl0830m_h5_path_df = pd.concat([pd.DataFrame(s3.glob(os.path.join(bucket, searchkey)), columns=['maap_path']) for searchkey in searchkey_h5_list])
atl0830m_csv_path_df = pd.concat([pd.DataFrame(s3.glob(os.path.join(bucket, searchkey)), columns=['maap_path']) for searchkey in searchkey_csv_list])

#Convert data frame to list
ATL08_h5_GRANULE_LIST = [i.replace("maap-ops-workspace", "https://maap-ops-workspace.s3.amazonaws.com") for i in atl0830m_h5_path_df.maap_path.to_list()]
ATL08_csv_GRANULE_LIST  = [i.replace("maap-ops-workspace", "https://maap-ops-workspace.s3.amazonaws.com") for i in atl0830m_csv_path_df.maap_path.to_list()]

grans = [os.path.split(i)[-1].replace('.h5','') for i in atl0830m_h5_path_df.maap_path.to_list()]
ATL08_h5_GRANULE_df = pd.DataFrame({'h5_path': ATL08_h5_GRANULE_LIST, 'granule_name': grans})

csvs  = [os.path.split(i)[-1].replace('_30m.csv','') for i in atl0830m_csv_path_df.maap_path.to_list()]
ATL08_csv_GRANULE_df = pd.DataFrame({'csv_path': ATL08_csv_GRANULE_LIST, 'granule_name': csvs})

print('Matching extracted csv to ATL08 30m h5 granules..')
merged = ATL08_h5_GRANULE_df.merge(ATL08_csv_GRANULE_df, how='left', on='granule_name')

print('Finding the null matches (the granules that still need extracted csvs...')
INPUT_ATL08_GRANULE_LIST = merged[merged.csv_path.isnull()].h5_path.to_list()
print(f'# of extractions still needed:\t{len(INPUT_ATL08_GRANULE_LIST)}\n')

Matching extracted csv to ATL08 30m h5 granules..
Finding the null matches (the granules that still need extracted csvs...
# of extractions still needed:	12283

CPU times: user 1min 5s, sys: 653 ms, total: 1min 6s
Wall time: 1min 56s


In [54]:
TEST_DPS  = False

if TEST_DPS:
    print('Running DPS on a SUBSET list of input')
    DPS_INPUT_ATL08_GRANULE_LIST = INPUT_ATL08_GRANULE_LIST[-100:]
else:
    LIST_SIZE = 12283
    print(f'Running DPS on the {LIST_SIZE} SUBSET list of input')
    DPS_INPUT_ATL08_GRANULE_LIST = INPUT_ATL08_GRANULE_LIST[0:LIST_SIZE]
    
DPS_INPUT_ATL08_GRANULE_LIST[-1]

Running DPS on the 12283 SUBSET list of input


'https://maap-ops-workspace.s3.amazonaws.com/lduncanson/dps_output/run_rebinning_ubuntu/master/2022/03/05/04/47/47/547100/ATL08_30m_20210715133745_03351202_005_01.h5'

## Run a DPS job across the list

In [32]:
%%time
submit_results_df_list = []
len_input_list = len(DPS_INPUT_ATL08_GRANULE_LIST)

for i, INPUT_ATL08_GRANULE in enumerate(DPS_INPUT_ATL08_GRANULE_LIST):

    DPS_num = i+1
    IDENTIFIER = 'run_extract_filter_atl08'
    ALGO_ID = f'{IDENTIFIER}_ubuntu'
    USER = 'lduncanson'
    WORKER_TYPE = 'maap-dps-worker-8gb'
    
    in_param_dict = {
                        'input_file': INPUT_ATL08_GRANULE
                    }

    submit_result = maap.submitJob(
            identifier=IDENTIFIER,
            algo_id=ALGO_ID,
            version='master',
            username=USER, # username needs to be the same as whoever created the workspace
            queue=WORKER_TYPE,
            **in_param_dict
        )
    
    # Build a dataframe of submission details
    submit_result['dps_num'] = DPS_num
    submit_result['tile_num'] = INPUT_ATL08_GRANULE
    submit_result['submit_time'] = datetime.datetime.now()
    submit_result['dbs_job_hour'] = datetime.datetime.now().hour
    submit_result['algo_id'] = ALGO_ID
    submit_result['user'] = USER
    submit_result['worker_type'] = WORKER_TYPE
    
    # Append to a list of data frames of submission results
    submit_results_df_list.append(pd.DataFrame([submit_result]))
    
    if DPS_num in [1, 100, 500, 1000, 1500, 2000, 3000, 5000, 7000, 9000, 11000, 13000, 15000, 17000, 19000, 21000, 24000, len_input_list]:
        print(f"DPS run #: {DPS_num}\t| granule name: {os.path.basename(INPUT_ATL08_GRANULE)}\t| submit status: {submit_result['status']}\t| job id: {submit_result['job_id']}") 

# Build a final submission results df and save
submit_results_df = pd.concat(submit_results_df_list)
nowtime = pd.Timestamp.now().strftime('%Y%m%d%H%M')
submit_results_df.to_csv(f'/projects/my-public-bucket/DPS_{IDENTIFIER}_submission_results_{len_input_list}_{nowtime}.csv')
submit_results_df

DPS run #: 1	| granule name: ATL08_30m_20181014051811_02380106_005_01.h5	| submit status: success	| job id: 14300080-dbb9-48f6-b09a-b2da43dd71ef
DPS run #: 100	| granule name: ATL08_30m_20181206074142_10490105_005_01.h5	| submit status: success	| job id: 5ca5820c-fee4-40bd-a0bc-8d007707db27
CPU times: user 1.2 s, sys: 102 ms, total: 1.31 s
Wall time: 32.5 s


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type
0,success,200,14300080-dbb9-48f6-b09a-b2da43dd71ef,1,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:44:41.138078,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,cc3a1c14-5814-4b7f-8767-02c45976debf,2,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:44:41.303006,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,dc77ede4-ceee-4991-8830-a525cb3fa648,3,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:44:49.258201,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,7d3be3a7-cc97-41dd-8bd5-0476571558c0,4,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:44:49.335834,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,aa0c915b-f434-4c66-9a53-33a1dda2679d,5,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:44:49.456240,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
...,...,...,...,...,...,...,...,...,...,...
0,success,200,1e134e59-1341-4858-98eb-7656434010fb,96,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:45:12.320036,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,d9348359-a4cd-4775-8402-eb9eac1750e9,97,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:45:12.509542,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,1c42e394-d1a0-467b-a82b-b172deb7acf7,98,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:45:12.760632,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,a2f41b23-50b7-4384-8404-84f3926c0b60,99,https://maap-ops-workspace.s3.amazonaws.com/ld...,2022-03-09 23:45:13.022425,23,run_extract_filter_atl08_ubuntu,lduncanson,maap-dps-worker-8gb


After almost any DPS job, you have to assess what succeeded and failed. This involves:
1. building a table of job status based on job ids captured in the job_results_df from the DPS run chunk (this takes 40 mins for ~47k jobs) --> this tells you how many jobs failed
2. merging the job status table with the job results df --> this tells you which specific granules (or tile nums) failed
3. building another input list of granules for a follow-up DPS
## Assess DPS results
Build a table of job status based on job id - how many jobs failed?

In [49]:
%%time
def BUILD_TABLE_JOBSTATUS(submit_results_df):
    import xmltodict
    df = pd.concat([pd.DataFrame(xmltodict.parse(maap.getJobStatus(job_id).content)).transpose() for job_id in submit_results_df.job_id.to_list()])
    return df

job_status_df = BUILD_TABLE_JOBSTATUS(submit_results_df)
#print(job_status_df.head())

num_jobs = submit_results_df.shape[0]
z = submit_results_df.merge(job_status_df, how='left', left_on='job_id', right_on='wps:JobID')

print(f'Count total jobs:\t{num_jobs}')
print(f"Count pending jobs:\t{z[z['wps:Status'] =='Accepted'].shape[0]}")
print(f"Count running jobs:\t{z[z['wps:Status'] =='Running'].shape[0]}")
print(f"Count succeeded jobs:\t{z[z['wps:Status'] =='Succeeded'].shape[0]}")
print(f"Count failed jobs:\t{z[z['wps:Status'] =='Failed'].shape[0]}")
print(f"% of failed jobs:\t{round(z[z['wps:Status'] =='Failed'].shape[0] / ( z[z['wps:Status'] =='Failed'].shape[0] + z[z['wps:Status'] =='Succeeded'].shape[0] ), 4) * 100}")

Count total jobs:	100
Count pending jobs:	0
Count running jobs:	39
Count succeeded jobs:	43
Count failed jobs:	18
% of failed jobs:	29.509999999999998
CPU times: user 1.07 s, sys: 67.9 ms, total: 1.13 s
Wall time: 3.08 s


In [47]:
xmltodict.parse(maap.getJobResult(z[z['wps:Status'] =='Failed'].iloc[15].job_id).content)

OrderedDict([('wps:Result',
              OrderedDict([('@xmlns:ows', 'http://www.opengis.net/ows/2.0'),
                           ('@xmlns:schemaLocation',
                            'http://schemas.opengis.net/wps/2.0/wps.xsd'),
                           ('@xmlns:wps', 'http://www.opengis.net/wps/2.0'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('wps:JobID',
                            'f45e0863-6de1-4d69-95cb-971ff74cf9b9'),
                           ('wps:Output',
                            OrderedDict([('@id', 'traceback'),
                                         ('wps:Data',
                                          'Not a conda environment: /app/icesat2_boreal/dps/alg_2-3/run_extract_filter_atl08.sh\n+ /app/icesat2_boreal/dps/alg_2-3/run_extract_filter_atl08.sh\nERROR: Exception:\nTraceback (most recent call last):\n  File "/opt/conda/lib/python3.9/site-packages/pip/_vendor

##### Pick a job and manually test - does it produce output as expected?
if so, then AWS had some issue during the DPS: contact MAAP dev about this

In [12]:
f = fails_df.iloc[1].tile_num.replace('https://maap-ops-workspace.s3.amazonaws.com/lduncanson','/projects/my-private-bucket')
!python /projects/icesat2_boreal/dps/alg_2-3/extract_filter_atl08.py --i $f --no-filter-qual --do_30m -o /projects/my-public-bucket/test_output


Written by:
	Nathan Thomas	| @Nmt28
	Paul Montesano	| paul.m.montesano@nasa.gov

Min lat: 45.0
Max lat: 75.0
Min lon: -180.0
Max lon: 180.0
Month range: 6-9

ATL08 granule name: 	ATL08_30m_20210714235338_03261206_005_01
Input dir: 		/projects/my-private-bucket/dps_output/run_rebinning_ubuntu/master/2022/03/05/04/51/06/341661

Segment length: 30m
Find src nodata value using max of h_can: 	nan

Building pandas dataframe...
Setting pandas df nodata values to np.nan for some basic eval.
# of ATL08 obs: 		211006
# of ATL08 obs (can pho.>=0): 	77997
# of ATL08 obs (toc pho.>=0): 	77997
# of ATL08 obs (h_can>=0): 	77905
# of ATL08 obs (h_can<0): 	92
Setting out pandas df nodata values: 	3.4028234663852886e+38
Quality Filtering: 	[OFF] (do downstream)
Geographic Filtering: 	[ON] xmin = -180.0, xmax = 180.0, ymin = 45.0, ymax = 75.0
Creating CSV: 		/projects/my-public-bucket/test_output/ATL08_30m_20210714235338_03261206_005_01_30m.csv


### Compile granules of the fails for another DPS batch (Round2)

In [181]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [189]:
import pprint
DPS_INPUT_ATL08_GRANULE_LIST_1, DPS_INPUT_ATL08_GRANULE_LIST_2, DPS_INPUT_ATL08_GRANULE_LIST_3, DPS_INPUT_ATL08_GRANULE_LIST_4, DPS_INPUT_ATL08_GRANULE_LIST_5 = list(chunks(DPS_INPUT_ATL08_GRANULE_LIST, 5000))
print(
    len(DPS_INPUT_ATL08_GRANULE_LIST_1), len(DPS_INPUT_ATL08_GRANULE_LIST_2), len(DPS_INPUT_ATL08_GRANULE_LIST_3), len(DPS_INPUT_ATL08_GRANULE_LIST_4), len(DPS_INPUT_ATL08_GRANULE_LIST_5)
)

5000 5000 5000 5000 280


### Examine output of DPS - but replace glob.glob with s3.glob

In [19]:
#print(f"The data frame show you submitted {len(job_results_df)} jobs. Check the returned results to see if the total returned = total submitted...")
for JOB_HOUR in range(20,21):
    returned_results_list = glob.glob(f"/projects/my-private-bucket/dps_output/run_extract_filter_atl08_ubuntu/master/2022/03/04/{JOB_HOUR}/**/*.csv", recursive=True)
    print(f"For DPS job that returned results in hour {JOB_HOUR}, # granules that ran: {len(returned_results_list)}")

For DPS job that returned results in hour 20, # granules that ran: 10


In [32]:
# Merge all files in the list
print("Creating pandas data frame...")
atl08_gdf = pd.concat((pd.read_csv(f) for f in returned_results_list ), sort=False, ignore_index=True) # <--generator is (), list is []
atl08_gdf = gpd.GeoDataFrame(atl08_gdf, geometry=gpd.points_from_xy(atl08_gdf.lon, atl08_gdf.lat), crs='epsg:4326')

Creating pandas data frame...


In [34]:
atl08_gdf.head()

Unnamed: 0,fid,lon,lat,dt,orb_orient,orb_num,rgt,gt,segid_beg,segid_end,...,asr,h_dif_ref,ter_flg,ph_rem_flg,dem_rem_flg,seg_wmask,lyr_flg,seg_cover,granule_name,geometry
0,14887,75.620664,45.00049,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249813.842915,249817.842915,...,0.158199,-1.342682,1,0,0,0,0,48,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62066 45.00049)
1,14888,75.620628,45.000758,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249815.336919,249819.336919,...,0.158199,-1.342682,1,0,0,0,0,48,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62063 45.00076)
2,14889,75.620401,45.002371,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249824.331298,249828.331298,...,0.158199,-1.073212,1,0,0,0,0,42,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62040 45.00237)
3,14890,75.620364,45.00264,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249825.83088,249829.83088,...,0.158199,-1.073212,1,0,0,0,0,42,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62036 45.00264)
4,14891,75.620291,45.003178,b'2019-04-08T02:04:27.000000Z',0,3125,150,b'gt1r',249828.83088,249832.83088,...,0.129356,-1.743439,1,0,0,0,0,24,ATL08_30m_20190408015557_01500302_005_01.h5,POINT (75.62029 45.00318)


In [27]:
os.system( "python /projects/icesat2_boreal/lib/build_tindex_master.py -t ATL08 -dps_year 2022 -m 3 --start_day '04' --outdir /projects/my-public-bucket/test_output")

512