In [145]:
from maap.maap import MAAP
maap = MAAP(maap_host='api.ops.maap-project.org')

# Launch DPS for tile_atl08.py

In [146]:
import os
import geopandas
import pandas as pd
import glob
import datetime

In [147]:
def get_stack_fn(stack_list_fn, in_tile_num):
    # Find most recent topo/Landsat stack path for tile in list of stack paths from *tindex_master.csv
    all_stacks_df = pd.read_csv(stack_list_fn)
    stack_for_tile = all_stacks_df[all_stacks_df['location'].str.contains("_"+str(in_tile_num))]
    [print(i) for i in stack_for_tile.path.to_list()]
    stack_for_tile_fn = stack_for_tile.path.to_list()[0]
    if len(stack_for_tile)==0:
        stack_for_tile_fn = None
    return(stack_for_tile_fn)

# nmt added: code that returns df of landsat locations and tile number
# This is basically CountOutput.py
def get_stack_df(dps_dir, TYPE, dps_year):
    
    if "Landsat" in TYPE:
        root = f"/projects/my-private-bucket/dps_output/do_landsat_stack_3-1-2_ubuntu/ops/{dps_year}/"
        ends_with_str = "_dps.tif"
    if "Topo" in TYPE:
        root = f"/projects/my-private-bucket/dps_output/do_topo_stack_3-1-5_ubuntu/ops/{dps_year}/"
        ends_with_str = "_stack.tif"
    if "ATL08" in TYPE:
        root = f"/projects/my-private-bucket/dps_output/run_extract_ubuntu/ops/{dps_year}/"
        ends_with_str = "0m.csv"
            
    df = pd.DataFrame(columns=['location', 'tile_num'])

    for dir, subdir, files in os.walk(root):
        for fname in files:
            if fname.endswith(ends_with_str): 
                 
                tile_num = fname.split('_')[1]
                   
                if "ATL08" in TYPE:
                    df = df.append({'location':os.path.join(dir+"/", fname)},ignore_index=True)
                else:
                    df = df.append({'location':os.path.join(dir+"/", fname), 'tile_num':tile_num},ignore_index=True)
        
    return df

#### Set the names of the data frames to create

In [148]:
# Topo and Landsat tindex_master csvs from build_tindex_master.py
topo_tindex = "/projects/my-public-bucket/DPS_tile_lists/Topo_tindex_master.csv"
landsat_tindex = "/projects/my-public-bucket/DPS_tile_lists/Landsat_tindex_master.csv"

# Model-ready subset of tiles for which Topo and Landsat coincide
model_ready_tiles_topo = "/projects/my-public-bucket/DPS_tile_lists/model_ready_tiles_topo_paths.csv"
model_ready_tiles_landsat = "/projects/my-public-bucket/DPS_tile_lists/model_ready_tiles_landsat_paths.csv"

## Make the data frames from build_tindex_master.py csvs for Topo and Landsat tiles
python lib/build_tindex_master.py

In [5]:
if os.path.isfile(landsat_tindex) and os.path.isfile(topo_tindex):
    print('Reading existing...')
    ls8_df = pd.read_csv(landsat_tindex)
    topo_df = pd.read_csv(topo_tindex)
else:
    s3_stem = 'https://s3.console.aws.amazon.com/s3/buckets/maap-ops-workspace/nathanmthomas'
    local_stem = '/projects/my-private-bucket'

    ls8_root =  s3_stem + '/dps_output/do_landsat_stack_3-1-2_ubuntu'
    topo_root = s3_stem + '/dps_output/do_topo_stack_3-1-5_ubuntu'
    
    ls8_df = get_stack_df(ls8_root, "Landsat")
    topo_df = get_stack_df(topo_root, "Topo")
topo_df.head()

Reading existing...


Unnamed: 0.1,Unnamed: 0,local_path,tile_num
0,0,/projects/my-private-bucket/dps_output/do_topo...,421
1,1,/projects/my-private-bucket/dps_output/do_topo...,455
2,2,/projects/my-private-bucket/dps_output/do_topo...,456
3,3,/projects/my-private-bucket/dps_output/do_topo...,491
4,4,/projects/my-private-bucket/dps_output/do_topo...,492


## Get tile ids for which both Topo and Landsat stacks exist

In [149]:
# added by nmt: get filenames of co-incident landsat and topo

topo_sub_df = pd.DataFrame(columns=['local_path','tile_num'])
ls8_sub_df = pd.DataFrame(columns=['local_path','tile_num'])

for i in range(len(ls8_df['tile_num'])):
    ls_tile_num = ls8_df['tile_num'][i]
    for j in range(len(topo_df['tile_num'])):
        topo_tile_num = topo_df['tile_num'][j]
        if ls_tile_num == topo_tile_num:
            # Only need to choose one, but we'll do 2 and then check
            ls8_sub_df = ls8_sub_df.append({'local_path':ls8_df['local_path'][i],'tile_num':ls8_df['tile_num'][i].astype(int)}, ignore_index=True)
            topo_sub_df = topo_sub_df.append({'local_path':topo_df['local_path'][j],'tile_num':topo_df['tile_num'][j].astype(int)}, ignore_index=True)

#ls8_sub_df['tile_num'] = ls8_sub_df['tile_num'].astype(float, errors = 'raise')
print(ls8_sub_df.head())
print(topo_sub_df.head())
print(len(ls8_sub_df),len(topo_sub_df))

topo_sub_df.to_csv( model_ready_tiles_topo, index=False, encoding='utf-8-sig')
ls8_sub_df.to_csv( model_ready_tiles_landsat, index=False, encoding='utf-8-sig')

KeyboardInterrupt: 

#### Now you have a set of tile ids for which both Landsat and Topo stacks exist

In [150]:
topo_sub_df = pd.read_csv("/projects/my-public-bucket/DPS_tile_lists/model_ready_tiles_topo_paths.csv")
INPUT_TILE_NUM_LIST = topo_sub_df['tile_num'].values.astype(int).tolist()
len(INPUT_TILE_NUM_LIST)

4465

##### Test: get a subset of tile ids for Norway tiles

In [151]:
INPUT_TILE_NUM_LIST_NORWAY = pd.read_csv('/projects/my-public-bucket/misc_files/norway_tiles.csv').layer.tolist()
len(INPUT_TILE_NUM_LIST_NORWAY)

123

#### Read in the latest tindex and compare with a previous set of completed tiles to see which ones still need to be run

In [152]:
import numpy as np
tiles_completed = pd.read_csv('/projects/my-public-bucket/DPS_tile_lists/ATL08_filt_tindex_master.csv')
print(f'Tiles completed: {len(tiles_completed)}')
tile_nums_missing = np.setdiff1d(INPUT_TILE_NUM_LIST, tiles_completed.tile_num)
print(f'Tiles missing: {len(tile_nums_missing)}')
INPUT_TILE_NUM_LIST = tile_nums_missing.tolist()
len(INPUT_TILE_NUM_LIST)
#print(INPUT_TILE_NUM_LIST)

Tiles completed: 106
Tiles missing: 4325


4325

In [153]:
tindex_master_fn = f'/projects/shared-buckets/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv'
tiles = pd.read_csv(tindex_master_fn)
len(tiles)

5921

In [154]:
TEST_DPS  = True

if TEST_DPS:
    DPS_INPUT_TILE_NUM_LIST = INPUT_TILE_NUM_LIST_NORWAY #INPUT_TILE_NUM_LIST[-10:]
    DPS_INPUT_TILE_NUM_LIST = DPS_INPUT_TILE_NUM_LIST[0:10]
else:
    DPS_INPUT_TILE_NUM_LIST = INPUT_TILE_NUM_LIST
    
print(DPS_INPUT_TILE_NUM_LIST)


[131, 132, 133, 4, 5, 6, 7, 270, 271, 9]


#### Customize the DPS run: set up the parameters dictionary

In [155]:
# Norway test 01
# Just include sol_el so we can use sol_el < 5
in_param_dict_norway01 = {
                        'in_tile_num': '',
                        'in_tile_fn': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v002.gpkg',
                        'in_tile_layer': 'boreal_tiles_v002',
                        'csv_list_fn': 's3://maap-ops-workspace/shared/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv',
                        'topo_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv',
                        'landsat_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Landsat_tindex_master.csv',
                        'years_list': '2019 2020 2021',
                        'user_stacks': 'nathanmthomas',
                        'user_atl08': 'lduncanson',
                        'thresh_sol_el': 5,
                        'v_ATL08': 4,
                        'minmonth': 6,
                        'maxmonth': 9
    }
# Norway test 02
# Use v005 ATL08, which will apply lc-based thresholds, extend to all months
# NOTE!! make sure you manually update to use the correct filter in tile_atl08.py
in_param_dict_norway02 = {
                        'in_tile_num': '',
                        'in_tile_fn': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v002.gpkg',
                        'in_tile_layer': 'boreal_tiles_v002',
                        'csv_list_fn': 's3://maap-ops-workspace/shared/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv',
                        'topo_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv',
                        'landsat_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Landsat_tindex_master.csv',
                        'years_list': '2019 2020 2021',
                        'user_stacks': 'nathanmthomas',
                        'user_atl08': 'lduncanson',
                        'thresh_sol_el': 5,
                        'v_ATL08': 5,
                        'minmonth': 1,
                        'maxmonth': 12
    }

In [156]:
in_param_dict = in_param_dict_norway01
in_param_dict

{'in_tile_num': '',
 'in_tile_fn': 'https://maap-ops-workspace.s3.amazonaws.com/shared/nathanmthomas/boreal_tiles_v002.gpkg',
 'in_tile_layer': 'boreal_tiles_v002',
 'csv_list_fn': 's3://maap-ops-workspace/shared/lduncanson/DPS_tile_lists/ATL08_tindex_master.csv',
 'topo_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Topo_tindex_master.csv',
 'landsat_stack_list_fn': 's3://maap-ops-workspace/shared/nathanmthomas/DPS_tile_lists/Landsat_tindex_master.csv',
 'years_list': '2019 2020 2021',
 'user_stacks': 'nathanmthomas',
 'user_atl08': 'lduncanson',
 'thresh_sol_el': 5,
 'v_ATL08': 4,
 'minmonth': 6,
 'maxmonth': 9}

## Run a DPS job across the list

In [157]:
job_results_list = []

print(f"# of input tiles for DPS: {len(DPS_INPUT_TILE_NUM_LIST)}")
for i, INPUT_TILE_NUM in enumerate(DPS_INPUT_TILE_NUM_LIST):

    DPS_num = i+1
    IDENTIFIER = 'run_tile_atl08'
    ALGO_ID = f'{IDENTIFIER}_ubuntu'
    USER = 'lduncanson'
    WORKER_TYPE = 'maap-dps-worker-8gb'
    
    in_param_dict['in_tile_num'] = INPUT_TILE_NUM
    
    submit_result = maap.submitJob(
            identifier=IDENTIFIER,
            algo_id=ALGO_ID,
            version='master',
            username=USER, # username needs to be the same as whoever created the workspace
            queue=WORKER_TYPE,
            **in_param_dict
        )
    
    # Build a dataframe of submission details
    submit_result['dps_num'] = DPS_num
    submit_result['tile_num'] = INPUT_TILE_NUM
    submit_result['submit_time'] = datetime.datetime.now()
    submit_result['dbs_job_hour'] =datetime.datetime.now().hour
    submit_result['algo_id'] = ALGO_ID
    submit_result['user'] = USER
    submit_result['worker_type'] = WORKER_TYPE
    job_results_list.append(pd.DataFrame([submit_result]))
    
    if DPS_num in [1, 100, 500, 1000, 3000, len(DPS_INPUT_TILE_NUM_LIST)]:
        #print(f"DPS run #: {DPS_num} | tile num: {INPUT_TILE_NUM} | job info: {submit_result}") 
        print(f"job info: {submit_result}") 
    
job_results_df = pd.concat(job_results_list)
job_results_df

# of input tiles for DPS: 10
job info: {'status': 'success', 'http_status_code': 200, 'job_id': '93540a8a-2f42-4582-9cfa-d12f1b67693d', 'dps_num': 1, 'tile_num': 131, 'submit_time': datetime.datetime(2022, 3, 2, 16, 13, 3, 825700), 'dbs_job_hour': 16, 'algo_id': 'run_tile_atl08_ubuntu', 'user': 'lduncanson', 'worker_type': 'maap-dps-worker-8gb'}
job info: {'status': 'success', 'http_status_code': 200, 'job_id': '87a27ec8-b21b-42cb-aadf-3efb7c5cdc8f', 'dps_num': 10, 'tile_num': 9, 'submit_time': datetime.datetime(2022, 3, 2, 16, 13, 4, 778421), 'dbs_job_hour': 16, 'algo_id': 'run_tile_atl08_ubuntu', 'user': 'lduncanson', 'worker_type': 'maap-dps-worker-8gb'}


Unnamed: 0,status,http_status_code,job_id,dps_num,tile_num,submit_time,dbs_job_hour,algo_id,user,worker_type
0,success,200,93540a8a-2f42-4582-9cfa-d12f1b67693d,1,131,2022-03-02 16:13:03.825700,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,73f800cf-3483-490b-9d3f-09a1cd8a0e81,2,132,2022-03-02 16:13:03.907653,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,f84f6e64-8a02-40b8-b5a8-b6ed108e81f6,3,133,2022-03-02 16:13:04.005227,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,368b139f-4673-4bd0-a74b-3ec53147d2da,4,4,2022-03-02 16:13:04.127023,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,97f27dfd-2781-4ff6-8143-3c7ac3b2f53b,5,5,2022-03-02 16:13:04.268813,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,06cb2b1f-cb91-4ce8-8fd7-d9b77ddd4667,6,6,2022-03-02 16:13:04.373147,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,1f476ce7-cf98-4927-8646-c9922ffb8891,7,7,2022-03-02 16:13:04.450345,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,eebf6abe-324c-4baf-b194-1164ab0860bc,8,270,2022-03-02 16:13:04.560460,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,f03d20ac-12e7-4be3-b143-d188e04b6b5c,9,271,2022-03-02 16:13:04.684745,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb
0,success,200,87a27ec8-b21b-42cb-aadf-3efb7c5cdc8f,10,9,2022-03-02 16:13:04.778421,16,run_tile_atl08_ubuntu,lduncanson,maap-dps-worker-8gb


#### Quick check of a deep DPS dir to see what was returned from the run above

In [158]:
print(f"The data frame show you submitted {len(job_results_df)} jobs. Check the returned results to see if the total returned = total submitted...")
for JOB_HOUR in range(1,25):
    returned_results_list = glob.glob(f"/projects/my-private-bucket/dps_output/run_tile_atl08_ubuntu/master/2022/03/02/{JOB_HOUR}/**/_stdout.txt", recursive=True)
    print(f"For DPS job that returned results in hour {JOB_HOUR}, # tiles that ran: {len(returned_results_list)}")

The data frame show you submitted 10 jobs. Check the returned results to see if the total returned = total submitted...
For DPS job that returned results in hour 1, # tiles that ran: 0
For DPS job that returned results in hour 2, # tiles that ran: 0
For DPS job that returned results in hour 3, # tiles that ran: 0
For DPS job that returned results in hour 4, # tiles that ran: 0
For DPS job that returned results in hour 5, # tiles that ran: 0
For DPS job that returned results in hour 6, # tiles that ran: 0
For DPS job that returned results in hour 7, # tiles that ran: 0
For DPS job that returned results in hour 8, # tiles that ran: 0
For DPS job that returned results in hour 9, # tiles that ran: 0
For DPS job that returned results in hour 10, # tiles that ran: 0
For DPS job that returned results in hour 11, # tiles that ran: 0
For DPS job that returned results in hour 12, # tiles that ran: 0
For DPS job that returned results in hour 13, # tiles that ran: 0
For DPS job that returned resul

In [163]:
!pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0
[0m

In [167]:
import xmltodict
[maap.getJobStatus(job_id).content for job_id in job_results_df.job_id.to_list()]
#print(xmltodict.unparse(list_dicts, pretty=True))
#maap.listJobs(username='lduncanson')


[b'<wps:StatusInfo xmlns:ows="http://www.opengis.net/ows/2.0" xmlns:schemaLocation="http://schemas.opengis.net/wps/2.0/wps.xsd" xmlns:wps="http://www.opengis.net/wps/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><wps:JobID>93540a8a-2f42-4582-9cfa-d12f1b67693d</wps:JobID><wps:Status>Succeeded</wps:Status></wps:StatusInfo>',
 b'<wps:StatusInfo xmlns:ows="http://www.opengis.net/ows/2.0" xmlns:schemaLocation="http://schemas.opengis.net/wps/2.0/wps.xsd" xmlns:wps="http://www.opengis.net/wps/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><wps:JobID>73f800cf-3483-490b-9d3f-09a1cd8a0e81</wps:JobID><wps:Status>Succeeded</wps:Status></wps:StatusInfo>',
 b'<wps:StatusInfo xmlns:ows="http://www.opengis.net/ows/2.0" xmlns:schemaLocation="http://schemas.opengis.net/wps/2.0/wps.xsd" xmlns:wps="http://www.opengis.net/wps/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><wps:JobID>f84f6e64-8a02-40b8-b5a8-b6ed108e81f6</wps:JobID><wps:Status>Succeeded</wps:Status></wps:

In [8]:
need_df_fn = pd.read_csv("/projects/my-public-bucket/DPS_tile_lists/Need_ATL08_filt_tindex_master.csv")
INPUT_TILE_NUM_LIST = need_df_fn['tile_num'].values.astype(int).tolist()
len(INPUT_TILE_NUM_LIST)

0

In [45]:
DATE_START = '01-01' + 'T00:00:00Z' # SUMMER start
DATE_END = '12-31' + 'T23:59:59Z' # SUMMER end
version = 5

date_filters = [f'{year}-{DATE_START},{year}-{DATE_END}' for year in YEARS]
version = str(f'{version:03}')

base_query = {
'short_name':"ATL08",
'version':version, 
'bounding_box':in_bbox
}

#q3 = [build_query(copy.copy(base_query), date_filter) for date_filter in date_filters]
queries = [dict(base_query, temporal=date_filter) for date_filter in date_filters]
print(f"\tSearching MAAP for granules using these parameters: \n\t{queries}")

# query CMR as many seasons as necessary
result_chain = itertools.chain.from_iterable([maap.searchGranule(**query) for query in queries])