# DPS to extract raster covars to ATL08 geodataframes
Paul Montesano, PhD

In [5]:
from maap.maap import MAAP
maap = MAAP()

In [6]:
import sys
ICESAT2_BOREAL_REPO_PATH = '/projects/code/icesat2_boreal'               #'/projects/icesat2_boreal' # /projects/Developer/icesat2_boreal/lib
ICESAT2_BOREAL_LIB_PATH = ICESAT2_BOREAL_REPO_PATH + '/lib'
sys.path.append(ICESAT2_BOREAL_LIB_PATH)
#!pip install -U -r $ICESAT2_BOREAL_REPO_PATH/dps/requirements_main.txt

In [7]:
import geopandas as gpd
import pandas as pd
import os
import json
import collections
import numpy as np
import sys
import s3fs
import matplotlib.pyplot as plt
sys.path.append(ICESAT2_BOREAL_LIB_PATH)
import maplib_folium
import ExtractUtils
from folium import TileLayer
print("Importing packages complete.")

NASA MAAP
Importing packages complete.


In [8]:
import importlib
import mosaiclib
importlib.reload(mosaiclib)

<module 'mosaiclib' from '/projects/code/icesat2_boreal/lib/mosaiclib.py'>

In [9]:
from mosaiclib import *

### Use MAAP Registration call in notebook chunk to register DPS algorithm
 - We need to register a DPS algorithm called `run_extract_covars` before proceeding to the chunks below...

In [193]:
maap.register_algorithm_from_yaml_file("/projects/code/icesat2_boreal/dps/registered/run_extract_covars.yml").text

'{"code": 200, "message": {"id": "27320fd65bcf65b12d6badf40ebd3a3559d3367f", "short_id": "27320fd6", "created_at": "2024-02-17T21:03:04.000+00:00", "parent_ids": ["9f13ea03043f86f6cd4cd57c76b0fecbcc5b1f56"], "title": "Registering algorithm: run_tile_atl08", "message": "Registering algorithm: run_tile_atl08", "author_name": "root", "author_email": "root@156a1941fa17", "authored_date": "2024-02-17T21:03:04.000+00:00", "committer_name": "root", "committer_email": "root@156a1941fa17", "committed_date": "2024-02-17T21:03:04.000+00:00", "trailers": {}, "web_url": "https://repo.maap-project.org/root/register-job-hysds-v4/-/commit/27320fd65bcf65b12d6badf40ebd3a3559d3367f", "stats": {"additions": 2, "deletions": 2, "total": 4}, "status": "pending", "project_id": 3, "last_pipeline": {"id": 12191, "iid": 811, "project_id": 3, "sha": "27320fd65bcf65b12d6badf40ebd3a3559d3367f", "ref": "main", "status": "pending", "source": "push", "created_at": "2024-02-17T21:03:05.393Z", "updated_at": "2024-02-17T

In [11]:
s3 = s3fs.S3FileSystem(anon=True)

In [14]:
DPS_FILE_LIST = ["s3://"+f for f in s3.glob("s3://maap-ops-workspace/montesano/data/atl08.v006/030m/*.parquet")]
DPS_FILE_LIST[100]

's3://maap-ops-workspace/montesano/data/atl08.v006/030m/atl08_006_030m_2020_2020_06_09_filt_00127.parquet'

In [10]:
in_param_dict = {
    's3_atl08_gdf_fn': '',
    'tindex_fn_list': [TOPO_TINDEX_FN_DICT['c2020updated'], HLS_TINDEX_FN_DICT['2020'], SAR_TINDEX_FN_DICT['2020'] ]
 }
in_param_dict

{'in_tile_num': '',
 'tindex_fn_list': ['s3://maap-ops-workspace/shared/montesano/DPS_tile_lists/run_build_stack_topo/build_stack_v2023_2/CopernicusGLO30/Topo_tindex_master.csv',
  's3://maap-ops-workspace/shared/montesano/DPS_tile_lists/HLS/HLS_stack_2023_v1/HLS_H30_2020/HLS_tindex_master.csv',
  's3://maap-ops-workspace/shared/montesano/DPS_tile_lists/run_build_stack/build_stack_v2023_2/build_stack_S1/SAR_S1_2020/S1_tindex_master.csv']}

## Run a DPS job across the list

In [45]:
IDENTIFIER = '2020'

In [46]:
# MAAP algorithm version name (same as the repo TAG)
MAAP_VERSION = "extract_covars" 

In [47]:
ALGO_ID = "run_extract_covars"
USER = 'montesano'
WORKER_TYPE = 'maap-dps-worker-8gb'

In [56]:
%%time
submit_results_df_list = []
len_input_list = len(DPS_FILE_LIST)
print(f"# of input tiles for DPS: {len_input_list}")

for i, INPUT_FILE in enumerate(DPS_FILE_LIST):

    DPS_num = i+1
    
    in_param_dict['s3_atl08_gdf_fn'] = INPUT_FILE
    INPUT_TILE_NUM = int(INPUT_FILE.split('_')[-1].split('.')[0])
    
    submit_result = maap.submitJob(
                                    identifier=IDENTIFIER,
                                    algo_id=ALGO_ID,
                                    version=MAAP_VERSION, 
                                    username=USER,
                                    queue=WORKER_TYPE,
                                    # Args that match yaml
                                    **in_param_dict
        )
    
    # Build a dataframe of submission details
    submit_result_df = pd.DataFrame( 
        {
                'dps_num':[DPS_num],
                'tile_num':[INPUT_TILE_NUM],
                'submit_time':[datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%s')],
                'dbs_job_hour': [datetime.datetime.now().hour],
                'algo_id': [ALGO_ID],
                'user': [USER],
                'worker_type': [WORKER_TYPE],
                'job_id': [submit_result.id],
                'submit_status': [submit_result.status],
            
        } 
    )
    
    # Append to a list of data frames of submission results
    submit_results_df_list.append(submit_result_df)
    
    if DPS_num in [1, 5, 10, 50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 7000, 9000, 11000, 13000, 15000, 17000, 19000, 21000, 24000, len_input_list]:
        print(f"DPS run #: {DPS_num}\t| tile num: {INPUT_TILE_NUM}\t| submit status: {submit_result.status}\t| job id: {submit_result.id}") 
        
# Build a final submission results df and save
submit_results_df = pd.concat(submit_results_df_list)
submit_results_df['run_name'] = IDENTIFIER
nowtime = pd.Timestamp.now().strftime('%Y%m%d%H%M')
print(f"Current time:\t{nowtime}")
submit_results_df.to_csv(f'/projects/my-public-bucket/dps_submission_results/DPS_{ALGO_ID}_{IDENTIFIER}_submission_results_{len_input_list}_{nowtime}.csv')
submit_results_df.info()

# of input tiles for DPS: 4
DPS run #: 1	| tile num: 41807	| submit status: success	| job id: 938b3ea6-1381-49b0-9a66-745767e68e06
DPS run #: 4	| tile num: 3916	| submit status: success	| job id: e6b305da-2b85-4952-8d53-9d0c5252c07e
Current time:	202402181737
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 0
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   dps_num        4 non-null      int64 
 1   tile_num       4 non-null      int64 
 2   submit_time    4 non-null      object
 3   dbs_job_hour   4 non-null      int64 
 4   algo_id        4 non-null      object
 5   user           4 non-null      object
 6   worker_type    4 non-null      object
 7   job_id         4 non-null      object
 8   submit_status  4 non-null      object
 9   run_name       4 non-null      object
dtypes: int64(3), object(7)
memory usage: 352.0+ bytes
CPU times: user 86.8 ms, sys: 11.3 ms, total: 98.2 ms
Wall time: 665 ms
