<hr>
<font size="6"> <b>Archive Watermaps</b></font>

<font size="5">  Transfer successful HyP3 jobs to an S3 bucket </font>
<br>
<br>

In [None]:
# Import relevant python packages
import json
from os import environ
from tqdm import tqdm

import boto3
import hyp3_sdk
from boto3.s3.transfer import TransferConfig

#specify which boto3 resource we will use
S3 = boto3.resource('s3')

Select configuration file and decide if prompt is True.

In [None]:
config_file = '/Users/jrsmale/GitHub/hyp3-nasa-disasters/data_management/hkh_watermaps.json'
prompt = False

with open(config_file) as f:
    config = json.load(f)

project_name = config["project_name"]
target_bucket = config["transfer_spec"]["target_bucket"]
target_prefix = config["transfer_spec"].get("target_prefix", project_name)
if prompt:
    project_name = input(f'HyP3 project name [{project_name}]: ') or project_name
    target_bucket = input(f'Destination bucket: [{target_bucket}]') or target_bucket
    target_prefix = input(f'Destination prefix: [{target_prefix}]') or target_prefix

Connect to HyP3 using HyP3 SDK to find a list of HyP3 jobs that are associated with our project's name.

In [None]:
hyp3 = hyp3_sdk.HyP3(
    config['host'], username=environ.get('EDL_USERNAME'), password=environ.get('EDL_PASSWORD'), prompt=prompt
)
jobs = hyp3.find_jobs(name=project_name)
print('\n' + project_name)
print(jobs)

Find the contents of our S3 bucket.

In [None]:
project_contents = set()
for o in S3.Bucket(target_bucket).objects.filter(Prefix=f'{target_prefix}/'):
        project_contents.add(o.key)

Then, we will check all succeed jobs and find which ones aren't represented yet in our S3 bucket. This prevents us from uploading the same files repeatedly to the cloud.

In [None]:
objects_to_copy = []
for job in tqdm(jobs):
    if not job.succeeded():
        continue
    source_bucket = job.files[0]['s3']['bucket']
    zip_key = job.files[0]['s3']['key']
    for ext in config["transfer_spec"]["extensions"]:
        source_key = zip_key.replace('.zip', ext)
        target_key = source_key.replace(job.job_id, target_prefix)
        if target_key not in project_contents:
            objects_to_copy.append({
                'source_bucket': source_bucket,
                'source_key': source_key,
                'target_bucket': target_bucket,
                'target_key': target_key,
            })

We now copy these new files to our S3 bucket.

In [None]:
print(f'\nFound {len(objects_to_copy)} new files to copy to s3://{target_bucket}/{target_prefix}/')
if prompt:
    input('Press Enter to continue, Ctrl-c to cancel')

chunk_size = 104857600
for object_to_copy in objects_to_copy:
        bucket = S3.Bucket(target_bucket)
        copy_source = {'Bucket': source_bucket, 'Key': source_key}
        transfer_config = TransferConfig(multipart_threshold=chunk_size, multipart_chunksize=chunk_size)
        bucket.copy(CopySource=copy_source, Key=target_key, Config=transfer_config)