In [1]:
import os
import tempfile
import numpy as np

import boto3 as boto
import iris

SQS_ENDPOINT_URL = "https://sqs.eu-west-2.amazonaws.com/"
JOB_QUEUE_NAME = "mogreps-conversion"
AWS_KEY_ID = "FOO"
AWS_SECRET_KEY_ID = "BAR"
USER_ID = "021908831235"
FAILED_QUEUE_NAME = "mogreps-conversion-failed"
PASSED_QUEUE_NAME = "mogreps-conversion-completed"
JOB_QUEUE_URL = SQS_ENDPOINT_URL + USER_ID + "/" + JOB_QUEUE_NAME


iris.FUTURE.netcdf_no_unlimited = True
iris.FUTURE.netcdf_promote = True


def setup_aws_session():
    boto.setup_default_session(region_name="eu-west-2",
                               aws_access_key_id=AWS_KEY_ID,
                               aws_secret_access_key=AWS_SECRET_KEY_ID)


def parse_s3_uri(s3_uri):
    nasty_stuff = s3_uri.split("/")
    _, extension = os.path.splitext(s3_uri)
    bucket = nasty_stuff[2]
    key = "/".join(nasty_stuff[3:])

    return bucket, key, extension


def download_object(s3_uri):
    bucket, key, extension = parse_s3_uri(s3_uri)
    data_file = tempfile.NamedTemporaryFile(mode='w+b', suffix=extension)
    s3 = boto.resource('s3')
    s3.Object(bucket, key).download_file(data_file.name)

    return data_file


def assert_cube_lists_equivalent(cubes_in, cubes_out):
     for cube_in in cubes_in:
        if cube_in.name() is not 'Unknown':
            name_con = iris.Constraint(name=cube_in.name())
            cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods == cube_in.cell_methods)
            dim_coords_con = iris.Constraint(cube_func=lambda c: all([p.name() == q.name() for (p,q) in zip(c.coords(), cube_in.coords())]))

            [cube_out] = cubes_out.extract(name_con&cell_method_con&dim_coords_con)

            for coord in cube_out.coords():
                coord.var_name = None
            cube_in.attributes = None
            cube_out.attributes = None
            cube_in.var_name = None
            cube_out.var_name = None

            np.testing.assert_equal(cube_in.data, cube_out.data)

            assert cube_in == cube_out
    
    
def replace_s3_obj(s3_uri, new_file_name):
    old_bucket, old_s3_key, old_extension = parse_s3_uri(s3_uri)

    s3 = boto.resource('s3')
    if 'mogreps-g' in s3_uri:
        new_bucket = 'mogreps-g'
    elif 'mogreps-uk' in s3_uri:
        new_bucket = 'mogreps-uk'
    else:
        raise ValueError
        
    new_s3_key = old_s3_key.split("/")[-1].replace('pp', 'nc')
    
    s3.Object(new_bucket, new_s3_key).upload_file(new_file_name)
    s3.Object(old_bucket, old_s3_key).delete()
    
    
def add_to_queue(s3_uri, queue):
    setup_aws_session()
    sqs = boto.resource('sqs')
    failed_queue = sqs.get_queue_by_name(QueueName=queue)
    failed_queue.send_message(MessageBody=s3_uri)
    
    
def save_to_netcdf(s3_uri, complevel=1):
    data_file_in = download_object(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    
    data_file_out = tempfile.NamedTemporaryFile(mode='w+b', suffix=".nc")
    iris.save(cubes_in, data_file_out.name, netcdf_format="NETCDF4", zlib=True, complevel=complevel)
    return data_file_in, data_file_out
    
    
def convert_object_from_s3(message_handle, s3_uri):
    """ Converts file to NetCDF """
    setup_aws_session()
    sqs = boto.resource('sqs')
    
    data_file_in, data_file_out = save_to_netcdf(s3_uri)

    cubes_in = iris.load(data_file_in.name)
    cubes_out = iris.load(data_file_out.name)
    
    try:
        assert_cube_lists_equivalent(cubes_in, cubes_out)
    except:
        raise
        add_to_queue(s3_uri, FAILED_QUEUE_NAME)
    else:
        replace_s3_obj(s3_uri, data_file_out.name)
        msg = sqs.Message(JOB_QUEUE_URL, message_handle)
        msg.delete()
        add_to_queue(s3_uri, PASSED_QUEUE_NAME)

In [2]:
import iris
import boto3 as boto
import json
import itertools

def get_jobs():
    setup_aws_session()
    sqs = boto.resource('sqs')
    queue = sqs.Queue(JOB_QUEUE_URL)
    
    some_messages = queue.receive_messages(MaxNumberOfMessages=1)
    while len(some_messages) > 0:
        [this_message] = some_messages
        s3_uri = "s3://mogreps/"+json.loads(this_message.body)["Message"]
        yield this_message.receipt_handle, s3_uri
        some_messages = queue.receive_messages(MaxNumberOfMessages=1)    

In [73]:
import dask
import distributed
e = distributed.Executor("ec2-52-56-198-147.eu-west-2.compute.amazonaws.com:8786")
e

<Client: scheduler="ec2-52-56-198-147.eu-west-2.compute.amazonaws.com:8786" processes=20 cores=40>

In [None]:
dask.map?

In [None]:
futures = []
jobs = get_jobs()
some_jobs = itertools.islice(jobs, 12)
dask.map(convert_object_from_s3, some_jobs)
for message_handle, s3_uri in some_jobs:
    future = e.submit(convert_object_from_s3, message_handle, s3_uri)
    futures.append(future)

In [74]:
futures = []
jobs = get_jobs()
some_jobs = itertools.islice(jobs, 12)
for message_handle, s3_uri in some_jobs:
    future = e.submit(convert_object_from_s3, message_handle, s3_uri)
    futures.append(future)

KeyboardInterrupt: 

In [None]:
e.restart()

In [60]:
message_handle, s3_uri = next(jobs)
f = e.submit(convert_object_from_s3, message_handle, s3_uri)

In [66]:
f.status

'finished'