In [14]:
from __future__ import print_function

import os
import tempfile

import boto3 as boto
import numpy as np
from datetime import datetime
import iris

iris.FUTURE.netcdf_no_unlimited = True


def setup_credentials():
#     os.environ["AWS_ACCESS_KEY_ID"] = ""
#     os.environ["AWS_SECRET_ACCESS_KEY"] = ""

def parse_s3_uri(s3_uri):
    nasty_stuff = s3_uri.split("/")
    _, extension = os.path.splitext(s3_uri)
    bucket = nasty_stuff[2]
    key = "/".join(nasty_stuff[3:])

    return bucket, key, extension


def download_object(s3_uri):
    bucket, key, extension = parse_s3_uri(s3_uri)
    data_file = tempfile.NamedTemporaryFile(mode='w+b', suffix=extension)
    s3 = boto.resource('s3', 'eu-west-2')
    s3.Object(bucket, key).download_file(data_file.name)

    return data_file


def sanitize_cubes(cube_in, cube_out):
    for coord in cube_out.coords():
            coord.var_name = None
    cube_in.attributes = None
    cube_out.attributes = None
    cube_out.var_name = None
    if cube_in.units == 'unknown':
        cube_in.units = cube_out.units
        
    return cube_in, cube_out


def assert_cube_lists_equivalent(cubes_in, cubes_out):
    for cube_in in cubes_in:
        if cube_in.name() is not 'unknown':
            name_con = iris.Constraint(name=cube_in.name())
            cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==cube_in.cell_methods)  
            coord_name_con = iris.Constraint(cube_func=lambda c: all([a.name()==b.name() for a, b in zip(c.coords(), cube_in.coords())]))

            [cube_out] = cubes_out.extract(name_con&cell_method_con&coord_name_con)
            sanitized_cube_in, sanitized_cube_out = sanitize_cubes(cube_in, cube_out)

            np.testing.assert_equal(sanitized_cube_in.data, sanitized_cube_out.data)
            assert sanitized_cube_in == sanitized_cube_out
    
    
    
def replace_s3_obj(s3_uri, new_file_name):
    old_bucket, old_s3_key, extension = parse_s3_uri(s3_uri)
    
    if "mogreps-uk" in old_s3_key:
        new_bucket = "mogreps-uk"
    elif "mogreps-g" in old_s3_key:
        new_bucket = "mogreps-g"
    else:
        raise ValueError

    s3 = boto.resource('s3', 'eu-west-2')
    s3.Object(new_bucket, new_s3_key).upload_file(new_file_name)
    s3.Object(old_bucket, old_s3_key).delete()
    
    
def add_to_queue(s3_uri, queue_name):
    sqs = boto.resource('sqs', 'eu-west-2')
    queue = sqs.get_queue_by_name(QueueName=queue_name)
    queue.send_message(MessageBody=s3_uri)
    
    
def save_to_netcdf(s3_uri, complevel=5):
    data_file_in = download_object(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    
    data_file_out = tempfile.NamedTemporaryFile(mode='w+b', suffix=".nc")
    iris.save(cubes_in, data_file_out.name, netcdf_format="NETCDF4", zlib=True, complevel=complevel)
    return data_file_in, data_file_out
    
    
def convert_object_from_s3(message):
    """ Converts file to NetCDF """
    setup_credentials()
    s3_uri = message.body
    data_file_in, data_file_out = save_to_netcdf(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    cubes_out = iris.load(data_file_out.name)
    
    try:
        assert_cube_lists_equivalent(cubes_in, cubes_out)
    except:
        add_to_queue(s3_uri, "mogreps-conversion-failed")
    else:
        replace_s3_obj(s3_uri, data_file_out.name)
        add_to_queue(s3_uri, "mogreps-conversion-completed")
        message.delete()
        
        
def test_file_size(s3_uri, complevel=5):
    now = datetime.now()
    data_file_in, data_file_out = save_to_netcdf(s3_uri, complevel)
    comp_time = (datetime.now() - now).total_seconds()
    now = datetime.now()
    cubes_in = iris.load(data_file_in.name)
    [cube.data for cube in cubes_in]
    load_time = (datetime.now() - now).total_seconds()
    return (os.stat(data_file_in.name).st_size, 
            os.stat(data_file_out.name).st_size,
            comp_time, load_time)

In [15]:
import distributed

e = distributed.Executor('ec2-52-56-198-147.eu-west-2.compute.amazonaws.com:8786')
e

<Client: scheduler="ec2-52-56-198-147.eu-west-2.compute.amazonaws.com:8786" processes=1 cores=2>

In [28]:
import iris
import dask
import boto3 as boto
import itertools


def pickup_jobs(queue_name="mogreps-conversion"):
    sqs = boto.resource('sqs', 'eu-west-2')
    queue = sqs.get_queue_by_name(QueueName=queue_name, QueueOwnerAWSAccountId="metofficepublicdata")
    for message in queue.receive_messages():
        yield message

        
jobs = pickup_jobs()
some_jobs = itertools.islice(jobs, 5)
e.map(convert_object_from_s3, some_jobs)

<generator object queue_to_iterator at 0x7f495cb22990>

Exception in thread Thread-20:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.5/site-packages/distributed/client.py", line 745, in _threaded_map
    args = [get(q) for q in qs_in]
  File "/opt/conda/lib/python3.5/site-packages/distributed/client.py", line 745, in <listcomp>
    args = [get(q) for q in qs_in]
  File "<ipython-input-28-c1eb59aac0fc>", line 9, in pickup_jobs
    queue = sqs.get_queue_by_name(QueueName=queue_name, QueueOwnerAWSAccountId="metofficepublicdata")
  File "/opt/conda/lib/python3.5/site-packages/boto3/resources/factory.py", line 520, in do_action
    response = action(self, *args, **kwargs)
  File "/opt/conda/lib/python3.5/site-packages/boto3/resources/action.py", line 83, in __call__
    response = getattr(parent.meta.client, operation_name)(*