In [94]:
!conda install pandas -y

Fetching package metadata .........
Solving package specifications: .

Package plan for installation in environment /opt/conda:

The following packages will be UPDATED:

    conda: 4.3.14-py35_0 --> 4.3.15-py35_0

conda-4.3.15-p 100% |################################| Time: 0:00:00  24.22 MB/s


In [4]:
from __future__ import print_function

import os
import tempfile

import boto3 as boto
import numpy as np
from datetime import datetime
import pandas as pd
import iris

iris.FUTURE.netcdf_no_unlimited = True


def human_bytes(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def parse_s3_uri(s3_uri):
    nasty_stuff = s3_uri.split("/")
    _, extension = os.path.splitext(s3_uri)
    bucket = nasty_stuff[2]
    key = "/".join(nasty_stuff[3:])

    return bucket, key, extension


def download_object(s3_uri):
    bucket, key, extension = parse_s3_uri(s3_uri)
#     print(bucket, key, extension)
    data_file = tempfile.NamedTemporaryFile(mode='w+b', suffix=extension)
    s3 = boto.resource('s3', 'eu-west-2')
    s3.Object(bucket, key).download_file(data_file.name)

    return data_file


def assert_cube_lists_equivalent(cubes_in, cubes_out):
    for cube_in in cubes_in:
        name_con = iris.Constraint(name=cube_in.name())
        cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==cube_in.cell_methods)
        
        [cube_out] = cubes_out.extract(name_con&cell_method_con)
        
        for coord in cube_out.coords():
            coord.var_name = None
        cube_in.attributes = None
        cube_out.attributes = None
            
        np.testing.assert_almost_equal(cube_in.data, cube_out.data)
        np.testing.assert_equal(cube_in.data, cube_out.data)
      
        print(cube_in)
        print(cube_out)
        assert cube_in == cube_out
    
    
def replace_s3_obj(s3_uri, new_file_name):
    bucket, old_s3_key, extension = parse_s3_uri(s3_uri)
    new_s3_key = "/".join(old_s3_key.split("/")[-1] + new_file_name)

    s3 = boto.resource('s3')
#     s3.Object(bucket, new_s3_key).upload_file(new_file_name)
    print("UPLOADING...")
    
#     s3.Object(bucket, old_s3_key).delete()
    
    
def add_to_dead_letter(s3_uri, dlq_name="dlq"):
    sqs = boto.resource('sqs', 'eu-west-2')
    dlq = sqs.get_queue_by_name(QueueName=dlq_name)
    dlq.send_message(MessageBody=s3_uri)
    
def save_to_netcdf(s3_uri, complevel=5):
    data_file_in = download_object(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    
    data_file_out = tempfile.NamedTemporaryFile(mode='w+b', suffix=".nc")
    iris.save(cubes_in, data_file_out.name, netcdf_format="NETCDF4", zlib=True, complevel=complevel)
    return data_file_in, data_file_out
    
def convert_object_from_s3(s3_uri):
    """ Converts file to NetCDF """
    data_file_in, data_file_out = save_to_netcdf(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    cubes_out = iris.load(data_file_out.name)
    
    try:
        assert_cube_lists_equivalent(cubes_in, cubes_out)
    except:
#         add_to_dead_letter(s3_uri)
        raise
    else:
        replace_s3_obj(s3_uri, data_file_out.name)
        
def test_file_size(s3_uri, complevel=5):
    now = datetime.now()
    data_file_in, data_file_out = save_to_netcdf(s3_uri, complevel)
    comp_time = (datetime.now() - now).total_seconds()
    now = datetime.now()
    cubes_in = iris.load(data_file_in.name)
    [cube.data for cube in cubes_in]
    load_time = (datetime.now() - now).total_seconds()
    return (os.stat(data_file_in.name).st_size, 
            os.stat(data_file_out.name).st_size,
            comp_time, load_time)

In [9]:
s3 = boto.client('s3', 'eu-west-2')
s3.list_objects(
    Bucket='mogreps',
    MaxKeys=100
)

ClientError: An error occurred (AccessDenied) when calling the ListObjects operation: Access Denied

In [2]:
input_files = [
    "s3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_003.pp"
]
compression_tests = []

for input_file in input_files:
    for complevel in range(6):
        print('.', end='')
        pre_size, post_size, comp_time, load_time = test_file_size(input_file, complevel)
        compression_tests.append({
            "complevel": complevel,
            "comp_time": comp_time,
            "load_time": load_time,
            "pre_size":  pre_size,
            "post_size": post_size

        })
print("Done")

.



.....Done


In [5]:
pd.DataFrame(compression_tests)

Unnamed: 0,comp_time,complevel,load_time,post_size,pre_size
0,3.822688,0,2.326409,340146635,93496528
1,8.199276,1,2.340681,71301468,93496528
2,8.038863,2,2.302426,69387855,93496528
3,9.698466,3,2.307762,67591033,93496528
4,11.674659,4,2.307426,66521354,93496528
5,13.174583,5,2.305073,65253371,93496528


In [6]:
from IPython.display import Markdown

body = "Compression Level | Compression Time | Iris Load Time| Input File Size  | Output File Size | Compression Ratio \n"
body += "--- | --- | --- | --- | --- | --- \n"
for test in compression_tests:
    body += "{} | {} | {} | {} | {} | {} \n".format(
        test["complevel"], 
        test["comp_time"], 
        test["load_time"], 
        human_bytes(test["pre_size"]),
        human_bytes(test["post_size"]), 
        float(test["post_size"]) / float(test["pre_size"]))


Markdown(body)


Compression Level | Compression Time | Iris Load Time| Input File Size  | Output File Size | Compression Ratio 
--- | --- | --- | --- | --- | --- 
0 | 3.822688 | 2.326409 | 89.2MiB | 324.4MiB | 3.6380670199860257 
1 | 8.199276 | 2.340681 | 89.2MiB | 68.0MiB | 0.7626108640098379 
2 | 8.038863 | 2.302426 | 89.2MiB | 66.2MiB | 0.7421436547889778 
3 | 9.698466 | 2.307762 | 89.2MiB | 64.5MiB | 0.7229255935578699 
4 | 11.674659 | 2.307426 | 89.2MiB | 63.4MiB | 0.7114847516048938 
5 | 13.174583 | 2.305073 | 89.2MiB | 62.2MiB | 0.697922932496488 


In [85]:
# convert_object_from_s3("s3://mogreps-g/201612/prods_op_mogreps-g_20161203_00_00_048.pp")


In [86]:
import iris
import boto3 as boto

def pickup_job(queue_name="new_files"):
    sqs = boto.resource('sqs')
    queue = sqs.get_queue_by_name(QueueName=queue_name)

if __name__ == "__main__":
    job = pickup_job()
    if job:
        ingest_job(job)

NoRegionError: You must specify a region.

In [13]:
import boto3 as boto
s3 = boto.resource('s3')
s3.Object("mogreps-g", "201612/prods_op_mogreps-g_20161203_00_00_048.pp").download_file("test.pp")

In [35]:
_ = s3.Object("mogreps-g", "201612/prods_op_mogreps-g_20161203_00_00_048.pp")

In [38]:
_.upload_file?

In [10]:
import iris
d = iris.load("test.pp")



In [23]:
name_con = iris.Constraint(name=d[0].name())
cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==d[0].cell_methods)

In [66]:
d.extract(name_con&cell_method_con)

[<iris 'Cube' of atmosphere_optical_thickness_due_all_ambient_aerosol / (1) (pseudo_level: 6; latitude: 600; longitude: 800)>]