In [94]:
!conda install pandas -y

Fetching package metadata .........
Solving package specifications: .

Package plan for installation in environment /opt/conda:

The following packages will be UPDATED:

    conda: 4.3.14-py35_0 --> 4.3.15-py35_0

conda-4.3.15-p 100% |################################| Time: 0:00:00  24.22 MB/s


In [4]:
from __future__ import print_function

import os
import tempfile

import boto3 as boto
import numpy as np
from datetime import datetime
import pandas as pd
import iris

iris.FUTURE.netcdf_no_unlimited = True


def human_bytes(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def parse_s3_uri(s3_uri):
    nasty_stuff = s3_uri.split("/")
    _, extension = os.path.splitext(s3_uri)
    bucket = nasty_stuff[2]
    key = "/".join(nasty_stuff[3:])

    return bucket, key, extension


def download_object(s3_uri):
    bucket, key, extension = parse_s3_uri(s3_uri)
#     print(bucket, key, extension)
    data_file = tempfile.NamedTemporaryFile(mode='w+b', suffix=extension)
    s3 = boto.resource('s3', 'eu-west-2')
    s3.Object(bucket, key).download_file(data_file.name)

    return data_file


def assert_cube_lists_equivalent(cubes_in, cubes_out):
    for cube_in in cubes_in:
        name_con = iris.Constraint(name=cube_in.name())
        cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==cube_in.cell_methods)
        
        [cube_out] = cubes_out.extract(name_con&cell_method_con)
        
        for coord in cube_out.coords():
            coord.var_name = None
        cube_in.attributes = None
        cube_out.attributes = None
            
        np.testing.assert_almost_equal(cube_in.data, cube_out.data)
        np.testing.assert_equal(cube_in.data, cube_out.data)
      
        print(cube_in)
        print(cube_out)
        assert cube_in == cube_out
    
    
def replace_s3_obj(s3_uri, new_file_name):
    bucket, old_s3_key, extension = parse_s3_uri(s3_uri)
    new_s3_key = "/".join(old_s3_key.split("/")[-1] + new_file_name)

    s3 = boto.resource('s3')
#     s3.Object(bucket, new_s3_key).upload_file(new_file_name)
    print("UPLOADING...")
    
#     s3.Object(bucket, old_s3_key).delete()
    
    
def add_to_dead_letter(s3_uri, dlq_name="dlq"):
    sqs = boto.resource('sqs', 'eu-west-2')
    dlq = sqs.get_queue_by_name(QueueName=dlq_name)
    dlq.send_message(MessageBody=s3_uri)
    
def save_to_netcdf(s3_uri, complevel=5):
    data_file_in = download_object(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    
    data_file_out = tempfile.NamedTemporaryFile(mode='w+b', suffix=".nc")
    iris.save(cubes_in, data_file_out.name, netcdf_format="NETCDF4", zlib=True, complevel=complevel)
    return data_file_in, data_file_out
    
def convert_object_from_s3(s3_uri):
    """ Converts file to NetCDF """
    data_file_in, data_file_out = save_to_netcdf(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    cubes_out = iris.load(data_file_out.name)
    
    try:
        assert_cube_lists_equivalent(cubes_in, cubes_out)
    except:
#         add_to_dead_letter(s3_uri)
        raise
    else:
        replace_s3_obj(s3_uri, data_file_out.name)
        
def test_file_size(s3_uri, complevel=5):
    now = datetime.now()
    data_file_in, data_file_out = save_to_netcdf(s3_uri, complevel)
    comp_time = (datetime.now() - now).total_seconds()
    now = datetime.now()
    cubes_in = iris.load(data_file_in.name)
    [cube.data for cube in cubes_in]
    load_time = (datetime.now() - now).total_seconds()
    return (os.stat(data_file_in.name).st_size, 
            os.stat(data_file_out.name).st_size,
            comp_time, load_time)

In [12]:
bucket = 'mogreps'
s3 = boto.client('s3', 'eu-west-2')
objects = s3.list_objects(
    Bucket=bucket,
    MaxKeys=25
)
input_files = ["s3://{}/{}".format(bucket, x["Key"]) for x in objects["Contents"]]
input_files

In [15]:
compression_tests = []
max_compression_level = 5
counter = 0.0
total_tests = float(max_compression_level * len(input_files))

for input_file in input_files:
    for complevel in range(max_compression_level+1):
        counter = counter + 1.0
        print('{0:.1f}%...'.format((counter / total_tests) * 100), end='')
        pre_size, post_size, comp_time, load_time = test_file_size(input_file, complevel)
        compression_tests.append({
            "complevel": complevel,
            "comp_time": comp_time,
            "load_time": load_time,
            "pre_size":  pre_size,
            "post_size": post_size

        })
print("Done")

0.8%.



1.6%.2.4%.3.2%.4.0%.4.8%.5.6000000000000005%.6.4%.7.199999999999999%.8.0%.8.799999999999999%.9.6%.10.4%.11.200000000000001%.12.0%.12.8%.13.600000000000001%.14.399999999999999%.15.2%.16.0%.16.8%.17.599999999999998%.18.4%.19.2%.20.0%.20.8%.21.6%.22.400000000000002%.23.200000000000003%.24.0%.24.8%.25.6%.26.400000000000002%.27.200000000000003%.28.000000000000004%.28.799999999999997%.29.599999999999998%.30.4%.31.2%.32.0%.32.800000000000004%.33.6%.34.4%.35.199999999999996%.36.0%.36.8%.37.6%.38.4%.39.2%.40.0%.40.8%.41.6%.42.4%.43.2%.44.0%.44.800000000000004%.45.6%.46.400000000000006%.47.199999999999996%.48.0%.48.8%.49.6%.50.4%.51.2%.52.0%.52.800000000000004%.53.6%.54.400000000000006%.55.2%.56.00000000000001%.56.8%.57.599999999999994%.58.4%.59.199999999999996%.60.0%.60.8%.61.6%.62.4%.63.2%.64.0%.64.8%.65.60000000000001%.66.4%.67.2%.68.0%.68.8%.69.6%.70.39999999999999%.71.2%.72.0%.72.8%.73.6%.74.4%.75.2%.76.0%.76.8%.77.60000000000001%.78.4%.79.2%.80.0%.80.80000000000001%.81.6%.82.39999999999999

In [16]:
dataframe = pd.DataFrame(compression_tests)
dataframe

Unnamed: 0,comp_time,complevel,load_time,post_size,pre_size
0,3.515517,0,2.323199,340146635,93496528
1,7.694350,1,2.301639,71301468,93496528
2,8.041861,2,2.307549,69387855,93496528
3,9.832034,3,2.311481,67591033,93496528
4,10.373593,4,2.317381,66521354,93496528
5,13.170756,5,2.368005,65253371,93496528
6,2.353394,0,1.284342,188359427,52465088
7,4.595000,1,1.300967,39439599,52465088
8,5.048035,2,1.308908,38392763,52465088
9,6.282737,3,1.307669,37385138,52465088


In [18]:
mean_results = dataframe.groupby(['complevel']).mean()
mean_results

Unnamed: 0_level_0,comp_time,load_time,post_size,pre_size
complevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2.086316,1.188775,170757408,48049195
1,4.187863,1.180632,35838674,48049195
2,4.438591,1.175563,34885209,48049195
3,5.241384,1.188722,33967232,48049195
4,5.483121,1.183906,33351000,48049195
5,6.996478,1.175321,32712771,48049195


In [28]:
for index, test in mean_results.iterrows():
    print(index)

0
1
2
3
4
5


In [29]:
from IPython.display import Markdown

body = "Compression Level | Compression Time | Iris Load Time| Input File Size  | Output File Size | Compression Ratio \n"
body += "--- | --- | --- | --- | --- | --- \n"
# for test in compression_tests:
for index, test in mean_results.iterrows():
    body += "{} | {} | {} | {} | {} | {} \n".format(
        index, 
        test["comp_time"], 
        test["load_time"], 
        human_bytes(test["pre_size"]),
        human_bytes(test["post_size"]), 
        float(test["post_size"]) / float(test["pre_size"]))


Markdown(body)


Compression Level | Compression Time | Iris Load Time| Input File Size  | Output File Size | Compression Ratio 
--- | --- | --- | --- | --- | --- 
0 | 2.0863164800000003 | 1.1887751199999999 | 45.8MiB | 162.8MiB | 3.55380372137348 
1 | 4.18786332 | 1.1806322000000002 | 45.8MiB | 34.2MiB | 0.7458745978990907 
2 | 4.43859104 | 1.17556292 | 45.8MiB | 33.3MiB | 0.726031081269936 
3 | 5.2413843600000005 | 1.1887221599999998 | 45.8MiB | 32.4MiB | 0.7069261410102708 
4 | 5.48312112 | 1.1839056 | 45.8MiB | 31.8MiB | 0.6941011186555779 
5 | 6.996478279999999 | 1.175321 | 45.8MiB | 31.2MiB | 0.6808182946665392 


In [85]:
# convert_object_from_s3("s3://mogreps-g/201612/prods_op_mogreps-g_20161203_00_00_048.pp")


In [86]:
import iris
import boto3 as boto

def pickup_job(queue_name="new_files"):
    sqs = boto.resource('sqs')
    queue = sqs.get_queue_by_name(QueueName=queue_name)

if __name__ == "__main__":
    job = pickup_job()
    if job:
        ingest_job(job)

NoRegionError: You must specify a region.

In [13]:
import boto3 as boto
s3 = boto.resource('s3')
s3.Object("mogreps-g", "201612/prods_op_mogreps-g_20161203_00_00_048.pp").download_file("test.pp")

In [35]:
_ = s3.Object("mogreps-g", "201612/prods_op_mogreps-g_20161203_00_00_048.pp")

In [38]:
_.upload_file?

In [10]:
import iris
d = iris.load("test.pp")



In [23]:
name_con = iris.Constraint(name=d[0].name())
cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==d[0].cell_methods)

In [66]:
d.extract(name_con&cell_method_con)

[<iris 'Cube' of atmosphere_optical_thickness_due_all_ambient_aerosol / (1) (pseudo_level: 6; latitude: 600; longitude: 800)>]