In [94]:
!conda install pandas -y

Fetching package metadata .........
Solving package specifications: .

Package plan for installation in environment /opt/conda:

The following packages will be UPDATED:

    conda: 4.3.14-py35_0 --> 4.3.15-py35_0

conda-4.3.15-p 100% |################################| Time: 0:00:00  24.22 MB/s


In [1]:
from __future__ import print_function

import os
import tempfile

import boto3 as boto
import numpy as np
from datetime import datetime
import pandas as pd
import iris

iris.FUTURE.netcdf_no_unlimited = True


def human_bytes(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def parse_s3_uri(s3_uri):
    nasty_stuff = s3_uri.split("/")
    _, extension = os.path.splitext(s3_uri)
    bucket = nasty_stuff[2]
    key = "/".join(nasty_stuff[3:])

    return bucket, key, extension


def download_object(s3_uri):
    bucket, key, extension = parse_s3_uri(s3_uri)
#     print(bucket, key, extension)
    data_file = tempfile.NamedTemporaryFile(mode='w+b', suffix=extension)
    s3 = boto.resource('s3', 'eu-west-2')
    s3.Object(bucket, key).download_file(data_file.name)

    return data_file


def assert_cube_lists_equivalent(cubes_in, cubes_out):
    for cube_in in cubes_in:
        name_con = iris.Constraint(name=cube_in.name())
        cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==cube_in.cell_methods)
        
        [cube_out] = cubes_out.extract(name_con&cell_method_con)
        
        for coord in cube_out.coords():
            coord.var_name = None
        cube_in.attributes = None
        cube_out.attributes = None
            
        np.testing.assert_almost_equal(cube_in.data, cube_out.data)
        np.testing.assert_equal(cube_in.data, cube_out.data)
      
        print(cube_in)
        print(cube_out)
        assert cube_in == cube_out
    
    
def replace_s3_obj(s3_uri, new_file_name):
    bucket, old_s3_key, extension = parse_s3_uri(s3_uri)
    new_s3_key = "/".join(old_s3_key.split("/")[-1] + new_file_name)

    s3 = boto.resource('s3')
#     s3.Object(bucket, new_s3_key).upload_file(new_file_name)
    print("UPLOADING...")
    
#     s3.Object(bucket, old_s3_key).delete()
    
    
def add_to_dead_letter(s3_uri, dlq_name="dlq"):
    sqs = boto.resource('sqs', 'eu-west-2')
    dlq = sqs.get_queue_by_name(QueueName=dlq_name)
    dlq.send_message(MessageBody=s3_uri)
    
def save_to_netcdf(s3_uri, complevel=5):
    data_file_in = download_object(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    
    data_file_out = tempfile.NamedTemporaryFile(mode='w+b', suffix=".nc")
    iris.save(cubes_in, data_file_out.name, netcdf_format="NETCDF4", zlib=True, complevel=complevel)
    return data_file_in, data_file_out
    
def convert_object_from_s3(s3_uri):
    """ Converts file to NetCDF """
    data_file_in, data_file_out = save_to_netcdf(s3_uri)
    cubes_in = iris.load(data_file_in.name)
    cubes_out = iris.load(data_file_out.name)
    
    try:
        assert_cube_lists_equivalent(cubes_in, cubes_out)
    except:
#         add_to_dead_letter(s3_uri)
        raise
    else:
        replace_s3_obj(s3_uri, data_file_out.name)
        
def test_file_size(s3_uri, complevel=5):
    now = datetime.now()
    data_file_in, data_file_out = save_to_netcdf(s3_uri, complevel)
    comp_time = (datetime.now() - now).total_seconds()
    now = datetime.now()
    cubes_in = iris.load(data_file_in.name)
    [cube.data for cube in cubes_in]
    load_time = (datetime.now() - now).total_seconds()
    return (os.stat(data_file_in.name).st_size, 
            os.stat(data_file_out.name).st_size,
            comp_time, load_time)

In [29]:
bucket = 'mogreps'
page = 3
s3 = boto.client('s3', 'eu-west-2')
objects = s3.list_objects(
    Bucket=bucket,
    MaxKeys=page*25
)
totalsize = sum([x["Size"] for x in objects["Contents"][-25:]])
print("Objects: {}, Total Size: {}".format(len(objects["Contents"][-25:]), human_bytes(totalsize)))
input_files = ["s3://{}/{}".format(bucket, x["Key"]) for x in objects["Contents"][-25:]]
input_files

Objects: 25, Total Size: 1.1GiB


['s3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_153.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_156.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_159.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_162.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_165.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_168.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_171.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_00_174.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_003.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_006.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_009.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_012.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_015.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_018.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_021.pp',
 's3://mogreps/2016/prods_op_mogreps-g_20160101_00_01_0

In [30]:
# Reset tests
compression_tests = []

In [25]:
max_compression_level = 5
counter = 0.0
total_tests = float((max_compression_level+1) * len(input_files))

for input_file in input_files:
    for complevel in range(max_compression_level+1):
        print('{0:.1f}%...'.format((counter / total_tests) * 100), end='')
        counter = counter + 1.0
        pre_size, post_size, comp_time, load_time = test_file_size(input_file, complevel)
        compression_tests.append({
            "complevel": complevel,
            "comp_time": comp_time,
            "load_time": load_time,
            "pre_size":  pre_size,
            "post_size": post_size

        })
print("Done")

0.0%...



0.7%...1.3%...2.0%...2.7%...3.3%...4.0%...4.7%...5.3%...6.0%...6.7%...7.3%...8.0%...8.7%...9.3%...10.0%...10.7%...11.3%...12.0%...12.7%...13.3%...14.0%...14.7%...15.3%...16.0%...16.7%...17.3%...18.0%...18.7%...19.3%...20.0%...20.7%...21.3%...22.0%...22.7%...23.3%...24.0%...24.7%...25.3%...26.0%...26.7%...27.3%...28.0%...28.7%...29.3%...30.0%...30.7%...31.3%...32.0%...32.7%...33.3%...34.0%...34.7%...35.3%...36.0%...36.7%...37.3%...38.0%...38.7%...39.3%...40.0%...40.7%...41.3%...42.0%...42.7%...43.3%...44.0%...44.7%...45.3%...46.0%...46.7%...47.3%...48.0%...48.7%...49.3%...50.0%...50.7%...51.3%...52.0%...52.7%...53.3%...54.0%...54.7%...55.3%...56.0%...56.7%...57.3%...58.0%...58.7%...59.3%...60.0%...60.7%...61.3%...62.0%...62.7%...63.3%...64.0%...64.7%...65.3%...66.0%...66.7%...67.3%...68.0%...68.7%...69.3%...70.0%...70.7%...71.3%...72.0%...72.7%...73.3%...74.0%...74.7%...75.3%...76.0%...76.7%...77.3%...78.0%...78.7%...79.3%...80.0%...80.7%...81.3%...82.0%...82.7%...83.3%...84.0%...84.7%.

In [26]:
dataframe = pd.DataFrame(compression_tests)
dataframe

Unnamed: 0,comp_time,complevel,load_time,post_size,pre_size
0,3.474704,0,2.324943,340146635,93496528
1,7.652931,1,2.315330,71301468,93496528
2,8.142890,2,2.326435,69387855,93496528
3,9.995425,3,2.321791,67591033,93496528
4,10.444520,4,2.369661,66521354,93496528
5,13.436164,5,2.318558,65253371,93496528
6,2.517968,0,1.292555,188359427,52465088
7,4.411094,1,1.295569,39439599,52465088
8,5.183987,2,1.300917,38392763,52465088
9,5.524792,3,1.295871,37385138,52465088


In [27]:
mean_results = dataframe.groupby(['complevel']).mean()
mean_results

Unnamed: 0_level_0,comp_time,load_time,post_size,pre_size
complevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.982155,1.057695,152656705,43086834
1,3.746239,1.051971,32094323,43086834
2,3.957869,1.059384,31246996,43086834
3,4.67017,1.047276,30423863,43086834
4,4.989844,1.058676,29852172,43086834
5,6.217529,1.050251,29280602,43086834


In [28]:
from IPython.display import Markdown

body = "Compression Level | Compression Time | Iris Load Time| Input File Size  | Output File Size | Compression Ratio \n"
body += "--- | --- | --- | --- | --- | --- \n"
# for test in compression_tests:
for index, test in mean_results.iterrows():
    body += "{} | {} | {} | {} | {} | {} \n".format(
        index, 
        '{0:.4f}s'.format(test["comp_time"]), 
        '{0:.4f}s'.format(test["load_time"]), 
        human_bytes(test["pre_size"]),
        human_bytes(test["post_size"]), 
        '{0:.5f}'.format(float(test["post_size"]) / float(test["pre_size"])))


Markdown(body)


Compression Level | Compression Time | Iris Load Time| Input File Size  | Output File Size | Compression Ratio 
--- | --- | --- | --- | --- | --- 
0 | 1.9822s | 1.0577s | 41.1MiB | 145.6MiB | 3.54300 
1 | 3.7462s | 1.0520s | 41.1MiB | 30.6MiB | 0.74488 
2 | 3.9579s | 1.0594s | 41.1MiB | 29.8MiB | 0.72521 
3 | 4.6702s | 1.0473s | 41.1MiB | 29.0MiB | 0.70611 
4 | 4.9898s | 1.0587s | 41.1MiB | 28.5MiB | 0.69284 
5 | 6.2175s | 1.0503s | 41.1MiB | 27.9MiB | 0.67957 


In [85]:
# convert_object_from_s3("s3://mogreps-g/201612/prods_op_mogreps-g_20161203_00_00_048.pp")


In [86]:
import iris
import boto3 as boto

def pickup_job(queue_name="new_files"):
    sqs = boto.resource('sqs')
    queue = sqs.get_queue_by_name(QueueName=queue_name)

if __name__ == "__main__":
    job = pickup_job()
    if job:
        ingest_job(job)

NoRegionError: You must specify a region.

In [13]:
import boto3 as boto
s3 = boto.resource('s3')
s3.Object("mogreps-g", "201612/prods_op_mogreps-g_20161203_00_00_048.pp").download_file("test.pp")

In [35]:
_ = s3.Object("mogreps-g", "201612/prods_op_mogreps-g_20161203_00_00_048.pp")

In [38]:
_.upload_file?

In [10]:
import iris
d = iris.load("test.pp")



In [23]:
name_con = iris.Constraint(name=d[0].name())
cell_method_con = iris.Constraint(cube_func=lambda c: c.cell_methods==d[0].cell_methods)

In [66]:
d.extract(name_con&cell_method_con)

[<iris 'Cube' of atmosphere_optical_thickness_due_all_ambient_aerosol / (1) (pseudo_level: 6; latitude: 600; longitude: 800)>]