In [None]:
import copy
import json
import os
import urllib
from datetime import datetime

import skimage.io as io
from skimage.transform import resize

import csv
import glob
import json
import os

import cv2
import numpy as np



In [None]:
def get_matching_s3_keys(s3_client, bucket, prefix='', suffix=''):
    """
    Generate the keys in an S3 bucket.
    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            if key.startswith(prefix) and key.endswith(suffix):
                yield key

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [None]:

def download_crops(base_folder, s3_client, new_size):
    """ every hour check s3 folder for new files"""
    generator = get_matching_s3_keys(s3_client,
                                     'aquabyte-annotations',
                                     prefix='cogito/to-be-processed',
                                     suffix='.json')

    for key in generator:
        json_file = os.path.basename(key)
        json_destination = os.path.join(base_folder, 'processed', json_file)

        # check if the file has been downloaded
        if os.path.isfile(json_destination):
            continue

        # otherwise download the file
        print('A new json file has been found {}. Downloading it!!'.format(key))
        s3_client.download_file("aquabyte-annotations", key, json_destination)

        # open the downloaded file
        annotations = json.load(open(json_destination))
        annotations_resized = copy.deepcopy(annotations)

        # step 0 - take care of annotations
        # download the images into the corresponding folders
        for (i, (annotation, annotation_res)) in enumerate(zip(annotations['images'], annotations_resized['images'])):
            if i % 1000 == 0:
                print('Image {} out of {} downloaded and added'.format(i, len(annotations['images'])))
            url = annotation['coco_url']
            assert annotation['coco_url'] == annotation_res['coco_url'], "Problem!!"

            image_name = url.split('%2F')[-1].split('?')[0]
            farm = image_name.split('_')[1]
            pen = image_name.split('_')[2]
            date = str(datetime.utcfromtimestamp(int(image_name.split('_')[-1].split('.')[0])/1000.0).date())
            image_dir = os.path.join(base_folder, farm, date, pen)
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)
            image_destination = os.path.join(image_dir, image_name)
            if not os.path.isfile(image_destination):
                urllib.urlretrieve(url, image_destination)

            image_resized_destination = image_destination.replace("aquabyte-images", "aquabyte-images-resized")

            if not os.path.isdir(os.path.dirname(image_resized_destination)):
                os.makedirs(os.path.dirname(image_resized_destination))
            if not os.path.isfile(image_resized_destination):
                image = io.imread(image_destination)
                image_resized = resize(image, new_size)
                io.imsave(image_resized_destination, image_resized)

            annotation["local_path"] = image_destination
            annotation_res['height'] = new_size[0]
            annotation_res['width'] = new_size[0]
            annotation_res["local_path"] = image_resized_destination

        with open(os.path.join(base_folder, 'cocofiles', 'coco_body_parts_' + json_file), 'w') as f:
            json.dump(annotations, f)

        # step 3 - take care of resized annotations
        yfactor = new_size[0] / 3000.0
        xfactor = new_size[1] / 4096.0
        # resize the annotations as well
        for (j, ann) in enumerate(annotations_resized['annotations']):
            if j % 50 == 0:
                print('Annotation {} out of {} resized'.format(j, len(annotations_resized['annotations'])))
            # bbox
            bbox = ann['bbox']
            bbox_resized = [int(bbox[0]*xfactor), int(bbox[1]*yfactor), int(bbox[2]*xfactor), int(bbox[3]*yfactor)]
            ann['bbox'] = bbox_resized

            # segmentation
            seg = ann['segmentation'][0]
            seg_resized = []
            for (i, v) in enumerate(seg):
                if i % 2 == 0:
                    factor = xfactor
                else:
                    factor = yfactor
                seg_resized.append(int(v*factor))
            ann['segmentation'] = [seg_resized]

        with open(os.path.join(base_folder.replace('aquabyte-images', 'aquabyte-images-resized'), 'cocofiles',
                               'coco_body_parts_' + json_file), 'w') as f:
            json.dump(annotations_resized, f)
