In [None]:
import argparse
import json
import glob
from multiprocessing import Pool
import os
import shutil

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import cv2
from PIL import Image, ImageDraw
from urllib.parse import urlparse
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
OUTPUT_BASE_DIR = 'generated_csv'

In [None]:
def _refresh_directory(dirname):
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.makedirs(dirname)

In [None]:
def _get_bucket_key(url):
    parsed_url = urlparse(url, allow_fragments=False)
    if parsed_url.netloc.startswith('s3'):
        url_components = parsed_url.path.lstrip('/').split('/')
        bucket, key = url_components[0], os.path.join(*url_components[1:])
    else:
        bucket = parsed_url.netloc.split('.')[0]
        key = parsed_url.path.lstrip('/')
    return bucket, key


def _captured_in_hour_range(key, start_hour, end_hour):
    hour = int([component for component in key.split('/') if component.startswith('hour=')][0].split('=')[-1])
    return start_hour <= hour <= end_hour



def extract_s3_keys(pen_id, date, start_hour, end_hour, inbound_bucket='aquabyte-frames-resized-inbound'):
    query = """
        SELECT captured_at, left_crop_url
        FROM prod.crop_annotation ca
        WHERE ca.pen_id={} AND ca.service_id = 2
        AND to_char(ca.captured_at, 'YYYY-MM-DD') IN ('{}')
        LIMIT 1;
    """.format(pen_id, date)

    df = rds_access_utils.extract_from_database(query)
    image_url = df.left_crop_url.iloc[0]
    bucket, key = _get_bucket_key(image_url)
    s3_folder = os.path.join(key[:key.index('date')], 'date={}'.format(date))
    generator = s3_access_utils.get_matching_s3_keys(inbound_bucket, s3_folder, suffixes=['capture.json'])
    keys = [key for key in generator if _captured_in_hour_range(key, start_hour, end_hour)]
    s3_key_dirs = sorted(list(set([os.path.dirname(f) for f in keys])))
    return s3_key_dirs

In [None]:

def get_resize_url(s3_key_dir):
    base_url = "https://aquabyte-frames-resized-inbound.s3-eu-west-1.amazonaws.com/"
    left_f = os.path.join(base_url, s3_key_dir,'left_frame.resize_512_512.jpg')
    right_f = os.path.join(base_url, s3_key_dir,'right_frame.resize_512_512.jpg')
    crop_metadata_f = os.path.join(base_url, s3_key_dir, 'crops.json')

    return left_f, right_f, crop_metadata_f


In [None]:
def generate_dataframe(pen_id, date, start_hour, end_hour, has_algae, num_processes=12):
    _refresh_directory(OUTPUT_BASE_DIR)
    # extract s3 keys
    print('Extracting s3 keys...')
    s3_key_dirs = extract_s3_keys(pen_id, date, start_hour, end_hour)
    print('S3 keys extraction complete!')

    print('extract s3 keys..')
    pool = Pool(num_processes)
    results = pool.map(get_resize_url, s3_key_dirs)
    print('s3 keys complete!')
    
    print("convert to dataframe")
    df = pd.DataFrame(results,columns=['left_frame_resized_url', 'right_frame_resized_url', 'crop_metadata_url'])
    df['base_key'] = np.array(s3_key_dirs)
    df['pen_id'] = pen_id
    df['date'] = date
    df['has_algae'] = has_algae
    return df

In [None]:
#df_2020_04_22 = generate_dataframe(56, "2020-04-22", 10, 12, False, num_processes = 12)

In [None]:
df_2020_04_22.shape

In [None]:
df_2020_04_22.head(3)

In [None]:
df_2020_04_22.right_frame_resized_url.iloc[4]

In [None]:
#df_2020_05_04 = generate_dataframe(56, "2020-05-04", 10, 12, True, num_processes = 12)

In [None]:
df_2020_05_04.shape

In [None]:
output = pd.concat([df_2020_04_22, df_2020_05_04])

In [None]:
output.shape

In [None]:
output_bucket='aquabyte-images-adhoc' 

In [None]:
_refresh_directory(OUTPUT_BASE_DIR)
output_path = os.path.join(OUTPUT_BASE_DIR, 'pen_id_{}.csv'.format(str(56)))
output.to_csv(output_path, index=False)
output_key = os.path.join("water_turbidity/algae_binary_adhoc", os.path.basename(output_path))
s3_access_utils.s3_client.upload_file(output_path, output_bucket, output_key)