In [None]:
import glob
import json
import numpy as np
import os

import boto3
from matplotlib import pyplot as plt
import pandas as pd
from PIL import Image
from pycocotools.coco import COCO
from skimage.io import imread

%matplotlib inline

# set pandas options
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)


<h1> Load raw database dump </h1>

In [None]:
# historical_annotations_df = pd.read_csv('/root/data/alok/crop_data/jan_week_cogito_annotations.csv')
historical_reconciled_annotations_df = pd.read_csv('/root/data/alok/crop_data/jan_23_reconciled.csv')

In [None]:
aws_credentials = json.load(open(os.environ['AWS_CREDENTIALS']))
s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_credentials['aws_access_key_id'],
    aws_secret_access_key=aws_credentials['aws_secret_access_key']
)


In [None]:
historical_reconciled_annotations_df.shape

<h1> Dataset Construction - Write crops, images, and metadata to disk </h1>

In [None]:
def extract_sample_from_dataset(df, N, crop_bucket, crop_dir, image_bucket, image_dir, metadata_dir):
    i = 0
    for idx, row in df.sample(N).iterrows():
        # write crop to disk
        crop_key = row.image_key
        crop_f_name = os.path.basename(crop_key)
        crop_f = os.path.join(crop_dir, crop_f_name)
        if not os.path.exists(crop_f):
            s3_client.download_file(crop_bucket, crop_key, crop_f)

        # write image to disk
        image_f_name = '_'.join(os.path.basename(crop_key).split('_')[:-4]) + '.jpg'
        image_key = os.path.join(os.path.dirname(crop_key), image_f_name)
        image_f = os.path.join(image_dir, image_f_name)
        if not os.path.exists(image_f):
            s3_client.download_file(image_bucket, image_key, image_f)

        # write metadata to disk
        metadata = dict(row)
        metadata_f_name = crop_f_name.replace('.jpg', '.json')
        metadata_f = os.path.join(metadata_dir, metadata_f_name)
        if not os.path.exists(metadata_f):
            with open(metadata_f, 'w') as f:
                json.dump(metadata, f)    
        
        if i % 10 == 0:
            print(i)
        i += 1


In [None]:
# define the number of images that we would like to randomly select from either dataset
N = 146

# begin with dataset construction corresponding to full, valid fish

crop_bucket = 'aquabyte-crops'
crop_dir = '/root/data/alok/crop_data/crops/'
image_bucket = 'aquabyte-images-raw'
image_dir = '/root/data/alok/crop_data/images/'
metadata_dir = '/root/data/alok/crop_data/crop_metadata/'

In [None]:


# extract_sample_from_dataset(
#     rdf, 
#     10, 
#     crop_bucket, 
#     crop_dir, 
#     image_bucket, 
#     image_dir, 
#     metadata_dir
# )


extract_sample_from_dataset(
    historical_reconciled_annotations_df, 
    N, 
    crop_bucket, 
    crop_dir, 
    image_bucket, 
    image_dir, 
    metadata_dir
)

# extract_sample_from_dataset(
#     historical_annotations_df, 
#     N, 
#     crop_bucket, 
#     crop_dir, 
#     image_bucket, 
#     image_dir, 
#     metadata_dir
# )



<h1> Extract luminance information </h1>

In [None]:
coco = COCO()
coco.imgs = [
    {
        'height': 3000,
        'width': 4096
    }
]

In [None]:
crop_fs = glob.glob(os.path.join(crop_dir, '*.jpg'))
metadata_fs = glob.glob(os.path.join(metadata_dir, '*.json'))

In [None]:
# analysis_df = pd.DataFrame()
analysis_df_path, key = '/root/data/alok/crop_data/analysis_df.h5', 'df'
analysis_df = pd.read_hdf(analysis_df_path, key)

i = 0
for crop_f, metadata_f in zip(sorted(crop_fs), sorted(metadata_fs)):
    if crop_f in analysis_df.crop_path.tolist():
        continue
    assert os.path.basename(crop_f).replace('.jpg', '') == os.path.basename(metadata_f).replace('.json', ''), 'Mismatch!'

    metadata = json.load(open(metadata_f))
    # get luminance value

    image_f_name = '_'.join(os.path.basename(crop_f).split('_')[:-4]) + '.jpg'
    image_f = os.path.join(image_dir, image_f_name)
    im = Image.open(image_f).convert('L')
    ann = {
        'image_id': 0,
        'segmentation': json.loads(metadata['segmentation'])['segmentation']
    }
    mask = coco.annToMask(ann)
    mean_luminance = np.mean(np.array(im)[np.where(mask > 0)])

    metadata['image_path'] = image_f
    metadata['crop_path'] = crop_f
    metadata['mean_luminance'] = mean_luminance
    row = metadata
    
    analysis_df = analysis_df.append(row, ignore_index=True)
    
    if i % 10 == 0:
        print(i)
    i += 1
    
    
    
        

In [None]:
analysis_df['aspect_ratio'] = analysis_df.image_width_px / analysis_df.image_height_px

In [None]:
analysis_df.to_hdf('/root/data/alok/crop_data/analysis_df.h5', 'df')

In [None]:
mask = (analysis_df.is_skipped == False) & (analysis_df.created_by == 'gunnar@aquabyte.ai')
analysis_df[mask].sort_values('mean_luminance', ascending=True)





In [None]:
analysis_df.to_hdf('/root/data/alok/crop_data/analysis_df.h5', 'df')

In [None]:
analysis_df.sort_values('aspect_ratio', ascending=True)

In [None]:
i = 328
image_f = analysis_df[mask].sort_values('mean_luminance', ascending=False).image_path.iloc[i]
im = Image.open(image_f).convert('L')
ann = {
    'image_id': 0,
    'segmentation': json.loads(analysis_df[mask].sort_values('mean_luminance', ascending=False).segmentation.iloc[i])['segmentation']
}
m = coco.annToMask(ann)



In [None]:
historical_reconciled_annotations_df

In [None]:
Image.fromarray(np.array(im) * m)