In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer
import random
from scipy.stats import norm
from PIL import Image, ImageDraw
from urllib.parse import urlparse
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
import random

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

<h1> Get lice annotation data across all pens </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data')

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))


In [None]:
def get_bucket_key(url):
    parsed_url = urlparse(url, allow_fragments=False)
    if parsed_url.netloc.startswith('s3'):
        url_components = parsed_url.path.lstrip('/').split('/')
        bucket, key = url_components[0], os.path.join(*url_components[1:])
    else:
        bucket = parsed_url.netloc.split('.')[0]
        key = parsed_url.path.lstrip('/')
    return bucket, key

def get_reconciled_df(pen_id, start_date, end_date):
    
    query = """
        select * from lati_fish_detections_lice_annotations_reconciled where pen_id={} 
        and captured_at between '{}' and '{}' and adult_female_count >= 0
        order by random() limit 800;
    """.format(pen_id, start_date, end_date)
    reconciled_df = rds_access_utils.extract_from_database(query)
    reconciled_df = reconciled_df[reconciled_df.is_skipped == False]
    reconciled_df = reconciled_df.drop(columns=['lati_fish_detections_lice_annotations_id'])
    return reconciled_df


def get_blurry_df(pen_id, start_date, end_date):
    query = """
        select * from lati_fish_detections_lice_annotations where pen_id={} 
        and captured_at between '{}' and '{}'
        and is_skipped != False and is_blurry = True and is_bad_crop = False
        order by random() limit 800;
    """.format(pen_id, start_date, end_date)
    blurry_df = rds_access_utils.extract_from_database(query)
    blurry_df = blurry_df.drop(columns=['completed_at'])
    return blurry_df


def get_dark_df(pen_id, start_date, end_date):
    query = """
        select * from lati_fish_detections_lice_annotations where pen_id={} 
        and captured_at between '{}' and '{}'
        and is_skipped != False and is_too_dark = True and is_bad_crop = False 
        order by random() limit 800;
    """.format(pen_id, start_date, end_date)
    dark_df = rds_access_utils.extract_from_database(query)
    dark_df = dark_df.drop(columns=['completed_at'])
    return dark_df

def get_bad_crop_df(pen_id, start_date, end_date):
    query = """
        select * from lati_fish_detections_lice_annotations where pen_id={} 
        and captured_at between '{}' and '{}'
        and is_skipped != False and is_bad_crop = True  
        order by random() limit 800;
    """.format(pen_id, start_date, end_date)
    bad_crop_df = rds_access_utils.extract_from_database(query)
    bad_crop_df = bad_crop_df.drop(columns=['completed_at'])
    return bad_crop_df

def get_cleaner_fish_df(pen_id, start_date, end_date):
    query = """
        select * from lati_fish_detections_lice_annotations where pen_id={} 
        and captured_at between '{}' and '{}'
        and is_skipped != False and is_cleaner_fish = True  
        order by random() limit 800;
    """.format(pen_id, start_date, end_date)
    bad_crop_df = rds_access_utils.extract_from_database(query)
    bad_crop_df = bad_crop_df.drop(columns=['completed_at'])
    return bad_crop_df


def process_data_for_pen_id(pen_id, start_date, end_date, df_dict, data_root):
    
    if not os.path.exists(data_root):
        os.makedirs(data_root)
    
    print(pen_id)
    print('---'*10)
#     reconciled_df = get_reconciled_df(pen_id, start_date, end_date)
#     blurry_df = get_blurry_df(pen_id, start_date, end_date)
#     dark_df = get_dark_df(pen_id, start_date, end_date)
#     bad_crop_df = get_bad_crop_df(pen_id, start_date, end_date)
    cleaner_fish_df = get_cleaner_fish_df(pen_id, start_date, end_date)
            
#     count = 0
#     for idx, row in reconciled_df.iterrows():
#         captured_at = row.captured_at
#         image_url = row.image_url
#         bucket, key = get_bucket_key(image_url)
#         key = key.replace('dev', 'environment=production')
#         key_directory = os.path.dirname(key)
#         full_image_bucket, full_image_key = 'aquabyte-frames-resized-inbound', os.path.join(key_directory, 'left_frame.resize_512_512.jpg')
#         full_image_f = os.path.join(data_root, full_image_key.replace('/', '_'))
#         s3_access_utils.download_from_s3(full_image_bucket, full_image_key, full_image_f)
#         df_dict['full_image_bucket'].append(full_image_bucket)
#         df_dict['full_image_key'].append(full_image_key)
#         df_dict['full_image_f'].append(full_image_f)
#         for k, v in dict(row).items():
#             df_dict[k].append(v)
        
#         if count % 100 == 0:
#             print(count)
#         count += 1
        
#     print('Reconciled df complete!')
    
#     count = 0
#     for idx, row in blurry_df.iterrows():
#         captured_at = row.captured_at
#         image_url = row.image_url
#         bucket, key = get_bucket_key(image_url)
#         key = key.replace('dev', 'environment=production')
#         key_directory = os.path.dirname(key)
#         full_image_bucket, full_image_key = 'aquabyte-frames-resized-inbound', os.path.join(key_directory, 'left_frame.resize_512_512.jpg')
#         full_image_f = os.path.join(data_root, full_image_key.replace('/', '_'))
#         s3_access_utils.download_from_s3(full_image_bucket, full_image_key, full_image_f)
#         df_dict['full_image_bucket'].append(full_image_bucket)
#         df_dict['full_image_key'].append(full_image_key)
#         df_dict['full_image_f'].append(full_image_f)
#         for k, v in dict(row).items():
#             df_dict[k].append(v)
        
#         if count % 100 == 0:
#             print(count)
#         count += 1

#     print('Blurry df complete!')
    
#     count = 0
#     for idx, row in dark_df.iterrows():
#         captured_at = row.captured_at
#         image_url = row.image_url
#         bucket, key = get_bucket_key(image_url)
#         key = key.replace('dev', 'environment=production')
#         key_directory = os.path.dirname(key)
#         full_image_bucket, full_image_key = 'aquabyte-frames-resized-inbound', os.path.join(key_directory, 'left_frame.resize_512_512.jpg')
#         full_image_f = os.path.join(data_root, full_image_key.replace('/', '_'))
#         s3_access_utils.download_from_s3(full_image_bucket, full_image_key, full_image_f)
#         df_dict['full_image_bucket'].append(full_image_bucket)
#         df_dict['full_image_key'].append(full_image_key)
#         df_dict['full_image_f'].append(full_image_f)
#         for k, v in dict(row).items():
#             df_dict[k].append(v)
            
#         if count % 100 == 0:
#             print(count)
#         count += 1
            
#     print('Dark df complete!')

#     count = 0
#     for idx, row in bad_crop_df.iterrows():
#         captured_at = row.captured_at
#         image_url = row.image_url
#         bucket, key = get_bucket_key(image_url)
#         key = key.replace('dev', 'environment=production')
#         key_directory = os.path.dirname(key)
#         full_image_bucket, full_image_key = 'aquabyte-frames-resized-inbound', os.path.join(key_directory, 'left_frame.resize_512_512.jpg')
#         full_image_f = s3_access_utils.download_from_s3(full_image_bucket, full_image_key)
#         df_dict['full_image_bucket'].append(full_image_bucket)
#         df_dict['full_image_key'].append(full_image_key)
#         df_dict['full_image_f'].append(full_image_f)
#         for k, v in dict(row).items():
#             df_dict[k].append(v)
        
#         if count % 100 == 0:
#             print(count)
#         count += 1
            
#     print('Bad crop df complete!')
    
    count = 0
    for idx, row in cleaner_fish_df.iterrows():
        captured_at = row.captured_at
        image_url = row.image_url
        bucket, key = get_bucket_key(image_url)
        key = key.replace('dev', 'environment=production')
        key_directory = os.path.dirname(key)
        full_image_bucket, full_image_key = 'aquabyte-frames-resized-inbound', os.path.join(key_directory, 'left_frame.resize_512_512.jpg')
        full_image_f = s3_access_utils.download_from_s3(full_image_bucket, full_image_key)
        df_dict['full_image_bucket'].append(full_image_bucket)
        df_dict['full_image_key'].append(full_image_key)
        df_dict['full_image_f'].append(full_image_f)
        for k, v in dict(row).items():
            df_dict[k].append(v)
        
        if count % 100 == 0:
            print(count)
        count += 1
            
    print('Cleaner fish df complete!')
            

 

In [None]:
# enumerate all valid pens
pen_ids = [56, 57, 58, 59, 60, 37, 65]
start_date, end_date = '2019-11-05', '2020-01-01'

In [None]:
random.seed(0)
np.random.seed(0)

df_dict = defaultdict(list)
data_root = '/root/data/alok/yolo_multiclassifier_data_cleaner_fish'
for pen_id in pen_ids:
    process_data_for_pen_id(pen_id, start_date, end_date, df_dict, data_root)


In [None]:
df = pd.DataFrame(df_dict)

In [None]:
df.to_csv('/root/data/alok/yolo_multiclass_data.csv')

In [None]:
pdf = pd.read_csv('/root/data/alok/yolo_multiclass_data_cleaner_fish.csv')

In [None]:
pdf

In [None]:
df.to_csv('/root/data/alok/yolo_training_data.csv')

In [None]:
df = pd.read_csv('/root/data/alok/cropper_experiment_data.csv')

In [None]:
image_fs = []
for idx, row in df.iterrows():
    image_url = row.image_url
    image_url = row.image_url
    bucket, key = get_bucket_key(image_url)
    print(image_url)
    image_f = s3_access_utils.download_from_s3(bucket, key)
    image_fs.append(image_f)
    
df['crop_image_f'] = image_fs

In [None]:
df.to_csv('/root/data/alok/cropper_experiment_data_v2.csv')

In [None]:
query = 'select * from lati_fish_detections_lice_annotations where pen_id={} limit 2000;'.format(pen_id)
cogito_df = rds_access_utils.extract_from_database(query)

In [None]:
for i in range(100):
    print(get_bucket_key(cogito_df.image_url.iloc[i])[0])

In [None]:
s3_access_utils = S3AccessUtils('/root/data')

In [None]:
FISH_WIDTH_M = 0.065
FISH_LENGTH_M = 0.294
FOCAL_LENGTH = 4015

def depth_fn(x):
    w, h = x['width'], x['height']
    theta = np.arctan(h / w) * (180.0 / np.pi)
    phi = np.arctan(FISH_WIDTH_M / FISH_LENGTH_M) * (180.0 / np.pi)
    if theta < phi:
        return w
    elif theta > 90.0 - phi:
        return h
    else:
        return (h**2 + w**2)**0.5

def process_data_df(df):
    df = df[df.is_cleaner_fish != True]
    df['image_width'] = df.metadata.apply(lambda x: x['width'])
    df['image_height'] = df.metadata.apply(lambda x: x['height'])
    df['length_px'] = df.metadata.apply(lambda x: depth_fn(x))
    df['single_image_depth_m'] = FOCAL_LENGTH * FISH_LENGTH_M / df.length_px
    df['stereo_depth_m'] = df.metadata.apply(lambda x: x.get('depth_m'))
    return df

df = process_data_df(df)


In [None]:
df.to_csv('/root/data/alok/aggregate_vikane_df.csv')

In [None]:
df = pd.read_csv('/root/data/alok/aggregate_vikane_df.csv')

In [None]:
in_focus_mask = (df.stereo_depth_m > 0.8) & (df.stereo_depth_m < 1.0)
accept_mask = ~df.is_skipped
skip_masks = {}
skip_reasons = [
    'is_accepted_in_qa', 
    'is_blurry', 
    'is_bad_crop', 
    'is_too_dark', 
    'is_bad_crop_many_fish', 
    'is_bad_orientation', 
    'is_bad_crop_cut_off', 
    'is_obstructed'
]
for skip_reason in skip_reasons:
    skip_masks[skip_reason] = df[skip_reason] == True

n = df.shape[0]
n_in_focus = df[in_focus_mask].shape[0]
n_in_focus_accepted = df[in_focus_mask & accept_mask].shape[0]
n_not_in_focus_accepted = df[~in_focus_mask & accept_mask].shape[0]


print('Total number of images inspected by Cogito over the weekend: {}'.format(n))
print('Total number of these images within in-focus range (45 cm - 55 cm): {}'.format(n_in_focus))
print('Total number of in-focus images accepted in QA: {}'.format(n_in_focus_accepted))
print('Total number of not-in-focus images accepted by Cogito: {}'.format(n_not_in_focus_accepted))

In [None]:
plt.figure(figsize=(20, 10))
valid_depth_mask = (df.stereo_depth_m > 0.0) & (df.stereo_depth_m < 2.0)
plt.hist(df[valid_depth_mask].stereo_depth_m, bins=20)
plt.show()

In [None]:
df[df.is_skipped == True].sample(2000)

In [None]:
def process_row(row, skip_reason):
    depth_m = row[depth_field]
    line_segment_length_px = object_length_m * FOCAL_LENGTH / depth_m
    image_url = row.image_url
    bucket, key = 'aquabyte-crops', urlparse(image_url, allow_fragments=False).path.lstrip('/')
    image_f = s3_access_utils.download_from_s3(bucket, key)

    im = Image.open(image_f)
    draw = ImageDraw.Draw(im)
    draw.line((100, 100, 100+line_segment_length_px, 100))

    f_name = os.path.basename(key)
    f = os.path.join(modified_images_dir, '{}_{}'.format(lo, hi), skip_reason, f_name)
    if not os.path.exists(os.path.dirname(f)):
        os.makedirs(os.path.dirname(f))
    im.save(f)


modified_images_dir = '/root/data/alok/lice_counting/bremnes_tittelsnes_image_breakdown'
object_length_m = 0.01
N = 20

cogito_accept_mask = ~df.is_skipped
qa_accept_mask = ~reconciled_df.is_skipped
depth_values = [round(x, 1) for x in np.arange(0.2, 1.5, 0.1)]

depth_field = 'stereo_depth_m'
for i in range(len(depth_values)-1):
    print(i)
    lo, hi = depth_values[i], depth_values[i+1]
    depth_mask = (df[depth_field] >= lo) & (df[depth_field] <= hi)
    
    # accepted images
    for idx, row in df[depth_mask & accept_mask].head(N).iterrows():
        process_row(row, 'accepted')
    
    # rejected images due to blurriness
    for idx, row in df[depth_mask & is_blurry_mask & (~is_bad_crop_mask) & (~is_too_dark_mask) & (~is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_blurry')
        
    # rejected images due to darkness
    for idx, row in df[depth_mask & (~is_blurry_mask) & (~is_bad_crop_mask) & is_too_dark_mask & (~is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_too_dark')
        
    # rejected images due to bad crop
    for idx, row in df[depth_mask & (~is_blurry_mask) & is_bad_crop_mask & (~is_too_dark_mask) & (~is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_bad_crop')
        
    # rejected images due to bad orientation
    for idx, row in df[depth_mask & (~is_blurry_mask) & (~is_bad_crop_mask) & (~is_too_dark_mask) & (is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_bad_orientation')


In [None]:
def process_row(row, skip_reason):
    depth_m = row[depth_field]
    line_segment_length_px = object_length_m * FOCAL_LENGTH / depth_m
    image_url = row.image_url
    if 'aquabyte-crops-lati' not in image_url:
        bucket, key = 'aquabyte-crops', urlparse(image_url, allow_fragments=False).path.lstrip('/')
    else:
        components = urlparse(image_url, allow_fragments=False).path.lstrip('/').split('/')
        bucket, key = components[0], os.path.join(*components[1:])
    print(bucket, key)
    image_f = s3_access_utils.download_from_s3(bucket, key)

    im = Image.open(image_f)
#     draw = ImageDraw.Draw(im)
#     draw.line((100, 100, 100+line_segment_length_px, 100))

    f_name = os.path.basename(key)
    f = os.path.join(modified_images_dir, skip_reason, f_name)
    if not os.path.exists(os.path.dirname(f)):
        os.makedirs(os.path.dirname(f))
    im.save(f)


modified_images_dir = '/root/data/alok/lice_counting/bremnes_tittelsnes_breakdown_depth_independent'
object_length_m = 0.01
N = 50


# rejected images due to skip reason
for target_skip_reason in skip_reasons:
    mask = skip_masks[target_skip_reason]
    for skip_reason, skip_mask in skip_masks.items():
        if skip_reason != target_skip_reason:
            mask = mask & ~skip_mask
        for idx, row in df[mask].head(N).iterrows():
            process_row(row, skip_reason)

# # rejected images due to darkness
# for idx, row in df[(~is_blurry_mask) & (~is_bad_crop_mask) & is_too_dark_mask & (~is_bad_orientation_mask)].head(N).iterrows():
#     process_row(row, 'is_too_dark')

# # rejected images due to bad crop
# for idx, row in df[(~is_blurry_mask) & is_bad_crop_mask & (~is_too_dark_mask) & (~is_bad_orientation_mask)].head(N).iterrows():
#     process_row(row, 'is_bad_crop')

# # rejected images due to bad orientation
# for idx, row in df[(~is_blurry_mask) & (~is_bad_crop_mask) & (~is_too_dark_mask) & (is_bad_orientation_mask)].head(N).iterrows():
#     process_row(row, 'is_bad_orientation')


In [None]:
df[df.image_url.str.contains('702_1953_3290_3000')].stereo_depth_m

In [None]:
df[df.image_url.str.contains('366_1350_2442_2229')].stereo_depth_m

In [None]:
df[df.image_url.str.contains('0_1127_2674_2012')].stereo_depth_m

<h1> Generate depth values </h1>

In [None]:
depth_field = 'stereo_depth_m'
valid_mask = (reconciled_df[depth_field] > 0.2) & (reconciled_df[depth_field] < 0.7)
plt.figure(figsize=(20, 10))
plt.hist(reconciled_df.loc[valid_mask & reconciled_accept_mask, depth_field], bins=20)
plt.show()

In [None]:
depth_field = 'single_image_depth_m'
plt.figure(figsize=(20, 10))
plt.hist(reconciled_df.loc[reconciled_accept_mask, depth_field], bins=20)
plt.show()

In [None]:
'https://aquabyte-images-adhoc-public.s3.amazonaws.com/bremnes_tittelsnes_image_breakdown/0.9_1.0/is_blurry/left_frame_crop_1070_1016_3242_2018.jpg'

In [None]:
gen = s3_access_utils.get_matching_s3_keys('aquabyte-images-adhoc-public', prefix='bremnes_tittelsnes_image_breakdown', suffixes='.jpg')

In [None]:
prefix = 'https://aquabyte-images-adhoc-public.s3.amazonaws.com'
bucket = 'aquabyte-images-adhoc-public'
urls = []
for key in gen:
    url = os.path.join(prefix, key)
    urls.append(url)
    

In [None]:
with open('/root/data/alok/urls.csv', 'w') as f:
    f.write(',\n'.join(urls))