In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from wpca import WPCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer
import random
from scipy.stats import norm
from PIL import Image, ImageDraw
from urllib.parse import urlparse
import datetime as dt
import heapq 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

<h1> Get lice annotation data </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))

In [None]:
def generate_annotation_performance(pen_id, start_date, end_date):
    query = "select * from lati_fish_detections_lice_annotations where pen_id = {0} and captured_at >= '{1}' and captured_at < '{2}'".format(pen_id, start_date, end_date)
    cogito_df = rds_access_utils.extract_from_database(query)
    cogito_df['is_submitted'] = ~cogito_df.is_skipped
    cogito_df['quality_score'] = cogito_df.metadata.apply(lambda x: x['quality_score'])
    plt.figure(figsize=(20, 10))
    plt.scatter(range(cogito_df.shape[0]), np.cumsum(cogito_df.sort_values('captured_at')['is_submitted']), color='b', s=3, alpha=0.6, label='Without PQ no delay')
    plt.scatter(range(cogito_df.shape[0]), np.cumsum(cogito_df.sort_values('completed_at')['is_submitted']), color='r', s=3, alpha=0.6, label='With PQ no delay')
    
    # simulate 4 hour delay
    cogito_df['simulated_completed_at'] = cogito_df.completed_at + dt.timedelta(hours=4)
    queue, submits = [], []
    last_ts = None
    i = 0
    for idx, row in cogito_df.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (cogito_df.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (cogito_df.captured_at > last_ts) & (cogito_df.captured_at <= row.simulated_completed_at)

        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(cogito_df[additional_captures_mask].quality_score.tolist(), 
                                        cogito_df[additional_captures_mask].is_submitted.tolist()))
    
        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[0], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
        if i % 100 == 0:
            print(i)
        i += 1
    
    plt.scatter(range(cogito_df.shape[0]), np.cumsum(np.array(submits)), color='black', s=3, alpha=0.6, label='With PQ 4 hour delay')
    
    
    # simulate 2 hour delay
    cogito_df['simulated_completed_at'] = cogito_df.completed_at + dt.timedelta(hours=2)
    queue, submits = [], []
    last_ts = None
    i = 0
    for idx, row in cogito_df.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (cogito_df.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (cogito_df.captured_at > last_ts) & (cogito_df.captured_at <= row.simulated_completed_at)

        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(cogito_df[additional_captures_mask].quality_score.tolist(), 
                                        cogito_df[additional_captures_mask].is_submitted.tolist()))
    
        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[0], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
        if i % 100 == 0:
            print(i)
        i += 1
        
    
    plt.scatter(range(cogito_df.shape[0]), np.cumsum(np.array(submits)), color='orange', s=3, alpha=0.6, label='With PQ 2 hour delay')
    
    # simulate 1 hour delay
    cogito_df['simulated_completed_at'] = cogito_df.completed_at + dt.timedelta(hours=1)
    queue, submits = [], []
    last_ts = None
    i = 0
    for idx, row in cogito_df.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (cogito_df.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (cogito_df.captured_at > last_ts) & (cogito_df.captured_at <= row.simulated_completed_at)

        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(cogito_df[additional_captures_mask].quality_score.tolist(), 
                                        cogito_df[additional_captures_mask].is_submitted.tolist()))
    
        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[0], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
        if i % 100 == 0:
            print(i)
        i += 1
        
    
    plt.scatter(range(cogito_df.shape[0]), np.cumsum(np.array(submits)), color='purple', s=3, alpha=0.6, label='With PQ 1 hour delay')
    
    
    
    
    # simulate 0 delay with perfect PQ
    cogito_df['simulated_completed_at'] = cogito_df.completed_at + dt.timedelta(hours=0)
    queue, submits = [], []
    last_ts = None
    i = 0
    for idx, row in cogito_df.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (cogito_df.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (cogito_df.captured_at > last_ts) & (cogito_df.captured_at <= row.simulated_completed_at)

        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(cogito_df[additional_captures_mask].quality_score.tolist(), 
                                        cogito_df[additional_captures_mask].is_submitted.tolist()))
    
        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[1], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
        if i % 100 == 0:
            print(i)
        i += 1
        
    
#     plt.scatter(range(cogito_df.shape[0]), np.cumsum(np.array(submits)), color='purple', s=3, alpha=0.6, label='With perfect PQ zero delay')
    
#     # simulate 12 hour delay
#     cogito_df['simulated_completed_at'] = cogito_df.completed_at + dt.timedelta(hours=12)
#     queue, submits = [], []
#     last_ts = None
#     i = 0
#     for idx, row in cogito_df.sort_values('simulated_completed_at').iterrows():
#         if not last_ts:
#             additional_captures_mask = (cogito_df.captured_at <= row.simulated_completed_at)
#         else:
#             additional_captures_mask = (cogito_df.captured_at > last_ts) & (cogito_df.captured_at <= row.simulated_completed_at)

#         last_ts = row.simulated_completed_at
#         additional_scores_and_submits = list(zip(cogito_df[additional_captures_mask].quality_score.tolist(), 
#                                         cogito_df[additional_captures_mask].is_submitted.tolist()))
    
#         queue.extend(additional_scores_and_submits)
#         queue.sort(key=lambda x: x[0], reverse=True)
#         _, submit = queue.pop(0)
#         submits.append(submit)
#         if i % 100 == 0:
#             print(i)
#         i += 1
        
    
#     plt.scatter(range(cogito_df.shape[0]), np.cumsum(np.array(submits)), color='purple', s=3, alpha=0.6, label='With PQ 12 hour delay')
    
#     # simulate 16 hour delay
#     cogito_df['simulated_completed_at'] = cogito_df.completed_at + dt.timedelta(hours=12)
#     queue, submits = [], []
#     last_ts = None
#     i = 0
#     for idx, row in cogito_df.sort_values('simulated_completed_at').iterrows():
#         if not last_ts:
#             additional_captures_mask = (cogito_df.captured_at <= row.simulated_completed_at)
#         else:
#             additional_captures_mask = (cogito_df.captured_at > last_ts) & (cogito_df.captured_at <= row.simulated_completed_at)

#         last_ts = row.simulated_completed_at
#         additional_scores_and_submits = list(zip(cogito_df[additional_captures_mask].quality_score.tolist(), 
#                                         cogito_df[additional_captures_mask].is_submitted.tolist()))
    
#         queue.extend(additional_scores_and_submits)
#         queue.sort(key=lambda x: x[0], reverse=True)
#         _, submit = queue.pop(0)
#         submits.append(submit)
#         if i % 100 == 0:
#             print(i)
#         i += 1
        
    
#     plt.scatter(range(cogito_df.shape[0]), np.cumsum(np.array(submits)), color='green', s=3, alpha=0.6, label='With PQ 16 hour delay')

    
    plt.axhline(50, linestyle='dashed', label='KPI')
    plt.xlabel('Number of images analyzed by Cogito')
    plt.ylabel('Number of images submitted to QA')
    plt.title('PQ Performance for PEN ID = {0} on {1}'.format(pen_id, start_date))
    plt.legend()
    plt.grid()
    plt.show()
    return cogito_df


In [None]:
pen_id=37

In [None]:
cogito_df = generate_annotation_performance(pen_id, '2019-09-22', '2019-09-23')

In [None]:
cogito_df = generate_annotation_performance(pen_id, '2019-09-21', '2019-09-22')

In [None]:
cogito_df = generate_annotation_performance(37, '2019-09-23', '2019-09-24')

In [None]:
x = [(3, 6), (2, 7), (9, 3), (4, 1), (1, 10)]

In [None]:
x.sort(key=lambda x: x[0], reverse=True)
a, b = x.pop(0)

In [None]:
# aggregate_df.to_csv('/root/data/alok/aggregate_df_bremnes_tittelsnes.csv')

In [None]:
s3_access_utils = S3AccessUtils('/root/data')

In [None]:
FISH_WIDTH_M = 0.065
FISH_LENGTH_M = 0.294
FOCAL_LENGTH = 4015

def depth_fn(x):
    w, h = x['width'], x['height']
    theta = np.arctan(h / w) * (180.0 / np.pi)
    phi = np.arctan(FISH_WIDTH_M / FISH_LENGTH_M) * (180.0 / np.pi)
    if theta < phi:
        return w
    elif theta > 90.0 - phi:
        return h
    else:
        return (h**2 + w**2)**0.5

def process_data_df(df):
    df = df[df.is_cleaner_fish != True]
    df['image_width'] = df.metadata.apply(lambda x: x['width'])
    df['image_height'] = df.metadata.apply(lambda x: x['height'])
    df['length_px'] = df.metadata.apply(lambda x: depth_fn(x))
    df['single_image_depth_m'] = FOCAL_LENGTH * FISH_LENGTH_M / df.length_px
    df['stereo_depth_m'] = df.metadata.apply(lambda x: x.get('depth_m'))
    return df

df = process_data_df(df)


In [None]:
df.to_csv('/root/data/alok/aggregate_vikane_df.csv')

In [None]:
in_focus_mask = (df.stereo_depth_m > 0.8) & (df.stereo_depth_m < 1.0)
accept_mask = ~df.is_skipped
skip_masks = {}
skip_reasons = [
    'is_accepted_in_qa', 
    'is_blurry', 
    'is_bad_crop', 
    'is_too_dark', 
    'is_bad_crop_many_fish', 
    'is_bad_orientation', 
    'is_bad_crop_cut_off', 
    'is_obstructed'
]
for skip_reason in skip_reasons:
    skip_masks[skip_reason] = df[skip_reason] == True

n = df.shape[0]
n_in_focus = df[in_focus_mask].shape[0]
n_in_focus_accepted = df[in_focus_mask & accept_mask].shape[0]
n_not_in_focus_accepted = df[~in_focus_mask & accept_mask].shape[0]


print('Total number of images inspected by Cogito over the weekend: {}'.format(n))
print('Total number of these images within in-focus range (45 cm - 55 cm): {}'.format(n_in_focus))
print('Total number of in-focus images accepted in QA: {}'.format(n_in_focus_accepted))
print('Total number of not-in-focus images accepted by Cogito: {}'.format(n_not_in_focus_accepted))

In [None]:
plt.figure(figsize=(20, 10))
valid_depth_mask = (df.stereo_depth_m > 0.0) & (df.stereo_depth_m < 2.0)
plt.hist(df[valid_depth_mask].stereo_depth_m, bins=20)
plt.show()

In [None]:
df[df.is_skipped == True].sample(2000)

In [None]:
def process_row(row, skip_reason):
    depth_m = row[depth_field]
    line_segment_length_px = object_length_m * FOCAL_LENGTH / depth_m
    image_url = row.image_url
    bucket, key = 'aquabyte-crops', urlparse(image_url, allow_fragments=False).path.lstrip('/')
    image_f = s3_access_utils.download_from_s3(bucket, key)

    im = Image.open(image_f)
    draw = ImageDraw.Draw(im)
    draw.line((100, 100, 100+line_segment_length_px, 100))

    f_name = os.path.basename(key)
    f = os.path.join(modified_images_dir, '{}_{}'.format(lo, hi), skip_reason, f_name)
    if not os.path.exists(os.path.dirname(f)):
        os.makedirs(os.path.dirname(f))
    im.save(f)


modified_images_dir = '/root/data/alok/lice_counting/bremnes_tittelsnes_image_breakdown'
object_length_m = 0.01
N = 20

cogito_accept_mask = ~df.is_skipped
qa_accept_mask = ~reconciled_df.is_skipped
depth_values = [round(x, 1) for x in np.arange(0.2, 1.5, 0.1)]

depth_field = 'stereo_depth_m'
for i in range(len(depth_values)-1):
    print(i)
    lo, hi = depth_values[i], depth_values[i+1]
    depth_mask = (df[depth_field] >= lo) & (df[depth_field] <= hi)
    
    # accepted images
    for idx, row in df[depth_mask & accept_mask].head(N).iterrows():
        process_row(row, 'accepted')
    
    # rejected images due to blurriness
    for idx, row in df[depth_mask & is_blurry_mask & (~is_bad_crop_mask) & (~is_too_dark_mask) & (~is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_blurry')
        
    # rejected images due to darkness
    for idx, row in df[depth_mask & (~is_blurry_mask) & (~is_bad_crop_mask) & is_too_dark_mask & (~is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_too_dark')
        
    # rejected images due to bad crop
    for idx, row in df[depth_mask & (~is_blurry_mask) & is_bad_crop_mask & (~is_too_dark_mask) & (~is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_bad_crop')
        
    # rejected images due to bad orientation
    for idx, row in df[depth_mask & (~is_blurry_mask) & (~is_bad_crop_mask) & (~is_too_dark_mask) & (is_bad_orientation_mask)].head(N).iterrows():
        process_row(row, 'is_bad_orientation')


In [None]:
def process_row(row, skip_reason):
    depth_m = row[depth_field]
    line_segment_length_px = object_length_m * FOCAL_LENGTH / depth_m
    image_url = row.image_url
    if 'aquabyte-crops-lati' not in image_url:
        bucket, key = 'aquabyte-crops', urlparse(image_url, allow_fragments=False).path.lstrip('/')
    else:
        components = urlparse(image_url, allow_fragments=False).path.lstrip('/').split('/')
        bucket, key = components[0], os.path.join(*components[1:])
    print(bucket, key)
    image_f = s3_access_utils.download_from_s3(bucket, key)

    im = Image.open(image_f)
#     draw = ImageDraw.Draw(im)
#     draw.line((100, 100, 100+line_segment_length_px, 100))

    f_name = os.path.basename(key)
    f = os.path.join(modified_images_dir, skip_reason, f_name)
    if not os.path.exists(os.path.dirname(f)):
        os.makedirs(os.path.dirname(f))
    im.save(f)


modified_images_dir = '/root/data/alok/lice_counting/bremnes_tittelsnes_breakdown_depth_independent'
object_length_m = 0.01
N = 50


# rejected images due to skip reason
for target_skip_reason in skip_reasons:
    mask = skip_masks[target_skip_reason]
    for skip_reason, skip_mask in skip_masks.items():
        if skip_reason != target_skip_reason:
            mask = mask & ~skip_mask
        for idx, row in df[mask].head(N).iterrows():
            process_row(row, skip_reason)

# # rejected images due to darkness
# for idx, row in df[(~is_blurry_mask) & (~is_bad_crop_mask) & is_too_dark_mask & (~is_bad_orientation_mask)].head(N).iterrows():
#     process_row(row, 'is_too_dark')

# # rejected images due to bad crop
# for idx, row in df[(~is_blurry_mask) & is_bad_crop_mask & (~is_too_dark_mask) & (~is_bad_orientation_mask)].head(N).iterrows():
#     process_row(row, 'is_bad_crop')

# # rejected images due to bad orientation
# for idx, row in df[(~is_blurry_mask) & (~is_bad_crop_mask) & (~is_too_dark_mask) & (is_bad_orientation_mask)].head(N).iterrows():
#     process_row(row, 'is_bad_orientation')


In [None]:
df[df.image_url.str.contains('702_1953_3290_3000')].stereo_depth_m

In [None]:
df[df.image_url.str.contains('366_1350_2442_2229')].stereo_depth_m

In [None]:
df[df.image_url.str.contains('0_1127_2674_2012')].stereo_depth_m

<h1> Generate depth values </h1>

In [None]:
depth_field = 'stereo_depth_m'
valid_mask = (reconciled_df[depth_field] > 0.2) & (reconciled_df[depth_field] < 0.7)
plt.figure(figsize=(20, 10))
plt.hist(reconciled_df.loc[valid_mask & reconciled_accept_mask, depth_field], bins=20)
plt.show()

In [None]:
depth_field = 'single_image_depth_m'
plt.figure(figsize=(20, 10))
plt.hist(reconciled_df.loc[reconciled_accept_mask, depth_field], bins=20)
plt.show()