In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer
import random
from scipy.stats import norm
from PIL import Image, ImageDraw
from urllib.parse import urlparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import normalize
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)

<h1> Load LATI data for Blom Kjeppevikholmen Pen ID 5 joined with keypoint annotations </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))
query = """
    select * from lati_fish_detections_lice_annotations a left join 
    (select keypoints, left_image_url, right_image_url, captured_at, camera_metadata, is_qa from keypoint_annotations) b
    on a.captured_at = b.captured_at
    where b.keypoints is not null
    and b.is_qa = true
    and a.pen_id = 5;
"""
cogito_df = rds_access_utils.extract_from_database(query)


In [None]:
s3_access_utils = S3AccessUtils('/root/data')

In [None]:
df = cogito_df.copy(deep=True)
cols = df.columns.tolist()
matches = []
for idx, row in df.iterrows():
    if row.left_image_url.replace('aquabyte-crops', 'aquabyte-crops-lati') == row.image_url:
        matches.append(True)
    else:
        matches.append(False)
df['is_match'] = matches
df = df[(df.is_match == True)]

<h1> Generate Stereo Depth Values </h1>

In [None]:
def get_world_keypoints(row):
    if 'leftCrop' in row.keypoints and 'rightCrop' in row.keypoints:
        return pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata)
    else:
        return None
    
cogito_df['world_keypoints'] = cogito_df.apply(
    lambda x: get_world_keypoints(x), axis=1
)

def get_centroid_depth(world_keypoints):
    if world_keypoints:
        depths = []
        for bp, wkp in world_keypoints.items():
            depths.append(wkp[1])

        return np.median(np.array(depths))
    else:
        return None

def get_length(world_keypoints):
    if world_keypoints:
        return euclidean_distance(world_keypoints['UPPER_LIP'], world_keypoints['ANAL_FIN'])
    else:
        return None

def get_x_length(world_keypoints):
    if world_keypoints:
        x = np.array([world_keypoints['UPPER_LIP'][0], 0, world_keypoints['UPPER_LIP'][2]])
        y = np.array([world_keypoints['ANAL_FIN'][0], 0, world_keypoints['ANAL_FIN'][2]])
        return euclidean_distance(x, y)
    else:
        return None
    

df['world_keypoints'] = df.apply(lambda x: get_world_keypoints(x), axis=1)
df['centroid_depth'] = df.world_keypoints.apply(lambda x: get_centroid_depth(x))
df['length'] = df.world_keypoints.apply(lambda x: get_length(x))
df['x_length'] = df.world_keypoints.apply(lambda x: get_x_length(x))
df['image_width'] = df.metadata.apply(lambda x: x['width'])
df['image_height'] = df.metadata.apply(lambda x: x['height'])
df['is_accepted'] = df.is_skipped == False

<h1> Plot Accepts vs. Rejects by Depth </h1>

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(df.centroid_depth, color='blue', alpha=0.5, bins=20)
plt.hist(df[df.is_accepted].centroid_depth, color='red', alpha=0.5, bins=20)
plt.grid()
plt.show()

In [None]:
depth_mask = (df.centroid_depth > 0.75) & (df.centroid_depth < 0.95)
plt.figure(figsize=(20, 10))
plt.hist(df[~df.is_accepted & (df.is_bad_crop == False) & depth_mask].x_length, color='blue')
plt.hist(df[df.is_accepted & depth_mask].x_length, color='red')
plt.show()


In [None]:
df[df.is_accepted & depth_mask].x_length.mean()

In [None]:
df[df.is_accepted & depth_mask & (df.x_length > 0.45)]

<h1> Download Kjeppevikholmen Images Locally </h1>

In [None]:
skip_reasons = ['is_accepted', 'is_too_dark', 'is_blurry']
skip_masks = {}
for skip_reason in skip_reasons:
    skip_masks[skip_reason] = df[skip_reason] == True

In [None]:
np.random.seed(0)
FOCAL_LENGTH = 4015

def process_row(row, skip_reason, lo, hi):
    depth_m = row['centroid_depth']
    line_segment_length_px = object_length_m * FOCAL_LENGTH / depth_m
    image_url = row.image_url
    if 'aquabyte-crops-lati' not in image_url:
        bucket, key = 'aquabyte-crops', urlparse(image_url, allow_fragments=False).path.lstrip('/')
    else:
        components = urlparse(image_url, allow_fragments=False).path.lstrip('/').split('/')
        bucket, key = components[0], os.path.join(*components[1:])
    print(bucket, key)
    image_f = s3_access_utils.download_from_s3(bucket, key)

    im = Image.open(image_f)
#     draw = ImageDraw.Draw(im)
#     draw.line((100, 100, 100+line_segment_length_px, 100))

    f_name = os.path.basename(key)
    f = os.path.join(modified_images_dir, '{}_{}'.format(lo, hi), skip_reason, f_name)
    if not os.path.exists(os.path.dirname(f)):
        os.makedirs(os.path.dirname(f))
    im.save(f)


modified_images_dir = '/root/data/alok/lice_counting/blom_kjeppevikholmen_breakdown_v3'
object_length_m = 0.01
N = 20

depth_values = [round(x, 1) for x in np.arange(0.5, 1.4, 0.1)]

# rejected images due to skip reason
for i in range(len(depth_values)-1):
    print(i)
    lo, hi = depth_values[i], depth_values[i+1]
    depth_mask = (df['centroid_depth'] >= lo) & (df['centroid_depth'] <= hi)
    for target_skip_reason in skip_reasons:
        print(target_skip_reason)
        mask = skip_masks[target_skip_reason]
#         for skip_reason, skip_mask in skip_masks.items():
#             if skip_reason != target_skip_reason:
#                 mask = mask & ~skip_mask
        for idx, row in df[mask & depth_mask].sample(min(N, df[mask & depth_mask].shape[0])).iterrows():
            process_row(row, target_skip_reason, lo, hi)


In [None]:
plt.figure(figsize=(20, 10))
plt.hist(df.centroid_depth, bins=20, color='blue')
plt.hist(df[df.is_skipped != True].centroid_depth, bins=20, color='red')
plt.show()

In [None]:
def generate_center_coordinate(metadata, x_direction=True):
    if x_direction:
        x = metadata['x_coord'] + 0.5 * metadata['width']
        return x
    y = metadata['y_coord'] + 0.5 * metadata['height']
    return y

df['centroid_x'] = df.metadata.apply(lambda x: generate_center_coordinate(x, x_direction=True))
df['centroid_y'] = df.metadata.apply(lambda x: generate_center_coordinate(x, x_direction=False))
df['is_submitted'] = df.is_skipped == False
df['depth'] = df.centroid_depth
df['crop_area'] = df.metadata.apply(lambda x: x['crop_area'])

In [None]:
accept_rates = []
is_submitted_mask = df.is_submitted == True
crop_area_list = list(np.percentile(df.crop_area, range(0, 110, 10)))
for idx in range(len(crop_area_list) - 1):
    low_ca, high_ca = crop_area_list[idx], crop_area_list[idx+1]
    mask = (df.crop_area > low_ca) & (df.crop_area < high_ca)
    if df[mask].shape[0] > 0:
        accept_rate = df[mask & is_submitted_mask].shape[0] / df[mask].shape[0]
    accept_rates.append(accept_rate)

plt.bar(x=range(len(accept_rates)), height=accept_rates)


In [None]:
accept_rates = []
is_submitted_mask = df.is_submitted == True
depth_list = list(np.percentile(df[df.depth.notnull()].depth, range(0, 110, 10)))
for idx in range(len(depth_list) - 1):
    low_d, high_d = depth_list[idx], depth_list[idx+1]
    mask = (df.depth > low_d) & (df.depth < high_d)
    
    if df[mask].shape[0] > 0:
        accept_rate = df[mask & is_submitted_mask].shape[0] / df[mask].shape[0]
    accept_rates.append(accept_rate)

plt.bar(x=range(len(accept_rates)), height=accept_rates)


In [None]:
percentile_size = 10
is_submitted_mask = df.is_submitted == True
crop_area_list = list(np.percentile(df.crop_area, range(0, 100+percentile_size, percentile_size)))
accept_rates = np.zeros([len(crop_area_list)-1, len(crop_area_list)-1])
for i in range(len(crop_area_list) - 1):
    low_ca, high_ca = crop_area_list[i], crop_area_list[i+1]
    ca_mask = (df.crop_area > low_ca) & (df.crop_area < high_ca)
    depth_list = list(np.percentile(df[ca_mask & df.depth.notnull()].depth, range(0, 110, 10)))
    print(depth_list)
    for j in range(len(depth_list) - 1):
        low_d, high_d = depth_list[j], depth_list[j+1]
        d_mask = (df.depth > low_d) & (df.depth < high_d)
        mask = ca_mask & d_mask
        if df[mask].shape[0] > 0:
            accept_rates[i, j] = df[mask & is_submitted_mask].shape[0] / df[mask].shape[0]

plt.figure(figsize=(20, 10))
sns.heatmap(accept_rates.T, annot=True)
plt.xlabel('Crop Area Percentiles')
plt.ylabel('Depth Percentiles (conditional on crop area)')
plt.show()


In [None]:
percentile_size = 10
is_submitted_mask = df.is_submitted == True
crop_area_list = list(np.percentile(df.crop_area, range(0, 100+percentile_size, percentile_size)))
accept_rates = np.zeros([len(crop_area_list)-1, len(crop_area_list)-1])
for i in range(len(crop_area_list) - 1):
    low_ca, high_ca = crop_area_list[i], crop_area_list[i+1]
    ca_mask = (df.crop_area > low_ca) & (df.crop_area < high_ca)
    theta_list = list(np.percentile(df[ca_mask & df.theta.notnull()].theta, range(0, 110, 10)))
    for j in range(len(theta_list) - 1):
        low_t, high_t = theta_list[j], theta_list[j+1]
        t_mask = (df.theta > low_t) & (df.theta < high_t)
        mask = ca_mask & t_mask
        if df[mask].shape[0] > 0:
            accept_rates[i, j] = df[mask & is_submitted_mask].shape[0] / df[mask].shape[0]

plt.figure(figsize=(20, 10))
sns.heatmap(accept_rates.T, annot=True)
plt.xlabel('A')
plt.show()


In [None]:
tdf = df[df.crop_area > np.percentile(df.crop_area, 90)]

In [None]:
list(np.percentile(tdf[tdf.depth.notnull()].depth, range(0, 100+percentile_size, percentile_size)))

In [None]:
wkp = df.world_keypoints.iloc[0]


In [None]:
def compute_angle(wkp):
    if wkp:
        v = wkp['UPPER_LIP'] - wkp['TAIL_NOTCH']
        theta = np.arctan(v[1] / v[0]) * 180 / np.pi
        return theta
    else:
        return None
    
def generate_center_coordinate(metadata, x_direction=True):
    if x_direction:
        x = metadata['x_coord'] + 0.5 * metadata['width']
        return x
    y = metadata['y_coord'] + 0.5 * metadata['height']
    return y




In [None]:
df['theta'] = df.world_keypoints.apply(lambda x: compute_angle(x))
df['depth'] = df.centroid_depth
df['square_depth'] = df.depth**2
df['square_length'] = df.length**2
df['square_theta'] = df.theta**2
df['length_theta'] = df.length * df.theta
df['length_depth'] = df.length * df.depth
df['length_square_theta'] = df.length * df.square_theta
df['depth_theta'] = df.depth * df.theta
df['depth_square_theta'] = df.depth * df.theta**2
df['centroid_x'] = df.metadata.apply(lambda x: generate_center_coordinate(x, x_direction=True)) - 2048
df['centroid_y'] = df.metadata.apply(lambda x: generate_center_coordinate(x, x_direction=False)) - 1500
df['square_centroid_x'] = df.centroid_x**2
df['square_centroid_y'] = df.centroid_y**2
df['is_accepted'] = 1.0 - df.is_skipped.astype(int)

# features = ['theta', 'square_length', 'square_theta', 'length_theta', 'length_square_theta', 'centroid_x', 
#             'centroid_y', 'square_centroid_x', 'square_centroid_y', 'depth', 'square_depth']

# features = ['depth', 'square_depth', 'centroid_x', 'centroid_y', 'square_centroid_x', 'square_centroid_y']
# features = ['centroid_x', 'centroid_y', 'square_centroid_x', 'square_centroid_y']
features = ['theta', 'square_theta', 'depth', 'square_depth', 'centroid_x', 'square_centroid_x', 'centroid_y', 'square_centroid_y']
# features = ['theta', 'square_theta', 'depth', 'square_depth']

null_mask = df[features + ['is_accepted']].isnull().any(axis=1)
X = df.loc[~null_mask, features].values
X = (X - X.mean(axis=0)) / X.std(axis=0)
y = df.loc[~null_mask, 'is_accepted'].values

In [None]:
clf = LogisticRegression().fit(X, y)


In [None]:
preds = clf.predict_proba(X)[:, 1]
precision, recall, thresholds = precision_recall_curve(y, preds)
plt.figure(figsize=(20, 10))
plt.plot(precision, recall)
plt.grid()
plt.show()

In [None]:
preds = clf.predict_proba(X)[:, 1]
preds = np.random.random(len(preds))
precision, recall, thresholds = precision_recall_curve(y, preds)
plt.figure(figsize=(20, 10))
plt.plot(precision, recall)
plt.grid()
plt.show()

In [None]:
threshold = 0.15
y[preds > threshold].sum() / y[preds > threshold].shape[0]

In [None]:
y[preds > threshold].sum() / y.sum()