In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from weight_estimation.body_parts import core_body_parts
from weight_estimation.utils import convert_to_world_point_arr, get_left_right_keypoint_arrs, CameraMetadata

from weight_estimation.weight_estimator import WeightEstimator


<h1> Load Datasets </h1>

<h2> Load Kjeppevikholmen AKPD annotations </h2>

In [None]:
df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-05,2019-06-12).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-12,2019-06-19).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-19,2019-06-26).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-26,2019-07-03).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-07-03,2019-07-04).csv')
])

df = df.sort_values('captured_at')
df['estimated_weight_g'] = df.weight
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour


<h2> Load Kjeppevikholmen Manual Annotations & Merge </h2>

In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
rds = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))

query = """
    SELECT * FROM keypoint_annotations
    WHERE pen_id=5
    AND captured_at BETWEEN '2019-06-05' AND '2019-07-02'
    AND is_qa = FALSE;
"""

mdf = rds.extract_from_database(query)

In [None]:
url_intersection = sorted(list(set(mdf.left_image_url).intersection(df.left_crop_url)))
tdf = df[df.left_crop_url.isin(url_intersection)].sort_values('left_crop_url')
tdf['manual_keypoints'] = mdf[mdf.left_image_url.isin(url_intersection)].sort_values('left_image_url').keypoints.values
tdf['camera_metadata'] = mdf[mdf.left_image_url.isin(url_intersection)].sort_values('left_image_url').camera_metadata.values


<h1> Compute Jitter Column </h1>

In [None]:
body_parts, depths, weights, lengths, akpd_scores, diffs_l_x, diffs_r_x, diffs_l_y, diffs_r_y = \
    [], [], [], [], [], [], [], [], []

for idx, row in tdf.iterrows():
    
    if row.akpd_score < 0.01:
        continue
    
    manual_keypoints = row.manual_keypoints
    akpd_keypoints = json.loads(row.annotation)
    weight = row.estimated_weight_g
    akpd_score = row.akpd_score
    
    # compute depth from manual keypoints
    
    cm = row.camera_metadata
    camera_metadata = CameraMetadata(
        baseline_m=cm['baseline'],
        focal_length=cm['focalLength'],
        focal_length_pixel=cm['focalLengthPixel'],
        pixel_count_width=cm['pixelCountWidth'],
        pixel_count_height=cm['pixelCountHeight'],
        image_sensor_width=cm['imageSensorWidth'],
        image_sensor_height=cm['imageSensorHeight'],
    )
    
    
    left_kps, right_kps = get_left_right_keypoint_arrs(manual_keypoints)
    wkps = convert_to_world_point_arr(left_kps, right_kps, camera_metadata)
    depth = np.median(wkps[:, 1])
    length_2d_left = np.linalg.norm(left_kps[core_body_parts.index('UPPER_LIP')] - left_kps[core_body_parts.index('TAIL_NOTCH')])
    length_2d_right = np.linalg.norm(right_kps[core_body_parts.index('UPPER_LIP')] - right_kps[core_body_parts.index('TAIL_NOTCH')])
    mean_length_2d = 0.5 * (length_2d_left + length_2d_right)
    
    ann_dict_left_kps_m = {item['keypointType']: [item['xFrame'], item['yFrame']] for item in manual_keypoints['leftCrop']}
    ann_dict_right_kps_m = {item['keypointType']: [item['xFrame'], item['yFrame']] for item in manual_keypoints['rightCrop']}
    ann_dict_left_kps_a = {item['keypointType']: [item['xFrame'], item['yFrame']] for item in akpd_keypoints['leftCrop']}
    ann_dict_right_kps_a = {item['keypointType']: [item['xFrame'], item['yFrame']] for item in akpd_keypoints['rightCrop']}
    for body_part in core_body_parts:
        depths.append(depth)
        weights.append(weight)
        lengths.append(mean_length_2d)
        akpd_scores.append(akpd_score)
        body_parts.append(body_part)
        diff_l_x = ann_dict_left_kps_m[body_part][0] - ann_dict_left_kps_a[body_part][0]
        diff_r_x = ann_dict_right_kps_m[body_part][0] - ann_dict_right_kps_a[body_part][0]
        diff_l_y = ann_dict_left_kps_m[body_part][1] - ann_dict_left_kps_a[body_part][1]
        diff_r_y = ann_dict_right_kps_m[body_part][1] - ann_dict_right_kps_a[body_part][1]
        diffs_l_x.append(diff_l_x)
        diffs_r_x.append(diff_r_x)
        diffs_l_y.append(diff_l_y)
        diffs_r_y.append(diff_r_y)
        
    
        
        
        

In [None]:
rdf = pd.DataFrame({
    'body_part': body_parts,
    'depth': depths,
    'weight': weights,
    'length_2d': lengths,
    'akpd_score': akpd_scores,
    'diff_l_x': diffs_l_x, 
    'diff_r_x': diffs_r_x,
    'diff_l_y': diffs_l_y,
    'diff_r_y': diffs_r_y
})

<h1> Visualizations </h1>

<h2> Visualize x-axis diff standard deviation (i.e. jitter) broken down by depth bucket </h2>

In [None]:
from sklearn.linear_model import LinearRegression

def compute_regression_line(buckets, stds):
    lr = LinearRegression()
    X = np.array(range(len(buckets))).reshape(-1, 1)
    y = np.array(stds)
    reg = lr.fit(X, y)
    return float(reg.coef_), float(reg.intercept_)

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
for idx, body_part in enumerate(core_body_parts):
    depth_buckets = np.arange(0.5, 2.2, 0.1)
    buckets, stds = [], []
    for low_depth, high_depth in zip(depth_buckets, depth_buckets[1:]):
        bucket = '{}-{}'.format(round(low_depth, 2), round(high_depth, 2))
        mask = (rdf.depth >= low_depth) & (rdf.depth <= high_depth) & (rdf.body_part == body_part) & \
               (rdf.diff_l_x.abs() < 50)
        std = rdf[mask].diff_l_x.std()

        buckets.append(bucket)
        stds.append(std)
    
    # plot empirical standard deviations broken down by depth bucket
    row, col = idx // 4, idx % 4
    ax = axes[row, col]
    ax.plot(stds, label='empirical jitter std values')
    ax.set_xticks(range(len(buckets)))
    ax.set_xticklabels(buckets, rotation=90)
    ax.grid()
    ax.set_title(body_part)
    ax.set_xlabel('Depth bucket (m)')
    ax.set_ylabel('Jitter standard deviation')
    
    # compute & plot regression line
    m, b = compute_regression_line(buckets, stds)
    x_values = np.array(range(len(buckets)))
    y_values = m * x_values + b
    ax.plot(x_values, y_values, linestyle='dashed', color='red', 
            label='Regression line: std = {}x + {}'.format(round(m, 2), round(b, 2)))
    ax.legend()
    

fig.subplots_adjust(hspace=0.5)

plt.show()

In [None]:
for body_part in core_body_parts:
    depth_buckets = np.arange(0.5, 2.2, 0.1)
    fig, axes = plt.subplots(4, 4, figsize=(20, 10))
    
    idx = 0
    print('QQ plots by depth bucket for body part: {}'.format(body_part))
    for low_depth, high_depth in zip(depth_buckets, depth_buckets[1:]):
        mask = (rdf.depth >= low_depth) & (rdf.depth <= high_depth) & (rdf.body_part == body_part)
        vals = rdf[mask].diff_l_x.values
        
        row, col = idx // 4, idx % 4
        ax = axes[row, col]
        stats.probplot(vals, dist='norm', plot=ax)
        ax.set_title('Depth bucket (m): {}-{}'.format(round(low_depth, 2), round(high_depth, 2)))
        
        idx += 1
        
    fig.subplots_adjust(hspace=0.5)
    plt.show()        
        

<h2> Examine Jitter standard deviation as a function of weight </h2>

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
for idx, body_part in enumerate(core_body_parts):
    weight_buckets = np.arange(1000, 10000, 1000)
    buckets, stds = [], []
    for low_weight, high_weight in zip(weight_buckets, weight_buckets[1:]):
        bucket = '{}-{}'.format(round(low_weight, 2), round(high_weight, 2))
        mask = (rdf.weight >= low_weight) & (rdf.weight <= high_weight) & (rdf.body_part == body_part) & \
               (rdf.diff_l_x.abs() < 50)
        std = rdf[mask].diff_l_x.std()

        buckets.append(bucket)
        stds.append(std)
    
    # plot empirical standard deviations broken down by depth bucket
    row, col = idx // 4, idx % 4
    ax = axes[row, col]
    ax.plot(stds, label='empirical jitter std values')
    ax.set_xticks(range(len(buckets)))
    ax.set_xticklabels(buckets, rotation=90)
    ax.grid()
    ax.set_title(body_part)
    ax.set_xlabel('Weight bucket (g)')
    ax.set_ylabel('Jitter standard deviation')
    
fig.subplots_adjust(hspace=0.5)

plt.show()

<h2> Examine AKPD score as a function of weight </h2>

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
weight_buckets = np.arange(0, 9000, 1000)
buckets, akpd_scores = [], []
for low_weight, high_weight in zip(weight_buckets, weight_buckets[1:]):
    bucket = '{}-{}'.format(round(low_weight, 2), round(high_weight, 2))
    mask = (rdf.weight >= low_weight) & (rdf.weight <= high_weight) & (rdf.body_part == body_part) & \
           (rdf.diff_l_x.abs() < 50)
    mean_akpd_score = rdf[mask].akpd_score.mean()

    buckets.append(bucket)
    akpd_scores.append(mean_akpd_score)
    
ax.plot(akpd_scores, label='per-bucket mean akpd score')
ax.set_xticks(range(len(buckets)))
ax.set_xticklabels(buckets, rotation=90)
ax.grid()
ax.set_title('AKPD score vs. weight bucket')
ax.set_xlabel('Weight bucket (g)')
ax.set_ylabel('AKPD score')

<h2> Examine AKPD score versus 2D fish length </h2>

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
length_cutoffs = np.arange(800, 4000, 200)
buckets, akpd_scores = [], []
for low_length, high_length in zip(length_cutoffs, length_cutoffs[1:]):
    bucket = '{}-{}'.format(round(low_length, 2), round(high_length, 2))
    mask = (rdf.length_2d >= low_length) & (rdf.length_2d <= high_length) & (rdf.body_part == body_part) & \
           (rdf.diff_l_x.abs() < 50)
    mean_akpd_score = rdf[mask].akpd_score.mean()

    buckets.append(bucket)
    akpd_scores.append(mean_akpd_score)
    
ax.plot(akpd_scores, label='per-bucket mean akpd score')
ax.set_xticks(range(len(buckets)))
ax.set_xticklabels(buckets, rotation=90)
ax.grid()
ax.set_title('AKPD score vs. 2D length bucket')
ax.set_xlabel('2D length bucket (pixels)')
ax.set_ylabel('AKPD score')

<h1> Compare average weight between manual pipeline and automatic pipeline </h1>

In [None]:
from research.weight_estimation.akpd_utils.akpd_scorer import generate_confidence_score
from keras.models import load_model
from typing import Dict, List

In [None]:
def compute_akpd_score(akpd_scorer_network, keypoints: Dict, camera_metadata: Dict) -> float:
    input_sample = {
        'keypoints': keypoints,
        'cm': camera_metadata,
        'stereo_pair_id': 0,
        'single_point_inference': True
    }

    akpd_score = generate_confidence_score(input_sample, akpd_scorer_network)
    return akpd_score



weight_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2020-11-27T00-00-00/weight_model_synthetic_data.pb')
kf_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/k-factor/playground/kf_predictor_v2.pb')
weight_estimator = WeightEstimator(weight_model_f, kf_model_f)

akpd_scorer_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/keypoint-detection-scorer/akpd_scorer_model_TF.h5'
akpd_scorer_f, _, _ = s3.download_from_url(akpd_scorer_url)
akpd_scorer_network = load_model(akpd_scorer_f)

weights_manual_pipeline, weights_auto, manual_akpd_scores = [], [], []
count = 0
for idx, row in tdf.iterrows():
    
    camera_metadata = row.camera_metadata
    
    cm = CameraMetadata(
        focal_length=camera_metadata['focalLength'],
        focal_length_pixel=camera_metadata['focalLengthPixel'],
        baseline_m=camera_metadata['baseline'],
        pixel_count_width=camera_metadata['pixelCountWidth'],
        pixel_count_height=camera_metadata['pixelCountHeight'],
        image_sensor_width=camera_metadata['imageSensorWidth'],
        image_sensor_height=camera_metadata['imageSensorHeight']
    )
    
    # auto weight calculation
    weight_auto, length, kf = weight_estimator.predict(json.loads(row.annotation), cm)
    weights_auto.append(weight_auto)
    
    # manual weight calculation
    ann = row.manual_keypoints
    if not ann or (not ann.get('leftCrop') or not ann.get('rightCrop')):
        weights_manual_pipeline.append(None)
        manual_akpd_scores.append(None)
        continue
        
    # manual AKPD score
    manual_akpd_score = compute_akpd_score(akpd_scorer_network, row.manual_keypoints, camera_metadata)
    manual_akpd_scores.append(manual_akpd_score)
    
    weight, length, kf = weight_estimator.predict(ann, cm)
    weights_manual_pipeline.append(weight)
    
    count += 1
    if count % 1000 == 0:
        print(count)
    

tdf['weight_manual'] = weights_manual_pipeline
tdf['weight_auto'] = weights_auto
tdf['manual_akpd_score'] = manual_akpd_scores


In [None]:
tdf[tdf.manual_akpd_score > 0.01].weight_manual.mean()

In [None]:
tdf[tdf.akpd_score > 0.01].weight_auto.mean()

In [None]:
weight_cutoffs = np.arange(0, 10000, 1000)
avg_weight_auto_list, avg_weight_manual_list, manual_sample_sizes, auto_sample_sizes = [], [], [], []
for low_weight, high_weight in zip(weight_cutoffs, weight_cutoffs[1:]):
    mask = (tdf.weight_manual > low_weight) & (tdf.weight_manual < high_weight)
    avg_weight_auto = tdf[mask & (tdf.akpd_score > 0.01)].weight_auto.mean()
    avg_weight_manual = tdf[mask & (tdf.manual_akpd_score > 0.01)].weight_manual.mean()
    manual_sample_size = tdf[mask & (tdf.manual_akpd_score > 0.01)].shape[0]
    auto_sample_size = tdf[mask & (tdf.akpd_score > 0.01)].shape[0]
    avg_weight_auto_list.append(avg_weight_auto)
    avg_weight_manual_list.append(avg_weight_manual)
    manual_sample_sizes.append(manual_sample_size)
    auto_sample_sizes.append(auto_sample_size)
    


In [None]:
list(zip(avg_weight_auto_list, avg_weight_manual_list, manual_sample_sizes, auto_sample_sizes))

In [None]:
plt.hist(tdf.manual_akpd_score, bins=100)
plt.show()

In [None]:
intersection = set(tdf[tdf.manual_akpd_score > 0.01].left_crop_url).intersection(set(tdf[tdf.akpd_score > 0.01].left_crop_url))
len(intersection) / tdf[tdf.akpd_score > 0.01].shape[0] 

In [None]:
mask = (tdf.manual_akpd_score > 0.01) & (tdf.akpd_score <= 0.01)
tdf[mask].estimated_weight_g.mean()

In [None]:
tdf[tdf.akpd_score > 0.01].estimated_weight_g.mean()