In [None]:
import json, os
import cv2
import torch
from multiprocessing import Pool, Manager
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.akpd import AKPD
from aquabyte.template_matching import find_matches_and_homography
from aquabyte.biomass_estimator import NormalizeCentered2D, NormalizedStabilityTransform, ToTensor, Network
from aquabyte.akpd_scorer import generate_confidence_score
from keras.models import load_model
import boto3
import pandas as pd
import numpy as np
import plotly.express as px
import time
from matplotlib import pyplot as plt
from copy import copy

from collections import defaultdict
import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression
from collections import defaultdict



In [None]:
df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-05,2019-06-12).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-12,2019-06-19).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-19,2019-06-26).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-26,2019-07-03).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-07-03,2019-07-04).csv')
])

df = df.sort_values('captured_at')
df['estimated_weight_g'] = df.weight
df = df[df.akpd_score > 0.9].copy(deep=True)
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour
hour_mask = (df.hour > 7) & (df.hour < 16)
df = df[hour_mask].copy(deep=True)

# get daily averages and sample sizes

records = defaultdict(list)
for date in sorted(list(set(df.index.date.astype(str)))):
    records[date].extend(df[date].weight.values.tolist())
    
records_json = json.dumps(records)




<h1> How do AKPD keypoints compare to precise manual keypoints? </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))

query = """
SELECT * FROM keypoint_annotations
WHERE pen_id=5
AND captured_at BETWEEN '2019-06-05' AND '2019-07-02'
AND keypoints is not null
AND keypoints -> 'leftCrop' is not null
AND keypoints -> 'rightCrop' is not null
AND is_qa = FALSE;
"""

mdf = rds_access_utils.extract_from_database(query)

In [None]:
url_intersection = sorted(list(set(mdf.left_image_url).intersection(df.left_crop_url)))
tdf = df[df.left_crop_url.isin(url_intersection)].sort_values('left_crop_url')
tdf['manual_keypoints'] = mdf[mdf.left_image_url.isin(url_intersection)].sort_values('left_image_url').keypoints.values
tdf['camera_metadata'] = mdf[mdf.left_image_url.isin(url_intersection)].sort_values('left_image_url').camera_metadata.values


In [None]:
BODY_PARTS = [
    'UPPER_LIP',
    'EYE',
    'TAIL_NOTCH',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'ADIPOSE_FIN',
    'PECTORAL_FIN',
    'ANAL_FIN'
]

In [None]:
def generate_disparity(x, body_part):
    if type(x) == str:
        x = json.loads(x)
    left_kps = {item['keypointType']: [item['xFrame'], item['yFrame']] for item in x['leftCrop']}
    right_kps = {item['keypointType']: [item['xFrame'], item['yFrame']] for item in x['rightCrop']}
    disp = abs(left_kps[body_part][0] - right_kps[body_part][0])
    return disp

diffs_dict = {}
diffs_dict['percentile'] = ['5th', '25th', '50th', '75th', '95th', '99th']
for body_part in BODY_PARTS:
    x = tdf.annotation.apply(lambda x: generate_disparity(x, body_part))
    y = tdf.manual_keypoints.apply(lambda x: generate_disparity(x, body_part))
    diffs = x.values - y.values
    diffs_dict[body_part] = np.percentile(abs(diffs), [5, 25, 50, 75, 95, 99])


In [None]:
pd.DataFrame(diffs_dict)

<h1> How do AKPD weights compare to precise manual weights? </h1>

In [None]:
to_tensor_transform = ToTensor()

# initialize data transforms so that we can run inference with biomass neural network
normalize_centered_2D_transform_biomass = NormalizeCentered2D()
normalized_stability_transform = NormalizedStabilityTransform()

# load neural network weights
biomass_network = torch.load('/root/data/alok/biomass_estimation/results/neural_network/2019-11-08T00:13:09/nn_epoch_798.pb')
akpd_scorer_network = load_model('/root/data/alok/biomass_estimation/playground/akpd_scorer_model_TF.h5') # make this better

In [None]:
def generate_weight_prediction(row_id, keypoints, cm):
    
    # run biomass estimation
    input_sample = {
        'keypoints': keypoints,
        'cm': cm,
        'stereo_pair_id': row_id,
        'single_point_inference': True
    }
    nomralized_centered_2D_kps = \
        normalize_centered_2D_transform_biomass.__call__(input_sample)

    normalized_stability_kps = normalized_stability_transform.__call__(nomralized_centered_2D_kps)
    tensorized_kps = to_tensor_transform.__call__(normalized_stability_kps)
    weight_prediction = biomass_network(tensorized_kps['kp_input']).item() * 1e4
    
    return weight_prediction


In [None]:
weight_predictions = []

args = []
count = 0
for idx, row in tdf.iterrows():
    cm = row.camera_metadata
    keypoints = row.manual_keypoints
    row_id = idx
    weight_prediction = generate_weight_prediction(row_id, keypoints, cm)
    weight_predictions.append(weight_prediction)
    
    if count % 100 == 0:
        print(count)
    count += 1

In [None]:
tdf['manual_weight'] = weight_predictions

In [None]:
plt.hist(tdf.weight - tdf.manual_weight, bins=100)
plt.show()

In [None]:
def print_accuracy_metrics(x, y):
    mean_error_pct = np.mean((x-y)/y)
    mean_abs_error_pct = np.mean(np.abs((x-y)/y))
    percentiles = [5, 25, 50, 75, 95]
    percentiles_abs_error_pct = np.percentile(np.abs((x-y)/y), percentiles)
    print('Mean error percentage: {}'.format(mean_error_pct))
    print('Mean absolute error percentage: {}'.format(mean_abs_error_pct))
    for p, val in zip(percentiles, percentiles_abs_error_pct):
        print('{}th percentile absolute error percentage: {}'.format(p, val))

In [None]:
print_accuracy_metrics(tdf.weight.values, tdf.manual_weight.values)

<h1> How do AKPD trendlines compare to Manual trendlines? </h1>

In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = sorted(list(tdf.index.date.astype(str)))
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(3, window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output


# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

    return raw_avg_weight, raw_sample_size, smart_average, metadata

In [None]:
tdf['estimated_weight_g'] = tdf.weight
tdf.index = pd.to_datetime(tdf.captured_at)
tdf['hour'] = tdf.index.hour
hour_mask = (tdf.hour > 7) & (tdf.hour < 16)
kdf = tdf[hour_mask].copy(deep=True)

# get daily averages and sample sizes

records = defaultdict(list)
for date in sorted(list(set(kdf.index.date.astype(str)))):
    records[date].extend(kdf[date].estimated_weight_g.values.tolist())
    
records_json = json.dumps(records)

dates = sorted(list(set(kdf.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    print(date)
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])

fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = kdf.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()


In [None]:
tdf['estimated_weight_g'] = tdf.manual_weight
tdf.index = pd.to_datetime(tdf.captured_at)
tdf['hour'] = tdf.index.hour
hour_mask = (tdf.hour > 7) & (tdf.hour < 16)
kdf = tdf[hour_mask].copy(deep=True)

# get daily averages and sample sizes

records = defaultdict(list)
for date in sorted(list(set(kdf.index.date.astype(str)))):
    records[date].extend(kdf[date].estimated_weight_g.values.tolist())
    
records_json = json.dumps(records)

dates = sorted(list(set(kdf.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    print(date)
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])

fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = kdf.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()


In [None]:
df['estimated_weight_g'] = df.weight
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour
hour_mask = (df.hour > 7) & (df.hour < 16)
kdf = df[hour_mask].copy(deep=True)

# get daily averages and sample sizes

records = defaultdict(list)
for date in sorted(list(set(kdf.index.date.astype(str)))):
    records[date].extend(kdf[date].estimated_weight_g.values.tolist())
    
records_json = json.dumps(records)

dates = sorted(list(set(kdf.index.date.astype(str))))
raw_avg_weights_2, raw_sample_sizes_2, growth_rates_2, trend_scores_2, smart_averages_2, distribution_confidences_2 = \
    [], [], [], [], [], []
for date in dates:
    print(date)
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    growth_rates_2.append(smart_average['dailyGrowthRate'])
    trend_scores_2.append(metadata['trend_score'])
    raw_avg_weights_2.append(raw_avg_weight)
    raw_sample_sizes_2.append(raw_sample_size)
    smart_averages_2.append(smart_average['weightMovingAvg'])
    distribution_confidences_2.append(metadata['distribution_confidence'])

fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = kdf.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights_2, label='Raw Avg.')
axes[0].plot(x_values, smart_averages_2, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages_2), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages_2), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes_2, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates_2)
axes[3].plot(x_values, trend_scores_2)
axes[4].plot(x_values, distribution_confidences_2)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()


In [None]:
plt.figure(figsize=(20, 10))
plt.plot(x_values[:-1], raw_sample_sizes, label='Manual Daily Sample Size')
plt.plot(x_values[:-1], raw_sample_sizes_2[:-1], label='AKPD Daily Sample Size')
# plt.plot(x_values[:-1], 1.02 * np.array(smart_averages_2[:-1]), color='red', linestyle='--', label='Smart Avg. +/-2%')
# plt.plot(x_values[:-1], 0.98 * np.array(smart_averages_2[:-1]), color='red', linestyle='--', label='Smart Avg. +/-2%')
plt.title('AKPD vs. Manual on Blom Kjeppevikholmen')
# plt.plot(x_values[:-1], smart_averages, label='Manual Raw Average Weights')
# plt.plot(x_values[:-1], smart_averages_2[:-1], label='AKPD Raw Average Weights')
plt.legend()
plt.grid()
plt.show()

In [None]:
i = -1
x = (4.73*np.exp(-7*growth_rates[i])*17651 + \
 4.78*np.exp(-8*growth_rates[i])*13524 + \
 4.85*np.exp(-9*growth_rates[i])*3960 + \
 5.36*np.exp(-29*growth_rates[i])*15259 + \
 5.52*np.exp(-35*growth_rates[i])*23111) / \
 (17651+13524+3960+15259+23111)

print((smart_averages[-1]-x*1e3)/(x*1e3))

In [None]:
i = -1
x = (4.73*np.exp(-6*growth_rates_2[i])*17651 + \
 4.78*np.exp(-7*growth_rates_2[i])*13524 + \
 4.85*np.exp(-8*growth_rates_2[i])*3960 + \
 5.36*np.exp(-28*growth_rates_2[i])*15259 + \
 5.52*np.exp(-34*growth_rates_2[i])*23111) / \
 (17651+13524+3960+15259+23111)

print((smart_averages_2[-1]-x*1e3)/(x*1e3))