In [None]:
import math
import json
import os
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta, time
from research.utils.data_access_utils import RDSAccessUtils
from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point
import matplotlib.pyplot as plt
from matplotlib.dates import AutoDateFormatter, AutoDateLocator

rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
duplicates = pd.read_csv('tittelsnes_p37_duplicate.csv')

duplicates = duplicates.sort_values('captured_at').copy(deep=True)
duplicates.index = pd.to_datetime(duplicates.captured_at)
dates = duplicates.index.date.astype(str)
duplicates['date'] = dates
duplicates['hour'] = duplicates.index.hour

In [None]:
# ground_truth = pd.read_csv('blom_vikane_singleweights.csv')

In [None]:
duplicates.head()

In [None]:
mask = (duplicates['is_duplicate'] == 1)

plt.figure(figsize=(20, 10))


plt.hist(duplicates[mask]['hour'], alpha = 0.5, color = 'blue', density = True, bins = 24)
plt.hist(duplicates[~mask]['hour'], alpha = 0.5, color = 'red', density = True, bins = 24)



In [None]:
import ast

thetas = []
phis = []

for index, row in duplicates.iterrows():
    ann1, cm1 = ast.literal_eval(row.annotation), ast.literal_eval(row.camera_metadata)
    
    wkps1 = pixel2world(ann1['leftCrop'], ann1['rightCrop'], cm1)

    vector = wkps1['PECTORAL_FIN'] - wkps1['ANAL_FIN']
    x, y, z = vector / np.linalg.norm(vector)
    
    theta = math.atan(y / x) * np.sign(y)
    phi = math.acos(z)
    dtheta = math.degrees(theta)
    dphi = 90 - math.degrees(phi)
    thetas.append(dtheta)
    phis.append(dphi)
    
plt.figure(figsize=(20, 10))
plt.scatter(thetas, phis, color = 'orange', label = 'Normal')
# plt.scatter(thetas2, phis2, color = 'blue', label = 'Negative')
plt.xlabel('Theta degree')
plt.ylabel('Phi degree')
plt.legend()

duplicates['theta'] = thetas
duplicates['phi'] = phis


In [None]:
from scipy.stats import t, norm

mask0 = (duplicates['captured_at'] > '2020-06-20') & (duplicates['captured_at'] < '2020-06-23')

#mask1 = mask0 & (np.abs(duplicates['theta']) < 10) & (np.abs(duplicates['phi']) < 10)
mask1 = mask0 & (np.abs(duplicates['hour']) > 7) & (np.abs(duplicates['hour']) < 17)
mask2 = mask1 & (duplicates['is_duplicate'] == 0)

print(sum(mask0), sum(mask1), sum(mask2))

dist1 = duplicates[mask1]
dist2 = duplicates[mask2]

# df, mean, std = t.fit(dist1['estimated_weight_g'])
# df2, mean2, std2 = t.fit(dist2['estimated_weight_g'])
# gt_df, gt_mean, gt_std = t.fit(gt_weights)
mean, std = norm.fit(dist1['estimated_weight_g'])
mean2, std2 = norm.fit(dist2['estimated_weight_g'])
# gt_mean, gt_std = norm.fit(gt_weights)

print(len(dist1), len(dist2))

plt.figure(figsize=(20, 10))

plt.hist(dist1['estimated_weight_g'], color = 'blue', alpha = 0.5, density = True, bins = 30)
plt.hist(dist2['estimated_weight_g'], color = 'red', alpha = 0.5, density = True, bins = 30)
# plt.hist(gt_weights, color = 'red', alpha = 0.5, density = True, bins = 30)

xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 1000)

#y = t.pdf(x, df, mean, std)
y = norm.pdf(x, mean, std)
plt.plot(x, y)
y2 = norm.pdf(x, mean2, std2)
# plt.plot(x, y2)
#plt.plot(x, y + 10 * (y - y2), color = 'green', linestyle = '-')

new_x = x - 5 * (np.mean(dist1['estimated_weight_g']) - np.mean(dist2['estimated_weight_g']))
plt.plot(new_x, y + 10 * (y - y2), color = 'red', linestyle = '-')

# gt_x = np.linspace(xmin, xmax, 1000)
# #gt_y = t.pdf(gt_x, gt_df, gt_mean, gt_std)
# gt_y = norm.pdf(gt_x, gt_mean, gt_std)
# plt.plot(gt_x, gt_y, color = 'black', linewidth = 4)

plt.figure(figsize=(20, 10))
plt.hist(dist1['estimated_weight_g'], color = 'blue', alpha = 0.5, density = True, bins = 30)
# plt.hist(gt_weights, color = 'red', alpha = 0.5, density = True, bins = 30)

In [None]:
plt.figure(figsize=(20, 10))

loss_factor = 0.16

density, bins, _ = plt.hist(dist1['estimated_weight_g'] * (1 - loss_factor), alpha = 0, density = True, bins = 30)
density2, bins, _ = plt.hist(dist2['estimated_weight_g'] * (1 - loss_factor), bins = bins, alpha = 0, density = True)

print(density)

bin_width = bins[1] - bins[0]

factor = 0.5

new_density = density + factor * (density - density2)
new_density[new_density < 0] = 0

_bins_adj = []
for i, end_bin in enumerate(bins[1:]):
    start_bin = bins[i]
    _mask = (dist1['estimated_weight_g'] * (1 - loss_factor) > start_bin) & (dist1['estimated_weight_g'] * (1 - loss_factor) <= end_bin)
    _bins_adj.append(np.mean(dist1['estimated_weight_g'][_mask] * (1 - loss_factor)))
    
#bins_adj = bins[1:] - bin_width / 2
bins_adj = np.array(_bins_adj)

factor2 = factor * -0.5
#factor2 = 0

new_bins_adj = bins_adj - factor2 * (np.mean(dist1['estimated_weight_g'] * (1 - loss_factor)) - np.mean(dist2['estimated_weight_g'] * (1 - loss_factor)))

plt.bar(new_bins_adj, new_density, color = 'blue', alpha = 0.5, width = bin_width)
# gt_density, gt_bins, _ = plt.hist(gt_weights, bins = bins, color = 'red', alpha = 0.5, density = True)

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(new_bins_adj, (density - density2) / np.sum(new_density), color = 'blue', alpha = 0.5, width = bin_width)

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[1:])
gt_pcts = [0, 0.0174, .1711, .3285, .2777, .1459, .0477, .0104, .0013, .0001]

d1 = dist1['estimated_weight_g'] * (1 - loss_factor)
d2 = dist2['estimated_weight_g'] * (1 - loss_factor)
new_density_adj = new_density / np.sum(new_density)

new_pcts = []
pcts1 = []
pcts2 = []

for i in range(len(buckets) - 1):
    mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])
    mask_new = (new_bins_adj > buckets[i]) & (new_bins_adj <= buckets[i + 1])
    gt_pct = gt_pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]

    new_pcts.append(np.sum(new_density_adj[mask_new]))
    pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
    pct2 = np.sum(mask2) / len(mask2)
    pcts2.append(pct2)
#     print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(new_density_adj[mask1]) - gt_pct, np.sum(new_density_adj[mask1]), gt_pct))

pcts1 = np.array(pcts1)
pcts2 = np.array(pcts2)

gt_avg = 4944.34

result = np.sum(new_bins_adj * new_density_adj) 
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 300, new_pcts, color = 'orange', width = 150, label = 'Dedup diff')
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Dedup')
plt.bar(x_buckets + 300, 10 * (pcts1 - pcts2), color = 'purple', width = 150, label = 'Dedup diff')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'Ground truth')

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
pcts = [0, 0.0174, .1711, .3285, .2777, .1459, .0477, .0104, .0013, .0001]

new_density_adj = new_density / np.sum(new_density)

for i in range(len(buckets) - 1):
    mask1 = (new_bins_adj > buckets[i]) & (new_bins_adj <= buckets[i + 1])
    pct2 = pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(new_density_adj[mask1]) - pct2, np.sum(new_density_adj[mask1]), pct2))

gt_avg = 3892

result = np.sum(new_bins_adj * new_density_adj) 
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
pcts = [0, 0.0174, .1711, .3285, .2777, .1459, .0477, .0104, .0013, .0001]

density_adj = density / np.sum(density)

for i in range(len(buckets) - 1):
    mask1 = (new_bins_adj > buckets[i]) & (new_bins_adj <= buckets[i + 1])
    pct2 = pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(density_adj[mask1]) - pct2, np.sum(density_adj[mask1]), pct2))

gt_avg = 3892

result = np.sum(new_bins_adj * density_adj) 
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
pcts = [0, 0.0174, .1711, .3285, .2777, .1459, .0477, .0104, .0013, .0001]

density_adj = density2 / np.sum(density2)

for i in range(len(buckets) - 1):
    mask1 = (new_bins_adj > buckets[i]) & (new_bins_adj <= buckets[i + 1])
    pct2 = pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(density_adj[mask1]) - pct2, np.sum(density_adj[mask1]), pct2))

gt_avg = 3892

result = np.sum(new_bins_adj * density_adj) 
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)

In [None]:
_new_pdf = y + 15 * (y - y2)
_new_pdf[_new_pdf < 0] = 0
new_pdf = _new_pdf / np.sum(_new_pdf)

new_x = x - 7.5 * (np.mean(dist1['estimated_weight_g']) - np.mean(dist2['estimated_weight_g']))

result = np.sum(new_x * new_pdf) 
(result - np.mean(gt_weights)) / np.mean(gt_weights)
print(result, np.mean(dist1['estimated_weight_g']), np.mean(gt_weights))
print((result - np.mean(gt_weights)) / np.mean(gt_weights))

In [None]:
buckets = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

for i in range(len(buckets) - 1):
    mask1 = (new_x > buckets[i]) & (new_x <= buckets[i + 1])
    mask2 = (gt_weights > buckets[i]) & (gt_weights <= buckets[i + 1])
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(new_pdf[mask1]) - sum(mask2) / len(mask2), np.sum(new_pdf[mask1]), sum(mask2) / len(mask2)))
    

In [None]:
buckets = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

for i in range(len(buckets) - 1):
    mask1 = (dist1['estimated_weight_g'] > buckets[i]) & (dist1['estimated_weight_g'] <= buckets[i + 1])
    mask2 = (gt_weights > buckets[i]) & (gt_weights <= buckets[i + 1])
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.2f, %0.2f vs %0.2f' % (buckets[i], sum(mask1) / len(mask1) - sum(mask2) / len(mask2), sum(mask1) / len(mask1), sum(mask2) / len(mask2)))

In [None]:
buckets = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

for i in range(len(buckets) - 1):
    mask1 = (dist2['estimated_weight_g'] > buckets[i]) & (dist2['estimated_weight_g'] <= buckets[i + 1])
    mask2 = (gt_weights > buckets[i]) & (gt_weights <= buckets[i + 1])
    
    print('%i: %0.2f, %0.2f vs %0.2f' % (buckets[i], sum(mask1) / len(mask1) - sum(mask2) / len(mask2), sum(mask1) / len(mask1), sum(mask2) / len(mask2)))

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

for i in range(len(buckets) - 1):
    mask1 = (dist1['estimated_weight_g'] > buckets[i]) & (dist1['estimated_weight_g'] <= buckets[i + 1])
    mask2 = (dist2['estimated_weight_g'] > buckets[i]) & (dist2['estimated_weight_g'] <= buckets[i + 1])
#     mask_gt = (gt_weights > buckets[i]) & (gt_weights <= buckets[i + 1])
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], (sum(mask1) / len(mask1) - sum(mask2) / len(mask2)), sum(mask1) / len(mask1), sum(mask2) / len(mask2)))
   # print('%i: %0.2f, %0.2f vs %0.2f' % (buckets[i], sum(mask1) / len(mask1) - sum(mask_gt) / len(mask_gt), sum(mask1) / len(mask1), sum(mask_gt) / len(mask_gt)))
    
    

In [None]:
buckets = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

for i in range(len(buckets) - 1):
    mask1 = (dist1['estimated_weight_g'] > buckets[i]) & (dist1['estimated_weight_g'] <= buckets[i + 1])
    mask2 = (dist2['estimated_weight_g'] > buckets[i]) & (dist2['estimated_weight_g'] <= buckets[i + 1])
    mask_gt = (gt_weights > buckets[i]) & (gt_weights <= buckets[i + 1])
    
    print('%i: %0.2f, %0.2f vs %0.2f' % (buckets[i], sum(mask1) / len(mask1) - sum(mask_gt) / len(mask_gt) + 10 * (sum(mask1) / len(mask1) - sum(mask2) / len(mask2)), sum(mask1) / len(mask1) + 10 * (sum(mask1) / len(mask1) - sum(mask2) / len(mask2)), sum(mask_gt) / len(mask_gt)))

In [None]:
print(np.mean(dist1['estimated_weight_g']), mean, np.mean(dist2['estimated_weight_g']), np.mean(gt_weights))

In [None]:
print((np.mean(dist1['estimated_weight_g']) - np.mean(gt_weights)) / np.mean(gt_weights))
print((np.mean(dist2['estimated_weight_g']) - np.mean(gt_weights)) / np.mean(gt_weights))

In [None]:
import json
import os
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
from filter_optimization.filter_optimization_task import extract_biomass_data
from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point

from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils

from scipy import stats
import statsmodels.api as sm

plt.rcParams['font.size'] = 18

In [None]:
queryCache = {}

In [None]:
pen_id = 95
df_start_date = '2020-07-21'
df_end_date = '2020-07-24'

if pen_id in queryCache and df_start_date in queryCache[pen_id] and df_end_date in queryCache[pen_id][df_start_date]:
    df2 = queryCache[pen_id][df_start_date][df_end_date]
else:
    df2 = extract_biomass_data(pen_id, df_start_date, df_end_date, 0.95)
    # df = extract_biomass_data(pen_id, '2020-08-24', '2020-09-03', 0.99)

    df2.date = pd.to_datetime(df2.date)
#     df['week'] = df.date.apply(lambda x: x.weekofyear)

    depths = []
    new_lengths = []
    for idx, row in df2.iterrows():
        ann, cm = row.annotation, row.camera_metadata
        wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
        depth = np.median([wkp[1] for wkp in wkps.values()])
        vector = wkps['UPPER_LIP'] - wkps['TAIL_NOTCH']
        depths.append(depth)
        new_lengths.append(np.linalg.norm(vector))
    df2['depth'] = depths
    df2['new_lengths'] = new_lengths
    
    queryCache[pen_id] = { df_start_date: { df_end_date: df2 } }


In [None]:
plt.hist(df2.hour)

df2 = df2[((df2.hour >= 5) & (df2.hour <= 15))]

In [None]:
df2 = df2[((df2.hour >= 5) & (df2.hour <= 15))]

In [None]:
e1 = df2['estimated_weight_g']
e2 = df2.estimated_weight_g[df2.depth > np.percentile(df2.depth, 75)]
e4 = np.concatenate([e1[e1 < np.median(e2)], np.median(e2) + (np.median(e2) - e1[e1 < np.median(e2)])])


In [None]:
a1 = np.mean(e1) * (1 - 0.1753)

In [None]:
(a1 - 4949) / 4949

In [None]:
pen_id = 167
df_start_date = '2020-10-20'
df_end_date = '2020-10-23'

if pen_id in queryCache and df_start_date in queryCache[pen_id] and df_end_date in queryCache[pen_id][df_start_date]:
    df2 = queryCache[pen_id][df_start_date][df_end_date]
else:
    df2 = extract_biomass_data(pen_id, df_start_date, df_end_date, 0.95)
    # df = extract_biomass_data(pen_id, '2020-08-24', '2020-09-03', 0.99)

    df2.date = pd.to_datetime(df2.date)
#     df['week'] = df.date.apply(lambda x: x.weekofyear)

    depths = []
    new_lengths = []
    for idx, row in df2.iterrows():
        ann, cm = row.annotation, row.camera_metadata
        wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
        depth = np.median([wkp[1] for wkp in wkps.values()])
        vector = wkps['UPPER_LIP'] - wkps['TAIL_NOTCH']
        depths.append(depth)
        new_lengths.append(np.linalg.norm(vector))
    df2['depth'] = depths
    df2['new_lengths'] = new_lengths
    
    queryCache[pen_id] = { df_start_date: { df_end_date: df2 } }


In [None]:
plt.figure(figsize=(20, 10))
plt.hist(df2.hour)

In [None]:
df2 = df2[((df2.hour >= 7) & (df2.hour <= 12))]

In [None]:
e1 = df2['estimated_weight_g']
e2 = df2.estimated_weight_g[df2.depth > np.percentile(df2.depth, 75)]
e4 = np.concatenate([e1[e1 < np.median(e2)], np.median(e2) + (np.median(e2) - e1[e1 < np.median(e2)])])

print(np.mean(e1), np.mean(e1) * (1 - 0.18), np.mean(e4) * (1 - 0.18))

In [None]:
(4192 - 4330) / 4330

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ]


# new_pcts = []
pcts1 = []
pcts2 = []
pcts3 = []

for i in range(len(buckets) - 1):
    mask1 = (e1 > buckets[i]) & (e1 <= buckets[i + 1])
    mask2 = (e2 > buckets[i]) & (e2 <= buckets[i + 1])
    mask3 = (e4 > buckets[i]) & (e4 <= buckets[i + 1])
    gt_pct = gt_pcts[i]

    pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
    pct2 = np.sum(mask2) / len(mask2)
    pcts2.append(pct2)
    pct3 = np.sum(mask3) / len(mask3)
    pcts3.append(pct3)


plt.figure(figsize=(20, 10))
# plt.bar(x_buckets - 300, new_pcts, color = 'orange', width = 150, label = 'Dedup diff')
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets + 300, pcts2, color = 'blue', width = 150, label = 'Dedup')
plt.bar(x_buckets + 150, pcts3, color = 'purple', width = 150, label = 'Dedup diff')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'Ground truth')