In [None]:
import json
import os
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
from filter_optimization.filter_optimization_task import extract_biomass_data, _add_date_hour_columns
from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point

from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils

from scipy import stats
import statsmodels.api as sm

plt.rcParams['font.size'] = 18

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
queryCache = {}

In [None]:
def get_df(pen_id, start_date, end_date, akpd_cutoff):
    if pen_id in queryCache and start_date in queryCache[pen_id] and end_date in queryCache[pen_id][start_date] and akpd_cutoff in queryCache[pen_id][start_date][end_date]:
        df = queryCache[pen_id][start_date][end_date][akpd_cutoff]
    else:
        df = extract_biomass_data(pen_id, start_date, df_end_date, akpd_cutoff)

        df.date = pd.to_datetime(df.date)
    #     df['week'] = df.date.apply(lambda x: x.weekofyear)

        depths = []
        new_lengths = []
        for idx, row in df.iterrows():
            ann, cm = row.annotation, row.camera_metadata
            wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
            depth = np.median([wkp[1] for wkp in wkps.values()])
            vector = wkps['UPPER_LIP'] - wkps['TAIL_NOTCH']
            depths.append(depth)
            new_lengths.append(np.linalg.norm(vector))
        df['depth'] = depths
        df['new_lengths'] = new_lengths

        queryCache[pen_id] = { start_date: { end_date: { akpd_cutoff: df } } }
        
    return df

In [None]:
queryCache2 = {}

In [None]:
RDS = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

def get_df2(pen_id, group_id, start_date, end_date, akpd_cutoff):
    if pen_id in queryCache2 and start_date in queryCache2[pen_id] and end_date in queryCache2[pen_id][start_date] and akpd_cutoff in queryCache2[pen_id][start_date][end_date]:
        df = queryCache2[pen_id][start_date][end_date][akpd_cutoff]
    else:
        query = '''
        SELECT * FROM
            prod.biomass_computations bc
            WHERE bc.pen_id={}
            AND bc.group_id ='{}'
            AND bc.akpd_score >= {}
            AND bc.captured_at BETWEEN '{}' and '{}'
            AND bc.estimated_weight_g > 0.0
        '''.format(pen_id, group_id, akpd_cutoff, start_date, end_date)

        df = RDS.extract_from_database(query)
        df = df.loc[:, ~df.columns.duplicated()]
        df = _add_date_hour_columns(df)

        df.date = pd.to_datetime(df.date)
    #     df['week'] = df.date.apply(lambda x: x.weekofyear)

        depths = []
        new_lengths = []
        for idx, row in df.iterrows():
            ann, cm = row.annotation, row.camera_metadata
            wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
            depth = np.median([wkp[1] for wkp in wkps.values()])
            vector = wkps['UPPER_LIP'] - wkps['TAIL_NOTCH']
            depths.append(depth)
            new_lengths.append(np.linalg.norm(vector))
        df['depth'] = depths
        df['new_lengths'] = new_lengths

        queryCache2[pen_id] = { start_date: { end_date: { akpd_cutoff: df } } }

    return df

In [None]:
pen_id = 60
start_date = '2020-08-21'
end_date = '2020-08-25'

df = get_df(pen_id, start_date, end_date, 0)

plt.figure(figsize=(20, 10))

plt.hist(df.hour, bins = 24)

In [None]:
plt.figure(figsize=(20, 10))

mask = df.akpd_score > 0.01

plt.scatter(df[mask].estimated_weight_g, df[mask].akpd_score)

X = df[mask][['estimated_weight_g', 'depth']]
X = sm.add_constant(X)
model = sm.OLS(df[mask].akpd_score, X)
results = model.fit()

print(results.summary())

In [None]:
pen5 = pd.read_csv('blom_vikane_singleweights.csv')

In [None]:
pen_id = 60
start_date = '2020-08-21'
end_date = '2020-08-25'

df = get_df(pen_id, start_date, end_date, 0)

mask = (df.hour >= 7) & (df.hour <= 17)

df = df[mask]

akpd_cutoffs = np.arange(0.01, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ] 

gt_avg_weight = 3490

loss_factor = 0.17

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        gt_mask = (pen5.weight > buckets[i] / 1000) & (pen5.weight <= buckets[i + 1] / 1000)
        gt_pct = np.sum(gt_mask) / len(gt_mask)
        errors1.append(pct1 - gt_pct)
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append((np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Vikane Pen 5 - AKPD Cutoff vs Error')
plt.legend()

In [None]:
pen_id = 186
start_date = '2020-10-26'
end_date = '2020-10-29'

df = get_df(pen_id, start_date, end_date, 0)

mask = (df.hour >= 6) & (df.hour <= 14)

df = df[mask]

akpd_cutoffs = np.arange(0.01, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ] 

gt_avg_weight = 4550

loss_factor = 0.16

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append((np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
# plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Varholmen Pen 12 - AKPD Cutoff vs Error')
plt.legend()

In [None]:
pen_id = 124
start_date = '2020-09-23'
end_date = '2020-09-27'

df = get_df(pen_id, start_date, end_date, 0)

# mask = (df.hour >= 6) & (df.hour <= 15)

# df = df[mask]

akpd_cutoffs = np.arange(0.01, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ] 

gt_avg_weight = 5326

loss_factor = 0

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append((np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
# plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Mowi Huenquillahue Pen 1 - AKPD Cutoff vs Error')
plt.legend()

In [None]:
pen_id = 125
start_date = '2020-09-25'
end_date = '2020-09-29'

df = get_df(pen_id, start_date, end_date, 0)

# mask = (df.hour >= 6) & (df.hour <= 15)

# df = df[mask]

akpd_cutoffs = np.arange(0.01, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ] 

gt_avg_weight = 5237

loss_factor = 0

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append((np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
# plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Mowi Huenquillahue Pen 2 - AKPD Cutoff vs Error')
plt.legend()

In [None]:
pen_id = 143
start_date = '2020-10-13'
end_date = '2020-10-17'

df = get_df(pen_id, start_date, end_date, 0)

# mask = (df.hour >= 6) & (df.hour <= 15)

# df = df[mask]

akpd_cutoffs = np.arange(0.01, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ] 

gt_avg_weight = 3440

loss_factor = 0.13

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append((np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
# plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Grieg Dale - AKPD Cutoff vs Error')
plt.legend()

In [None]:
pen_id = 164
start_date = '2020-09-27'
end_date = '2020-10-01'

df = get_df(pen_id, start_date, end_date, 0)

mask = (df.hour >= 6) & (df.hour <= 15)

df = df[mask]

akpd_cutoffs = np.arange(0, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, .0062, .2281, .6490, .1143, .0023, .0001, 0, 0, 0 ] 

gt_avg_weight = 3365.32

loss_factor = 0.16

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append(np.abs(np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Eldviktaren - AKPD Cutoff vs Error')
plt.legend()

In [None]:
pen_id = 116
start_date = '2020-10-26'
end_date = '2020-10-30'

df = get_df(pen_id, start_date, end_date, 0)

mask = (df.hour >= 7) & (df.hour <= 15)

df = df[mask]

akpd_cutoffs = np.arange(0, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, 0, .0055, .0410, .1686, .3253, .2729, .1323, .0411, .0133]

gt_avg_weight = 5860

loss_factor = 0.16

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append(np.abs(np.mean(d1) - gt_avg_weight) / gt_avg_weight)
# plt.plot(percentages, errors)
plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Slapoya - AKPD Cutoff vs Error')
plt.legend()

In [None]:
df1 = df[df.akpd_score > 0.01]
np.max(df1.estimated_weight_g)

In [None]:
pen_id = 151
start_date = '2020-10-08'
end_date = '2020-10-12'

df = get_df2(pen_id, '151-ENGALL-1455', start_date, end_date, 0)

mask = (df.hour >= 7) & (df.hour <= 17)

df = df[mask]

akpd_cutoffs = np.arange(0, 1, 0.01)

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, 0.0039, .0216, .1490, .3591, .3284, .1163, .0200, .0016, .0001]

gt_avg_weight = 4932.62

loss_factor = 0.16

percentages = []
errors = []
mean_errors = []

plt.figure(figsize=(20, 10))

errors01 = []
errors95 = []

for akpd_cutoff in akpd_cutoffs:
    d1 = df.estimated_weight_g[df.akpd_score >= akpd_cutoff] * (1 - loss_factor)
    errors1 = []
    for i in range(len(buckets) - 1):
        mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
        pct1 = np.sum(mask1) / len(mask1)
        errors1.append(pct1 - gt_pcts[i])
        
        if np.abs(akpd_cutoff - 0.01) < .001:
            errors01.append(pct1)
        elif np.abs(akpd_cutoff - 0.95) < .001:
            errors95.append(pct1)
    percentages.append(len(d1) / len(df))
#     print(len(d1) / len(df))
    errors.append(np.mean(np.abs(np.array(errors1))))
    mean_errors.append(np.abs(np.mean(d1) - gt_avg_weight) / gt_avg_weight)
    if np.abs(akpd_cutoff - 0.01) < .001:
        print(akpd_cutoff, len(d1))
    elif np.abs(akpd_cutoff - 0.95) < .001:
        print(akpd_cutoff, len(d1))

# plt.plot(percentages, errors)
plt.plot(akpd_cutoffs, errors, label = 'Distribution Error')
plt.plot(akpd_cutoffs, mean_errors, label = 'Avg Weight Error')
plt.xlabel('AKPD Cutoff')
plt.ylabel('Error')
plt.title('Varholmen P5 - AKPD Cutoff vs Error')
plt.legend()


plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 300, errors01, color = 'orange', width = 150, label = 'AKPD = 0.01')
plt.bar(x_buckets - 150, errors95, color = 'red', width = 150, label = 'AKPD = 0.95')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'Ground truth')
plt.legend()

# for index, akpd_cutoff in enumerate(akpd_cutoffs):
#     print(akpd_cutoff, mean_errors[index])

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])
gt_pcts = [0, 0, .0055, .0410, .1686, .3253, .2729, .1323, .0411, .0133]

loss_factor = 0.16

d1 = df['estimated_weight_g'] * (1 - loss_factor)
d2 = df2['estimated_weight_g'] * (1 - loss_factor)
d3 = df3['estimated_weight_g'] * (1 - loss_factor)
# d2 = df.estimated_weight_g[df.depth > np.percentile(df.depth, 75)] * (1 - loss_factor)
# d3 = np.concatenate([d1[d1 < np.mean(d2)], np.mean(d2) + (np.mean(d2) - d1[d1 < np.mean(d2)])])
# d4 = np.concatenate([d1[d1 < np.median(d2)], np.median(d2) + (np.median(d2) - d1[d1 < np.median(d2)])])
# d2 = dist2['estimated_weight_g'] * (1 - loss_factor)
# new_density_adj = new_density / np.sum(new_density)

print(np.mean(d1), np.mean(d2), np.mean(d3), np.mean(d4))

# new_pcts = []
pcts1 = []
pcts2 = []
pcts3 = []

for i in range(len(buckets) - 1):
    mask1 = (d4 > buckets[i]) & (d4 <= buckets[i + 1])
    mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])
    mask3 = (d3 > buckets[i]) & (d3 <= buckets[i + 1])
#     mask_new = (new_bins_adj > buckets[i]) & (new_bins_adj <= buckets[i + 1])
    gt_pct = gt_pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]

#     new_pcts.append(np.sum(new_density_adj[mask_new]))
    pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
    pct2 = np.sum(mask2) / len(mask2)
    pcts2.append(pct2)
    pct3 = np.sum(mask3) / len(mask3)
    pcts3.append(pct3)
#     print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(new_density_adj[mask1]) - gt_pct, np.sum(new_density_adj[mask1]), gt_pct))

# pcts1 = np.array(pcts1)
# pcts2 = np.array(pcts2)

# gt_avg = 4944.34

# result = np.sum(new_bins_adj * new_density_adj) 
# (result - gt_avg) / gt_avg
# print(result, gt_avg)
# print((result - gt_avg) / gt_avg)

plt.figure(figsize=(20, 10))
# plt.bar(x_buckets - 300, new_pcts, color = 'orange', width = 150, label = 'Dedup diff')
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets + 300, pcts2, color = 'blue', width = 150, label = 'Dedup')
plt.bar(x_buckets + 150, pcts3, color = 'purple', width = 150, label = 'Dedup diff')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'Ground truth')