In [None]:
import json
import os
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils
from report_generation.report_generator import generate_ts_data, SamplingFilter
from research.utils.datetime_utils import add_days
from report_generation.report_generator import gen_pm_base
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, ValidationError
from filter_optimization.filter_optimization_task import _add_date_hour_columns
from research.weight_estimation.keypoint_utils.optics import pixel2world
import numpy as np

pd.set_option('display.max_rows', 500)

In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))

In [None]:
cohort_names = [
    'seglberget_pen_id_66_2020-05-13_2020-06-13',
    'bolaks_pen_id_88_2020-02-10_2020-03-10',
    'langoy_pen_id_108_2020-05-07_2020-05-17',
    'tittelsnes_pen_id_37_2020-05-23_2020-06-24',
    'aplavika_pen_id_95_2020-06-26_2020-07-26',
    'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02',
    'silda_pen_id_86_2020-06-19_2020-07-19',
    'vikane_pen_id_60_2020-08-05_2020-08-30',
    'eldviktaren_pen_id_164_2020-09-06_2020-10-06',
    'habranden_pen_id_100_2020-08-10_2020-08-31'
]

In [None]:
ROOT_DIR = '/root/data/alok/biomass_estimation/playground'
batch_name = 'test'
dfs, gt_metadatas = {}, {}
for cohort_name in cohort_names:
    s3_dir = os.path.join(
        'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets',
        cohort_name
    )

    ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata.json')
    ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata.json')
    ground_truth_f = os.path.join(ROOT_DIR, ground_truth_key_base)
    s3.download_from_url(ground_truth_metadata_url, custom_location=ground_truth_f)
    gt_metadata = json.load(open(ground_truth_f))
    gt_metadatas[cohort_name] = gt_metadata
    
    data_url = os.path.join(s3_dir, 'annotation_dataset.csv')
    data_f, _, _= s3.download_from_url(data_url)
    df = pd.read_csv(data_f)
    df = _add_date_hour_columns(df)
    dfs[cohort_name] = df
    
    

In [None]:
gt_metadatas

<h1> Generate average weight accuracy </h1>

In [None]:
def generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding):
    last_feeding_date = gt_metadata['last_feeding_date']
    date = add_days(last_feeding_date, days_post_feeding)
    weights, _ = generate_smart_individual_values(pm_base, date, max_day_diff, True, apply_growth_rate, 0.9)
    return weights


def generate_average_weight_accuracy(weights, gt_metadata, loss_factor):
    avg_weight_prediction = np.mean(weights)
    gutted_weight_prediction = avg_weight_prediction * (1.0 - loss_factor)
    gt_weight = gt_metadata['gutted_average_weight']
    avg_weight_err = (gutted_weight_prediction - gt_weight) / gt_weight
    return avg_weight_err




In [None]:
start_hours = [6, 7, 8]
end_hours = [13, 14, 15]
apply_growth_rate_list = [False, True]
max_day_diff_list = [1, 2, 3]
days_post_feeding_list = [0, 1, 2, 3]
max_final_days_post_feeding = 5
loss_factors = [0.16, 0.17]

cohort_name_col = []
start_hour_col = []
end_hour_col = []
apply_growth_rate_col = []
max_day_diff_col = []
days_post_feeding_col = []
final_days_post_feeding_col = []
loss_factor_col = []
avg_weight_error_col = []

for cohort_name in sorted(list(dfs.keys())):
    print(cohort_name)
    gt_metadata = gt_metadatas[cohort_name]
    for start_hour in start_hours:
        for end_hour in end_hours:
            for final_days_post_feeding in days_post_feeding_list:
                sampling_filter = SamplingFilter(
                    start_hour=start_hour,
                    end_hour=end_hour,
                    kf_cutoff=0.0,
                    akpd_score_cutoff=0.95
                )
                df = dfs[cohort_name]
                final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
                tdf = df[df.date <= final_date_post_feeding]
                pm_base = gen_pm_base(tdf, sampling_filter)
                for apply_growth_rate in apply_growth_rate_list:
                    for max_day_diff in max_day_diff_list:
                        for days_post_feeding in range(0, final_days_post_feeding + 1):
                            for loss_factor in loss_factors:
                                try:
                                    weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
                                except ValidationError as err:
                                    continue
                                avg_weight_err = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                                
                                cohort_name_col.append(cohort_name)
                                start_hour_col.append(start_hour)
                                end_hour_col.append(end_hour)
                                apply_growth_rate_col.append(apply_growth_rate)
                                max_day_diff_col.append(max_day_diff)
                                days_post_feeding_col.append(days_post_feeding)
                                final_days_post_feeding_col.append(final_days_post_feeding)
                                loss_factor_col.append(loss_factor)
                                avg_weight_error_col.append(avg_weight_err)
                        
    

In [None]:
tdf = pd.DataFrame({
    'cohort_name': cohort_name_col,
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'apply_growth_rate': apply_growth_rate_col,
    'max_day_diff': max_day_diff_col,
    'days_post_feeding': days_post_feeding_col,
    'final_days_post_feeding': final_days_post_feeding_col,
    'loss_factor': loss_factor_col,
    'avg_weight_error': avg_weight_error_col
})

tdf['avg_weight_error_abs'] = tdf.avg_weight_error.abs()

In [None]:
for cohort_name in cohort_names:
    mask = tdf.cohort_name == cohort_name
    print(tdf[mask].sort_values('avg_weight_error_abs', ascending=True).head(10))

In [None]:
gt_metadatas['vikane_pen_id_60_2020-08-05_2020-08-30']

In [None]:
mask = (tdf.cohort_name == 'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02') & (tdf.days_post_feeding == 0) & (tdf.final_days_post_feeding == 1)
tdf[mask].sort_values('avg_weight_error_abs')



In [None]:
cohort_name_col = []
start_hour_col = []
end_hour_col = []
apply_growth_rate_col = []
max_day_diff_col = []
days_post_feeding_col = []
final_days_post_feeding_col = []
loss_factor_col = []
std_avg_weight_error_col = []
abs_avg_weight_error_col = []
mean_avg_weight_error_col = []

for start_hour in start_hours:
    for end_hour in end_hours:
        for apply_growth_rate in apply_growth_rate_list:
            for max_day_diff in max_day_diff_list:
                for days_post_feeding in days_post_feeding_list:
                    for final_days_post_feeding in days_post_feeding_list:
                        for loss_factor in loss_factors:
                            mask = (tdf.start_hour_col == start_hour) & \
                            (tdf.end_hour_col == end_hour) & \
                            (tdf.apply_growth_rate == apply_growth_rate) & \
                            (tdf.max_day_diff == max_day_diff) & \
                            (tdf.days_post_feeding == days_post_feeding) & \
                            (tdf.final_days_post_feeding == final_days_post_feeding) & \
                            (tdf.loss_factor == loss_factor)
                            
                            start_hour_col.append(start_hour)
                            end_hour_col.append(end_hour)
                            apply_growth_rate_col.append(apply_growth_rate)
                            max_day_diff_col.append(max_day_diff)
                            days_post_feeding_col.append(days_post_feeding)
                            final_days_post_feeding_col.append(final_days_post_feeding)
                            loss_factor_col.append(loss_factor)
                            std_avg_weight_error_col.append(tdf[mask].avg_weight_error.std())
                            abs_avg_weight_error_col.append(tdf[mask].avg_weight_error_abs.mean())
                            mean_avg_weight_error_col.append(tdf[mask].avg_weight_error.mean())

In [None]:
rdf = pd.DataFrame({
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'apply_growth_rate': apply_growth_rate_col,
    'max_day_diff': max_day_diff_col,
    'days_post_feeding': days_post_feeding_col,
    'final_days_post_feeding': final_days_post_feeding_col,
    'loss_factor': loss_factor_col,
    'abs_avg_weight_error': abs_avg_weight_error_col,
    'std_avg_weight_error': std_avg_weight_error_col,
    'mean_avg_weight_error': mean_avg_weight_error_col,
})



In [None]:
rdf

In [None]:
mask = (rdf.loss_factor == 0.16)
rdf[mask].sort_values('abs_avg_weight_error')

In [None]:
cohort_names = [
    'seglberget_pen_id_66_2020-05-13_2020-06-13',
    'bolaks_pen_id_88_2020-02-10_2020-03-10',
    'langoy_pen_id_108_2020-05-07_2020-05-17',
    'tittelsnes_pen_id_37_2020-05-23_2020-06-24',
    'aplavika_pen_id_95_2020-06-26_2020-07-26',
    'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02',
    'silda_pen_id_86_2020-06-19_2020-07-19',
    'vikane_pen_id_60_2020-08-05_2020-08-30',
    'eldviktaren_pen_id_164_2020-09-06_2020-10-06',
    'habranden_pen_id_100_2020-08-10_2020-08-31'
]

In [None]:
tdf.to_csv('/root/data/alok/biomass_estimation/playground/smart_average_param_grid_search_bryton.csv')

In [None]:
tdf[(tdf.cohort_name == 'seglberget_pen_id_66_2020-05-13_2020-06-13')].sort_values('avg_weight_error_abs')



In [None]:
start_hours = [6, 7, 8]
end_hours = [13, 14, 15]
apply_growth_rate_list = [False, True]
max_day_diff_list = [1, 2, 3]
days_post_feeding_list = [0, 1, 2, 3]
max_final_days_post_feeding = 5
loss_factors = [0.16, 0.17]

for start_hour in start_hours:
    avgs = []
    avg_accuracies = []
    
    for cohort_name in cohort_names:
    
        a = tdf[((tdf.cohort_name == cohort_name) & (tdf.days_post_feeding == 1) & (tdf.max_day_diff == 3) & (tdf.apply_growth_rate == True))].sort_values('avg_weight_error_abs')

        try:
            vals = a.index[(a.start_hour_col == start_hour)]
            ranks = [1 - a.index.get_loc(val) / len(a) for val in vals ]
            accuracies = [ a.avg_weight_error_abs[val] for val in vals ]
            avg = np.mean(ranks[1:50])
            avg_accuracy = np.mean(accuracies[1:50])
            
            if np.isnan(avg):
                continue
                
            avgs.append(avg)
            avg_accuracies.append(avg_accuracy)
            
            print(max_day_diff, avg, avg_accuracy)
        except Exception as e:
            print(e)
            pass
        
    print('total', start_hour, np.mean(avgs), np.mean(avg_accuracies))

In [None]:
tdf[(tdf.cohort_name == 'bolaks_pen_id_88_2020-02-10_2020-03-10')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'langoy_pen_id_108_2020-05-07_2020-05-17')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'tittelsnes_pen_id_37_2020-05-23_2020-06-24')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'aplavika_pen_id_95_2020-06-26_2020-07-26')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'silda_pen_id_86_2020-06-19_2020-07-19')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'vikane_pen_id_60_2020-08-05_2020-08-30')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'eldviktaren_pen_id_164_2020-09-06_2020-10-06')].sort_values('avg_weight_error_abs')

In [None]:
tdf[(tdf.cohort_name == 'habranden_pen_id_100_2020-08-10_2020-08-31')].sort_values('avg_weight_error_abs')

In [None]:
# generate Vikane average weight and distribution error - explore basic parameters

ground_truth_metadata = json.load(open(ground_truth_f))
day_after_feeding_stop = add_days(ground_truth_metadata['last_feeding_date'], 1)
start_date, end_date = add_days(day_after_feeding_stop, -2), add_days(day_after_feeding_stop, -1)
tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)

sampling_filter = SamplingFilter(
    start_hour=7,
    end_hour=15,
    akpd_score_cutoff=0.95,
    kf_cutoff=0.0
)
pm_base = gen_pm_base(tdf, sampling_filter)
weights, _ = generate_smart_individual_values(pm_base, day_after_feeding_stop, 3, True, True, 0.9)


In [None]:
np.mean(weights)

In [None]:
from filter_optimization.filter_optimization_task import extract_biomass_data

movikodden_gt = {'pen_id': 167,
  'gutted_weight_distribution': {'0.0-1.0': 0.0,
   '1.0-2.0': 1.53,
   '2.0-3.0': 10.67,
   '3.0-4.0': 19.87,
   '4.0-5.0': 42.36,
   '5.0-6.0': 22.28,
   '6.0-7.0': 3.15,
   '7.0-8.0': 0.13,
   '8.0-9.0': 0.00,
   '9.0-10.0': 0.00},
  'gutted_avg_weight_distribution': {'0.0-1.0': 0.0,
   '1.0-2.0': 1675.00,
   '2.0-3.0': 2522.77,
   '3.0-4.0': 3438.65,
   '4.0-5.0': 4560.83,
   '5.0-6.0': 5268.86,
   '6.0-7.0': 6206.66,
   '7.0-8.0': 7205.90,
   '8.0-9.0': 0.00,
   '9.0-10.0': 0.00},  
  'gutted_unit_weight_distribution': {'0.0-1.0': 0.0,
   '1.0-2.0': 0.60,
   '2.0-3.0': 6.28,
   '3.0-4.0': 15.93,
   '4.0-5.0': 45.05,
   '5.0-6.0': 27.37,
   '6.0-7.0': 4.56,
   '7.0-8.0': 0.23,
   '8.0-9.0': 0.00,
   '9.0-10.0': 0.00},              
  'gutted_average_weight': 4289.31,
  'expected_loss_factor': 18,
  'last_feeding_date': '2020-10-23',
  'undeployment_date': '2020-10-28',
  'harvest_date': '2020-10-29',
  'slaughter_date': '2020-10-30'}

pen_id = movikodden_gt['pen_id']
df_start_date = '2020-10-18'
df_end_date = '2020-10-26'

start_hour = 6
end_hour = 15

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.95
)

df = extract_biomass_data(pen_id, df_start_date, df_end_date, 0.95)

final_date_post_feeding = add_days(movikodden_gt['last_feeding_date'], 3)
tdf = df[df.date <= final_date_post_feeding]
pm_base = gen_pm_base(tdf, sampling_filter)

apply_growth_rate = True
max_day_diff = 3
days_post_feeding = 1
final_days_post_feeding = 3

loss_factor = 0.18

try:
    weights = generate_raw_individual_values(pm_base, movikodden_gt, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
except ValidationError as err:
    pass
avg_weight_err = generate_average_weight_accuracy(weights, movikodden_gt, loss_factor)

print(avg_weight_err)



In [None]:
import matplotlib.pyplot as plt
from scipy.special import logit

buckets = [1000, 2000, 3000, 4000, 5000, 6000, 7000]

avg_weight = movikodden_gt['gutted_average_weight']

bucket_translation = {
    1000: '1.0-2.0',
    2000: '2.0-3.0',
    3000: '3.0-4.0',
    4000: '4.0-5.0',
    5000: '5.0-6.0',
    6000: '6.0-7.0',
    7000: '7.0-8.0',
    8000: '8.0-9.0',
    9000: '9.0-10.0',
}

loss_factors = np.arange(0.10, 0.2, 0.001)
avg_weight_accuracy = []
dist_accuracy = []
dist_accuracy_median = []
dist_accuracy_new = []
dist_accuracy_new_median = []

bucket_accuracies = []
percentile_accuracies = {}

gt_pcts = []
gt_pcts_new = []

gt_unit_pcts = []

for i in range(len(buckets) - 1):
    bucket_accuracies.append([])
    
    gt_pct = movikodden_gt['gutted_weight_distribution'][bucket_translation[buckets[i]]]
    gt_pct_new = new_buckets[buckets[i]]
    
    gt_unit_pct = movikodden_gt['gutted_unit_weight_distribution'][bucket_translation[buckets[i]]]
    
    gt_pcts.append(gt_pct)
    gt_pcts_new.append(gt_pct_new)
    
    gt_unit_pcts.append(gt_unit_pct)

gt_pcts = np.array(gt_pcts)
gt_unit_pcts = np.array(gt_unit_pcts)
    
for i in range(len(buckets) - 1):
    bucket_accuracies.append([])
    
for i in np.arange(50, 90, 10):
    percentile_accuracies[i] = []
    
pct_diff = []
pct_diff2 = []
pct_unit_diff = []
pct_unit_diff2 = []
pct_plots = []
    
for loss_factor in loss_factors:
    weights_adj = weights * (1.0 - loss_factor)
    weights_adj.sort()
    weights_sum = np.cumsum(weights_adj) / np.sum(weights_adj) * 100

    dist_accuracies = []
    dist_accuracies_new = []
    
    pcts = []
    avg_weights = []
    percentiles = []
    percentiles2 = []
    
    for i in range(len(buckets) - 1):
        mask1 = (weights_adj >= buckets[i]) & (weights_adj < buckets[i + 1])

        gt_pct = gt_pcts[i]
        gt_pct_new = gt_pcts_new[i]
        
        pct = sum(mask1) * 100 / len(mask1)
        pcts.append(pct)
        
        avg_weights.append(np.mean(weights_adj[mask1]))
        
        dist_accuracies.append(np.abs(pct - gt_pct))
        dist_accuracies_new.append(np.abs(pct - gt_pct_new))
        
        bucket_accuracies[i].append(np.abs(pct - gt_pct))
        
        equivalent_gt_percentile = np.percentile(weights_adj, np.sum(gt_pcts[0:i]))
        percentiles.append(equivalent_gt_percentile)
        
        # try this....
        equivalent_gt_unit_percentile = stats.percentileofscore(weights_sum, np.sum(gt_unit_pcts[0:i]))
        equivalent_gt_unit_percentile_weight = np.percentile(weights_adj, equivalent_gt_unit_percentile)
        percentiles2.append(equivalent_gt_unit_percentile_weight)
        
        
    pcts = np.array(pcts)
    avg_weights = np.array(avg_weights)
    percentiles = np.array(percentiles)
    percentiles2 = np.array(percentiles2)
    unit_pcts = pcts * avg_weights / np.sum(pcts * avg_weights) * 100
    
#     model = sm.OLS(np.log(np.cumsum(pcts)), np.log(np.cumsum(gt_pcts)))
#     results = model.fit()
#     pct_diff.append(np.mean(np.abs(np.cumsum(pcts) - np.cumsum(gt_pcts))))
# #     pct_diff.append(results.rsquared)
#     model = sm.OLS(logit(np.cumsum(pcts / 101)), logit(np.cumsum(gt_pcts / 101)))
#     results = model.fit()
#     pct_diff2.append(np.mean(np.abs(logit(np.cumsum(pcts / 101)) - logit(np.cumsum(gt_pcts / 101)))))
# #     pct_diff2.append(results.rsquared)
#     pct_diff.append(np.median(np.abs(percentiles2 - buckets[:-1]) / buckets[:-1]))
#     pct_diff2.append(np.mean(np.abs(percentiles2 - buckets[:-1]) / buckets[:-1]))
    pct_plots.append(percentiles2)
    
    model = sm.OLS(percentiles - buckets[:-1], buckets[:-1])
#     model = sm.OLS((percentiles - buckets[:-1]) / buckets[:-1], buckets[:-1])
    results = model.fit()
    pct_diff.append(results.rsquared)
    
    model = sm.OLS((percentiles2 - buckets[:-1]), buckets[:-1])
    results = model.fit()
    pct_unit_diff.append(results.rsquared)
    
    
#     model = sm.OLS(np.cumsum(unit_pcts), np.cumsum(gt_unit_pcts))
#     model = sm.OLS(logit(np.cumsum(unit_pcts / 101)), logit(np.cumsum(gt_unit_pcts / 101)))
    X = buckets[:-1]
    X = sm.add_constant(X)
    model = sm.OLS(percentiles2 - buckets[:-1], X)
#     model = sm.OLS((percentiles2 - buckets[:-1]) / buckets[:-1], X)
    results = model.fit()
    pct_unit_diff2.append(results.rsquared)
    
    model = sm.OLS((percentiles - buckets[:-1]), X)
    results = model.fit()
    pct_diff2.append(results.rsquared)
        
    avg_weight_accuracy.append(np.abs(np.mean(weights_adj) - avg_weight) / avg_weight * 100)
    dist_accuracy.append(np.mean(dist_accuracies))
    dist_accuracy_median.append(np.percentile(dist_accuracies, 50))
    dist_accuracy_new.append(np.mean(dist_accuracies_new))
    dist_accuracy_new_median.append(np.percentile(dist_accuracies_new, 50))
    
    for i in np.arange(50, 90, 10):
        percentile_accuracies[i].append(np.percentile(dist_accuracies, i))
#         print('%i - %i: %0.2f, %0.2f vs %0.2f' % (buckets[i], buckets[i + 1], pct - gt_pct, pct, gt_pct))

In [None]:
plt.scatter(buckets[:-1], percentiles2)

In [None]:
plt.scatter(np.cumsum(unit_pcts), np.cumsum(gt_unit_pcts))

In [None]:
loss_factor1 = 0.18
loss_factor2 = 0.139

weights_adj1 = weights * (1.0 - loss_factor1)
weights_adj2 = weights * (1.0 - loss_factor2)

x_buckets = np.array(buckets[1:])

pcts1 = []
pcts2 = []
gt_pcts = []

mean_weights1 = []
mean_weights2 = []
gt_weights = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
    mask1 = (weights_adj1 >= buckets[i]) & (weights_adj1 < buckets[i + 1])
    mask2 = (weights_adj2 >= buckets[i]) & (weights_adj2 < buckets[i + 1])

#     gt_pct = movikodden_gt['gutted_weight_distribution'][bucket_translation[buckets[i]]]
#     gt_pct_new = new_buckets[buckets[i]]

    if np.sum(mask1) > 0:
        pct1 = sum(mask1) * 100 / len(mask1)
        avg_weight1 = np.mean(weights_adj1[mask1])
    else:
        pct1 = 0
        avg_weight1 = 0
        
    if np.sum(mask2) > 0:
        pct2 = sum(mask2) * 100 / len(mask2)
        avg_weight2 = np.mean(weights_adj2[mask2])
    else:
        pct2 = 0
        avg_weight2 = 0

    gt_pct = movikodden_gt['gutted_weight_distribution'][bucket_translation[buckets[i]]]
    gt_weight = movikodden_gt['gutted_avg_weight_distribution'][bucket_translation[buckets[i]]]
    
    mean_weights1.append(avg_weight1)
    mean_weights2.append(avg_weight2)
    gt_weights.append(gt_weight)

    pcts1.append(pct1)
    pcts2.append(pct2)
    gt_pcts.append(gt_pct)

pcts1 = np.array(pcts1)
pcts2 = np.array(pcts2)
mean_weights1 = np.array(mean_weights1)
mean_weights2 = np.array(mean_weights2)
gt_weights = np.array(gt_weights)
gt_pcts = np.array(gt_pcts)

adj_pcts1 = pcts1 * mean_weights1
adj_pcts2 = pcts2 * mean_weights2

adj_pcts1 = adj_pcts1 / np.sum(adj_pcts1) * 100
adj_pcts2 = adj_pcts2 / np.sum(adj_pcts2) * 100
    
errors1 = pcts1 - gt_pcts
errors2 = pcts2 - gt_pcts

# plt.figure(figsize=(10, 5))

# plt.bar(x_buckets - 200, mean_weights1, color = 'red', width = 200, label = '18%')
# plt.bar(x_buckets + 200, mean_weights2, color = 'blue', width = 200, label = '15.5%')
# plt.bar(x_buckets, gt_weights, color = 'green', width = 200, label = 'Ground truth')

# plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets - 200, pcts1, color = 'red', width = 200, label = '%0.2f%%' % (loss_factor1 * 100, ))
plt.bar(x_buckets + 200, pcts2, color = 'blue', width = 200, label = '%0.2f%%' % (loss_factor2 * 100, ))
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets - 200, pcts1, color = 'red', width = 200, label = '%0.2f%%' % (loss_factor1 * 100, ))
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets - 200, pcts2, color = 'blue', width = 200, label ='%0.2f%%' % (loss_factor2 * 100, ))
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets + 200, pcts1 - gt_pcts, color = 'red', width = 200, label = '%0.2f%% error' % (loss_factor1 * 100, ))
plt.bar(x_buckets, pcts2 - gt_pcts, color = 'blue', width = 200, label = '%0.2f%% error' % (loss_factor2 * 100, ))

plt.legend()

In [None]:
fig = plt.figure(figsize=(10, 10))

for i, loss_factor in enumerate(loss_factors):
    plt.plot(buckets[:-1], pct_plots[i], marker = 'o')
#     plt.plot(np.cumsum(gt_pcts), np.cumsum(pct_plots[i]), marker = 'o', label = loss_factor)

# plt.plot(np.cumsum(gt_pcts), np.cumsum(gt_pcts), color = 'green', linewidth = 4)
plt.plot(buckets[:-1], buckets[:-1], color = 'green', linewidth = 4)

# plt.legend()

In [None]:
from scipy.special import logit

plt.plot(logit(np.cumsum(gt_pcts / 100)), logit(np.cumsum(gt_pcts / 100)), 'o')

In [None]:
plt.plot(np.cumsum(gt_pcts / 100), np.cumsum(gt_pcts / 100), 'o')

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
ax2 = ax.twinx()
# ax.plot(loss_factors, dist_accuracy, color = 'red', linewidth = 4)
# ax.plot(loss_factors, dist_accuracy_median, color = 'red', linestyle = 'dotted')
ax.plot(loss_factors, pct_diff, color = 'red', linestyle = 'dotted')
ax.plot(loss_factors, pct_diff2, color = 'red', linestyle = 'dotted')
ax.plot(loss_factors, pct_unit_diff, color = 'blue', linewidth = 4)
ax.plot(loss_factors, pct_unit_diff2, color = 'blue', linewidth = 4)
# ax.plot(loss_factors, dist_accuracy_new, color = 'blue')
# ax.plot(loss_factors, dist_accuracy_new_median, color = 'blue', linestyle = 'dotted')

# for percentile, bucket in percentile_accuracies.items():
#     ax.plot(loss_factors, bucket, label = percentile)

ax2.plot(loss_factors, -1 * np.array(avg_weight_accuracy), color = 'green')
ax.set_xlabel('Loss factor')
ax.set_ylabel('Avg dist accuracy')
ax.set_title('Avg dist accuracy vs loss factor')

# ax.legend()

# print(loss_factors[np.argmin(avg_weight_accuracy)])
print(loss_factors[np.argmin(pct_diff)])
print(loss_factors[np.argmin(pct_diff2)])
print(loss_factors[np.argmin(pct_unit_diff)])
print(loss_factors[np.argmin(pct_unit_diff2)])
# print(loss_factors[np.argmax(pct_diff)])
# print(loss_factors[np.argmax(pct_diff2)])
# print(loss_factors[np.argmax(pct_unit_diff)])

In [None]:
tittelsnes_gt = {'pen_id': 37,
#   'gutted_weight_distribution': {'0.0-1.0': 0.0,
#    '1.0-2.0': 1.74,
#    '2.0-3.0': 17.11,
#    '3.0-4.0': 32.85,
#    '4.0-5.0': 27.77,
#    '5.0-6.0': 14.59,
#    '6.0-7.0': 4.77,
#    '7.0-8.0': 1.04,
#    '8.0-9.0': 0.13,
#    '9.0-10.0': 0.01},
'gutted_weight_distribution': {'0.0-1.0': 0.0,
   '1.0-2.0': 3.65,
   '2.0-3.0': 21.14,
   '3.0-4.0': 32.32,
   '4.0-5.0': 25.38,
   '5.0-6.0': 12.54,
   '6.0-7.0': 3.99,
   '7.0-8.0': 0.87,
   '8.0-9.0': 0.10,
   '9.0-10.0': 0.01},
  'gutted_unit_weight_distribution': {'0.0-1.0': 0.0,
   '1.0-2.0': 1.58,
   '2.0-3.0': 14.16,
   '3.0-4.0': 29.00,
   '4.0-5.0': 29.25,
   '5.0-6.0': 17.50,
   '6.0-7.0': 6.59,
   '7.0-8.0': 1.66,
   '8.0-9.0': 0.23,
   '9.0-10.0': 0.02},    
  'gutted_average_weight': 3900,
  'expected_loss_factor': 16.5,
  'last_feeding_date': '2020-06-17',
  'undeployment_date': '2020-06-22',
  'harvest_date': '2020-06-23',
  'slaughter_date': '2020-06-24'}

start_hour = 7
end_hour = 17
cohort_name = 'tittelsnes_pen_id_37_2020-05-23_2020-06-24'

sampling_filter = SamplingFilter(
    start_hour=start_hour,
    end_hour=end_hour,
    kf_cutoff=0.0,
    akpd_score_cutoff=0.95
)

df = dfs[cohort_name]

final_date_post_feeding = add_days(tittelsnes_gt['last_feeding_date'], 3)
tdf = df[df.date <= final_date_post_feeding]
pm_base = gen_pm_base(tdf, sampling_filter)

apply_growth_rate = True
max_day_diff = 3
days_post_feeding = 1
final_days_post_feeding = 3

loss_factor = 0.16

try:
    weights = generate_raw_individual_values(pm_base, tittelsnes_gt, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
except ValidationError as err:
    pass
avg_weight_err = generate_average_weight_accuracy(weights, tittelsnes_gt, loss_factor)

print(avg_weight_err)



In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.special import logit

buckets = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]

avg_weight = tittelsnes_gt['gutted_average_weight']

bucket_translation = {
    1000: '1.0-2.0',
    2000: '2.0-3.0',
    3000: '3.0-4.0',
    4000: '4.0-5.0',
    5000: '5.0-6.0',
    6000: '6.0-7.0',
    7000: '7.0-8.0',
    8000: '8.0-9.0',
    9000: '9.0-10.0',
}

new_buckets = {
    1000: 3.65,
    2000: 21.14,
    3000: 32.32,
    4000: 25.38,
    5000: 12.54,
    6000: 3.99,
    7000: 0.87,
    8000: 0.10,
    9000: 0.01
}

avg_weight_accuracy = []
dist_accuracy = []
dist_accuracy_median = []
dist_accuracy_new = []
dist_accuracy_new_median = []

bucket_accuracies = []
percentile_accuracies = []

gt_pcts = []
gt_pcts_new = []
gt_unit_pcts = []

for i in range(len(buckets) - 1):
    bucket_accuracies.append([])
    
    gt_pct = tittelsnes_gt['gutted_weight_distribution'][bucket_translation[buckets[i]]]
    gt_pct_new = new_buckets[buckets[i]]
    
    gt_unit_pct = tittelsnes_gt['gutted_unit_weight_distribution'][bucket_translation[buckets[i]]]
    
    gt_pcts.append(gt_pct)
    gt_pcts_new.append(gt_pct_new)
    
    gt_unit_pcts.append(gt_unit_pct)

gt_pcts = np.array(gt_pcts)
gt_unit_pcts = np.array(gt_unit_pcts)
    
for i in np.arange(50, 90, 10):
    percentile_accuracies.append([])
    
pct_diff = []
pct_diff2 = []

loss_factors = np.arange(0.12, 0.18, 0.001)
# loss_factors = np.arange(0.14, 0.16, 0.001)
pct_plots = []
percentile_plots = []
    
for loss_factor in loss_factors:
    weights_adj = weights * (1.0 - loss_factor)
    weights_adj.sort()
    weights_sum = np.cumsum(weights_adj) / np.sum(weights_adj) * 100

    pcts = []
    avg_weights = []
    
    dist_accuracies = []
    dist_accuracies_new = []
    
    percentiles = []
    percentiles2 = []
    
    for i in range(len(buckets) - 1):
        mask1 = (weights_adj >= buckets[i]) & (weights_adj < buckets[i + 1])

        gt_pct = gt_pcts[i]
        gt_pct_new = gt_pcts_new[i]
        
        pct = sum(mask1) * 100 / len(mask1)
                
        pcts.append(pct)
        
        avg_weights.append(np.mean(weights_adj[mask1]))
        
        dist_accuracies.append(np.abs(pct - gt_pct))
        dist_accuracies_new.append(np.abs(pct - gt_pct_new))
        
        bucket_accuracies[i].append(np.abs(pct - gt_pct))
        
        equivalent_gt_percentile = np.percentile(weights_adj, np.sum(gt_pcts[0:i]))
        percentiles.append(equivalent_gt_percentile)
        
        # try this....
        equivalent_gt_unit_percentile = stats.percentileofscore(weights_sum, np.sum(gt_unit_pcts[0:i]))
        equivalent_gt_unit_percentile_weight = np.percentile(weights_adj, equivalent_gt_unit_percentile)
        percentiles2.append(equivalent_gt_unit_percentile_weight)
        
    percentiles = np.array(percentiles)
    percentiles2 = np.array(percentiles2)
    pcts = np.array(pcts)
    avg_weights = np.array(avg_weights)
    
    unit_pcts = pcts * avg_weights / np.sum(pcts * avg_weights) * 100
    
#     model = sm.OLS((percentiles - buckets[:-1]) / buckets[:-1], buckets[:-1])
    model = sm.OLS((percentiles2) - (buckets[:-1]), (buckets[:-1]))
    results = model.fit()
    pct_diff.append(results.rsquared)
    
    X = (buckets[:-1])
    X = sm.add_constant(X)
    model = sm.OLS((percentiles2) - (buckets[:-1]), X)
#     model = sm.OLS((percentiles - buckets[:-1]) / buckets[:-1], X)
    results = model.fit()
    pct_diff2.append(results.rsquared)
#     pct_diff.append(np.mean(np.abs(percentiles2 - buckets[:-1]) / buckets[:-1]))
#     model = sm.OLS(np.cumsum(gt_pcts), np.cumsum(pcts))
#     model = sm.OLS(logit(np.cumsum(gt_unit_pcts / 101)), logit(np.cumsum(unit_pcts / 101)))
# #     model = sm.OLS(percentiles, buckets[:-1])
#     results = model.fit()
#     pct_diff.append(results.rsquared)
#     pct_diff2.append(np.mean(np.abs(np.cumsum(gt_pcts) - np.cumsum(pcts))))
#     pct_diff.append(results.params[0])
#     pct_diff.append(np.mean(np.abs(percentiles - buckets[:-1]) / buckets[:-1]))
    
#     if np.abs(loss_factor - 0.14) < .001:
#         pcts14 = percentiles
#     if np.abs(loss_factor - 0.16) < .001:
#         pcts155 = percentiles
#     elif np.abs(loss_factor - 0.18) < .001:
#         pcts18 = percentiles
    pct_plots.append(pcts)
    percentile_plots.append(percentiles)    
    
    avg_weight_accuracy.append(np.abs(np.mean(weights_adj) - avg_weight) / avg_weight * 100)
    dist_accuracy.append(np.mean(dist_accuracies))
    dist_accuracy_median.append(np.percentile(dist_accuracies, 90))
    dist_accuracy_new.append(np.mean(dist_accuracies_new))
    dist_accuracy_new_median.append(np.percentile(dist_accuracies_new, 90))
    
    for index, i in enumerate(np.arange(50, 90, 10)):
        percentile_accuracies[index].append(np.percentile(dist_accuracies_new, i))
#         print('%i - %i: %0.2f, %0.2f vs %0.2f' % (buckets[i], buckets[i + 1], pct - gt_pct, pct, gt_pct))

In [None]:
print(percentiles)
print(percentiles2)

In [None]:
loss_factor1 = 0.165
loss_factor2 = 0.13

weights_adj1 = weights * (1.0 - loss_factor1)
weights_adj2 = weights * (1.0 - loss_factor2)

x_buckets = np.array(buckets[1:])

pcts1 = []
pcts2 = []
gt_pcts = []

mean_weights1 = []
mean_weights2 = []
gt_weights = []

# errors1 = []
# errors2 = []

for i in range(len(buckets) - 1):
    mask1 = (weights_adj1 >= buckets[i]) & (weights_adj1 < buckets[i + 1])
    mask2 = (weights_adj2 >= buckets[i]) & (weights_adj2 < buckets[i + 1])

#     gt_pct = movikodden_gt['gutted_weight_distribution'][bucket_translation[buckets[i]]]
#     gt_pct_new = new_buckets[buckets[i]]

    if np.sum(mask1) > 0:
        pct1 = sum(mask1) * 100 / len(mask1)
        avg_weight1 = np.mean(weights_adj1[mask1])
    else:
        pct1 = 0
        avg_weight1 = 0
        
    if np.sum(mask2) > 0:
        pct2 = sum(mask2) * 100 / len(mask2)
        avg_weight2 = np.mean(weights_adj2[mask2])
    else:
        pct2 = 0
        avg_weight2 = 0

    gt_pct = tittelsnes_gt['gutted_weight_distribution'][bucket_translation[buckets[i]]]
#     gt_weight = tittelsnes_gt['gutted_avg_weight_distribution'][bucket_translation[buckets[i]]]
    
#     mean_weights1.append(avg_weight1)
#     mean_weights2.append(avg_weight2)
#     gt_weights.append(gt_weight)

    pcts1.append(pct1)
    pcts2.append(pct2)
    gt_pcts.append(gt_pct)

pcts1 = np.array(pcts1)
pcts2 = np.array(pcts2)
# mean_weights1 = np.array(mean_weights1)
# mean_weights2 = np.array(mean_weights2)
# gt_weights = np.array(gt_weights)
gt_pcts = np.array(gt_pcts)

# adj_pcts1 = pcts1 * mean_weights1
# adj_pcts2 = pcts2 * mean_weights2

# adj_pcts1 = adj_pcts1 / np.sum(adj_pcts1) * 100
# adj_pcts2 = adj_pcts2 / np.sum(adj_pcts2) * 100
    
errors1 = pcts1 - gt_pcts
errors2 = pcts2 - gt_pcts

# plt.figure(figsize=(10, 5))

# plt.bar(x_buckets - 200, mean_weights1, color = 'red', width = 200, label = '18%')
# plt.bar(x_buckets + 200, mean_weights2, color = 'blue', width = 200, label = '15.5%')
# plt.bar(x_buckets, gt_weights, color = 'green', width = 200, label = 'Ground truth')

# plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets - 200, pcts1, color = 'red', width = 200, label = '%0.2f%%' % (loss_factor1 * 100, ))
plt.bar(x_buckets + 200, pcts2, color = 'blue', width = 200, label = '%0.2f%%' % (loss_factor2 * 100, ))
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets - 200, pcts1, color = 'red', width = 200, label = '%0.2f%%' % (loss_factor1 * 100, ))
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets - 200, pcts2, color = 'blue', width = 200, label ='%0.2f%%' % (loss_factor2 * 100, ))
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

plt.figure(figsize=(10, 5))

plt.bar(x_buckets + 200, pcts1 - gt_pcts, color = 'red', width = 200, label = '%0.2f%% error' % (loss_factor1 * 100, ))
plt.bar(x_buckets, pcts2 - gt_pcts, color = 'blue', width = 200, label = '%0.2f%% error' % (loss_factor2 * 100, ))

plt.legend()

In [None]:
# plt.plot(logit(np.cumsum(gt_pcts / 101)), logit(np.cumsum(gt_pcts / 101)), 'o')
plt.plot(logit(np.cumsum(pcts / 101)), logit(np.cumsum(pcts / 101)), 'o')

In [None]:
fig = plt.figure(figsize=(10, 10))
# plt.plot(buckets[:-1], pcts14, color = 'red', marker = 'o', label = '16.5%')
# plt.plot(buckets[:-1], pcts18, color = 'red', marker = 'o', label = '16.5%')
# plt.plot(buckets[:-1], pcts155, color = 'blue', marker = 'o', label = '15.5%')
for i, loss_factor in enumerate(loss_factors):
#     plt.plot(np.cumsum(gt_pcts), np.cumsum(pct_plots[i]), marker = 'o', label = loss_factor)
    plt.plot(buckets[:-1], percentile_plots[i], marker = 'o', label = loss_factor)
    
plt.plot(buckets[:-1], buckets[:-1], color = 'green', linewidth = 4)

for i, loss_factor in enumerate(loss_factors):
    model = sm.OLS(percentile_plots[i], buckets[:-1])
    results = model.fit()
    print('%0 3f, %0.6f, %0.6f, %0.2f' % (loss_factor, results.rsquared, np.median(np.abs(percentile_plots[i] - buckets[:-1]) / buckets[:-1]), np.median(np.abs(percentile_plots[i] - buckets[:-1]))))


In [None]:
fig = plt.figure(figsize=(10, 10))
# plt.plot(buckets[:-1], pcts14, color = 'red', marker = 'o', label = '16.5%')
# plt.plot(buckets[:-1], pcts18, color = 'red', marker = 'o', label = '16.5%')
# plt.plot(buckets[:-1], pcts155, color = 'blue', marker = 'o', label = '15.5%')
for i, loss_factor in enumerate(loss_factors):
    plt.plot(logit(np.cumsum(gt_pcts / 101)), logit(np.cumsum(pct_plots[i] / 101)), marker = 'o', label = loss_factor)
#     plt.plot(buckets[:-1], percentile_plots[i], marker = 'o', label = loss_factor)

plt.plot(logit(np.cumsum(gt_pcts / 101)), logit(np.cumsum(gt_pcts / 101)), color = 'green', linewidth = 4)
# plt.plot(buckets[:-1], buckets[:-1], color = 'green', linewidth = 4)
# fig = plt.figure(figsize=(10, 10))
# plt.plot(buckets[:-1], pcts155 - buckets[:-1])

# plt.legend()

for i, loss_factor in enumerate(loss_factors):
#     model = sm.OLS(np.cumsum(pct_plots[i]), np.cumsum(gt_pcts))
    model = sm.OLS(logit(np.cumsum(pct_plots[i] / 101)), logit(np.cumsum(gt_pcts / 101)))
    results = model.fit()
    print('%0.3f, %0.2f, %0.6f, %0.2f' % (loss_factor, results.rsquared, np.mean(np.abs(logit(np.cumsum(pct_plots[i] / 101)) - logit(np.cumsum(gt_pcts / 101)))), np.mean(np.abs(np.cumsum(pct_plots[i]) - np.cumsum(gt_pcts)))))
# model = sm.OLS(pcts18, buckets[:-1])
# results = model.fit()
# print(results.params)

In [None]:


plt.figure(figsize=(10, 5))

plt.bar(np.array(x_buckets) + 200, pct_plots[4], color = 'blue', width = 200, label = '15.5%')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 200, label = 'Ground truth')

plt.legend()

In [None]:
len(pct_plots[4]), len(x_buckets)

In [None]:

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
ax2 = ax.twinx()
ax.plot(loss_factors, (pct_diff), color = 'red', linestyle = 'dotted')
ax.plot(loss_factors, (pct_diff2), color = 'red')

ax.set_xlabel('Loss factor')
ax.set_ylabel('Avg dist accuracy')
ax.set_title('Avg dist accuracy vs loss factor')

# ax2.plot(loss_factors, -1 * np.array(avg_weight_accuracy), color = 'green')

# ax.axhline(0)

# x_buckets = buckets[:-1]
# pcts18 = np.array(pcts18)
# pcts155 = np.array(pcts155)

# import scipy.stats as stats
# slope, intercept, r_value, p_value, std_err = stats.linregress(buckets[:-1], pcts18)
# print(slope, np.mean(np.abs(pcts18 - x_buckets)))
# slope, intercept, r_value, p_value, std_err = stats.linregress(buckets[:-1], pcts155)
# print(slope, np.mean(np.abs(pcts155 - x_buckets)))

print(loss_factors[np.argmax(pct_diff)])
print(loss_factors[np.argmax(pct_diff2)])
print(loss_factors[np.argmin(pct_diff)])
print(loss_factors[np.argmin(pct_diff2)])

In [None]:
pct_diff2

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
ax2 = ax.twinx()
ax.plot(loss_factors, dist_accuracy, color = 'red', linewidth = 4)
# ax2.plot(loss_factors, dist_accuracy_median, color = 'red', linestyle = 'dotted')
# ax.plot(loss_factors, dist_accuracy_new, color = 'blue')
# ax.plot(loss_factors, dist_accuracy_new_median, color = 'blue', linestyle = 'dotted')

for bucket in percentile_accuracies:
    ax.plot(loss_factors, bucket)

# ax2.plot(loss_factors, avg_weight_accuracy, color = 'green')
ax.set_xlabel('Loss factor')
ax.set_ylabel('Avg dist accuracy')
ax.set_title('Avg dist accuracy vs loss factor')

In [None]:
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
ax2 = ax.twinx()
ax.plot(loss_factors, dist_accuracy, color = 'red', linewidth = 4)
# ax2.plot(loss_factors, dist_accuracy_median, color = 'red', linestyle = 'dotted')
# ax.plot(loss_factors, dist_accuracy_new, color = 'blue')
# ax.plot(loss_factors, dist_accuracy_new_median, color = 'blue', linestyle = 'dotted')

for bucket in bucket_accuracies:
    ax.plot(loss_factors, bucket)

# ax2.plot(loss_factors, avg_weight_accuracy, color = 'green')
ax.set_xlabel('Loss factor')
ax.set_ylabel('Avg dist accuracy')
ax.set_title('Avg dist accuracy vs loss factor')