In [None]:
from collections import defaultdict
import json, os
import cv2
import torch
import pandas as pd
from multiprocessing import Pool, Manager
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from research.weight_estimation.akpd import AKPD
from research.weight_estimation.biomass_estimator import NormalizeCentered2D, NormalizedStabilityTransform, ToTensor, Network
from research.weight_estimation.akpd_scorer import generate_confidence_score
from keras.models import load_model
import boto3

import numpy as np
import plotly.express as px
import time
from matplotlib import pyplot as plt
from copy import copy


import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression
from collections import defaultdict



In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = [str(d) for d in sorted(list(tdf.index.date.astype(str)))]
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(3, window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output


# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    print(df[start_date:end_date].resample('D'))
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

    return raw_avg_weight, raw_sample_size, smart_average, metadata

In [None]:
df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-05,2019-06-12).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-12,2019-06-19).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-19,2019-06-26).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-26,2019-07-03).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-07-03,2019-07-04).csv')
])

df = df.sort_values('captured_at')
df['estimated_weight_g'] = df.weight
df = df[df.akpd_score > 0.9].copy(deep=True)
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour

# get daily averages and sample sizes

records = defaultdict(list)
for date in sorted(list(set(df.index.date.astype(str)))):
    records[date].extend(df[date].weight.values.tolist())
    
records_json = json.dumps(records)




In [None]:
f = '/root/data/alok/biomass_estimation/playground/kjeppevikholmen_data.csv'
df.to_csv(f)

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
s3_access_utils.s3_client.upload_file(f, 'aquabyte-images-adhoc', 'alok/production_datasets/kjeppevikholmen_data.csv')

In [None]:
cm = {"baseline": 0.10079791852561114, "focalLength": 0.013842509663066934, "pixelCountWidth": 4096, "focalLengthPixel": 4012.3216414686767, "imageSensorWidth": 0.01412, "pixelCountHeight": 3000, "imageSensorHeight": 0.01035}

In [None]:
cms = [cm] * df.shape[0]

In [None]:
'hello'.endswith('o')

In [None]:
dates = sorted(list(set(df.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])

fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = df.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()

In [None]:
df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-05,2019-06-12).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-12,2019-06-19).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-19,2019-06-26).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-26,2019-07-03).csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-07-03,2019-07-04).csv')
])

df = df.sort_values('captured_at')
df['estimated_weight_g'] = df.weight
df = df[df.akpd_score > 0.9].copy(deep=True)
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour
hour_mask = (df.hour > 7) & (df.hour < 16)
df = df[hour_mask].copy(deep=True)

# get daily averages and sample sizes

records = defaultdict(list)
for date in sorted(list(set(df.index.date.astype(str)))):
    records[date].extend(df[date].weight.values.tolist())
    
records_json = json.dumps(records)




In [None]:
dates = sorted(list(set(df.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])

fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = df.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(x_values, raw_avg_weights)
plt.plot(x_values, smart_averages)
plt.ylim([0, 5000])
plt.grid()
plt.show()

In [None]:
smart_averages, dates

In [None]:
FMT = '%Y-%m-%d'
dates = list(x_values.date.astype(str))
extended_dates = copy(dates)
additional_dates_start = '2019-07-03'
for i in range(36):
    additional_date = dt.datetime.strftime(dt.datetime.strptime(additional_dates_start, FMT) + dt.timedelta(i), FMT)
    extended_dates.append(additional_date)
    

In [None]:
extended_raw_weights = copy(raw_avg_weights)
extended_smart_averages = copy(smart_averages)
for date in extended_dates:
    if date not in dates:
        extended_smart_averages.append(None)
        extended_raw_weights.append(None)
    

In [None]:
extrapolated_data = []
for idx, date in enumerate(extended_dates):
    last_date = dates[-1]
    days_elapsed = (dt.datetime.strptime(date, FMT) - dt.datetime.strptime(last_date, FMT)).days
    if days_elapsed >= 0:
        ext = smart_averages[-1] * np.exp(growth_rates[-1] * days_elapsed)
        extrapolated_data.append(ext)
    else:
        extrapolated_data.append(None) 
        
    

In [None]:
pd.DataFrame({
    'date': dates,
    'raw_avg_weight': raw_avg_weights,
    'smart_avg_weight': smart_averages
}).to_csv('/root/data/alok/biomass_estimation/playground/blom_kjeppevikholmen_data.csv')

In [None]:
plt.figure(figsize=(20, 10))
date_index = pd.to_datetime(extended_dates)
plt.plot(date_index, extended_raw_weights, label='Raw Average Weight')
plt.plot(date_index, extended_smart_averages, label='Smart Average Weight')
plt.plot(date_index, extrapolated_data, linestyle='--', label='Extrapolated Smart Average Weight')
plt.scatter(['2019-07-08'], 4730, marker='x', color='b', label='Slaughter Weight')
plt.scatter(['2019-07-09'], 4780, marker='x', color='b')
plt.scatter(['2019-07-10'], 4850, marker='x', color='b')
plt.scatter(['2019-07-30'], 5360, marker='x', color='b')
plt.scatter(['2019-08-05'], 5520, marker='x', color='b')
plt.ylim([0, 6000])
plt.legend()
plt.grid()
plt.show()

In [None]:
pd.DataFrame({
    'date': extended_dates,
    'raw_avg_weight': extended_raw_weights,
    'smart_avg_weight': extended_smart_averages,
    'extrapolated_avg_weight': extrapolated_data
}).to_csv('/root/data/alok/biomass_estimation/playground/extrapolated_blom_kjeppevikholmen_data.csv')

In [None]:
(5.41-5.52)/5.52

In [None]:
df.captured_at['2019-06-07']

In [None]:
pd.DataFrame({'date': extended_dates, 'weight': extrapolated_data})

In [None]:
smart_averages[-1] * np.exp(8*growth_rates[-1])

In [None]:
growth_rates

In [None]:
i = -1
print(4.73*np.exp(-6*growth_rates[i]), 4.78*np.exp(-7*growth_rates[i]), 4.85*np.exp(-8*growth_rates[i]), 5.36*np.exp(-28*growth_rates[i]), 5.52*np.exp(-34*growth_rates[i]))

In [None]:
ss_1, ss_2, ss_3 = 17651, 13542, 3960
gt_1, gt_2, gt_3 = 4.73, 4.78, 4.85
gt = (gt_1*ss_1 + gt_2*ss_2 + gt_3*ss_3) / (ss_1 + ss_2 + ss_3)
pred = 4.68 * np.exp(.0043*2)
e = (pred - gt) / gt
print(e)


In [None]:
(4.68-4.76)/4.76

In [None]:
i = -1
x = (4.73*np.exp(-6*growth_rates[i])*17651 + \
 4.78*np.exp(-7*growth_rates[i])*13524 + \
 4.85*np.exp(-8*growth_rates[i])*3960 + \
 5.36*np.exp(-28*growth_rates[i])*15259 + \
 5.52*np.exp(-34*growth_rates[i])*23111) / \
 (17651+13524+3960+15259+23111)

In [None]:
(smart_averages[-1]-x*1e3)/(x*1e3)


In [None]:
plt.figure(figsize=(20, 10))
start_date, end_date = '2020-01-05', '2020-01-11'
plt.scatter(df[start_date:end_date].index, df[start_date:end_date].estimated_weight_g)
plt.grid()
plt.show()

<h1> Add in Length / K-Factor </h1>

In [None]:
plt.figure(figsize=(20, 10))
start_date, end_date = '2019-06-03', '2019-06-20'
plt.scatter(df[start_date:end_date].index, 
            df[start_date:end_date].estimated_weight_g)

plt.grid()
plt.xlabel('Date')
plt.ylabel('Estimated Weight (g)')
plt.title('Weight Predictions vs. Time')
plt.show()