In [None]:
import json, os
import cv2
import torch
from multiprocessing import Pool, Manager
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.akpd import AKPD
from aquabyte.template_matching import find_matches_and_homography
from aquabyte.biomass_estimator import NormalizeCentered2D, NormalizedStabilityTransform, ToTensor, Network
from aquabyte.data_loader import KeypointsDataset, NormalizeCentered2D, ToTensor, BODY_PARTS
from torch.utils.data import Dataset, DataLoader
from aquabyte.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point

from aquabyte.akpd_scorer import generate_confidence_score
from keras.models import load_model
import boto3
import pandas as pd
import numpy as np
import plotly.express as px
import time
from matplotlib import pyplot as plt

from collections import defaultdict
import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression
from collections import defaultdict
from matplotlib.ticker import PercentFormatter



In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = sorted(list(tdf.index.date.astype(str)))
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(3, window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output, adj_weights


# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average, adj_weights = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

    return raw_avg_weight, raw_sample_size, smart_average, metadata, adj_weights

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/data_dump_1.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-06-from-2019-10-25-to-2019-11-01.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-07-from-2019-11-01-to-2019-11-08.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-08-from-2019-11-08-to-2019-11-15.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-09-from-2019-11-15-to-2019-11-22.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-10-from-2019-11-22-to-2019-11-29.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-11-from-2019-11-29-to-2019-12-06.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-12-from-2019-12-06-to-2019-12-13.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-13-from-2019-12-13-to-2019-12-20.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-14-from-2019-12-20-to-2019-12-27.csv')
])    



In [None]:
df.index = pd.to_datetime(df.captured_at)
df['estimated_weight_g'] = df.weight
df = df[(~df.estimated_weight_g.isnull()) & (df.akpd_score > 0.9)].copy(deep=True)

records = defaultdict(list)
for date in sorted(list(set(df.index.date.astype(str)))):
    records[date].extend(df[date].estimated_weight_g.values.tolist())

records_json = json.dumps(records)


In [None]:
FMT = '%Y-%m-%d'
dates = sorted(list(set(df.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
adj_weights_dict = {}
for date in dates:
    print(date)
    dates_to_include = []
    for i in range(-10, 10):
        date_to_include = dt.datetime.strftime(dt.datetime.strptime(date, FMT) - dt.timedelta(days=i), FMT)
        dates_to_include.append(date_to_include)
    this_records_json = json.dumps({k: v for k, v in records.items() if k in dates_to_include})
    raw_avg_weight, raw_sample_size, smart_average, metadata, adj_weights = compute_metrics(date, this_records_json)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])
    adj_weights_dict[date] = adj_weights


In [None]:
fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = df.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
# axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
# axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()

In [None]:
pd.DataFrame({'date': dates, 'avg': smart_averages, 'raw_avg': raw_avg_weights, 'growth_rate': growth_rates})

In [None]:
(2139 * np.exp(-6 * 0.016) - 2048) / 2048

In [None]:
np.log((smart_averages[-1] / smart_averages[0])) / len(dates)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(adj_weights_dict['2019-12-05'])
plt.grid()
plt.show()

In [None]:
gt_df = pd.read_csv('/root/data/alok/biomass_estimation/playground/imr_dec_weighing.csv')

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.hist(gt_df.W, bins=np.arange(0, 9000, 1000), color='red', alpha=0.8, weights=np.ones(len(gt_df.W)) / len(gt_df.W), 
         label='Ground Truth')
ax.hist(adj_weights_dict['2019-12-05'], bins=np.arange(0, 9000, 1000), color='blue', 
         alpha=0.5, weights=np.ones(len(adj_weights_dict['2019-12-05'])) / len(adj_weights_dict['2019-12-05']),
         label='Prediction')
ax.grid()
ax.legend(fontsize=18)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
ax.set_title('IMR December Weighing - Distribution Comparison', fontsize=20)
ax.set_xlabel('Weight Bucket (g)', fontsize=18)
ax.set_ylabel('Frequency', fontsize=18)
ax.tick_params(axis='both', which='major', labelsize=18)
plt.show()


In [None]:
hist = np.histogram(gt_df.W, bins=np.arange(0, 9000, 1000),  weights=np.ones(len(gt_df.W)) / len(gt_df.W))
pct_1 = hist[0]
bins = hist[1]

In [None]:
pct_2 = np.histogram(adj_weights_dict['2019-12-05'], bins=np.arange(0, 9000, 1000), weights=np.ones(len(adj_weights_dict['2019-12-05'])) / len(adj_weights_dict['2019-12-05']))[0]

In [None]:
pd.DataFrame({
    'weight_bucket': ['{}-{}'.format(bins[i], bins[i+1]) for i in range(len(bins) - 1)],
    'predicted_pct': pct_2 * 100,
    'ground_truth_pct': pct_1 * 100
})

In [None]:
gt_df[(gt_df.W > 4000) & (gt_df.W < 5000)].W.mean()

In [None]:
x = adj_weights_dict['2019-12-05']
x[(x > 4000) & (x < 5000)].mean()

In [None]:
weight_buckets = ['{}-{}'.format(bins[i], bins[i+1]) for i in range(len(bins) - 1)]
predicted_means, ground_truth_means = [], []
for idx in range(len(weight_buckets)):
    weight_bucket = weight_buckets[idx]
    weight_bucket_nr = [int(k) for k in weight_bucket.split('-')]
    predicted_means.append(x[(x > weight_bucket_nr[0]) & (x < weight_bucket_nr[1])].mean())
    ground_truth_means.append(gt_df[(gt_df.W > weight_bucket_nr[0]) & (gt_df.W < weight_bucket_nr[1])].W.mean())

    

In [None]:
results_df = pd.DataFrame({
    'weight_bucket': ['{}-{}'.format(bins[i], bins[i+1]) for i in range(len(bins) - 1)],
    'predicted_pct': pct_2 * 100,
    'ground_truth_pct': pct_1 * 100,
    'predicted_mean': predicted_means,
    'ground_truth_mean': ground_truth_means,
    'count_error_pct': 100 * (np.array(pct_2) - np.array(pct_1)),
    'weight_prediction_error_pct': 100 * (np.array(predicted_means) - np.array(ground_truth_means)) / np.array(ground_truth_means)
})

In [None]:
results_df.round(2).to_csv('/root/data/alok/biomass_estimation/playground/distribution_accuracy.csv')

In [None]:
results_df

In [None]:
np.minimum(pct_1, pct_2).sum() / np.maximum(pct_1, pct_2).sum() 