In [None]:
import json, os
import cv2
import torch
from multiprocessing import Pool, Manager
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.akpd import AKPD
from aquabyte.template_matching import find_matches_and_homography
from aquabyte.biomass_estimator import NormalizeCentered2D, NormalizedStabilityTransform, ToTensor, Network
from aquabyte.akpd_scorer import generate_confidence_score
from keras.models import load_model
import boto3
import pandas as pd
import numpy as np
import plotly.express as px
import time
from matplotlib import pyplot as plt

from collections import defaultdict
import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression
from collections import defaultdict



In [None]:
s3_access_utils = S3AccessUtils('/root/data')
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))



In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = sorted(list(tdf.index.date.astype(str)))
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output


# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    if len(records) == 0:
        output = {
            'weightMovingAvg': None,
            'weightMovingDist': None,
            'numMovingAvgBatiFish': None,
            'numMovingAvgLookbackDays': None,
            'dailyGrowthRate': None
        }
        return output
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    
    # compute required date window (NOTE: NEED TO ADD LOGIC THAT CAUSES THIS TO INFLUENCE CONFIDENCE SCORE)
    day_deltas = np.array([(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(date, '%Y-%m-%d')).days \
                 for k in historical_dates])
    if ((day_deltas > -window_size_d) & (day_deltas <= window_size_d // 2)).sum() == 0:
        idx = np.argmin(np.abs(day_deltas))
        date = historical_dates[idx]
    
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        if date in historical_dates:
            distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
        else:
            distribution_confidence = None
        
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

#     return smart_average, metadata
    return json.dumps(smart_average)

if __name__ == '__main__':
    
    # test function with gap in data
    records = {
        '2020-01-01': [1000, 2000, 3000],
        '2020-01-07': [5000, 4000, 3000]
    }

    records_json = json.dumps(records)
    date = '2020-01-07'
    
    
    smart_average = compute_metrics(date, records_json)
    smart_average = json.loads(smart_average)
    np.testing.assert_almost_equal(smart_average['weightMovingAvg'], 4000, 6)
    np.testing.assert_almost_equal(smart_average['dailyGrowthRate'], 0.11552, 4)
    
    # test function with no historical data in 7 day window
    records = {
        '2020-01-01': [1000, 2000, 3000],
        '2020-01-08': [5000, 4000, 3000]
    }

    records_json = json.dumps(records)
    date = '2020-01-08'
    
    smart_average = compute_metrics(date, records_json)
    smart_average = json.loads(smart_average)
    np.testing.assert_almost_equal(smart_average['weightMovingAvg'], 4000, 6)
    np.testing.assert_almost_equal(smart_average['dailyGrowthRate'], 0.0, 4)
    
    # test function with no data for today
    records = {
        '2020-01-05': [1000, 2000, 3000],
        '2020-01-07': [1010, 2030, 3050]
    }
    
    records_json = json.dumps(records)
    date = '2020-01-09'
    
    smart_average = compute_metrics(date, records_json)
    smart_average = json.loads(smart_average)
    np.testing.assert_almost_equal(smart_average['weightMovingAvg'], 2060.45, 6)
    np.testing.assert_almost_equal(smart_average['dailyGrowthRate'], 0.007444, 4)
    
    # test function with no data in window
    records = {
        '2020-01-05': [1000, 2000, 3000],
        '2020-01-12': [1010, 2030, 3050]
    }
    
    records_json = json.dumps(records)
    date = '2020-01-19'
    
    smart_average = compute_metrics(date, records_json)
    smart_average = json.loads(smart_average)
    np.testing.assert_almost_equal(smart_average['weightMovingAvg'], 2030, 4)
    np.testing.assert_almost_equal(smart_average['dailyGrowthRate'], 0.0, 4)
    
    # test function with no data at all in array
    records = {}
    
    records_json = json.dumps(records)
    date = '2020-01-19'
    
    smart_average = compute_metrics(date, records_json)
    np.testing.assert_equal(smart_average['weightMovingAvg'], None)
    np.testing.assert_equal(smart_average['dailyGrowthRate'], None)


In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = sorted(list(tdf.index.date.astype(str)))
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output


# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

#     return smart_average, metadata
    return json.dumps(smart_average)

# if __name__ == '__main__':
    
    # test function with gap in data
    records = {
        '2020-01-01': [1000, 2000, 3000],
        '2020-01-07': [5000, 4000, 3000]
    }

    records_json = json.dumps(records)
    date = '2020-01-07'
    
    
    smart_average = compute_metrics(date, records_json)
    smart_average = json.loads(smart_average)
    np.testing.assert_almost_equal(smart_average['weightMovingAvg'], 4000, 6)
    np.testing.assert_almost_equal(smart_average['dailyGrowthRate'], 0.11552, 4)
    
    # test function with no historical data in 7 day window
    records = {
        '2020-01-01': [1000, 2000, 3000],
        '2020-01-08': [5000, 4000, 3000]
    }

    records_json = json.dumps(records)
    date = '2020-01-08'
    
    
    smart_average = compute_metrics(date, records_json)
    smart_average = json.loads(smart_average)
    np.testing.assert_almost_equal(smart_average['weightMovingAvg'], 4000, 6)
    np.testing.assert_almost_equal(smart_average['dailyGrowthRate'], 0.0, 4)


In [None]:
np.exp(6*.1155245)

In [None]:
# extract dataframe
pen_id, group_id = 59, '59_t3_akpd'
query = """
    SELECT * FROM
    prod.biomass_computations bc
    WHERE bc.pen_id={0}
    AND (bc.group_id='{1}' OR bc.group_id='{0}')
    AND bc.captured_at between '2019-10-01' and '2020-01-31'
    AND bc.akpd_score > 0.9;
""".format(pen_id, group_id)

df = rds_access_utils.extract_from_database(query)
df = df.sort_values('captured_at')


# get daily averages and sample sizes
df.index = pd.to_datetime(df.captured_at)
records = defaultdict(list)
for date in sorted(list(set(df.index.date.astype(str)))):
    records[date].extend(df[date].estimated_weight_g.values.tolist())
    
records_json = json.dumps(records)




In [None]:
dates = sorted(list(set(df.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    print(date)
    this_records_json = json.dumps({k: v for k, v in json.loads(records_json).items() if k <= date})
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, this_records_json)
    growth_rates.append(metadata['growth_rate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])


In [None]:
fig, axes = plt.subplots(4, 1, figsize=(10, 27))
x_values = df.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights)
axes[0].plot(x_values, smart_averages)
axes[1].plot(x_values, growth_rates)
axes[2].plot(x_values, trend_scores)
axes[3].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3], ['Raw avg. weight', 'Growth rate', 'Local trend score', 'Distribution Confidence']):
    axes[i].grid()
    axes[i].set_title(title)
plt.grid()
plt.show()


<h1> Unit Tests </h1>

In [None]:
records = {
    '2020-01-01': [1, 2, 3],
    '2020-01-04': [5, 2, 3],
    '2020-01-05': [5, 6, 37],
    '2020-01-07': [5, 4, 3]
}

records_json = json.dumps(records)
date = '2020-01-07'
# raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
smart_average = compute_metrics(date, records_json)

In [None]:
smart_average

In [None]:
metadata

In [None]:
records = {
    '2020-01-01': [1, 2, 3],
}

records_json = json.dumps(records)
date = '2020-01-01'
raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)

In [None]:
smart_average

In [None]:
metadata