In [None]:
import json, os
import cv2
import torch
from multiprocessing import Pool, Manager
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.akpd import AKPD
from aquabyte.template_matching import find_matches_and_homography
from aquabyte.biomass_estimator import NormalizeCentered2D, NormalizedStabilityTransform, ToTensor, Network
from aquabyte.akpd_scorer import generate_confidence_score
from keras.models import load_model
import boto3
import pandas as pd
from collections import defaultdict
import datetime as dt

import numpy as np
from matplotlib import pyplot as plt
import time
import matplotlib.dates as mdates

In [None]:
class AKPD(object):

    def __init__(self, aws_credentials):
        self.client = boto3.client(
            "sagemaker-runtime", 
            region_name="eu-west-1", 
            aws_access_key_id=aws_credentials['aws_access_key_id'], 
            aws_secret_access_key=aws_credentials['aws_secret_access_key']
        
        )

    def predict_keypoints(self, left_crop_url, right_crop_url, left_crop_metadata, right_crop_metadata, camera_metadata):
        body = [{
            'leftCropUrl': left_crop_url,
            'rightCropUrl': right_crop_url,
            'leftCropMetadata': left_crop_metadata,
            'rightCropMetadata': right_crop_metadata,
            'cameraMetadata': camera_metadata,
            'id': 1
        }]

        body_str = json.dumps(body).replace("'", '"')

        resp = self.client.invoke_endpoint(EndpointName='auto-keypoints', ContentType='application/json', Body=body_str)
        akpd_keypoints_str = resp['Body'].read()
        akpd_keypoints = json.loads(akpd_keypoints_str.decode("utf-8"))
        return akpd_keypoints

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

df = pd.concat([
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/data_dump_1.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-06-from-2019-10-25-to-2019-11-01.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-07-from-2019-11-01-to-2019-11-08.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-08-from-2019-11-08-to-2019-11-15.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-09-from-2019-11-15-to-2019-11-22.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-10-from-2019-11-22-to-2019-11-29.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-11-from-2019-11-29-to-2019-12-06.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-12-from-2019-12-06-to-2019-12-13.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-13-from-2019-12-13-to-2019-12-20.csv'),
    pd.read_csv('/root/data/alok/biomass_estimation/playground/pen=61/biomass.csv-61-14-from-2019-12-20-to-2019-12-27.csv')
])    

aws_credentials = json.load(open(os.environ['AWS_CREDENTIALS']))
akpd = AKPD(aws_credentials)

to_tensor_transform = ToTensor()

# initialize data transforms so that we can run inference with biomass neural network
normalize_centered_2D_transform_biomass = NormalizeCentered2D()
normalized_stability_transform = NormalizedStabilityTransform()

# load neural network weights
biomass_network = torch.load('/root/data/alok/biomass_estimation/results/neural_network/2019-11-08T00:13:09/nn_epoch_798.pb')
akpd_scorer_network = load_model('/root/data/alok/biomass_estimation/playground/akpd_scorer_model_TF.h5') # make this better

<h1> Function to generate weight prediction and confidence score </h1>

In [None]:
def generate_weight_score(row_id, left_crop_url, right_crop_url, left_crop_metadata, right_crop_metadata, akpd_keypoints, cm):
    
    # run AKPD scoring network
    input_sample = {
        'keypoints': akpd_keypoints,
        'cm': row.camera_metadata,
        'stereo_pair_id': row.id,
        'single_point_inference': True
    }
    akpd_score = generate_confidence_score(input_sample, akpd_scorer_network)

    # run biomass estimation
    input_sample = {
        'keypoints': akpd_keypoints,
        'cm': row.camera_metadata,
        'stereo_pair_id': row.id,
        'single_point_inference': True
    }
    nomralized_centered_2D_kps = \
        normalize_centered_2D_transform_biomass.__call__(input_sample)

    normalized_stability_kps = normalized_stability_transform.__call__(nomralized_centered_2D_kps)
    tensorized_kps = to_tensor_transform.__call__(normalized_stability_kps)
    akpd_weight_prediction = biomass_network(tensorized_kps['kp_input']).item() * 1e4
    
    
    return akpd_score, akpd_weight_prediction


In [None]:
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour

In [None]:
score_mask = df.akpd_score > 0.9
hour_mask = (df.hour >= 7) & (df.hour < 16)
plt.figure(figsize=(20, 10))
tdf1 = df[score_mask].weight.resample('D', how=lambda x: x.mean())
tdf2 = df[score_mask & hour_mask].weight.resample('D', how=lambda x: x.mean())
plt.plot(tdf1.index, tdf1.values)
plt.plot(tdf2.index, tdf2.values)
# plt.plot(tdf.ewm(2).mean())
# plt.plot(tdf.index, weights)
plt.grid()
plt.title('Biomass progression over time (pen_id = 61)')
plt.xlabel('Date')
plt.ylabel('Avg. Weight')
plt.show()

<h1> Perform Smart Averaging </h1>

In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = sorted(list(tdf.index.date.astype(str)))
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output


# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

    return raw_avg_weight, raw_sample_size, smart_average, metadata

In [None]:
df['estimated_weight_g'] = df.weight
records = defaultdict(list)
for date in sorted(list(set(df.index.date.astype(str)))):
    records[date].extend(df[date].estimated_weight_g.dropna().values.tolist())

records_json = json.dumps(records)

In [None]:
dates = sorted(list(set(df.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    print(date)
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])


In [None]:
fig, axes = plt.subplots(5, 1, figsize=(10, 30))
x_values = df.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()

In [None]:
3500 / (np.pi * (6**2) * 12)

In [None]:
100000 / (np.pi * (25**2) * 40)

In [None]:
from collections import defaultdict
import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression

'''
    This function generates biomass estimate for date given historical record of weight estiamtes
    Assumptions: 
        - records_json is JSON blob in string form as specified in instructions
        - date is of the format 'YYYY-mm-dd'
        - records_json contains up to 30 days of data. The latest date in records_json equals 'date' input
'''

def generate_biomass_estimate(date, records_json, min_lookback=3, max_lookback=5, min_sample_size=3000, bucket_size=0.1):
    records = json.loads(records_json)
    
    # get list of weights by date
    weights_by_date = defaultdict(list)
    for r in records:       
        weights_by_date[r['date']].append(r['estimatedWeightG'])

    # calculate daily growth rate (if more than one day is provided)
    historical_dates = sorted(list(weights_by_date.keys()))
    if len(historical_dates) > 1:
        avg_weights = []
        for h_date in historical_dates:
            avg_weight = np.mean(weights_by_date[h_date])
            avg_weights.append(avg_weight)

        x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                     dt.datetime.strptime(dates[0], '%Y-%m-%d')).days for date in historical_dates]
        X = np.array(x_values).reshape(-1, 1)
        y = np.log(np.array(avg_weights))
        reg = LinearRegression().fit(X, y)
        growth_rate = reg.coef_[0]
    else:
        growth_rate = 0.0
    
    # calculate moving average weight
    date_idx = historical_dates.index(date)
    lookback, sample_size, adj_weights = 0, 0, []
    while True:
        lookback += 1
        sample_size += len(weights_by_date[historical_dates[date_idx-lookback+1]])
        adj_weights_for_date = list(np.exp(growth_rate*(lookback-1))*np.array(weights_by_date[historical_dates[date_idx-lookback+1]]))
        adj_weights.extend(adj_weights_for_date)
        if date_idx-lookback+1 == 0:
            break
        if ((lookback >= min_lookback) & (sample_size >= min_sample_size)) | (lookback >= max_lookback):
            break
    
    adj_weights = np.array(adj_weights)
    ma_weight = adj_weights.sum() / sample_size
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': ma_weight,
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': lookback,
        'dailyGrowthRate': growth_rate
    }
    
    return output
    

In [None]:
elements = []
for idx, row in df[(df.akpd_score > 0.9)].iterrows():
    weight = row.weight
    date = str(idx.date())
    print(date)
    elements.append({
        'date': date,
        'estimatedWeightG': weight
    })
    

In [None]:
tdf

In [None]:
dates = sorted(list(set(df.index.date.astype(str).tolist())))
weights = []
for date in dates:
    weight = generate_biomass_estimate(date, json.dumps(elements))
    print(date, weight['weightMovingAvg'])
    weights.append(weight['weightMovingAvg'])

In [None]:
df[df.akpd_score > 0.9].weight.resample('D').agg(lambda x: x.shape[0])

In [None]:
df['2019-12-01'][df['2019-12-01'].akpd_score > 0.9].shape

In [None]:
df['2019-12-01'].shape

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(tdf[tdf.index != '2019-12-12'].index, weights)
myFmt = mdates.DateFormatter('%Y-%m-%d')
fig.autofmt_xdate()
ax.grid()
plt.show()

In [None]:
df['hour'] = df.index.hour

In [None]:
plt.plot(df.groupby('hour')['weight'].agg(lambda x: x.mean()))
plt.show()

In [None]:
rows, cols = 20, 5
fig, axes = plt.subplots(rows, cols, figsize=(30, 60))
for idx, date in enumerate(dates[:100]):
    row, col = idx // cols, idx % cols
    axes[row, col].plot(df[date][df[date].akpd_score > 0.9].groupby('hour')['weight'].agg(lambda x: x.mean()))
    axes[row, col].set_title(date)
plt.show()

In [None]:
rows, cols = 20, 5
fig, axes = plt.subplots(rows, cols, figsize=(30, 60))
for idx, date in enumerate(dates[:100]):
    row, col = idx // cols, idx % cols
    axes[row, col].plot(df[date].groupby('hour')['weight'].agg(lambda x: x.shape[0]))
    axes[row, col].set_title(date)
plt.show()

In [None]:
date = '2019-11-10'
hours = list(range(8, 15))
for hour in hours:
    plt.hist(df[date][(df[date].akpd_score > 0.9) & (df[date].hour == hour)].weight)
    plt.title(df[date][(df[date].akpd_score > 0.9) & (df[date].hour == hour)].weight.mean())
    plt.show()


In [None]:
mask = df.akpd_score > 0.95
plt.figure(figsize=(20, 10))
tdf = df[mask].weight.resample('D', how=lambda x: x.shape[0] / 1e5)
plt.plot(tdf.index, tdf.values)
# plt.plot(tdf.ewm(2).mean())
# plt.plot(tdf.index, weights)
plt.grid()
plt.title('Biomass progression over time (pen_id = 61)')
plt.xlabel('Date')
plt.ylabel('Avg. Weight')
plt.show()

In [None]:
df.head()

In [None]:
tdf = df.weight.resample('D', how=lambda x: x.mean())

In [None]:
date = '2019-11-26'

In [None]:
historical_dates

In [None]:
df.weight.resample('D', how=lambda x: x.mean())

In [None]:
from collections import defaultdict
import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression

'''
    This function generates biomass estimate for date given historical record of weight estiamtes
    Assumptions: 
        - records_json is JSON blob in string form as specified in instructions
        - date is of the format 'YYYY-mm-dd'
        - records_json contains up to 30 days of data. The latest date in records_json equals 'date' input
'''

def generate_biomass_estimate(date, df, min_lookback=3, max_lookback=5, min_sample_size=3000, bucket_size=0.1):

    # calculate daily growth rate (if more than one day is provided)
    
    tdf = df.weight.resample('D', how=lambda x: x.mean())
    sample_tdf = df.weight.resample('D', how=lambda x: x.shape[0])
    start_date = dt.datetime.strftime(dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(days=15), '%Y-%m-%d')
    historical_dates = tdf.index.date.astype(str).tolist()
    if start_date not in historical_dates:
        start_date = historical_dates[0]
    h_dates = historical_dates[historical_dates.index(start_date):historical_dates.index(date)+1]
    if len(historical_dates) > 1:
        x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                     dt.datetime.strptime(start_date, '%Y-%m-%d')).days for date in h_dates]
        X = np.array(x_values).reshape(-1, 1)
        y = np.log(tdf[start_date:date].values)
        reg = LinearRegression().fit(X, y)
        growth_rate = reg.coef_[0]
    else:
        growth_rate = 0.0
    
    # calculate moving average weight
    date_idx = h_dates.index(date)
    lookback, sample_size, adj_weights = 0, 0, []
    while True:
        lookback += 1
        sample_size += sample_tdf[h_dates[date_idx-lookback+1]]
        adj_weights_for_date = list(np.exp(growth_rate*(lookback-1))*np.array(df[h_dates[date_idx-lookback+1]].weight))
        adj_weights.extend(adj_weights_for_date)
        if date_idx-lookback+1 == 0:
            break
        if ((lookback >= min_lookback) & (sample_size >= min_sample_size)) | (lookback >= max_lookback):
            break
    
    ma_weight = np.nanmean(adj_weights)
    
    output = {
        'weightMovingAvg': ma_weight,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': lookback,
        'dailyGrowthRate': growth_rate
    }
    
    return output
    
    
    

In [None]:
generate_biomass_estimate('2019-12-05', df)

In [None]:
dates = tdf.index.date.astype(str).tolist()
weights = []
for date in dates:
    try:
        weight = generate_biomass_estimate(date, df, min_sample_size=3000)['weightMovingAvg']
        weights.append(weight)
    except:
        weights.append(None)
    

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 10))
tdf = df[(df.akpd_score > 0.9)].akpd_weight.resample('D', how=lambda x: x.mean())
rdf = df[(df.akpd_score > 0.9)].akpd_weight.resample('D', how=lambda x: len(x))
ax[0].plot(dates, tdf.dropna().values, label='daily averages', color='blue')
ax[0].plot(dates, weights, label='growth-rate-adjusted moving average', color='red')
ax[0].grid()
ax[0].legend()
ax[0].set_title('Vikane Pen ID 60 Biomass Progression')
ax[1].plot(rdf.index, rdf.values)
ax[1].grid()
ax[1].set_title('Vikane Pen ID 60 Sample Sizes')
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
mask = df.score > 0.8
plt.hist(df[mask].weight, bins=10)
plt.xlabel('Predicted weight (grams)')
plt.ylabel('Frequency')
plt.title('Predicted weight distribution for IMR - 11/09-11/13')
plt.grid()
plt.show()

In [None]:
def display_crops(left_image_f, right_image_f, left_keypoints, right_keypoints, side='both', overlay_keypoints=True, show_labels=False):
    assert side == 'left' or side == 'right' or side == 'both', \
        'Invalid side value: {}'.format(side)

    if side == 'left' or side == 'right':
        fig, ax = plt.subplots(figsize=(20, 10))
        image_f = left_image_f if side == 'left' else right_image_f
        keypoints = left_keypoints if side == 'left' else right_keypoints
        image = plt.imread(image_f)
        ax.imshow(image)

        if overlay_keypoints:
            for bp, kp in keypoints.items():
#                 ax.scatter([kp[0]], [kp[1]], color='red', s=1)
                ax.scatter([kp[0]], [kp[1]], color='red', s=200, alpha=0.3)
                if show_labels:
                    ax.annotate(bp, (kp[0], kp[1]), color='red')
    else:
        fig, axes = plt.subplots(2, 1, figsize=(20, 20))
        left_image = plt.imread(left_image_f)
        right_image = plt.imread(right_image_f)
        axes[0].imshow(left_image)
        axes[1].imshow(right_image)
        if overlay_keypoints:
            for bp, kp in left_keypoints.items():
#                 axes[0].scatter([kp[0]], [kp[1]], color='red', s=1)
                axes[0].scatter([kp[0]], [kp[1]], color='red', s=200, alpha=0.3)
                if show_labels:
                    axes[0].annotate(bp, (kp[0], kp[1]), color='red')
            for bp, kp in right_keypoints.items():
#                 axes[1].scatter([kp[0]], [kp[1]], color='red', s=1)
                axes[1].scatter([kp[0]], [kp[1]], color='red', s=200, alpha=0.3)
                if show_labels:
                    axes[1].annotate(bp, (kp[0], kp[1]), color='red')
    plt.show()

In [None]:
idx = 24
row = df[mask].sort_values('weight', ascending=False).iloc[idx]

left_crop_url, right_crop_url = row.left_crop_url, row.right_crop_url
left_image_f, _, _ = s3_access_utils.download_from_url(left_crop_url)
right_image_f, _, _ = s3_access_utils.download_from_url(right_crop_url)
left_crop_metadata, right_crop_metadata = row.left_crop_metadata, row.right_crop_metadata,
cm = row.camera_metadata
row_id = idx

# run AKPD
akpd_keypoints = akpd.predict_keypoints(left_crop_url, right_crop_url, left_crop_metadata, right_crop_metadata)
left_keypoints = {item['keypointType']: [item['xCrop'], item['yCrop']] for item in akpd_keypoints[0]['leftCrop']}
right_keypoints = {item['keypointType']: [item['xCrop'], item['yCrop']] for item in akpd_keypoints[0]['rightCrop']}


# run AKPD scoring network
input_sample = {
    'keypoints': akpd_keypoints[0],
    'cm': cm,
    'stereo_pair_id': row_id,
    'single_point_inference': True
}
nomralized_centered_2D_kps = \
    normalize_centered_2D_transform_akpd.__call__(input_sample)

akpd_normalized_kps = akpd_normalization_transform.__call__(nomralized_centered_2D_kps)
tensorized_kps = to_tensor_transform.__call__(akpd_normalized_kps)
score = akpd_scorer_network(tensorized_kps['kp_input']).item()
display_crops(left_image_f, right_image_f, left_keypoints, right_keypoints, show_labels=True)


In [None]:
left_crop_url

In [None]:
3250 = 2070 * (1+r)**60

In [None]:
((3250/2070)**(1/63.0)) - 1

In [None]:
2070*(1.00718)**(51)

In [None]:
ground_truth_weights = []
for i in range(50):
    for count in range(100):
        ground_truth_weights.append(i)

In [None]:
sample_sizes = np.arange(1000, 200000, 1000)
err_pcts = []
for sample_size in sample_sizes:
    samples = []
    for s in range(sample_size):
        k = random.sample(ground_truth_weights, 1)[0]
        samples.append(k)
    err_pct = (np.mean(samples) - 4.5) / 4.5
    err_pcts.append(err_pct)
    
    

In [None]:
x = (np.array(err_pcts) * 4.5 + 4.5 - 24.5) / 24.5

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(sample_sizes, x)
plt.show()

In [None]:
score_mask = df.akpd_score > 0.9
date_mask = (df.captured_at > '2019-11-01') & (df.captured_at < '2019-11-08')
fig, axes = plt.subplots(2, 1, figsize=(10, 10))
tdf = df[date_mask & score_mask].weight.resample('D', how=lambda x: x.mean())
rdf = df[date_mask & score_mask].weight.resample('D', how=lambda x: x.shape[0])
axes[0].plot(tdf.index, tdf.values)
axes[0].grid()
axes[0].set_title('Biomass progression over time (pen_id = 61)')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Avg. Weight')
axes[1].plot(rdf.index, rdf.values)
axes[1].grid()
axes[1].set_title('Daily Sample Size over time (pen_id = 61)')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Num. Samples')
plt.show()

In [None]:
start_date, end_date = '2019-11-02', '2019-11-07'
fmt = '%Y-%m-%d'
days = (dt.datetime.strptime(end_date, fmt) - dt.datetime.strptime(start_date, fmt)).days
growth_rate = ((tdf[end_date] - tdf[start_date]) / tdf[start_date]) / days
harvest_date = '2019-11-16'
days_to_harvest = (dt.datetime.strptime(harvest_date, fmt) - dt.datetime.strptime(end_date, fmt)).days
projected_weight = tdf[end_date] * (1 + growth_rate)**days_to_harvest
gt_weight = 3370
err_pct = (projected_weight - gt_weight) / gt_weight
print('Error percentage: {}'.format(err_pct))

In [None]:
score_mask = df.akpd_score > 0.9
dates = sorted(list(set(df.index.date.astype(str))))
N = 4
for idx in range(len(dates)-N-1):
    start_date, end_date, date = dates[idx], dates[idx+N], dates[idx+N+1]
    print(start_date, end_date, date)
    x = np.percentile(df[score_mask][start_date:end_date].weight, list(range(100)))
    y = np.percentile(df[score_mask][date].weight, list(range(100)))
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    axes[0].scatter(x[1:99], y[1:99])
    axes[0].set_title(date)
    axes[0].grid()
    
    lower_bound = int(df[start_date:date].weight.min() * 0.8)
    upper_bound = int(df[start_date:date].weight.max() * 1.2)
    
    axes[1].hist(df[score_mask][start_date:end_date].weight, bins=list(np.arange(lower_bound, upper_bound, 300)), color='blue', alpha=0.5, density=True)
    axes[1].hist(df[score_mask][date].weight, bins=list(np.arange(lower_bound, upper_bound, 300)), color='red', alpha=0.5, density=True)
    axes[1].set_title(date)
    axes[1].grid()
    plt.show()