In [None]:
import json, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point

In [None]:
from collections import defaultdict
import json, os
import cv2
import torch
import pandas as pd
from multiprocessing import Pool, Manager
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from keras.models import load_model
import boto3


import numpy as np
import plotly.express as px
import time
from matplotlib import pyplot as plt
from copy import copy


import datetime as dt
import json
import numpy as np
from sklearn.linear_model import LinearRegression
from collections import defaultdict



In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# compute daily growth rate via fitting an exponential curve,
# weighting each day by its sample size
def compute_growth_rate(tdf, rdf, start_date, end_date):
    x_values = [(dt.datetime.strptime(k, '%Y-%m-%d') - \
                 dt.datetime.strptime(start_date, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    y = np.log(tdf.values)
    reg = LinearRegression().fit(X, y, sample_weight=rdf.values)
    growth_rate = reg.coef_[0]
    trend_score = reg.score(X, y, sample_weight=rdf.values)
    return growth_rate, trend_score


# compute distribution confidence via looking at RMS of percent deviations for qq plot
# of today's distribution against distribution in the remainder of the window
def compute_distribution_confidence(df, start_date, end_date, date):
    mean_adjustment = df[date:date].estimated_weight_g.mean() - df[start_date:end_date].estimated_weight_g.mean()
    x = np.percentile(df[start_date:end_date].estimated_weight_g + mean_adjustment, list(range(100)))
    y = np.percentile(df[date:date].estimated_weight_g, list(range(100)))
    distribution_confidence = np.mean(np.square((x[1:99] - y[1:99]) / y[1:99])) ** 0.5
    return distribution_confidence


# NOTE: we need to think more carefully about this to understand how distribution 
# confidence and trend score affect the minimum sample size we want. Hardcoded for now. 
def compute_minimum_sample_size(distribution_confidence, trend_score):
    return 5000
    
# Smart average is defined as a lookback to a maximum of window_size_d days (currently set to 7),
# or until the minimum sample size is achieved
def compute_smart_average(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = [str(d) for d in sorted(list(tdf.index.date.astype(str)))]
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(3, window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output

def compute_smart_average_new(df, tdf, rdf, date, distribution_confidence, growth_rate, 
                          trend_score, window_size_d, bucket_size=0.1):
    
    dates = [str(d) for d in sorted(list(tdf.index.date.astype(str)))]
    if len(dates) == 1:
        growth_rate = 0.0
    minimum_sample_size = compute_minimum_sample_size(distribution_confidence, trend_score)
    x_values = [(dt.datetime.strptime(date, '%Y-%m-%d') - \
                 dt.datetime.strptime(k, '%Y-%m-%d')).days \
                 for k in tdf.index.date.astype(str)]
    X = np.array(x_values).reshape(-1, 1)
    Y = tdf.values
    N = rdf.values
    
    for i in range(3, window_size_d):
        if N[np.abs(np.squeeze(X)) <= i].sum() >= minimum_sample_size:
            break
    N[np.abs(np.squeeze(X)) > i] = 0
    
    smart_average = 0.0
    sample_size = 0.0
    adj_weights = []
    total_days = 0
    for x, y, n, this_date in zip(X, Y, N, dates):
        smart_average += np.exp(x * growth_rate) * y * n
        sample_size += n
        if n > 0:
            adj_weights_for_date = \
                list(np.exp(x * growth_rate) * df[this_date:this_date].estimated_weight_g.values)
            adj_weights.extend(adj_weights_for_date)
            total_days += 1
        
    smart_average /= sample_size
    
    adj_weights = np.array(adj_weights)
    distribution = {}
    buckets = [round(x, 1) for x in np.arange(0.0, 1e-3 * adj_weights.max(), bucket_size)]
    for b in buckets:
        low, high = 1e3 * b, 1e3 * (b + bucket_size)
        count = adj_weights[(adj_weights >= low) & (adj_weights < high)].shape[0]
        distribution[b] = count / sample_size
    
    output = {
        'weightMovingAvg': float(smart_average),
        'weightMovingDist': distribution,
        'numMovingAvgBatiFish': sample_size,
        'numMovingAvgLookbackDays': total_days,
        'dailyGrowthRate': growth_rate
    }
    
    return output

# generate date range given current date and window size. If future data
# is available relative to current date, windows where the current date
# is centered are preferred
def compute_date_range(historical_dates, date, window_size_d):
    FMT = '%Y-%m-%d'
    max_num_days = 0
    start_date, end_date = None, None
    for i in range(window_size_d // 2 + 1):
        lower_bound_date = (dt.datetime.strptime(date, FMT) - dt.timedelta(days=window_size_d-1) + \
                            dt.timedelta(days=i)).strftime(FMT)
        upper_bound_date = (dt.datetime.strptime(date, FMT) + dt.timedelta(days=i)).strftime(FMT)
        num_days = ((np.array(historical_dates)  >= lower_bound_date) & \
                    (np.array(historical_dates) <= upper_bound_date)).sum()
        if num_days >= max_num_days:
            start_date, end_date = lower_bound_date, upper_bound_date
            max_num_days = num_days
    
    return start_date, end_date


def get_new_average(df):
    N, bins, _ = plt.hist(df['estimated_weight_g'], bins = 50)

    binWidth = bins[1] - bins[0]

    plt.clf()

    x_d = np.linspace(0, 2, 100)

    average_weights = []
    max_densities = []

    for index in range(len(N)):
        lowerBin = bins[index]
        upperBin = bins[index + 1]
        subset = (df['estimated_weight_g'] >= lowerBin) & (df['estimated_weight_g'] < upperBin) 
        depths = df[subset]['depth'].values
        density = sum(norm.pdf((x_d - xi) / 0.1) for xi in depths)

        average_weights.append(np.mean(df[subset]['estimated_weight_g']))
        max_densities.append(np.max(density))
    
    max_densities = np.array(max_densities)
    average_weights = np.nan_to_num(np.array(average_weights))
    
    return np.sum(max_densities * average_weights) / np.sum(max_densities)

def compute_metrics_new(date, df, window_size_d=7):
    
#     records = json.loads(records_json)
    
#     dts, vals = [], []
#     for iter_date in records:
#         for val in records[iter_date]:
#             dts.append(iter_date)
#             vals.append(val)

#     df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    
    
    # get raw statistics
    raw_avg_weight = df[date:date].estimated_weight_g.mean()
    #raw_avg_weight = get_new_average(df[date:date])
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    print(df[start_date:end_date].resample('D'))
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

    return raw_avg_weight, raw_sample_size, smart_average, metadata

def compute_metrics(date, records_json, window_size_d=7):
    
    records = json.loads(records_json)
    
    dts, vals = [], []
    for iter_date in records:
        for val in records[iter_date]:
            dts.append(iter_date)
            vals.append(val)

    df = pd.DataFrame(vals, index=pd.to_datetime(dts), columns=['estimated_weight_g'])
    
    
    
    # get raw statistics
    #raw_avg_weight = df[date:date].estimated_weight_g.mean()
    raw_avg_weight = get_new_average(df[date:date])
    raw_sample_size = df[date:date].shape[0]
    
    # compute relevant date range
    historical_dates = sorted(list(set(df.index.date.astype(str))))
    start_date, end_date = compute_date_range(historical_dates, date, window_size_d)
    print(df[start_date:end_date].resample('D'))
    rdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.shape[0])
    tdf = df[start_date:end_date].estimated_weight_g.resample('D').agg(lambda x: x.mean())
    tdf = tdf[rdf > 0].copy(deep=True)
    rdf = rdf[rdf > 0].copy(deep=True)
    
    growth_rate, trend_score, distribution_confidence = None, None, None
    if start_date < end_date:
        growth_rate, trend_score = compute_growth_rate(tdf, rdf, start_date, end_date)
        distribution_confidence = compute_distribution_confidence(df, start_date, end_date, date)
    smart_average = compute_smart_average(df, tdf, rdf, date, 
                                          distribution_confidence, growth_rate, 
                                          trend_score, window_size_d)
    metadata = {
        'trend_score': trend_score,
        'distribution_confidence': distribution_confidence
    }

    return raw_avg_weight, raw_sample_size, smart_average, metadata

In [None]:
# # df = pd.concat([
# #     pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-05,2019-06-12).csv'),
# #     pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-12,2019-06-19).csv'),
# #     pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-19,2019-06-26).csv'),
# #     pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-06-26,2019-07-03).csv'),
# #     pd.read_csv('/root/data/alok/biomass_estimation/playground/output-pen=5/biomass_output,pen=5,range=(2019-07-03,2019-07-04).csv')
# # ])
# rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
# query = """
#     SELECT * FROM (
#       (SELECT * FROM prod.crop_annotation cas
#       INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
#       WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
#       AND cas.annotation_state_id = 3
#       AND cas.pen_id=5) a
#     RIGHT JOIN 
#       (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
#       WHERE prod.biomass_computations.captured_at between '2019-06-05' and '2019-07-05'
#       AND prod.biomass_computations.akpd_score > 0.9) bc 
#     ON 
#       (a.left_crop_url=bc.left_crop_url)
#     ) x
#     WHERE x.captured_at between '2019-06-05' and '2019-07-05'
#     AND x.pen_id = 5
#     AND x.group_id = '5';
# """
# df = rds_access_utils.extract_from_database(query)

# df = df.sort_values('captured_at')
# # df['estimated_weight_g'] = df.weight
# df = df[df.akpd_score > 0.9].copy(deep=True)
# df.index = pd.to_datetime(df.captured_at)
# df['hour'] = df.index.hour

# depths = []
# for idx, row in df.iterrows():
#     ann, cm = row.annotation, row.camera_metadata
#     wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
#     depth = np.median([wkp[1] for wkp in wkps.values()])
#     depths.append(depth)
# df['depth'] = depths

# hour_mask = (df.hour > 7) & (df.hour < 16)
# df = df[hour_mask].copy(deep=True)

# # get daily averages and sample sizes

# # records = defaultdict(list)
# # for date in sorted(list(set(df.index.date.astype(str)))):
# #     records[date].extend(df[date].weight.values.tolist())
    
# # records_json = json.dumps(records)




In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
query = """
    SELECT * FROM (
      (SELECT * FROM prod.crop_annotation cas
      INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
      WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
      AND cas.annotation_state_id = 3
      AND cas.pen_id=66) a
    RIGHT JOIN 
      (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
      WHERE prod.biomass_computations.captured_at between '2020-06-01' and '2020-06-11'
      AND prod.biomass_computations.akpd_score > 0.99) bc 
    ON 
      (a.left_crop_url=bc.left_crop_url)
    ) x
    WHERE x.captured_at between '2020-06-01' and '2020-06-11'
    AND x.pen_id = 66
    AND x.group_id = '66';
"""
df = rds_access_utils.extract_from_database(query)

df = df.sort_values('captured_at')
# df['estimated_weight_g'] = df.weight
df = df[df.akpd_score > 0.99].copy(deep=True)
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour

depths = []
for idx, row in df.iterrows():
    ann, cm = row.annotation, row.camera_metadata
    wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
    depth = np.median([wkp[1] for wkp in wkps.values()])
    depths.append(depth)
df['depth'] = depths

# hour_mask = (df.hour > 7) & (df.hour < 16)
# df = df[hour_mask].copy(deep=True)

In [None]:
a  = [ 1, 2]
b = [2, 3]

pd.DataFrame([a, b], columns= ['a', 'b'])

In [None]:
dates = sorted(list(set(df.index.date.astype(str))))
raw_avg_weights, raw_sample_sizes, growth_rates, trend_scores, smart_averages, distribution_confidences = [], [], [], [], [], []
for date in dates:
    #raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics(date, records_json)
    raw_avg_weight, raw_sample_size, smart_average, metadata = compute_metrics_new(date, df)
    growth_rates.append(smart_average['dailyGrowthRate'])
    trend_scores.append(metadata['trend_score'])
    raw_avg_weights.append(raw_avg_weight)
    raw_sample_sizes.append(raw_sample_size)
    smart_averages.append(smart_average['weightMovingAvg'])
    distribution_confidences.append(metadata['distribution_confidence'])

fig, axes = plt.subplots(5, 1, figsize=(10, 20))
x_values = df.estimated_weight_g.resample('D').agg(lambda x: x.mean()).dropna().index
axes[0].plot(x_values, raw_avg_weights, label='Raw Avg.')
axes[0].plot(x_values, smart_averages, label='Smart Avg.')
axes[0].plot(x_values, 1.02 * np.array(smart_averages), color='red', linestyle='--', label='Smart Avg. +/-2%')
axes[0].plot(x_values, 0.98 * np.array(smart_averages), color='red', linestyle='--')
axes[1].plot(x_values, raw_sample_sizes, label='Raw Daily Sample Size')
axes[2].plot(x_values, growth_rates)
axes[3].plot(x_values, trend_scores)
axes[4].plot(x_values, distribution_confidences)
for i, title in zip([0, 1, 2, 3, 4], ['Avg. weight', 'Raw Sample Size', 'Growth rate', 'Local trend score', 'Distribution Instability']):
    axes[i].set_title(title)
    axes[i].grid()
    axes[i].legend()
plt.show()

In [None]:
smart_averages

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# extract dataframe
# s3_access_utils = S3AccessUtils('/root/data')
# rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
# pen_id, group_id = 5, '5'
# query = """
#     SELECT * FROM
#     prod.biomass_computations bc
#     WHERE bc.pen_id={0}
#     AND bc.group_id='{1}';
# """.format(pen_id, group_id)
# query = """
#     SELECT * FROM (
#       (SELECT * FROM prod.crop_annotation cas
#       INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
#       WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
#       AND cas.annotation_state_id = 3
#       AND cas.pen_id=5) a
#     RIGHT JOIN 
#       (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
#       WHERE prod.biomass_computations.captured_at between '2019-07-02 00:00' and '2019-07-03 00:00'
#       AND prod.biomass_computations.akpd_score > 0.9) bc 
#     ON 
#       (a.left_crop_url=bc.left_crop_url)
#     ) x
#     WHERE x.captured_at between '2019-07-02 00:00' and '2019-07-03 00:00'
#     AND x.pen_id = 5
#     AND x.group_id = '5';
# """

pen_id = 83
start_date = '2020-05-01 00:00'
end_date = '2020-05-02 00:00'
akpd_filter = 0.99

query = """
    SELECT * FROM (
      (SELECT * FROM prod.crop_annotation cas
      INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
      WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
      AND cas.annotation_state_id = 3
      AND cas.pen_id=%i) a
    RIGHT JOIN 
      (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
      WHERE prod.biomass_computations.captured_at >= '%s'
      AND prod.biomass_computations.captured_at <= '%s'
      AND prod.biomass_computations.akpd_score > %0.2f) bc 
    ON 
      (a.left_crop_url=bc.left_crop_url)
    ) x
    WHERE x.captured_at >= '%s'
    AND x.captured_at <= '%s'
    AND x.pen_id = %i
    AND x.group_id = '%i';
""" % (pen_id, start_date, end_date, akpd_filter, start_date, end_date, pen_id, pen_id)

df = rds_access_utils.extract_from_database(query)
df = df.sort_values('captured_at')
df = df[df.akpd_score > 0.9].copy(deep=True)
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour

depths = []
for idx, row in df.iterrows():
    ann, cm = row.annotation, row.camera_metadata
    wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
    depth = np.median([wkp[1] for wkp in wkps.values()])
    depths.append(depth)
df['depth'] = depths

In [None]:
N, bins, _ = plt.hist(df['estimated_weight_g'], bins = 50)

binWidth = bins[1] - bins[0]

plt.clf()

fig, axes = plt.subplots(nrows = 3, ncols = 1, figsize = (10, 5 * 3))

x_d = np.linspace(0, 2, 100)

average_weights = []
max_densities = []

for index in range(len(N)):
    lowerBin = bins[index]
    upperBin = bins[index + 1]
    subset = (df['estimated_weight_g'] >= lowerBin) & (df['estimated_weight_g'] < upperBin) 
    depths = df[subset]['depth'].values
    density = sum(norm.pdf((x_d - xi) / 0.1) for xi in depths)
    
    average_weights.append(np.mean(df[subset]['estimated_weight_g']))
    #max_densities.append(np.percentile(density, 95))
    #max_densities.append(np.percentile(density, 50))
    max_densities.append(np.max(density))
    #max_densities.append(np.mean(density))
    #max_densities.append(len(depths))

    #print(index, np.max(density))

max_densities = np.array(max_densities)
average_weights = np.nan_to_num(np.array(average_weights))

axes[0].bar(bins[1:], N / np.max(N), width = binWidth, color = 'blue', alpha = 0.5)
axes[0].set_title('Vikane Pen 1 (6/1/20 - 6/10/20): Original Histogram')
axes[0].set_xlabel('Weight')
axes[0].set_ylabel('Density')
axes[1].bar(bins[1:], max_densities / np.max(max_densities), width = binWidth, color = 'red', alpha = 0.5)
axes[1].set_title('Vikane Pen 1 (6/1/20 - 6/10/20): Adjusted Histogram')
axes[1].set_xlabel('Weight')
axes[1].set_ylabel('Density')
axes[2].bar(bins[1:], N / np.max(N), width = binWidth, color = 'blue', alpha = 0.5)
axes[2].bar(bins[1:], max_densities / np.max(max_densities), width = binWidth, color = 'red', alpha = 0.5)
axes[2].set_title('Vikane Pen 1 (6/1/20 - 6/10/20): Original vs Adjusted Histogram')
axes[2].set_xlabel('Weight')
axes[2].set_ylabel('Density')

prior = np.mean(df['estimated_weight_g'])
normalized = np.sum(max_densities * average_weights) / np.sum(max_densities)
print(prior, normalized, (normalized - prior) / prior)


In [None]:
#from sklearn.neighbors import KernelDensity

# raw_weights = df['estimated_weight_g'].values
# raw_weights.sort()
# weights = raw_weights[:,None]

# kde = KernelDensity(bandwidth = 200, kernel='gaussian')
# kde.fit(weights)

# logprob = kde.score_samples(weights)

# plt.fill_between(raw_weights, np.exp(logprob), alpha = 0.5)

x_d = np.linspace(0, 8000, 1000)

weights = df['estimated_weight_g'].values

density = sum(norm.pdf((x_d - xi) / 200) for xi in weights)
#density = sum(norm(xi).pdf(x_d) for xi in weights)

plt.fill_between(x_d, density, alpha=0.5)

# plt.hist(df['estimated_weight_g'], bins = 100)

In [None]:
base_weight = 2000
bucket_size = bins[1] - bins[0]
buckets = [bucket_size * x for x in list(range(8))]

fig, axes = plt.subplots(nrows = len(buckets), ncols = 1, figsize = (10, 5 * len(buckets)))

x_d = np.linspace(0, 2, 100)

for index, bucket in enumerate(buckets):
    startingWeight = base_weight + bucket
    endingWeight = base_weight + bucket + bucket_size
    subset = (df['estimated_weight_g'] > startingWeight) & (df['estimated_weight_g'] < endingWeight) 
    
    depths = df[subset]['depth'].values
#     raw_depths.sort()
#     depths = raw_depths[:,None]

#     kde = KernelDensity(bandwidth = 0.05, kernel='gaussian')
#     kde.fit(depths)

#     logprob = kde.score_samples(depths)

#     axes[index].fill_between(raw_depths, np.exp(logprob), alpha=0.5)
    density = sum(norm.pdf((x_d - xi) / 0.15) for xi in depths)
    axes[index].fill_between(x_d, density, alpha=0.5)
    axes[index].set_title('Vikane Pen 1 (6/1/20 - 6/10/20): Kernel Density by Depth [%0.0f -> %0.0f]' % (startingWeight, endingWeight))
    axes[index].set_xlabel('Depth')
    axes[index].set_ylabel('Density')
    


#     axes[index].hist(df[subset]['depth'], bins = 20)