In [None]:
from collections import defaultdict
import json
import datetime as dt
import os
import numpy as np
import pandas as pd
from research.utils.data_access_utils import RDSAccessUtils
from research.utils.datetime_utils import add_days, get_dates_in_range
from research.weight_estimation.population_metrics import PopulationMetricsEstimator
from research.weight_estimation.keypoint_utils.keypoint_transformations import get_raw_3d_coordinates

from matplotlib import pyplot as plt

<h1> Establish Useful Functions for smart average calcualtion </h1>

In [None]:
class DataGenerator(object):

    def __init__(self):
        credentials = json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS']))
        self.rds_access_utils = RDSAccessUtils(credentials)
        self.df = None

    def query_from_db(self, pen_id, start_date=None, end_date=None, min_akpd_score=0.99):
        if not end_date:
            end_date = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d')
        if not start_date:
            start_date = add_days(end_date, -30 * 6)
        query = """
            SELECT * FROM
            prod.biomass_computations bc
            WHERE bc.pen_id={}
            AND bc.akpd_score >= {}
            AND bc.captured_at between '{}' and '{}'
            AND bc.estimated_weight_g > 0.0
        """.format(pen_id, min_akpd_score, start_date, end_date)

        print('Executing query...')
        print(query)
        self.df = self.rds_access_utils.extract_from_database(query)
        print('Query complete!')
        self.df = self.df.loc[:, ~self.df.columns.duplicated()]
        self.df.rename(columns={'estimated_weight_g': 'estimated_weight_g_0'}, inplace=True)


    def preprocess_df(self):
        self.df.index = list(range(self.df.shape[0]))
        self.df = self.df.sort_values('captured_at').copy(deep=True)
        self.df.index = pd.to_datetime(self.df.captured_at)
        dates = self.df.index.date.astype(str)
        self.df['date'] = dates
        self.df['estimated_k_factor'] = 1e5 * self.df['estimated_weight_g_0'] / (self.df['estimated_length_mm']**3)
        self.df['hour'] = self.df.index.hour


    # generate default data-frame to use on start-up
    def get_df(self):
        return self.df

    
def generate_pme(df, start_date, end_date, start_hour, end_hour, low_kf, high_kf):
    date_mask = (df.date >= start_date) & (df.date <= end_date)
    if start_hour < end_hour:
        hour_mask = (df.hour >= start_hour) & (df.hour <= end_hour)
    else:
        hour_mask = (df.hour >= start_hour) | (df.hour <= end_hour)
    kf_mask = (df.estimated_k_factor >= low_kf) & (df.estimated_k_factor <= high_kf)
    
    mask = date_mask & hour_mask & kf_mask & (df.akpd_score > 0.99)
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g_0'].values,
                                    df[mask].estimated_k_factor.values))
    if biomass_computations:
        return PopulationMetricsEstimator(biomass_computations)
    return None


<h1> Load data </h1>

In [None]:
KPI = log (sample size * dist_consistency^20) / np.log(500 * 0.9^20)

In [None]:


def generate_analysis_df(df, pen_id, start_date, end_date):
    


    start_hours = np.arange(0, 24, 1)
    end_hours = np.arange(0, 24, 1)
    low_kfs = np.arange(0.9, 1.3, 0.01)

    analysis_data = defaultdict(list)
    dates = get_dates_in_range(start_date, end_date)
    for start_hour in start_hours:
        print(start_hour)
        for end_hour in end_hours:
            for low_kf in low_kfs:
                if start_hour >= end_hour:
                    continue
                pme = generate_pme(df, start_date, end_date, start_hour, end_hour, low_kf)
                if not pme:
                    continue
                kpis, dcs, smart_avgs = [], [], []
                for date in dates:
                    metrics = pme.generate_smart_metrics_on_date(date)
                    if metrics.get('raw_sample_size') and metrics.get('distribution_consistency'):
                        kpi = np.log(metrics.get('raw_sample_size') * metrics.get('distribution_consistency')**20) / np.log(500 * 0.9**20)
                        kpis.append(kpi)
                        dcs.append(metrics.get('distribution_consistency'))
                    if date == dates[-1]:
                        smart_avgs.append(metrics['smart_average_weight'])

                # compute mean kpi
                mean_kpi = np.mean(kpis)
                mean_dc = np.mean(dcs)

                # add to data
                analysis_data['mean_kpi'].append(mean_kpi)
                analysis_data['mean_dc'].append(mean_dc)
                analysis_data['smart_avg'].append(smart_avgs[-1])
                analysis_data['start_hour'].append(start_hour)
                analysis_data['end_hour'].append(end_hour)
                analysis_data['low_kf'].append(low_kf)


    analysis_df = pd.DataFrame(analysis_data)
    return analysis_df









In [None]:
pen_id, start_date, end_date = 88, '2020-02-25', '2020-03-06'
dg = DataGenerator()
dg.query_from_db(pen_id, start_date=start_date, end_date=end_date)
dg.preprocess_df()
df = dg.get_df()


In [None]:
start_hours = np.arange(0, 24, 1)
end_hours = np.arange(0, 24, 1)
# start_hours = [0]
# end_hours = [24]
low_kfs = np.arange(0.8, 1.5, 0.05)
high_kfs = [3.0]

analysis_data = defaultdict(list)
dates = get_dates_in_range(start_date, end_date)
for start_hour in start_hours:
    print(start_hour)
    for end_hour in end_hours:
        for low_kf in low_kfs:
            for high_kf in high_kfs:
                if start_hour >= end_hour:
                    continue
                pme = generate_pme(df, start_date, end_date, start_hour, end_hour, low_kf, high_kf)
                if not pme:
                    continue
                kpis, dcs, smart_avgs, smart_kfs, raw_sample_sizes = [], [], [], [], []
                for date in dates:
                    metrics = pme.generate_smart_metrics_on_date(date)
                    if metrics.get('raw_sample_size') and metrics.get('distribution_consistency'):
                        kpi = np.log(metrics.get('raw_sample_size') * metrics.get('distribution_consistency')**20) / np.log(500 * 0.9**20)
                        kpis.append(kpi)
                        dcs.append(metrics.get('distribution_consistency'))
                        raw_sample_sizes.append(metrics.get('raw_sample_size'))
                    if date == dates[-1]:
                        smart_avgs.append(metrics['smart_average_weight'])
                        smart_kfs.append(np.mean(metrics['kfs']))


                # compute mean kpi
                mean_kpi = np.mean(kpis)
                mean_dc = np.mean(dcs)
                total_sample_size = np.sum(raw_sample_sizes)

                # add to data
                analysis_data['mean_kpi'].append(mean_kpi)
                analysis_data['mean_dc'].append(mean_dc)
                analysis_data['smart_avg'].append(smart_avgs[-1])
                analysis_data['smart_kf'].append(smart_kfs[-1])
                analysis_data['total_sample_size'].append(total_sample_size)
                analysis_data['start_hour'].append(start_hour)
                analysis_data['end_hour'].append(end_hour)
                analysis_data['low_kf'].append(low_kf)
                analysis_data['high_kf'].append(high_kf)





In [None]:
start_date, end_date

In [None]:
pme = generate_pme(df, '2020-06-11', '2020-06-22', 0, 24, 1.065, 3.0)

In [None]:
metrics = pme.generate_smart_metrics_on_date('2020-06-21', max_day_difference=3, apply_growth_rate=True, incorporate_future=True, bucket_size=1000)




In [None]:
metrics

In [None]:
w_dist = {}
kf_breakdown = {}
count = 0
for k in list(metrics['smart_distribution'].keys()):
    key = '{}-{}'.format(str(k), str(float(k)+1))
    w_dist[key] = metrics['smart_distribution'][k]['count']
    kf_breakdown[key] = metrics['smart_distribution'][k]['avgKFactor']
    count += metrics['smart_distribution'][k]['count']
w_dist = {k: 100 * float(v) / count for k, v in w_dist.items()}
w_dist

In [None]:
kf_breakdown

In [None]:
np.mean(metrics['kfs'])

In [None]:
analysis_df = pd.DataFrame(analysis_data)
analysis_df.sort_values('mean_kpi', ascending=False)

In [None]:
analysis_df[(analysis_df.low_kf == analysis_df.low_kf.min())].sort_values('mean_kpi', ascending=False)

In [None]:
analysis_df[(analysis_df.start_hour == 0) & (analysis_df.end_hour == 23) & (analysis_df.low_kf == analysis_df.low_kf.min())].sort_values('mean_kpi', ascending=False)

In [None]:
analysis_df.sort_values('mean_kpi', ascending=False)

In [None]:
figs, axes = plt.subplots(3, 1, figsize=(12, 12))
axes[0].plot(analysis_df.low_kf, analysis_df.mean_kpi)
axes[0].set_xlabel('K-factor cutoff')
axes[0].set_ylabel('Biomass KPI')
axes[1].plot(analysis_df.low_kf, analysis_df.smart_avg)
axes[2].plot(analysis_df.low_kf, analysis_df.total_sample_size)
[ax.grid() for ax in axes]
plt.show()

In [None]:
analysis_df = generate_analysis_df(88, '2020-02-26', '2020-03-06')

In [None]:
figs, axes = plt.subplots(2, 1, figsize=(15, 15))
axes[0].plot(analysis_df.low_kf, analysis_df.mean_dc)
axes[1].plot(analysis_df.low_kf, analysis_df.smart_avg)
[ax.grid() for ax in axes]
plt.show()

In [None]:
analysis_df = generate_analysis_df(66, '2020-06-05', '2020-06-12')

In [None]:
figs, axes = plt.subplots(2, 1, figsize=(15, 15))
axes[0].plot(analysis_df.low_kf, analysis_df.mean_kpi)
axes[1].plot(analysis_df.low_kf, analysis_df.smart_avg)
[ax.grid() for ax in axes]
plt.show()

In [None]:
analysis_df = generate_analysis_df(83, '2020-05-25', '2020-06-21')

In [None]:
figs, axes = plt.subplots(2, 1, figsize=(15, 15))
axes[0].plot(analysis_df.low_kf, analysis_df.mean_kpi)
axes[1].plot(analysis_df.low_kf, analysis_df.smart_avg)
[ax.grid() for ax in axes]
plt.show()

In [None]:
df = pd.read_csv('/root/data/alok/biomass_estimation/playground/imr_pen_id_61_2019-11-15_2019-12-15_20200520_model_keras_reduced_jitter.csv')


In [None]:
plt.figure(figsize=(20, 10))
plt.hist(np.log(1 - df.akpd_score))
plt.grid()
plt.show()

In [None]:
df.loc[df.akpd_score > 0.99, '20200520_model_keras_reduced_jitter_estimated_weight_g'].mean()

In [None]:
df = pd.read_csv('/root/data/alok/biomass_estimation/playground/pen_66_2020-06-05_2020-06-12_0_1759_nn_epoch_798.csv')
# df = pd.read_csv('/root/data/alok/biomass_estimation/playground/pen_88_2020-02-28_2020-03-06_combined_nn_epoch_798.csv')



In [None]:
df[(df.akpd_score > 0.99)].shape, df[(df.post_refinement_akpd_score > 0.99)].shape

In [None]:
df[(df.akpd_score > 0.99) | (df.post_refinement_akpd_score > 0.99)].shape

In [None]:
df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score > 0.99)].shape

In [None]:
df[(df.akpd_score > 0.99)].estimated_weight_g.mean()

In [None]:
df[(df.akpd_score >= 0.9) & (df.post_refinement_akpd_score >= 0.99)].nn_epoch_798_estimated_weight_g.mean()

In [None]:
df[df.post_refinement_akpd_score >= 0.99].estimated_weight_g.mean()

In [None]:


original_weights = df[df.akpd_score > 0.99].estimated_weight_g.values
original_weight = np.mean(original_weights)

adj_set = df[(df.post_refinement_akpd_score >= 0.99)].nn_epoch_798_estimated_weight_g.values
non_adj_set = df[(df.akpd_score >= 0.99) & (df.post_refinement_akpd_score < 0.99)].estimated_weight_g.values
new_weights = np.array(list(adj_set) + list(non_adj_set))
print(len(new_weights))
new_weight = np.mean(new_weights)

print('Average weight without AKPR: {}'.format(original_weight))
print('Average weight with AKPR: {}'.format(new_weight))


In [None]:
plt.figure(figsize=(20, 10))
plt.hist(original_weights, bins=5, color='blue', alpha=0.7)
plt.hist(new_weights, bins=5, color='red', alpha=0.7)
plt.grid()
plt.show()

In [None]:
original_hist, new_hist = {}, {}
bin_labels, original_freqs, new_freqs = [], [], []
bin_edges = np.arange(0, 10000, 1000)
for idx in range(len(bin_edges) - 1):
    low_edge, high_edge = bin_edges[idx], bin_edges[idx + 1]
    bin_label = '{}-{}'.format(low_edge, high_edge)
    
    original_mask = (original_weights >= low_edge) & (original_weights < high_edge)
    original_count = np.sum(original_mask)
    
    new_mask = (new_weights >= low_edge) & (new_weights < high_edge)
    new_count = np.sum(new_mask)
    
    bin_labels.append(bin_label)
    original_freqs.append(original_count)
    new_freqs.append(new_count)
    
original_freqs = np.array(original_freqs) / np.sum(original_freqs)
new_freqs = np.array(new_freqs) / np.sum(new_freqs)
    
    


In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(list(range(len(bin_labels))), original_freqs, tick_label=bin_labels, alpha=0.5, color='blue', label='without AKPR')
ax.bar(list(range(len(bin_labels))), new_freqs, tick_label=bin_labels, alpha=0.5, color='red', label='with AKPR')
plt.grid()
plt.legend()
plt.show()

In [None]:
df

In [None]:
from research.weight_estimation.keypoint_utils.keypoint_transformations import get_raw_3d_coordinates

pre_depths = []
post_depths = []
for idx, row in df.iterrows():
    pre_ann = json.loads(row.annotation.replace("'", '"'))
    post_ann = json.loads(row.post_refinement_akpd.replace("'", '"'))
    cm = json.loads(row.camera_metadata.replace("'", '"'))
    
    pre_kp_arr = get_raw_3d_coordinates(pre_ann, cm)
    pre_depth = np.median(pre_kp_arr[:, 1])
    
    post_kp_arr = get_raw_3d_coordinates(post_ann, cm)
    post_depth = np.median(post_kp_arr[:, 1])
    
    pre_depths.append(pre_depth)
    post_depths.append(post_depth)


plt.figure(figsize=(20, 10))
plt.hist(df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score < 0.99)].pre_depth, bins=20, color='blue', alpha=0.8)
plt.hist(df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score > 0.99)].pre_depth, bins=20, color='red', alpha=0.8)
plt.grid()
plt.show()
    

In [None]:
df['pre_depth'] = pre_depths
df['post_depth'] = post_depths

In [None]:
df[df.akpd_score > 0.99].pre_depth.mean()

In [None]:
df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score < 0.99)].pre_depth.mean()

In [None]:
df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score > 0.99)].pre_depth.mean()


In [None]:
plt.figure(figsize=(20, 10))
plt.hist(df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score < 0.99)].pre_depth, bins=20, color='blue', alpha=0.8)
plt.hist(df[(df.akpd_score > 0.99) & (df.post_refinement_akpd_score > 0.99)].pre_depth, bins=20, color='red', alpha=0.8)
plt.grid()
plt.show()

In [None]:
from research.utils.image_utils import Picture
from research.utils.data_access_utils import S3AccessUtils

image_url = 'https://aquabyte-frames-resized-inbound.s3-eu-west-1.amazonaws.com/environment=production/site-id=59/pen-id=95/date=2020-06-25/hour=10/at=2020-06-25T10:48:55.901597000Z/left_frame.resize_512_512.jpg'



In [None]:
picture = Picture(s3_access_utils=S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS']))), image_url=image_url)
picture.enhance()
picture.get_image()

In [None]:
picture.get_image()

In [None]:
image_url = 'https://aquabyte-frames-resized-inbound.s3-eu-west-1.amazonaws.com/environment=production/site-id=59/pen-id=95/date=2020-06-25/hour=10/at=2020-06-25T10:56:38.558207000Z/left_frame.resize_512_512.jpg'

picture = Picture(s3_access_utils=S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS']))), image_url=image_url)
picture.get_image()


In [None]:
picture.enhance()
picture.get_image()
