In [None]:
import json
import os
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import torch

from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from research.weight_estimation.weight_estimator import WeightEstimator
from research.weight_estimation.population_metrics import PopulationMetricsEstimator
from research.weight_estimation.keypoint_utils.optics import pixel2world

In [None]:
# extract dataframe
s3_access_utils = S3AccessUtils('/root/data')
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))


# query = """
#     SELECT * FROM (
#       (SELECT * FROM prod.crop_annotation cas
#       INNER JOIN prod.annotation_state pas on pas.id=cas.annotation_state_id
#       WHERE cas.service_id = (SELECT ID FROM prod.service where name='BATI')
#       AND cas.annotation_state_id = 3
#       AND cas.pen_id=88) a
#     RIGHT JOIN 
#       (SELECT left_crop_url, estimated_weight_g, akpd_score FROM prod.biomass_computations
#       WHERE prod.biomass_computations.captured_at between '2020-02-10' and '2020-03-10'
#       AND prod.biomass_computations.akpd_score > 0.9) bc 
#     ON 
#       (a.left_crop_url=bc.left_crop_url)
#     ) x
#     WHERE x.captured_at between '2020-02-10' and '2020-03-10'
#     AND x.pen_id = 88
#     AND x.group_id = '88';
# """

query = """
    select * from prod.biomass_computations
    where pen_id = 88
    and captured_at between '2020-02-10' and '2020-03-10';
"""

df = rds_access_utils.extract_from_database(query)
df = df.sort_values('captured_at')
df.index = pd.to_datetime(df.captured_at)
df['hour'] = df.index.hour


In [None]:
df['estimated_k_factor'] = 1e5 * (df.estimated_weight_g / (df.estimated_length_mm ** 3))

In [None]:
df[columns].to_csv('/root/data/alok/biomass_estimation/playground/bolaks_data_2.csv')

In [None]:
columns = ['akpd_score', 'estimated_k_factor', 'estimated_weight_g', 'captured_at']

In [None]:
kdf = pd.read_csv('/root/data/alok/biomass_estimation/playground/bolaks_data_2.csv')

In [None]:
df.estimated_k_factor


In [None]:
np.nanpercentile(kdf.estimated_k_factor.values, 70)

In [None]:
df.estimated_k_factor = np.maximum(np.minimum(df.estimated_k_factor.values, 3), 0)

In [None]:
model_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2020-04-01T00-00-00/nn_epoch_253.pb'
s3_access_utils = S3AccessUtils('/root/data')
model_f, _, _ = s3_access_utils.download_from_url(model_url)
weight_estimator = WeightEstimator(model_f)

df['keypoints'] = df.annotation
preds = weight_estimator.generate_predictions(df)
df['pred'] = preds

In [None]:
mask = (df.hour > 7) & (df.hour < 15) & (df.akpd_score > 0.9) & (df.pred > 0)
tdf = df[mask].copy(deep=True)

In [None]:
biomass_computations = []
for idx, row in tdf.iterrows():
    date = str(row.captured_at)[:10]
    biomass_computations.append((date, row.pred))



In [None]:
pme = PopulationMetricsEstimator(biomass_computations)
dates = sorted(list(set([item[0] for item in biomass_computations])))
smart_average_weights = []
for date in dates:
    w = pme.generate_smart_metrics_on_date(date)
    smart_average_weights.append(w)

In [None]:
biomass_computations = []
for idx, row in tdf.iterrows():
    date = str(row.captured_at)[:10]
    biomass_computations.append((date, row.estimated_weight_g))


In [None]:
pme = PopulationMetricsEstimator(biomass_computations)
dates = sorted(list(set([item[0] for item in biomass_computations])))
smart_average_weights_2 = []
for date in dates:
    w = pme.generate_smart_metrics_on_date(date)
    smart_average_weights_2.append(w)

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(pd.to_datetime(dates), smart_average_weights_2, color='blue')
plt.plot(pd.to_datetime(dates), smart_average_weights, color='red')
plt.grid()
plt.show()

In [None]:
tdf['world_keypoints'] = \
    tdf.apply(lambda row: pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata), axis=1)

tdf['depth'] = tdf.world_keypoints.apply(lambda x: np.mean([item[1] for item in x.values()]))

In [None]:
def generate_error_breakdown(df, vals, field, pred_field, gt_field):
    for idx in range(len(vals) - 1):
        mask = (df[field] > vals[idx]) & (df[field] < vals[idx + 1])
        error_pct = (df.loc[mask, pred_field].mean() - df.loc[mask, gt_field].mean()) / (df.loc[mask, gt_field].mean())
        print('Error percentage for {} in range {} <-> {}%: {}'.format(
            field,
            round(vals[idx], 2), 
            round(vals[idx + 1], 2),
            round(100*error_pct, 2))
        )





In [None]:
generate_error_breakdown(tdf, np.arange(0.5, 2.5, 0.1), 'depth', 'pred', 'estimated_weight_g')

In [None]:
tdf.captured_at.astype(str).apply(lambda x: x[:10]).values

In [None]:
tdf[tdf.depth > 1.9].estimated_weight_g.mean()

In [None]:
tdf.estimated_weight_g.mean()

In [None]:
tdf[tdf.depth > 1.9].pred.mean()

In [None]:
mask = (tdf.depth > 1.0) & (tdf.depth < 1.5)
(tdf[mask].pred.mean() - tdf[mask].estimated_weight_g.mean()) / tdf[mask].estimated_weight_g.mean()

In [None]:
df.to_csv('/root/data/alok/biomass_estimation/playground/bolaks_data.csv')