In [None]:
import json
import os
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

<h1> Load Duplicate Dataset </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
url = 'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/jane/fish_id/bolaks_data_pairs.csv'
f, _, _ = s3_access_utils.download_from_url(url)
duplicate_df = pd.read_csv(f)
duplicate_df = duplicate_df.head(1000)

<h1> Load Raw Bolaks dataset from production </h1>

In [None]:
bolaks_url = 'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets/bolaks_pen_id_88_2020-02-10_2020-03-10.csv'
f, _, _ = s3_access_utils.download_from_url(bolaks_url)
df = pd.read_csv(f)


In [None]:
len(df.left_crop_url.unique().tolist())

In [None]:
p1_left_crop_urls = duplicate_df.p1_left_crop_url.tolist()
p2_left_crop_urls = duplicate_df.p2_left_crop_url.tolist()
unique_left_crop_urls = sorted(list(set(p1_left_crop_urls + p2_left_crop_urls)))


In [None]:
tdf = df[df.left_crop_url.isin(unique_left_crop_urls)]

In [None]:
f_name = 'bolaks_pen_id_88_2020-02-10_2020-03-10_filtered_1752.csv'
f = os.path.join('/root/data/alok/biomass_estimation/playground', f_name)
tdf.to_csv(f)
bucket, key = 'aquabyte-images-adhoc', 'alok/playground/{}'.format(f_name)
s3_access_utils.s3_client.upload_file(f, bucket, key)

In [None]:
rdf = pd.read_csv('/root/data/alok/biomass_estimation/playground/bolaks_pen_id_88_2020-02-10_2020-03-10_filtered_1752_nn_epoch_798.csv')

In [None]:
def get_prediction(rdf, left_crop_url):
    mask = rdf.left_crop_url == left_crop_url
    pred = rdf[mask].nn_epoch_798_estimated_weight_g.iloc[0]
    akpd_score = rdf[mask].post_refinement_akpd_score.iloc[0]
    return pred, akpd_score

In [None]:

p1_preds, p2_preds = [], []
p1_akpd_scores, p2_akpd_scores = [], []
for idx, row in duplicate_df.iterrows():
    try:
        p1_pred, p1_akpd_score = get_prediction(rdf, row.p1_left_crop_url)
        p2_pred, p2_akpd_score = get_prediction(rdf, row.p2_left_crop_url)
        p1_preds.append(p1_pred)
        p2_preds.append(p2_pred)
        p1_akpd_scores.append(p1_akpd_score)
        p2_akpd_scores.append(p2_akpd_score)
    except:
        p1_preds.append(None)
        p2_preds.append(None)
        p1_akpd_scores.append(None)
        p2_akpd_scores.append(None)
        

    

In [None]:
mask = rdf.akpd_score > 0.9
(rdf[mask].estimated_weight_g.mean() - rdf[mask].nn_epoch_798_estimated_weight_g.mean()) / rdf[mask].estimated_weight_g.mean()

In [None]:
rdf[mask].estimated_weight_g.mean()

In [None]:
rdf[mask].nn_epoch_798_estimated_weight_g.mean()

In [None]:
duplicate_df['p1_pred'] = p1_preds
duplicate_df['p2_pred'] = p2_preds
duplicate_df['pred_min'] = duplicate_df[['p1_pred', 'p2_pred']].min(axis=1)
duplicate_df['pred_max'] = duplicate_df[['p1_pred', 'p2_pred']].max(axis=1)
duplicate_df['p1_akpd_score'] = p1_akpd_scores
duplicate_df['p2_akpd_score'] = p2_akpd_scores


In [None]:
(((duplicate_df.max_w - duplicate_df.min_w).abs() / duplicate_df.min_w)**2).mean() ** 0.5

In [None]:
(((duplicate_df[mask].pred_max - duplicate_df[mask].pred_min).abs() / duplicate_df[mask].pred_min)**2).mean() ** 0.5

In [None]:
plt.figure(figsize=(20, 10))
duplicate_df['original_pred_avg'] = 0.5*(duplicate_df.p1_w + duplicate_df.p2_w)
duplicate_df['original_pct_diff'] = 100*(duplicate_df.p1_w - duplicate_df.p2_w) / duplicate_df.original_pred_avg
duplicate_df['new_pred_avg'] = 0.5 * (duplicate_df.p1_pred + duplicate_df.p2_pred)
duplicate_df['new_pct_diff'] = 100*(duplicate_df.p1_pred - duplicate_df.p2_pred) / duplicate_df.new_pred_avg

# plot original deviations between duplicates
plt.hist(duplicate_df.original_pct_diff, 
         weights=np.ones(duplicate_df.shape[0])/duplicate_df.shape[0],
         bins=np.arange(-50, 50, 5),
         color='blue',
         label='Before AKPR (Automatic Key-point Refinement)',
         
         alpha=0.8)

# plot deviations between duplicates after AKPR
mask = (duplicate_df.p1_akpd_score > 0.99) & (duplicate_df.p2_akpd_score > 0.99)
plt.hist(duplicate_df[mask].new_pct_diff, 
         weights=np.ones(duplicate_df[mask].shape[0])/duplicate_df[mask].shape[0],
         bins=np.arange(-50, 50, 5),
         color='red',
         label='After AKPR',
         alpha=0.8)

plt.xlabel('Adjusted Percentage Difference Between Duplicates (%)')
plt.ylabel('Frequency')
plt.legend()
plt.grid()
plt.show()

In [None]:
duplicate_df.min_w.mean()

In [None]:
duplicate_df.max_w.mean()

In [None]:
duplicate_df.p1_pred.mean()

In [None]:
duplicate_df.p2_pred.mean()

In [None]:
duplicate_df.pred_min.mean()

In [None]:
duplicate_df.pred_max.mean()

In [None]:
mask = (rdf.post_refinement_akpd_score >= 0.99) & (rdf.akpd_score >= 0.99)

(rdf[mask].estimated_weight_g - rdf[mask].nn_epoch_798_estimated_weight_g).mean()

In [None]:
duplicate_df