In [None]:
import json
import os
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils
from report_generation.report_generator import generate_ts_data, SamplingFilter
from research.utils.datetime_utils import add_days
from report_generation.report_generator import gen_pm_base
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, ValidationError
from filter_optimization.filter_optimization_task import _add_date_hour_columns
from research.weight_estimation.keypoint_utils.optics import pixel2world
import numpy as np

pd.set_option('display.max_rows', 500)

In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))

In [None]:
cohort_names = [
    'seglberget_pen_id_66_2020-05-13_2020-06-13',
    'bolaks_pen_id_88_2020-02-10_2020-03-10',
    'langoy_pen_id_108_2020-05-07_2020-05-17',
    'tittelsnes_pen_id_37_2020-05-23_2020-06-24',
    'aplavika_pen_id_95_2020-06-26_2020-07-26',
    'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02',
    'silda_pen_id_86_2020-06-19_2020-07-19',
    'vikane_pen_id_60_2020-08-05_2020-08-30',
    'eldviktaren_pen_id_164_2020-09-06_2020-10-06',
    'habranden_pen_id_100_2020-08-10_2020-08-31'
]

In [None]:
ROOT_DIR = '/root/data/alok/biomass_estimation/playground'
batch_name = 'mirror'
dfs, gt_metadatas = {}, {}
for cohort_name in cohort_names:
    s3_dir = os.path.join(
        'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets',
        cohort_name
    )

    ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata.json')
    ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata.json')
    ground_truth_f = os.path.join(ROOT_DIR, ground_truth_key_base)
    s3.download_from_url(ground_truth_metadata_url, custom_location=ground_truth_f)
    gt_metadata = json.load(open(ground_truth_f))
    gt_metadatas[cohort_name] = gt_metadata
    
    data_url = os.path.join(s3_dir, 'annotation_dataset.csv')
    data_f, _, _= s3.download_from_url(data_url)
    df = pd.read_csv(data_f)
    df = _add_date_hour_columns(df)
    dfs[cohort_name] = df
    
    

<h1> Generate average weight accuracy </h1>

In [None]:
from population_metrics.population_metrics_base import PopulationMetricsBase, ValidationError
from population_metrics.raw_metrics import get_raw_sample_size, get_raw_weight_values, get_raw_kf_values
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.confidence_metrics import generate_trend_stability, get_raw_and_historical_weights

In [None]:
from typing import Dict, List, Tuple, Union
import numpy as np
from population_metrics.population_metrics_base import PopulationMetricsBase, ValidationError
from population_metrics.raw_metrics import get_raw_sample_size, get_raw_weight_values, get_raw_kf_values
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.confidence_metrics import generate_trend_stability, get_raw_and_historical_weights
from research.utils.datetime_utils import add_days, day_difference, get_dates_in_range

"""
This module contains functions for computing daily level smart features - for example, smart growth rate,
smart distribution, smart average, smart k-factor, smart sample-size, and smart standard deviation. 
Ask Alok for more context behind mathematical formulation.
"""


def get_included_dates(pm_base: PopulationMetricsBase, date: str,
                       max_day_difference: int, incorporate_future: bool) -> List:
    """
    Gets list of dates that fall into window corresponding to max_day_difference. Window
    is affected by whether or not incorporate_future is set to True.
    """

    start = add_days(date, -max_day_difference)
    end = add_days(date, max_day_difference if incorporate_future else 0)
    included_dates = sorted([date for date in get_dates_in_range(start, end) if date in pm_base.unique_dates])
    if not included_dates:
        raise ValidationError('No raw biomass data found in window!')
    return included_dates


def get_smart_growth_rate(pm_base: PopulationMetricsBase, date: str,
                          incorporate_future: bool = True, apply_growth_rate: bool = True,
                          trend_stability_threshold: float = 0.9) -> float:
    """Get local growth rate adjustment to use for smart average computation."""

    raw_sample_size = get_raw_sample_size(pm_base, date)
    growth_rate_for_smart_metrics = 0.0
    if apply_growth_rate:
        try:
            growth_rate = compute_local_growth_rate(pm_base, date, incorporate_future=incorporate_future)
            trend_stability = generate_trend_stability(pm_base, date, incorporate_future=incorporate_future)
            if raw_sample_size and trend_stability and trend_stability > trend_stability_threshold:
                growth_rate_for_smart_metrics = growth_rate
        except ValidationError as err:
            print(str(err))
    return growth_rate_for_smart_metrics


def generate_smart_individual_values(pm_base: PopulationMetricsBase, date: str, max_day_difference: int,
                                     incorporate_future: bool, apply_growth_rate: bool,
                                     trend_stability_threshold: float, reflection_point: float) -> Tuple:
    """
    Generate smart individual values for weight and k-factor on given date.
    Args:
        pm_base: PopulationMetricsBase instance
        date: the date to compute smart individual values for
        max_day_difference: what is the maximum day difference of dates in the window?
        incorporate_future: should future data be incorporated?
        apply_growth_rate: should we apply a growth rate adjustment?
        trend_stability_threshold: if apply_growth_rate is True, what minimum trend_stability_threshold
                                   should we mandate for growth rate adjustment?
    Returns:
        adj_weights: growth rate adjusted individual weights in window
        kfs: individual k-factor values in window
    """
    
    # validate data
    included_dates = get_included_dates(pm_base, date, max_day_difference, incorporate_future)

    # compute local growth rate to use for smart average
    growth_rate_for_smart_metrics = get_smart_growth_rate(pm_base, date, incorporate_future=incorporate_future,
                                                          apply_growth_rate=apply_growth_rate,
                                                          trend_stability_threshold=trend_stability_threshold)

    # get adjusted weights and kfs for smart metrics
    all_weights = []
    adj_weights, kfs = [], []
    for d in included_dates:

        # extend adjusted weights list for this date
        weights_for_date = get_raw_weight_values(pm_base, d)
        day_diff = day_difference(d, date)
        adj_weights_for_date = np.array(weights_for_date) * np.exp(-day_diff * growth_rate_for_smart_metrics)
        adj_weights.extend(adj_weights_for_date)

        # extend k-factor list for this date
        kfs_for_date = get_raw_kf_values(pm_base, d)
        kfs.extend(kfs_for_date)
        all_weights.extend(weights_for_date)

    weights = np.array(adj_weights)
#     new_reflection_point = reflection_point * np.median(adj_weights) / np.median(all_weights)
#     weights = np.array(list(weights[weights < new_reflection_point]) + list(new_reflection_point + (new_reflection_point - weights[weights < new_reflection_point])))
    return weights




In [None]:
def generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding, 
                                  reflection_point):
    last_feeding_date = gt_metadata['last_feeding_date']
    date = add_days(last_feeding_date, days_post_feeding)
    weights = generate_smart_individual_values(pm_base, date, max_day_diff, True, apply_growth_rate, 0.9, reflection_point)
    return weights


def generate_average_weight_accuracy(weights, gt_metadata, loss_factor):
    avg_weight_prediction = np.mean(weights)
    gutted_weight_prediction = avg_weight_prediction * (1.0 - loss_factor)
    gt_weight = gt_metadata['gutted_average_weight']
    avg_weight_err = (gutted_weight_prediction - gt_weight) / gt_weight
    return avg_weight_err




In [None]:
def generate_depths(df):
    depths = []
    for idx, row in df.iterrows():
        ann = json.loads(row.annotation.replace("'", '"'))
        cm = json.loads(row.camera_metadata.replace("'", '"'))
        wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)

        depth = np.median([wkp[1] for wkp in wkps.values()])
        depths.append(depth)
    return depths

def get_reflection_point(tdf, sampling_filter):
    hour_mask = (tdf.hour >= sampling_filter.start_hour) & (tdf.hour <= sampling_filter.end_hour)
    kdf = tdf[hour_mask].copy(deep=True)
    depths = generate_depths(kdf)
    kdf['depth'] = depths
    far_mask = kdf.depth > np.percentile(kdf.depth.values, 75)
    reflection_point = kdf[far_mask].estimated_weight_g.median()
    return reflection_point
    
    
    

In [None]:
loss_factors = {
    'seglberget_pen_id_66_2020-05-13_2020-06-13': 0.16, # unconfirmed
    'bolaks_pen_id_88_2020-02-10_2020-03-10': 0.17, # confirmed
    'langoy_pen_id_108_2020-05-07_2020-05-17': 0.16, # unknown
    'tittelsnes_pen_id_37_2020-05-23_2020-06-24': 0.16, # confirmed 
    'aplavika_pen_id_95_2020-06-26_2020-07-26': 0.1753, # confirmed
    'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02': 0.17, # confirmed
    'silda_pen_id_86_2020-06-19_2020-07-19': .20, # confirmed
    'vikane_pen_id_60_2020-08-05_2020-08-30': .17, # confirmed
    'eldviktaren_pen_id_164_2020-09-06_2020-10-06': .16, # confirmed
    'habranden_pen_id_100_2020-08-10_2020-08-31': .14 #uncofirmed
}

In [None]:
starvation_times = {
    'seglberget_pen_id_66_2020-05-13_2020-06-13': 7, # unconfirmed
    'bolaks_pen_id_88_2020-02-10_2020-03-10': 9, # 7, 8, 9, 12 - avg is 9. Could do sample size weighted
    'langoy_pen_id_108_2020-05-07_2020-05-17': 12, # confirmed
    'tittelsnes_pen_id_37_2020-05-23_2020-06-24': 7, # 6, 7, 8 - avg is 7
    'aplavika_pen_id_95_2020-06-26_2020-07-26': 6, # confirmed
#     'kjeppevikholmen_pen_id_5_2019-06-05_2019-07-02': 0, # exclude from the study
#     'silda_pen_id_86_2020-06-19_2020-07-19': 0, # unknown, exclude from the study
    'vikane_pen_id_60_2020-08-05_2020-08-30': 9, # confirmed
    'eldviktaren_pen_id_164_2020-09-06_2020-10-06': 13.5, # 12, 13, 14, 15 - avg is 13.5
    'habranden_pen_id_100_2020-08-10_2020-08-31': 5 #uncofirmed
}

In [None]:
start_hours = [5]
end_hours = [15]
apply_growth_rate_list = [True]
max_day_diff_list = [3]
days_post_feeding_list = [1]
max_final_days_post_feeding = 1
# loss_factors = [0.16, 0.17]

cohort_name_col = []
start_hour_col = []
end_hour_col = []
apply_growth_rate_col = []
max_day_diff_col = []
days_post_feeding_col = []
final_days_post_feeding_col = []
loss_factor_col = []
avg_weight_error_col = []
starvation_time_col = []

vikaneData = None

for cohort_name in sorted(list(dfs.keys())):
    if cohort_name not in starvation_times:
        continue
    starvation_time = starvation_times[cohort_name]
    print(cohort_name)
    gt_metadata = gt_metadatas[cohort_name]
    for start_hour in start_hours:
        for end_hour in end_hours:
            for final_days_post_feeding in days_post_feeding_list:
                sampling_filter = SamplingFilter(
                    start_hour=start_hour,
                    end_hour=end_hour,
                    kf_cutoff=0.0,
                    akpd_score_cutoff=0.01
                )
                df = dfs[cohort_name]
                final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
                tdf = df[(df.date <= final_date_post_feeding) & (df.date >= add_days(final_date_post_feeding, -14))]
                pm_base = gen_pm_base(tdf, sampling_filter)
                
                reflection_point = get_reflection_point(tdf, sampling_filter)
                
                for apply_growth_rate in apply_growth_rate_list:
                    for max_day_diff in max_day_diff_list:
                        for days_post_feeding in [1]:#range(0, final_days_post_feeding + 1):
#                             for loss_factor in loss_factors:
                            loss_factor = loss_factors[cohort_name]
                            try:
                                weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding, 
                                                                        reflection_point)
                                if cohort_name == 'vikane_pen_id_60_2020-08-05_2020-08-30':
                                    vikaneData = weights
                            except ValidationError as err:
                                continue
                            avg_weight_err = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)

                            cohort_name_col.append(cohort_name)
                            start_hour_col.append(start_hour)
                            end_hour_col.append(end_hour)
                            apply_growth_rate_col.append(apply_growth_rate)
                            max_day_diff_col.append(max_day_diff)
                            days_post_feeding_col.append(days_post_feeding)
                            final_days_post_feeding_col.append(final_days_post_feeding)
                            loss_factor_col.append(loss_factor)
                            avg_weight_error_col.append(avg_weight_err)
                            starvation_time_col.append(starvation_time)



In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[:-1])

for i in range(len(buckets) - 1):
    mask1 = (vikaneData > buckets[i]) & (vikaneData <= buckets[i + 1])
    print(np.sum(mask1) / len(mask1), ',')

In [None]:
np.mean(vikaneData), np.std(vikaneData)

In [None]:
tdf = pd.DataFrame({
    'cohort_name': cohort_name_col,
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'apply_growth_rate': apply_growth_rate_col,
    'max_day_diff': max_day_diff_col,
    'days_post_feeding': days_post_feeding_col,
    'final_days_post_feeding': final_days_post_feeding_col,
    'loss_factor': loss_factor_col,
    'avg_weight_error': avg_weight_error_col,
    'starvation_time_col': starvation_time_col
})

tdf['avg_weight_error_abs'] = tdf.avg_weight_error.abs()

In [None]:
tdf

In [None]:
import matplotlib.pyplot as plt

plt.scatter(tdf.starvation_time_col, tdf.avg_weight_error)
plt.axhline(0)

In [None]:
for cohort_name in cohort_names:
    mask = (tdf.cohort_name == cohort_name) & (tdf.days_post_feeding == 1)
    print(tdf[mask].sort_values('avg_weight_error_abs', ascending=True)[['cohort_name', 'avg_weight_error', 'loss_factor']])

In [None]:
gt_metadatas['vikane_pen_id_60_2020-08-05_2020-08-30']

In [None]:
tdf.cohort_name.unique()

In [None]:
mask = (tdf.cohort_name == 'tittelsnes_pen_id_37_2020-05-23_2020-06-24') & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3) & (tdf.loss_factor == 0.17)
tdf[mask].sort_values('avg_weight_error_abs')



In [None]:
mask = (tdf.start_hour_col == 6) & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3)
tdf[mask].avg_weight_error_abs.median()



In [None]:
mask = (tdf.start_hour_col == 7) & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3)
tdf[mask].avg_weight_error_abs.median()



In [None]:
cohort_name_col = []
start_hour_col = []
end_hour_col = []
apply_growth_rate_col = []
max_day_diff_col = []
days_post_feeding_col = []
final_days_post_feeding_col = []
loss_factor_col = []
std_avg_weight_error_col = []
abs_avg_weight_error_col = []
mean_avg_weight_error_col = []

for start_hour in start_hours:
    for end_hour in end_hours:
        for apply_growth_rate in apply_growth_rate_list:
            for max_day_diff in max_day_diff_list:
                for days_post_feeding in days_post_feeding_list:
                    for final_days_post_feeding in final_days_post_feeding_list:
                        for loss_factor in loss_factors:
                            mask = (tdf.start_hour_col == start_hour) & \
                            (tdf.end_hour_col == end_hour) & \
                            (tdf.apply_growth_rate == apply_growth_rate) & \
                            (tdf.max_day_diff == max_day_diff) & \
                            (tdf.days_post_feeding == days_post_feeding) & \
                            (tdf.final_days_post_feeding == final_days_post_feeding) & \
                            (tdf.loss_factor == loss_factor)
                            
                            start_hour_col.append(start_hour)
                            end_hour_col.append(end_hour)
                            apply_growth_rate_col.append(apply_growth_rate)
                            max_day_diff_col.append(max_day_diff)
                            days_post_feeding_col.append(days_post_feeding)
                            final_days_post_feeding_col.append(final_days_post_feeding)
                            loss_factor_col.append(loss_factor)
                            std_avg_weight_error_col.append(tdf[mask].avg_weight_error.std())
                            abs_avg_weight_error_col.append(tdf[mask].avg_weight_error_abs.mean())
                            mean_avg_weight_error_col.append(tdf[mask].avg_weight_error.mean())

In [None]:
rdf = pd.DataFrame({
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'apply_growth_rate': apply_growth_rate_col,
    'max_day_diff': max_day_diff_col,
    'days_post_feeding': days_post_feeding_col,
    'final_days_post_feeding': final_days_post_feeding_col,
    'loss_factor': loss_factor_col,
    'abs_avg_weight_error': abs_avg_weight_error_col,
    'std_avg_weight_error': std_avg_weight_error_col,
    'mean_avg_weight_error': mean_avg_weight_error_col,
})



In [None]:
rdf

In [None]:
mask = (rdf.loss_factor == 0.16)
rdf[mask].sort_values('abs_avg_weight_error')

In [None]:
tdf.to_csv('/root/data/alok/biomass_estimation/playground/smart_average_param_grid_search.csv')

In [None]:
tdf[(tdf.cohort_name == 'bolaks_pen_id_88_2020-02-10_2020-03-10')].sort_values('avg_weight_error_abs')



In [None]:
# generate Vikane average weight and distribution error - explore basic parameters

ground_truth_metadata = json.load(open(ground_truth_f))
day_after_feeding_stop = add_days(ground_truth_metadata['last_feeding_date'], 1)
start_date, end_date = add_days(day_after_feeding_stop, -2), add_days(day_after_feeding_stop, -1)
tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)

sampling_filter = SamplingFilter(
    start_hour=7,
    end_hour=15,
    akpd_score_cutoff=0.95,
    kf_cutoff=0.0
)
pm_base = gen_pm_base(tdf, sampling_filter)
weights, _ = generate_smart_individual_values(pm_base, day_after_feeding_stop, 3, True, True, 0.9)


In [None]:
np.mean(weights)