In [None]:
import json
import os
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils
from report_generation.report_generator import generate_ts_data, SamplingFilter
from research.utils.datetime_utils import add_days
from report_generation.report_generator import gen_pm_base
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, ValidationError
from filter_optimization.filter_optimization_task import _add_date_hour_columns
from research.weight_estimation.keypoint_utils.optics import pixel2world
import numpy as np

pd.set_option('display.max_rows', 500)

In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))

In [None]:
cohort_names = [
    'aplavika_pen_id_95_2020-07-10_2020-07-26',
    'bolaks_pen_id_88_2020-02-28_2020-03-10',
    'dale_pen_id_143_2020-10-07_2020-10-21',
    'djubawik_pen_id_153_2020-11-10_2020-11-26',
    'eldviktaren_pen_id_164_2020-09-21_2020-10-08',
    'langoy_pen_id_108_2020-05-07_2020-05-17',
    'leivsethamran_pen_id_165_2020-10-18_2020-11-13',
    'movikodden_pen_id_114_2020-11-03_2020-11-25',
    'movikodden_pen_id_167_2020-10-13_2020-10-30',
    'seglberget_pen_id_66_2020-05-13_2020-06-13',
    'silda_pen_id_86_2020-07-02_2020-07-19',
    'slapoya_pen_id_116_2020-10-18_2020-11-08',
    'tittelsnes_pen_id_37_2020-06-10_2020-06-24',
    'varholmen_pen_id_131_2020-08-15_2020-08-30',
    'varholmen_pen_id_151_2020-10-02_2020-10-17',
    'varholmen_pen_id_186_2020-10-18_2020-11-02',
    'vikane_pen_id_60_2020-08-10_2020-08-30',
#     'kjeppevikholmen_pen_id_5_2019-06-18_2019-07-02',
#     'habranden_pen_id_100_2020-08-10_2020-08-31'
]

In [None]:
batch_name = 'test'

ROOT_DIR = '/root/data/alok/biomass_estimation/playground'
dfs, gt_metadatas = {}, {}
for cohort_name in cohort_names:
    s3_dir = os.path.join(
        'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets',
        cohort_name
    )

    ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata.json')
    ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata.json')
#     ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata_validated.json')
#     ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata_validated.json')
    ground_truth_f = os.path.join(ROOT_DIR, ground_truth_key_base)
    print(ground_truth_metadata_url)
    s3.download_from_url(ground_truth_metadata_url, custom_location=ground_truth_f)
    gt_metadata = json.load(open(ground_truth_f))
    gt_metadatas[cohort_name] = gt_metadata
    
    data_url = os.path.join(s3_dir, 'annotation_dataset.csv')
    data_f, _, _= s3.download_from_url(data_url)
    df = pd.read_csv(data_f)
    df = _add_date_hour_columns(df)
    dfs[cohort_name] = df
    
    

In [None]:
# gt_metadatas

<h1> Generate average weight accuracy </h1>

In [None]:
def generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding):
    last_feeding_date = gt_metadata['last_feeding_date']
    date = add_days(last_feeding_date, days_post_feeding)
    weights, _ = generate_smart_individual_values(pm_base, date, max_day_diff, True, apply_growth_rate, 0.9)
    return weights


def generate_average_weight_accuracy(weights, gt_metadata, loss_factor):
    avg_weight_prediction = np.mean(weights)
    gutted_weight_prediction = avg_weight_prediction * (1.0 - loss_factor)
    gt_weight = gt_metadata['gutted_average_weight']
    avg_weight_err = (gutted_weight_prediction - gt_weight) / gt_weight
    return avg_weight_err

def generate_distribution_accuracy(weights, gt_metadata, loss_factor):
    gutted_weights = weights * (1.0 - loss_factor)
    gutted_weight_distribution = gt_metadata['gutted_weight_distribution']
    
    if gutted_weight_distribution is None:
        return []
    
    count_distribution_errors = []
    
    for bucket in gutted_weight_distribution:
        lower_bound, upper_bound = bucket.split('-')
        pct = gutted_weight_distribution[bucket]
        mask = (gutted_weights >= float(lower_bound) * 1000) & (gutted_weights < float(upper_bound) * 1000)

        pct = np.sum(mask) / len(mask)
        gt_pct = gutted_weight_distribution[bucket] / 100
        
        count_distribution_errors.append(pct - gt_pct)
        
    return count_distribution_errors



In [None]:
start_hours = [7]
end_hours = [15]
apply_growth_rate = True
max_day_diff = 3
days_post_feeding = 1
final_days_post_feeding = 3
loss_factors = [0.14, 0.16] # need to determine the right values here

cohort_name_col = []
start_hour_col = []
end_hour_col = []
loss_factor_col = []
avg_weight_error_col = []
count_distribution_error_col = []

for loss_factor in loss_factors:
    avg_weight_error_col.append([])
    count_distribution_error_col.append([])

for cohort_name in sorted(list(dfs.keys())):
    print(cohort_name)
    gt_metadata = gt_metadatas[cohort_name]
    for start_hour in start_hours:
        for end_hour in end_hours:
            sampling_filter = SamplingFilter(
                start_hour=start_hour,
                end_hour=end_hour,
                kf_cutoff=0.0,
                akpd_score_cutoff=0.95
            )
            df = dfs[cohort_name]
            final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
            tdf = df[df.date <= final_date_post_feeding]
            pm_base = gen_pm_base(tdf, sampling_filter)
            
            try:
                weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
            except ValidationError as err:
                continue
            
            cohort_name_col.append(cohort_name)
            start_hour_col.append(start_hour)
            end_hour_col.append(end_hour)
            loss_factor_col.append(loss_factor)
                   
            for index, loss_factor in enumerate(loss_factors):
                avg_weight_err = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                avg_weight_error_col[index].append(avg_weight_err)
                
                count_distribution_errors = generate_distribution_accuracy(weights, gt_metadata, loss_factor)
                count_distribution_error_col[index].append(count_distribution_errors)


In [None]:
columns = {
    'cohort_name': cohort_name_col,
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col
}

for index, loss_factor in enumerate(loss_factors):
    col_name = 'avg_weight_error_%0.2f' % (loss_factor,)
    col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
    columns[col_name] = avg_weight_error_col[index]
    columns[col_abs_name] = np.abs(avg_weight_error_col[index])
    
    col_abs_name = 'avg_count_dist_error_abs_%0.2f' % (loss_factor,)
    columns[col_abs_name] = [np.mean(np.abs(l)) for l in count_distribution_error_col[index]]

tdf = pd.DataFrame(columns)

In [None]:
tdf

In [None]:
index = 0
loss_factor = loss_factors[index]
col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
error = tdf[col_abs_name]

print('Loss factor', loss_factor)

print()

print('Average Weight Error')
print('Avg %0.1f' % (np.mean(error) * 100, ))
print('90th Pct %0.1f' % (np.percentile(error, 90) * 100, ))
print('Max %0.1f' % (np.max(error) * 100, ))

print()

dist_errors = [item for sublist in count_distribution_error_col[index] for item in sublist]

print('Count Distribution Error')
print('Avg %0.1f' % (np.mean(np.abs(dist_errors)) * 100, ))
print('90th Pct %0.1f' % (np.percentile(np.abs(dist_errors), 90) * 100, ))
print('Max %0.1f' % (np.max(np.abs(dist_errors)) * 100, ))

In [None]:
# for cohort_name in cohort_names:
#     mask = tdf.cohort_name == cohort_name
#     print(tdf[mask].sort_values('avg_weight_error_abs', ascending=True).head(10))

In [None]:
gt_metadatas['vikane_pen_id_60_2020-08-05_2020-08-30']

In [None]:
tdf.cohort_name.unique()

In [None]:
mask = (tdf.cohort_name == 'tittelsnes_pen_id_37_2020-05-23_2020-06-24') & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3) & (tdf.loss_factor == 0.17)
tdf[mask].sort_values('avg_weight_error_abs')



In [None]:
mask = (tdf.start_hour_col == 6) & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3)
tdf[mask].avg_weight_error_abs.median()



In [None]:
mask = (tdf.start_hour_col == 7) & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3)
tdf[mask].avg_weight_error_abs.median()



In [None]:
cohort_name_col = []
start_hour_col = []
end_hour_col = []
apply_growth_rate_col = []
max_day_diff_col = []
days_post_feeding_col = []
final_days_post_feeding_col = []
loss_factor_col = []
std_avg_weight_error_col = []
abs_avg_weight_error_col = []
mean_avg_weight_error_col = []

for start_hour in start_hours:
    for end_hour in end_hours:
        for apply_growth_rate in apply_growth_rate_list:
            for max_day_diff in max_day_diff_list:
                for days_post_feeding in days_post_feeding_list:
                    for final_days_post_feeding in final_days_post_feeding_list:
                        for loss_factor in loss_factors:
                            mask = (tdf.start_hour_col == start_hour) & \
                            (tdf.end_hour_col == end_hour) & \
                            (tdf.apply_growth_rate == apply_growth_rate) & \
                            (tdf.max_day_diff == max_day_diff) & \
                            (tdf.days_post_feeding == days_post_feeding) & \
                            (tdf.final_days_post_feeding == final_days_post_feeding) & \
                            (tdf.loss_factor == loss_factor)
                            
                            start_hour_col.append(start_hour)
                            end_hour_col.append(end_hour)
                            apply_growth_rate_col.append(apply_growth_rate)
                            max_day_diff_col.append(max_day_diff)
                            days_post_feeding_col.append(days_post_feeding)
                            final_days_post_feeding_col.append(final_days_post_feeding)
                            loss_factor_col.append(loss_factor)
                            std_avg_weight_error_col.append(tdf[mask].avg_weight_error.std())
                            abs_avg_weight_error_col.append(tdf[mask].avg_weight_error_abs.mean())
                            mean_avg_weight_error_col.append(tdf[mask].avg_weight_error.mean())

In [None]:
rdf = pd.DataFrame({
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'apply_growth_rate': apply_growth_rate_col,
    'max_day_diff': max_day_diff_col,
    'days_post_feeding': days_post_feeding_col,
    'final_days_post_feeding': final_days_post_feeding_col,
    'loss_factor': loss_factor_col,
    'abs_avg_weight_error': abs_avg_weight_error_col,
    'std_avg_weight_error': std_avg_weight_error_col,
    'mean_avg_weight_error': mean_avg_weight_error_col,
})



In [None]:
rdf

In [None]:
mask = (rdf.loss_factor == 0.16)
rdf[mask].sort_values('abs_avg_weight_error')

In [None]:
tdf.to_csv('/root/data/alok/biomass_estimation/playground/smart_average_param_grid_search.csv')

In [None]:
tdf[(tdf.cohort_name == 'bolaks_pen_id_88_2020-02-10_2020-03-10')].sort_values('avg_weight_error_abs')



In [None]:
# generate Vikane average weight and distribution error - explore basic parameters

ground_truth_metadata = json.load(open(ground_truth_f))
day_after_feeding_stop = add_days(ground_truth_metadata['last_feeding_date'], 1)
start_date, end_date = add_days(day_after_feeding_stop, -2), add_days(day_after_feeding_stop, -1)
tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)

sampling_filter = SamplingFilter(
    start_hour=7,
    end_hour=15,
    akpd_score_cutoff=0.95,
    kf_cutoff=0.0
)
pm_base = gen_pm_base(tdf, sampling_filter)
weights, _ = generate_smart_individual_values(pm_base, day_after_feeding_stop, 3, True, True, 0.9)


In [None]:
np.mean(weights)