In [None]:
import json
import os
import pandas as pd
from research.utils.data_access_utils import S3AccessUtils
from report_generation.report_generator import generate_ts_data, SamplingFilter
from research.utils.datetime_utils import add_days
from report_generation.report_generator import gen_pm_base
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, ValidationError
from filter_optimization.filter_optimization_task import _add_date_hour_columns
from research.weight_estimation.keypoint_utils.optics import pixel2world
import numpy as np

pd.set_option('display.max_rows', 500)

In [None]:
s3 = S3AccessUtils('/root/data', json.load(open(os.environ['AWS_CREDENTIALS'])))

In [None]:
cohort_names = [
    'seglberget_pen_id_66_2020-05-13_2020-06-13',
    'bolaks_pen_id_88_2020-02-28_2020-03-10',
    'langoy_pen_id_108_2020-05-07_2020-05-17',
    'tittelsnes_pen_id_37_2020-06-10_2020-06-24',
    'aplavika_pen_id_95_2020-07-10_2020-07-26',
#     'kjeppevikholmen_pen_id_5_2019-06-18_2019-07-02',
    'silda_pen_id_86_2020-07-02_2020-07-19',
    'vikane_pen_id_60_2020-08-10_2020-08-30',
    'eldviktaren_pen_id_164_2020-09-21_2020-10-08',
#     'habranden_pen_id_100_2020-08-10_2020-08-31',
    'varholmen_pen_id_131_2020-08-15_2020-08-30',
    'dale_pen_id_143_2020-10-07_2020-10-21',
    'djubawik_pen_id_153_2020-11-10_2020-11-26',
    'leivsethamran_pen_id_165_2020-10-18_2020-11-13',
    'movikodden_pen_id_114_2020-11-03_2020-11-25',
    'movikodden_pen_id_167_2020-10-13_2020-10-30',
    'slapoya_pen_id_116_2020-10-18_2020-11-08',
    'varholmen_pen_id_131_2020-08-15_2020-08-30',
    'varholmen_pen_id_151_2020-10-02_2020-10-17',
    'varholmen_pen_id_186_2020-10-18_2020-11-02'
]

cohort_names2 = [
    'dale_pen_id_144_2020-12-20_2021-01-11'
]

In [None]:
camera_type = {
    'seglberget_pen_id_66_2020-05-13_2020-06-13': 'sexton',
    'bolaks_pen_id_88_2020-02-28_2020-03-10': 'sexton',
    'langoy_pen_id_108_2020-05-07_2020-05-17': 'sexton',
    'tittelsnes_pen_id_37_2020-06-10_2020-06-24': 'sexton',
    'aplavika_pen_id_95_2020-07-10_2020-07-26': 'sexton',
#     'kjeppevikholmen_pen_id_5_2019-06-18_2019-07-02': 'sexton',
    'silda_pen_id_86_2020-07-02_2020-07-19': 'sexton',
    'vikane_pen_id_60_2020-08-10_2020-08-30': 'atlas',
    'eldviktaren_pen_id_164_2020-09-21_2020-10-08': 'atlas',
#     'habranden_pen_id_100_2020-08-10_2020-08-31': 'imenco',
    'varholmen_pen_id_131_2020-08-15_2020-08-30': 'imenco',
    'dale_pen_id_143_2020-10-07_2020-10-21': 'atlas',
    'djubawik_pen_id_153_2020-11-10_2020-11-26': 'atlas',
    'leivsethamran_pen_id_165_2020-10-18_2020-11-13': 'atlas',
    'movikodden_pen_id_114_2020-11-03_2020-11-25': 'imenco',
    'movikodden_pen_id_167_2020-10-13_2020-10-30': 'imenco',
    'slapoya_pen_id_116_2020-10-18_2020-11-08': 'imenco',
    'varholmen_pen_id_131_2020-08-15_2020-08-30': 'imenco',
    'varholmen_pen_id_151_2020-10-02_2020-10-17': 'imenco',
    'varholmen_pen_id_186_2020-10-18_2020-11-02': 'atlas',
    
    
    'dale_pen_id_144_2020-12-20_2021-01-11': 'atlas'
}

In [None]:
gt_metadatas['varholmen_pen_id_186_2020-10-18_2020-11-02']

In [None]:
batch_name = 'test'

ROOT_DIR = '/root/data/alok/biomass_estimation/playground'
dfs, gt_metadatas = {}, {}
for cohort_name in cohort_names:
    s3_dir = os.path.join(
        'https://aquabyte-images-adhoc.s3-eu-west-1.amazonaws.com/alok/production_datasets',
        cohort_name
    )

    ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata.json')
    ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata.json')
#     ground_truth_metadata_url = os.path.join(s3_dir, 'ground_truth_metadata_validated.json')
#     ground_truth_key_base = os.path.join(batch_name, cohort_name, 'ground_truth_metadata_validated.json')
    ground_truth_f = os.path.join(ROOT_DIR, ground_truth_key_base)
    print(ground_truth_metadata_url)
    s3.download_from_url(ground_truth_metadata_url, custom_location=ground_truth_f)
    gt_metadata = json.load(open(ground_truth_f))
    gt_metadatas[cohort_name] = gt_metadata
    
#     data_url = os.path.join(s3_dir, 'annotation_dataset.csv')
#     data_f, _, _= s3.download_from_url(data_url)
#     df = pd.read_csv(data_f)
#     df = _add_date_hour_columns(df)
#     dfs[cohort_name] = df
    
    

In [None]:
gt_metadatas['varholmen_pen_id_186_2020-10-18_2020-11-02']

In [None]:
from filter_optimization.filter_optimization_task import extract_biomass_data

dfs2, gt_metadatas2 = {}, {}

cohort_name = 'dale_pen_id_144_2020-12-20_2021-01-11'

gt_metadata = {'pen_id': 144,
 'gutted_average_weight': 8000,
 'gutted_weight_distribution': None,
 'expected_loss_factor': 0.16,
 'last_feeding_date': '2021-01-11',
 'harvest_date': '2021-01-15',
 'slaughter_date': '2021-01-15'}

gt_metadatas2[cohort_name] = gt_metadata

df = extract_biomass_data(gt_metadata['pen_id'], '2021-01-01', '2021-01-12', 0.01)
df = _add_date_hour_columns(df)
dfs2[cohort_name] = df



<h1> Generate old / new model weights </h1>

In [None]:

    
"""This module contains utility helper functions for the WeightEstimator class."""

from collections import namedtuple
from typing import Dict, List, Tuple
import numpy as np
import torch
from research.weight_estimation.keypoint_utils import body_parts


CameraMetadata = namedtuple('CameraMetadata',
                            ['focal_length', 'focal_length_pixel', 'baseline_m',
                             'pixel_count_width', 'pixel_count_height', 'image_sensor_width',
                             'image_sensor_height'])


def get_left_right_keypoint_arrs(annotation: Dict[str, List[Dict]]) -> Tuple:
    """Gets numpy array of left and right keypoints given input keypoint annotation.
    Args:
        annotation: dict with keys 'leftCrop' and 'rightCrop'. Values are lists where each element
        is a dict with keys 'keypointType', 'xCrop' (num pixels from crop left edge),
        'yCrop' (num pixels from crop top edge), 'xFrame' (num pixels from full frame left edge),
        and 'yFrame' (num pixels from full frame top edge).
    Returns:
        X_left: numpy array containing left crop (xFrame, yFrame) for each key-point ordered
        alphabetically.
        X_right: same as above, but for right crop.
    """

    left_keypoints, right_keypoints = {}, {}
    for item in annotation['leftCrop']:
        body_part = item['keypointType']
        left_keypoints[body_part] = (item['xFrame'], item['yFrame'])

    for item in annotation['rightCrop']:
        body_part = item['keypointType']
        right_keypoints[body_part] = (item['xFrame'], item['yFrame'])

    left_keypoint_arr, right_keypoint_arr = [], []
    for body_part in body_parts.core_body_parts:
        left_keypoint_arr.append(left_keypoints[body_part])
        right_keypoint_arr.append(right_keypoints[body_part])

    X_left = np.array(left_keypoint_arr)
    X_right = np.array(right_keypoint_arr)
    return X_left, X_right


def normalize_left_right_keypoint_arrs(X_left: np.ndarray, X_right: np.ndarray) -> Tuple:
    """Normalizes input left and right key-point arrays. The normalization involves (1) 2D
    translation of all keypoints such that they are centered, (2) rotation of the 2D coordiantes
    about the center such that the line passing through UPPER_LIP and fish center is horizontal.
    """

    # translate key-points, perform reflection if necessary
    upper_lip_idx = body_parts.core_body_parts.index(body_parts.UPPER_LIP)
    tail_notch_idx = body_parts.core_body_parts.index(body_parts.TAIL_NOTCH)
    if X_left[upper_lip_idx, 0] > X_left[tail_notch_idx, 0]:
        X_center = 0.5 * (np.max(X_left, axis=0) + np.min(X_left, axis=0))
        X_left_centered = X_left - X_center
        X_right_centered = X_right - X_center
    else:
        X_center = 0.5 * (np.max(X_right, axis=0) + np.min(X_right, axis=0))
        X_left_centered = X_right - X_center
        X_right_centered = X_left - X_center
        X_left_centered[:, 0] = -X_left_centered[:, 0]
        X_right_centered[:, 0] = -X_right_centered[:, 0]

    # rotate key-points
    upper_lip_x, upper_lip_y = tuple(X_left_centered[upper_lip_idx])
    theta = np.arctan(upper_lip_y / upper_lip_x)
    R = np.array([
        [np.cos(theta), -np.sin(theta)],
        [np.sin(theta), np.cos(theta)]
    ])

    D = X_left_centered - X_right_centered
    X_left_rot = np.dot(X_left_centered, R)
    X_right_rot = X_left_rot - D
    return X_left_rot, X_right_rot


def convert_to_world_point_arr(X_left: np.ndarray, X_right: np.ndarray,
                               camera_metadata: CameraMetadata) -> np.ndarray:
    """Converts input left and right normalized keypoint arrays into world coordinate array."""

    y_world = camera_metadata.focal_length_pixel * camera_metadata.baseline_m / \
              (X_left[:, 0] - X_right[:, 0])

    # Note: the lines commented out below are technically the correct formula for conversion
    # x_world = X_left[:, 0] * y_world / camera_metadata.focal_length_pixel
    # z_world = -X_left[:, 1] * y_world / camera_metadata.focal_length_pixel
    x_world = ((X_left[:, 0] * camera_metadata.image_sensor_width / camera_metadata.pixel_count_width) * y_world) / (camera_metadata.focal_length)
    z_world = (-(X_left[:, 1] * camera_metadata.image_sensor_height / camera_metadata.pixel_count_height) * y_world) / (camera_metadata.focal_length)
    X_world = np.vstack([x_world, y_world, z_world]).T
    return X_world


def stabilize_keypoints(X: np.ndarray) -> np.ndarray:
    """Transforms world coordinate array so that neural network inputs are stabilized"""
    X_new = np.zeros(X.shape)
    X_new[:, 0] = 0.5 * X[:, 0] / X[:, 1]
    X_new[:, 1] = 0.5 * X[:, 2] / X[:, 1]
    X_new[:, 2] = 0.05 / X[:, 1]
    return X_new


def convert_to_nn_input(annotation: Dict[str, List[Dict]], camera_metadata: CameraMetadata) \
        -> torch.Tensor:
    """Convrts input keypoint annotation and camera metadata into neural network tensor input."""
    X_left, X_right = get_left_right_keypoint_arrs(annotation)
    X_left_norm, X_right_norm = normalize_left_right_keypoint_arrs(X_left, X_right)
    X_world = convert_to_world_point_arr(X_left_norm, X_right_norm, camera_metadata)
    X = stabilize_keypoints(X_world)
    nn_input = torch.from_numpy(np.array([X])).float()
    return nn_input


"""
This module contains the WeightEstimator class for estimating fish weight (g), length (mm), and
k-factor given input keypoint coordinates and camera metadata.
"""

from typing import Dict, Tuple
import torch
from torch import nn


class Network(nn.Module):
    """Network class defines neural-network architecture for both weight and k-factor estimation
    (currently both neural networks share identical architecture)."""

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(24, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        """Run inference on input keypoint tensor."""
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.output(x)
        return x


class WeightEstimator:
    """WeightEstimator class is used to predict fish weight, k-factor, and length
    given input keypoint annotations and camera metadata."""

    def __init__(self, weight_model_f: str, kf_model_f: str) -> None:
        """Initializes class with input weight and k-factor neural-networks."""
        self.weight_model = Network()
        self.weight_model.load_state_dict(torch.load(weight_model_f))
        self.weight_model.eval()

        self.kf_model = Network()
        self.kf_model.load_state_dict(torch.load(kf_model_f))
        self.kf_model.eval()

    @staticmethod
    def _get_model_input(annotation: Dict, camera_metadata: CameraMetadata) -> torch.Tensor:
        """Generates neural-network input tensor given annotation and camera_metadata."""
        X = convert_to_nn_input(annotation, camera_metadata)
        return X

    def predict_weight(self, annotation: Dict, camera_metadata: CameraMetadata) -> float:
        """Generates weight prediction given input annotation and camera metadata."""
        X = self._get_model_input(annotation, camera_metadata)
        weight = 1e4 * self.weight_model(X).item()
        return weight

    def predict_kf(self, annotation: Dict, camera_metadata: CameraMetadata) -> float:
        """Generates k-factor prediction gievn input annotation and camera metadata."""
        X = self._get_model_input(annotation, camera_metadata)
        kf = self.kf_model(X).item()
        return kf

    def predict(self, annotation: Dict, camera_metadata: CameraMetadata) -> Tuple:
        """Generates weight, k-factor, and length predictions given input annotation and camera
        metadata."""
        weight = self.predict_weight(annotation, camera_metadata)
        kf = self.predict_kf(annotation, camera_metadata)
        if weight * kf > 0:
            length = (1e5 * weight / kf) ** (1.0 / 3)
        else:
            length = 0
        return weight, length, kf

# Start here

In [None]:
models = [
    ('weight_v1', 'curr-nonsynthetic', 'https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/playground/nn_epoch_798_v2.pb', True),
    ('weight_v2', 'curr-synthetic', 'https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2020-11-27T00-00-00/weight_model_synthetic_data.pb', True),
    ('weight_v3', 'nojitter-ols', '/root/data/alok/biomass_estimation/playground/output_model_bryton.pb', False),
    ('weight_v4', 'jitter-ols', '/root/data/alok/biomass_estimation/playground/output_model_bryton2.pb', False),
    ('weight_v5', 'jitter-nools', '/root/data/alok/biomass_estimation/playground/output_model_bryton3.pb', False),
    ('weight_v6', 'augV1-ols', '/root/data/alok/biomass_estimation/playground/output_model_bryton4.pb', False),
    ('weight_v7', 'augV1-nools', '/root/data/alok/biomass_estimation/playground/output_model_bryton5.pb', False),
    ('weight_v8', 'augV2-ols', '/root/data/alok/biomass_estimation/playground/output_model_bryton6.pb', False),
    ('weight_v9', 'augV3-ols', '/root/data/alok/biomass_estimation/playground/output_model_bryton7.pb', False),
    ('weight_v10', 'augV1-bidir-ols', '/root/data/alok/biomass_estimation/playground/output_model_bryton8.pb', False),
    ('weight_v11', 'augV2-ols-akpd', '/root/data/alok/biomass_estimation/playground/output_model_bryton9.pb', False),
    ('weight_v12', 'augV1-ols-akpd-halfinfl', '/root/data/alok/biomass_estimation/playground/output_model_bryton10.pb', False),
    ('weight_v13', 'augV2-ols-akpd-halfinfl', '/root/data/alok/biomass_estimation/playground/output_model_bryton11.pb', False),
    ('weight_v14', 'augV3-o-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton12.pb', False),
    ('weight_v15', 'augV4-o-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton13.pb', False),
    ('weight_v16', 'augV4-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton14.pb', False),
    ('weight_v17', 'augV4-o-a-h2-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton15.pb', False),
    ('weight_v18', 'augV4-o-a-h3-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton16.pb', False),
    ('weight_v19', 'augV4-o-a-h4-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton17.pb', False),
    ('weight_v20', 'augV4-o-a-h5-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton18.pb', False),
    ('weight_v21', 'augV4-j2-o-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton19.pb', False),
    ('weight_v22', 'augV4-j3-o-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton20.pb', False),
    ('weight_v23', 'augV4-j4-o-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton21.pb', False),
    ('weight_v24', 'augV4-o-a-h-99#2', '/root/data/alok/biomass_estimation/playground/output_model_bryton22.pb', False),
    ('weight_v25', 'noaugV4-o-a-h-99', '/root/data/alok/biomass_estimation/playground/output_model_bryton23.pb', False),
    ('weight_v26', 'augV4-j2-o-a-h-99#2', '/root/data/alok/biomass_estimation/playground/output_model_bryton24.pb', False),
    ('weight_v27', 'augV1-alokj-o-a-h-99#2', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a1.pb', False),
    ('weight_v28', 'augV1-alokj-o-a-h-99-t', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a2.pb', False),
    ('weight_v29', 'augV1-alokj-o-a-h-90', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a3.pb', False),
    ('weight_v30', 'augV1-o-o-a-h-90', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a4.pb', False),
    ('weight_v31', 'augV4-alokj-o-a-h-90', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a5.pb', False),
    ('weight_v32', 'augV4-o-a-h-90', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a6.pb', False)
]

additional_models = [
    ('weight_v31', 'augV4-alokj-o-a-h-90', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a5.pb', False),
    ('weight_v32', 'augV4-o-a-h-90', '/root/data/alok/biomass_estimation/playground/output_model_bryton_a6.pb', False)
]

In [None]:
gtsf = pd.read_csv('/root/data/alok/biomass_estimation/playground/gtsf_akpr2.csv')


In [None]:
for key, tag, model_url, is_url in additional_models:
    if is_url:
        weight_model_f, _, _ = s3.download_from_url(model_url)
    else:
        weight_model_f = model_url
    kf_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/k-factor/playground/kf_predictor_v2.pb')

    weight_estimator = WeightEstimator(weight_model_f, kf_model_f)
    
    weights = []

    for idx, row in gtsf.iterrows():
        annotation = json.loads(row.keypoints.replace("'", '"'))
        if not annotation:
            weights.append(None)
            continue
        camera_metadata = json.loads(row.camera_metadata.replace("'", '"'))
        if not camera_metadata:
            camera_metadata = json.loads(rdf.camera_metadata.iloc[0].replace("'", '"'))

        camera_metadata_obj = CameraMetadata(
            focal_length=camera_metadata['focalLength'],
            focal_length_pixel=camera_metadata['focalLengthPixel'],
            baseline_m=camera_metadata['baseline'],
            pixel_count_width=camera_metadata['pixelCountWidth'],
            pixel_count_height=camera_metadata['pixelCountHeight'],
            image_sensor_width=camera_metadata['imageSensorWidth'],
            image_sensor_height=camera_metadata['imageSensorHeight']
        )

        weight, length, kf = weight_estimator.predict(annotation, camera_metadata_obj)
        weights.append(weight)

    gtsf[key] = weights

In [None]:
import statsmodels.api as sm

for key, tag, model_url, is_url in models:
# plt.scatter(gtsf.weight, gtsf['weight_v11'])
    X = gtsf[key]
    X = sm.add_constant(X)
    model = sm.OLS(gtsf.weight, X)
    results = model.fit()

    print('%s: %0.2f, %0.2f' % (tag, 100 * results.rsquared, 100 * results.params[1]))

In [None]:
row = rdf.iloc[0]

annotation = json.loads(row.annotation.replace("'", '"'))
camera_metadata = json.loads(row.camera_metadata.replace("'", '"'))
if not camera_metadata:
    camera_metadata = json.loads(rdf.camera_metadata.iloc[0].replace("'", '"'))

camera_metadata_obj = CameraMetadata(
    focal_length=camera_metadata['focalLength'],
    focal_length_pixel=camera_metadata['focalLengthPixel'],
    baseline_m=camera_metadata['baseline'],
    pixel_count_width=camera_metadata['pixelCountWidth'],
    pixel_count_height=camera_metadata['pixelCountHeight'],
    image_sensor_width=camera_metadata['imageSensorWidth'],
    image_sensor_height=camera_metadata['imageSensorHeight']
)

weight = weight_estimator._get_model_input(annotation, camera_metadata_obj)
weight.size()

In [None]:
for key, tag, model_url, is_url in additional_models:
    # weight_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2020-11-27T00-00-00/weight_model_synthetic_data.pb')
    if is_url:
        weight_model_f, _, _ = s3.download_from_url(model_url)
    else:
        weight_model_f = model_url
    kf_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/k-factor/playground/kf_predictor_v2.pb')

    weight_estimator = WeightEstimator(weight_model_f, kf_model_f)


    for k, rdf in dfs.items():
        print(k)
        weights = []
        count = 0
        for idx, row in rdf.iterrows():
            if count % 100 == 0:
                print('Percentage completion: {}%'.format(round(100 * count / rdf.shape[0], 2)))
                print(count)
            count += 1
            annotation = json.loads(row.annotation.replace("'", '"'))
            if not annotation:
                weights.append(None)
                continue
            camera_metadata = json.loads(row.camera_metadata.replace("'", '"'))
            if not camera_metadata:
                camera_metadata = json.loads(rdf.camera_metadata.iloc[0].replace("'", '"'))

            camera_metadata_obj = CameraMetadata(
                focal_length=camera_metadata['focalLength'],
                focal_length_pixel=camera_metadata['focalLengthPixel'],
                baseline_m=camera_metadata['baseline'],
                pixel_count_width=camera_metadata['pixelCountWidth'],
                pixel_count_height=camera_metadata['pixelCountHeight'],
                image_sensor_width=camera_metadata['imageSensorWidth'],
                image_sensor_height=camera_metadata['imageSensorHeight']
            )

            weight, length, kf = weight_estimator.predict(annotation, camera_metadata_obj)
            weights.append(weight)
        rdf[key] = weights


In [None]:
for key, tag, model_url, is_url in models:
    # weight_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2020-11-27T00-00-00/weight_model_synthetic_data.pb')
    if is_url:
        weight_model_f, _, _ = s3.download_from_url(model_url)
    else:
        weight_model_f = model_url
    kf_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/k-factor/playground/kf_predictor_v2.pb')

    weight_estimator = WeightEstimator(weight_model_f, kf_model_f)


    for k, rdf in dfs2.items():
        print(k)
        weights = []
        count = 0
        for idx, row in rdf.iterrows():
            if count % 100 == 0:
                print('Percentage completion: {}%'.format(round(100 * count / rdf.shape[0], 2)))
                print(count)
            count += 1
            annotation = row.annotation
            if not annotation:
                weights.append(None)
                continue
            camera_metadata = row.camera_metadata
            if not camera_metadata:
                camera_metadata = rdf.camera_metadata

            camera_metadata_obj = CameraMetadata(
                focal_length=camera_metadata['focalLength'],
                focal_length_pixel=camera_metadata['focalLengthPixel'],
                baseline_m=camera_metadata['baseline'],
                pixel_count_width=camera_metadata['pixelCountWidth'],
                pixel_count_height=camera_metadata['pixelCountHeight'],
                image_sensor_width=camera_metadata['imageSensorWidth'],
                image_sensor_height=camera_metadata['imageSensorHeight']
            )

            weight, length, kf = weight_estimator.predict(annotation, camera_metadata_obj)
            weights.append(weight)
        rdf[key] = weights


<h1> Generate average weight accuracy with old model </h1>

In [None]:
def generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding):
    last_feeding_date = gt_metadata['last_feeding_date']
    date = add_days(last_feeding_date, days_post_feeding)
    weights, _ = generate_smart_individual_values(pm_base, date, max_day_diff, True, apply_growth_rate, 0.9)
    return weights


def generate_average_weight_accuracy(weights, gt_metadata, loss_factor):
    avg_weight_prediction = np.mean(weights)
    gutted_weight_prediction = avg_weight_prediction * (1.0 - loss_factor)
    gt_weight = gt_metadata['gutted_average_weight']
    avg_weight_err = (gutted_weight_prediction - gt_weight) / gt_weight
    return avg_weight_err, gutted_weight_prediction

def generate_distribution_accuracy(weights, gt_metadata, loss_factor):
    gutted_weights = weights * (1.0 - loss_factor)
    gutted_weight_distribution = gt_metadata['gutted_weight_distribution']
    
    if gutted_weight_distribution is None:
        return []
    
    count_distribution_errors = []
    
    for bucket in gutted_weight_distribution:
        lower_bound, upper_bound = bucket.split('-')
        pct = gutted_weight_distribution[bucket]
        mask = (gutted_weights >= float(lower_bound) * 1000) & (gutted_weights < float(upper_bound) * 1000)

        pct = np.sum(mask) / len(mask)
        gt_pct = gutted_weight_distribution[bucket] / 100
        
        count_distribution_errors.append(pct - gt_pct)
        
    return count_distribution_errors



In [None]:
df2 = df[(df.hour >= 3) & (df.hour <= 20)]

count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

idx_values = np.where(count > 1.0 / 18)[0]

start_index = np.where(bins == 10)[0][0]
start_array = np.where(idx_values == start_index)[0][0]

lower_index = start_array
upper_index = start_array

while lower_index > 0 and (idx_values[lower_index] - idx_values[lower_index - 1] == 1):
    lower_index = lower_index - 1
while upper_index < len(idx_values) - 1 and (idx_values[upper_index + 1] - idx_values[upper_index] == 1):
    upper_index = upper_index + 1
    
print(bins[idx_values[lower_index]], bins[idx_values[upper_index]])
# min_idx, max_idx = idx_values[[0, -1]]
# print(bins[min_idx], bins[max_idx])
# bins

In [None]:
df2 = df[(df.hour >= 3) & (df.hour <= 20)]

#count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

start_hour = np.min(df2.hour)
end_hour = np.max(df2.hour)

bins = np.arange(start_hour, end_hour + 1)

weights = []

for hour in np.arange(start_hour, end_hour + 1):
    avg_weight = np.mean(df2[df2.hour == hour].estimated_weight_g)
    weights.append(avg_weight)

start_index = np.where(bins == 10)[0][0]

lower_index = start_index
upper_index = start_index

is_iterating = True
eps = 3

while is_iterating:
#     print(np.std(weights[lower_index:upper_index]))
    if lower_index > 0 and upper_index < len(weights) - 1 and np.abs(weights[upper_index + 1] - weights[lower_index - 1]) < eps * np.std(weights[lower_index - 1:upper_index + 1]):
        lower_index = lower_index - 1
        upper_index = upper_index + 1
    elif lower_index > 0 and np.abs(weights[upper_index] - weights[lower_index - 1]) < eps * np.std(weights[lower_index - 1:upper_index]):
        lower_index = lower_index - 1
    elif upper_index < len(weights) - 1 and np.abs(weights[upper_index + 1] - weights[lower_index]) < eps * np.std(weights[lower_index:upper_index + 1]):
        upper_index = upper_index + 1
    else:
        is_iterating = False
        
start_hour, end_hour = bins[lower_index], bins[upper_index]

plt.plot(bins, weights)

print(start_hour, end_hour)

In [None]:
gt_metadatas['leivsethamran_pen_id_165_2020-10-18_2020-11-13']

In [None]:
from datetime import datetime

last_feeding_date = gt_metadatas['dale_pen_id_143_2020-10-07_2020-10-21']['last_feeding_date']
slaughter_date = gt_metadatas['dale_pen_id_143_2020-10-07_2020-10-21']['slaughter_date']

date_diff = datetime.strptime(slaughter_date, '%Y-%m-%d') - datetime.strptime(last_feeding_date, '%Y-%m-%d')
date_diff.days
# gt_metadatas['dale_pen_id_143_2020-10-07_2020-10-21']['expected_loss_factor']

In [None]:
all_dfs1 = []
all_dfs2 = []
all_dfs3 = []
all_dfs4 = []
all_dfs5 = []
all_dfs6 = []

In [None]:
import matplotlib.pyplot as plt

for key, tag, _, _ in additional_models:
    start_hours = [7]
    end_hours = [15]
    apply_growth_rate = True
    max_day_diff = 3
    days_post_feeding = 1
    final_days_post_feeding = 1
    loss_factors = [0.16, 'expected_loss_factor'] # need to determine the right values here
    akpd_cutoffs = [0.01, 0.95]

    hour_filter_methods = ['manual', 'hour_hist', 'u-shape'] #  'u-shape',

    cohort_name_col = []
    akpd_cutoff_col = []
    hour_filter_method_col = []
    start_hour_col = []
    end_hour_col = []
    loss_factor_col = []
    starvation_days_col = []
    avg_weight_col = []
    avg_weight_error_col = []
    gt_avg_weight_col = []
    count_distribution_error_col = []
    camera_col = []

    for loss_factor in loss_factors:
        avg_weight_error_col.append([])
        avg_weight_error_col.append([])
        count_distribution_error_col.append([])

    for cohort_name in sorted(list(dfs.keys())):
        print(cohort_name)
        
        gt_metadata = gt_metadatas[cohort_name]

        last_feeding_date = gt_metadata['last_feeding_date']
        slaughter_date = gt_metadata['slaughter_date']

        if slaughter_date is not None and last_feeding_date is not None:
            date_diff = datetime.strptime(slaughter_date, '%Y-%m-%d') - datetime.strptime(last_feeding_date, '%Y-%m-%d')
            starvation_days = date_diff.days
        else:
            starvation_days = None

        df = dfs[cohort_name]
        df['estimated_weight_g'] = df[key]
        final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
        tdf = df[df.date <= final_date_post_feeding]

        start_end_hours = []

        for method in hour_filter_methods:
            if method == 'manual':
                for start_hour in start_hours:
                    for end_hour in end_hours:
                        start_end_hours.append((method, start_hour, end_hour))
            elif method == 'u-shape':
                df2 = df[(df.hour >= 3) & (df.hour <= 20)]

                #count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

                start_hour = np.min(df2.hour)
                end_hour = np.max(df2.hour)

                bins = np.arange(start_hour, end_hour + 1)

                weights = []

                for hour in np.arange(start_hour, end_hour + 1):
                    avg_weight = np.mean(df2[df2.hour == hour].estimated_weight_g)
                    weights.append(avg_weight)

                start_index = np.where(bins == 10)[0][0]

                lower_index = start_index
                upper_index = start_index

                is_iterating = True
                eps = 3

                while is_iterating:
                #     print(np.std(weights[lower_index:upper_index]))
                    if lower_index > 0 and upper_index < len(weights) - 1 and np.abs(weights[upper_index + 1] - weights[lower_index - 1]) < eps * np.std(weights[lower_index - 1:upper_index + 1]):
                        lower_index = lower_index - 1
                        upper_index = upper_index + 1
                    elif lower_index > 0 and np.abs(weights[upper_index] - weights[lower_index - 1]) < eps * np.std(weights[lower_index - 1:upper_index]):
                        lower_index = lower_index - 1
                    elif upper_index < len(weights) - 1 and np.abs(weights[upper_index + 1] - weights[lower_index]) < eps * np.std(weights[lower_index:upper_index + 1]):
                        upper_index = upper_index + 1
                    else:
                        is_iterating = False

                start_hour, end_hour = bins[lower_index], bins[upper_index]
                
                start_end_hours.append((method, start_hour, end_hour))
            elif method == 'hour_hist':
                df2 = df[(df.hour >= 3) & (df.hour <= 20)]

                count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

                idx_values = np.where(count > 1.0 / 18)[0]

                start_index = np.where(bins == 10)[0][0]
                start_array = np.where(idx_values == start_index)[0][0]

                lower_index = start_array
                upper_index = start_array

                while lower_index > 0 and (idx_values[lower_index] - idx_values[lower_index - 1] == 1):
                    lower_index = lower_index - 1
                while upper_index < len(idx_values) - 1 and (idx_values[upper_index + 1] - idx_values[upper_index] == 1):
                    upper_index = upper_index + 1

                start_hour, end_hour = bins[idx_values[lower_index]], bins[idx_values[upper_index]]

                start_end_hours.append((method, start_hour, end_hour))

        for akpd_cutoff in akpd_cutoffs:
            for method, start_hour, end_hour in start_end_hours:
                sampling_filter = SamplingFilter(
                    start_hour=start_hour,
                    end_hour=end_hour,
                    kf_cutoff=0.0,
                    akpd_score_cutoff=akpd_cutoff
                )

                pm_base = gen_pm_base(tdf, sampling_filter)

                try:
                    weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
                except ValidationError as err:
                    continue

                akpd_cutoff_col.append(akpd_cutoff)
                cohort_name_col.append(cohort_name)
                hour_filter_method_col.append(method)
                start_hour_col.append(start_hour)
                end_hour_col.append(end_hour)
                loss_factor_col.append(gt_metadata['expected_loss_factor'])
                starvation_days_col.append(starvation_days)
                avg_weight_col.append(np.mean(weights))
                gt_avg_weight_col.append(gt_metadata['gutted_average_weight'])
                camera_col.append(camera_type[cohort_name])

                for index, loss_factor in enumerate(loss_factors):
                    if loss_factor == 'expected_loss_factor':
                        loss_factor = gt_metadata['expected_loss_factor'] or 0.165

                        if loss_factor > 10:
                            loss_factor = loss_factor / 100.0

                    avg_weight_err, gutted_weight_prediction = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                    avg_weight_error_col[index].append(avg_weight_err)

                    count_distribution_errors = generate_distribution_accuracy(weights, gt_metadata, loss_factor)
                    count_distribution_error_col[index].append(count_distribution_errors)
                    
    columns = {
        'cohort_name': cohort_name_col,
        'hour_filter_method_col': hour_filter_method_col,
        'akpd_cutoff_col': akpd_cutoff_col,
        'start_hour_col': start_hour_col,
        'end_hour_col': end_hour_col,
        'loss_factor_col': loss_factor_col,
        'starvation_days_col': starvation_days_col,
        'avg_weight_col': avg_weight_col,
        'gt_avg_weight_col': gt_avg_weight_col,
        'camera_col': camera_col
    }

    for index, loss_factor in enumerate(loss_factors):
        if loss_factor == 'expected_loss_factor':
            col_name = 'avg_weight_error_exp'
            col_abs_name = 'avg_weight_error_abs_exp'
            col_abs_dist_name = 'avg_count_dist_error_abs_exp'
        else:
            col_name = 'avg_weight_error_%0.2f' % (loss_factor,)
            col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
            col_abs_dist_name = 'avg_count_dist_error_abs_%0.2f' % (loss_factor,)

        columns[col_name] = avg_weight_error_col[index]
        columns[col_abs_name] = np.abs(avg_weight_error_col[index])
        columns[col_abs_dist_name] = [np.mean(np.abs(l)) for l in count_distribution_error_col[index]]

    tdf = pd.DataFrame(columns)
    
    df1 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'manual')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]
    df2 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]
    df3 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'u-shape')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]
    df4 = tdf[(tdf.akpd_cutoff_col == 0.95) & (tdf.hour_filter_method_col == 'manual')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]
    df5 = tdf[(tdf.akpd_cutoff_col == 0.95) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]
    df6 = tdf[(tdf.akpd_cutoff_col == 0.95) & (tdf.hour_filter_method_col == 'u-shape')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]
    
    all_dfs1.append(df1)
    all_dfs2.append(df2)
    all_dfs3.append(df3)
    all_dfs4.append(df4)
    all_dfs5.append(df5)
    all_dfs6.append(df6)

In [None]:
all_dfs21 = []
all_dfs22 = []
all_dfs23 = []
all_dfs24 = []
all_dfs25 = []
all_dfs26 = []

In [None]:
import matplotlib.pyplot as plt

for key, tag, _, _ in additional_models:
    start_hours = [7]
    end_hours = [15]
    apply_growth_rate = True
    max_day_diff = 3
    days_post_feeding = 1
    final_days_post_feeding = 1
    loss_factors = [0.16, 'expected_loss_factor'] # need to determine the right values here
    akpd_cutoffs = [0.01, 0.95]

    hour_filter_methods = ['manual', 'hour_hist', 'u-shape'] #  'u-shape',

    cohort_name_col = []
    akpd_cutoff_col = []
    hour_filter_method_col = []
    start_hour_col = []
    end_hour_col = []
    loss_factor_col = []
    starvation_days_col = []
    avg_weight_col = []
    avg_weight_error_col = []
    gt_avg_weight_col = []
    count_distribution_error_col = []
    camera_col = []

    for loss_factor in loss_factors:
        avg_weight_error_col.append([])
        avg_weight_error_col.append([])
        count_distribution_error_col.append([])

    for cohort_name in sorted(list(dfs2.keys())):
        print(cohort_name)
        
        gt_metadata = gt_metadatas2[cohort_name]

        last_feeding_date = gt_metadata['last_feeding_date']
        slaughter_date = gt_metadata['slaughter_date']

        if slaughter_date is not None and last_feeding_date is not None:
            date_diff = datetime.strptime(slaughter_date, '%Y-%m-%d') - datetime.strptime(last_feeding_date, '%Y-%m-%d')
            starvation_days = date_diff.days
        else:
            starvation_days = None

        df = dfs2[cohort_name]
        df['estimated_weight_g'] = df[key]
        final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
        tdf = df[df.date <= final_date_post_feeding]

        start_end_hours = []

        for method in hour_filter_methods:
            if method == 'manual':
                for start_hour in start_hours:
                    for end_hour in end_hours:
                        start_end_hours.append((method, start_hour, end_hour))
            elif method == 'u-shape':
                df2 = df[(df.hour >= 3) & (df.hour <= 20)]

                #count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

                start_hour = np.min(df2.hour)
                end_hour = np.max(df2.hour)

                bins = np.arange(start_hour, end_hour + 1)

                weights = []

                for hour in np.arange(start_hour, end_hour + 1):
                    avg_weight = np.mean(df2[df2.hour == hour].estimated_weight_g)
                    weights.append(avg_weight)

                start_index = np.where(bins == 10)[0][0]

                lower_index = start_index
                upper_index = start_index

                is_iterating = True
                eps = 3

                while is_iterating:
                #     print(np.std(weights[lower_index:upper_index]))
                    if lower_index > 0 and upper_index < len(weights) - 1 and np.abs(weights[upper_index + 1] - weights[lower_index - 1]) < eps * np.std(weights[lower_index - 1:upper_index + 1]):
                        lower_index = lower_index - 1
                        upper_index = upper_index + 1
                    elif lower_index > 0 and np.abs(weights[upper_index] - weights[lower_index - 1]) < eps * np.std(weights[lower_index - 1:upper_index]):
                        lower_index = lower_index - 1
                    elif upper_index < len(weights) - 1 and np.abs(weights[upper_index + 1] - weights[lower_index]) < eps * np.std(weights[lower_index:upper_index + 1]):
                        upper_index = upper_index + 1
                    else:
                        is_iterating = False

                start_hour, end_hour = bins[lower_index], bins[upper_index]
                
                start_end_hours.append((method, start_hour, end_hour))
            elif method == 'hour_hist':
                df2 = df[(df.hour >= 3) & (df.hour <= 20)]

                count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

                idx_values = np.where(count > 1.0 / 18)[0]

                start_index = np.where(bins == 10)[0][0]
                start_array = np.where(idx_values == start_index)[0][0]

                lower_index = start_array
                upper_index = start_array

                while lower_index > 0 and (idx_values[lower_index] - idx_values[lower_index - 1] == 1):
                    lower_index = lower_index - 1
                while upper_index < len(idx_values) - 1 and (idx_values[upper_index + 1] - idx_values[upper_index] == 1):
                    upper_index = upper_index + 1

                start_hour, end_hour = bins[idx_values[lower_index]], bins[idx_values[upper_index]]

                start_end_hours.append((method, start_hour, end_hour))

        for akpd_cutoff in akpd_cutoffs:
            for method, start_hour, end_hour in start_end_hours:
                sampling_filter = SamplingFilter(
                    start_hour=start_hour,
                    end_hour=end_hour,
                    kf_cutoff=0.0,
                    akpd_score_cutoff=akpd_cutoff
                )

                pm_base = gen_pm_base(tdf, sampling_filter)

                try:
                    weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
                except ValidationError as err:
                    continue

                akpd_cutoff_col.append(akpd_cutoff)
                cohort_name_col.append(cohort_name)
                hour_filter_method_col.append(method)
                start_hour_col.append(start_hour)
                end_hour_col.append(end_hour)
                loss_factor_col.append(gt_metadata['expected_loss_factor'])
                starvation_days_col.append(starvation_days)
                avg_weight_col.append(np.mean(weights))
                gt_avg_weight_col.append(gt_metadata['gutted_average_weight'])
                camera_col.append(camera_type[cohort_name])

                for index, loss_factor in enumerate(loss_factors):
                    if loss_factor == 'expected_loss_factor':
                        loss_factor = gt_metadata['expected_loss_factor'] or 0.165

                        if loss_factor > 10:
                            loss_factor = loss_factor / 100.0

                    avg_weight_err, gutted_weight_prediction = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                    avg_weight_error_col[index].append(avg_weight_err)

                    count_distribution_errors = generate_distribution_accuracy(weights, gt_metadata, loss_factor)
                    count_distribution_error_col[index].append(count_distribution_errors)
                    
    columns = {
        'cohort_name': cohort_name_col,
        'hour_filter_method_col': hour_filter_method_col,
        'akpd_cutoff_col': akpd_cutoff_col,
        'start_hour_col': start_hour_col,
        'end_hour_col': end_hour_col,
        'loss_factor_col': loss_factor_col,
        'starvation_days_col': starvation_days_col,
        'avg_weight_col': avg_weight_col,
        'gt_avg_weight_col': gt_avg_weight_col,
        'camera_col': camera_col
    }

    for index, loss_factor in enumerate(loss_factors):
        if loss_factor == 'expected_loss_factor':
            col_name = 'avg_weight_error_exp'
            col_abs_name = 'avg_weight_error_abs_exp'
            col_abs_dist_name = 'avg_count_dist_error_abs_exp'
        else:
            col_name = 'avg_weight_error_%0.2f' % (loss_factor,)
            col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
            col_abs_dist_name = 'avg_count_dist_error_abs_%0.2f' % (loss_factor,)

        columns[col_name] = avg_weight_error_col[index]
        columns[col_abs_name] = np.abs(avg_weight_error_col[index])
        columns[col_abs_dist_name] = [np.mean(np.abs(l)) for l in count_distribution_error_col[index]]

    tdf = pd.DataFrame(columns)
    
    df1 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'manual')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp', 'start_hour_col', 'end_hour_col']]
    df2 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp', 'start_hour_col', 'end_hour_col']]
    df3 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'u-shape')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp', 'start_hour_col', 'end_hour_col']]
    df4 = tdf[(tdf.akpd_cutoff_col == 0.95) & (tdf.hour_filter_method_col == 'manual')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp', 'start_hour_col', 'end_hour_col']]
    df5 = tdf[(tdf.akpd_cutoff_col == 0.95) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp', 'start_hour_col', 'end_hour_col']]
    df6 = tdf[(tdf.akpd_cutoff_col == 0.95) & (tdf.hour_filter_method_col == 'u-shape')][['cohort_name', 'avg_weight_col', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp', 'start_hour_col', 'end_hour_col']]
    
    all_dfs21.append(df1)
    all_dfs22.append(df2)
    all_dfs23.append(df3)
    all_dfs24.append(df4)
    all_dfs25.append(df5)
    all_dfs26.append(df6)

In [None]:
for row in all_dfs21:
    print(row['avg_weight_col'].values[0])


In [None]:
avg = []
adj_avg = []

for index, model in enumerate(models):
    row = all_dfs22[index] 
    avg_weight = row['avg_weight_col'].values[0]
    start_hour = row['start_hour_col'].values[0]
    end_hour = row['end_hour_col'].values[0]
    
    avg.append(avg_weight)
    adj_avg.append((1 - avg_under[model[1]]) * avg_weight)
    
    print(model[1], start_hour, end_hour, (1 - avg_under[model[1]]), avg_weight, (1 - avg_under[model[1]]) * avg_weight, avg_weight / (1 + avg_under[model[1]]))

print(np.mean(avg))
print(np.mean(adj_avg))

In [None]:
avg = []
adj_avg = []

for index, model in enumerate(models):
    row = all_dfs25[index] 
    avg_weight = row['avg_weight_col'].values[0]
    start_hour = row['start_hour_col'].values[0]
    end_hour = row['end_hour_col'].values[0]
    
    avg.append(avg_weight)
    adj_avg.append((1 - avg_under[model[1]]) * avg_weight)
    
    print(model[1], start_hour, end_hour, (1 - avg_under[model[1]]), avg_weight, (1 - avg_under[model[1]]) * avg_weight, avg_weight / (1 + avg_under[model[1]]))

print(np.mean(avg))
print(np.mean(adj_avg))

In [None]:
avg_under

In [None]:
# curr-synthetic -0.033101807184804044
# augV4-o-a-h-99 -0.020492502857156494
# augV4-o-a-h-99#2 -0.026963895675136098

7895 * 1.033, 8243 * 1.02, 8233 * 1.027

In [None]:
for row in all_dfs23:
    print(row['avg_weight_col'].values[0])


In [None]:
for row in all_dfs24:
    print(row['avg_weight_col'].values[0])


In [None]:
for row in all_dfs25:
    print(row['avg_weight_col'].values[0])


In [None]:
for row in all_dfs26:
    print(row['avg_weight_col'].values[0])


In [None]:
dfs['leivsethamran_pen_id_165_2020-10-18_2020-11-13'].iloc[0]

In [None]:
# for model in additional_models:
#     models.append(model)

In [None]:
from matplotlib.pyplot import cm

plt.figure(figsize=(20, 10))

color=iter(cm.rainbow(np.linspace(0,1,len(models))))

all_dfs = all_dfs5

avg_under = {}

for index, model in enumerate(models):
    c = next(color)
    _, tag, _, _ = model
    plt.scatter(all_dfs[index].gt_avg_weight_col, all_dfs[index]['avg_weight_error_0.16'], color = c, label=tag)

    mask = all_dfs[index].gt_avg_weight_col > 5000
                   
    avg_under[model[1]] = np.mean(all_dfs[index]['avg_weight_error_exp'][mask])
    
# plt.legend()

In [None]:
plt.figure(figsize=(20, 10))

my_models = ['curr-synthetic', 'augV4-o-a-h-99', 'augV4-o-a-h-99#2']

color=iter(cm.rainbow(np.linspace(0,1,len(my_models))))

for index, model in enumerate(models):
    _, tag, _, _ = model
    
    if tag not in my_models: # 'augV1-ols-akpd-halfinfl',
        continue
    c = next(color)
    plt.scatter(all_dfs[index].gt_avg_weight_col, all_dfs2[index]['avg_weight_error_exp'], color = c, label=tag)
    
plt.legend()



In [None]:
plt.figure(figsize=(20, 10))

my_models = ['curr-synthetic', 'augV1-ols', 'jitter-ols'] # , 'augV4-o-a-h-99', 'augV4-o-a-h-99#2'

color=iter(cm.rainbow(np.linspace(0,1,len(my_models))))

for index, model in enumerate(models):
    _, tag, _, _ = model
    
    if tag not in my_models: # 'augV1-ols-akpd-halfinfl',
        continue
    c = next(color)
    plt.scatter(all_dfs[index].gt_avg_weight_col, all_dfs5[index]['avg_weight_error_exp'], color = c, label=tag)
    
plt.legend()



In [None]:
all_dfs = all_dfs21

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
all_dfs = all_dfs1

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
all_dfs = all_dfs2

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
all_dfs = all_dfs3

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
all_dfs = all_dfs4

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
all_dfs = all_dfs5

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
all_dfs = all_dfs6

metric = []

for index, model in enumerate(models):
    _, tag, _, _ = model
    print('%-*s: %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f, %0.2f' % (25, tag, 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 90), 100 * np.percentile(np.abs(all_dfs[index]['avg_weight_error_exp']), 50), np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)), 100 * np.mean((all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_0.16'])), 100 * np.mean(np.abs(all_dfs[index]['avg_weight_error_exp'])), 100 * np.std((all_dfs[index]['avg_weight_error_0.16']))))
    metric.append(np.sqrt(np.mean((50 * np.abs(all_dfs[index]['avg_weight_error_exp'])) ** 2)))
    
print(np.mean(metric))

In [None]:
print(models[0])
all_dfs[4]

# dfs['aplavika_pen_id_95_2020-07-10_2020-07-26'].iloc[0]

In [None]:
print(models[1])
all_dfs[1]

In [None]:
# # print(np.mean(np.abs(df4['avg_weight_error_0.16'])), np.mean(np.abs(df3['avg_weight_error_0.16'])), np.mean(np.abs(df5['avg_weight_error_0.16'])), np.mean(np.abs(df1['avg_weight_error_0.16'])), np.mean(np.abs(df6['avg_weight_error_0.16'])), np.mean(np.abs(df7['avg_weight_error_0.16'])))
# # print(np.std((df4['avg_weight_error_0.16'])), np.std((df3['avg_weight_error_0.16'])), np.std((df5['avg_weight_error_0.16'])), np.std((df1['avg_weight_error_0.16'])), np.std((df6['avg_weight_error_0.16'])), np.std((df7['avg_weight_error_0.16'])))

# print(np.mean(np.abs(df5['avg_weight_error_0.16'])), np.mean(np.abs(df1['avg_weight_error_0.16'])), np.mean(np.abs(df6['avg_weight_error_0.16'])), np.mean(np.abs(df7['avg_weight_error_0.16'])), np.mean(np.abs(df8['avg_weight_error_0.16'])))
# print(np.std((df5['avg_weight_error_0.16'])), np.std((df1['avg_weight_error_0.16'])), np.std((df6['avg_weight_error_0.16'])), np.std((df7['avg_weight_error_0.16'])), np.std((df8['avg_weight_error_0.16'])))


In [None]:
# # print(np.mean(np.abs(df4['avg_weight_error_exp'])), np.mean(np.abs(df3['avg_weight_error_exp'])), np.mean(np.abs(df5['avg_weight_error_exp'])), np.mean(np.abs(df1['avg_weight_error_exp'])), np.mean(np.abs(df6['avg_weight_error_exp'])), np.mean(np.abs(df7['avg_weight_error_exp'])))
# # print(np.std((df4['avg_weight_error_exp'])), np.std((df3['avg_weight_error_exp'])), np.std((df5['avg_weight_error_exp'])), np.std((df1['avg_weight_error_exp'])), np.std((df6['avg_weight_error_exp'])), np.std((df7['avg_weight_error_exp'])))

# print(np.mean(np.abs(df5['avg_weight_error_exp'])), np.mean(np.abs(df1['avg_weight_error_exp'])), np.mean(np.abs(df6['avg_weight_error_exp'])), np.mean(np.abs(df7['avg_weight_error_exp'])), np.mean(np.abs(df8['avg_weight_error_exp'])))
# print(np.std((df5['avg_weight_error_exp'])), np.std((df1['avg_weight_error_exp'])), np.std((df6['avg_weight_error_exp'])), np.std((df7['avg_weight_error_exp'])), np.std((df8['avg_weight_error_exp'])))



In [None]:
# np.mean(tdf[tdf.akpd_cutoff_col == 0.01])
# np.std(tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'manual')])
# np.std(tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'hour_hist')])
# np.mean(tdf[tdf.akpd_cutoff_col == 0.95])
# np.std(tdf[tdf.akpd_cutoff_col == 0.95])

In [None]:
import statsmodels.api as sm

tdf1 = tdf[tdf['starvation_days_col'] > 0]

plt.scatter(tdf1['starvation_days_col'], tdf1['avg_weight_error_0.14'])

X = tdf1['starvation_days_col']
X = sm.add_constant(X)
model = sm.OLS(tdf1['avg_weight_error_0.14'], X)
results = model.fit()
results.summary()

In [None]:
plt.scatter(tdf['avg_weight_col'], tdf['avg_weight_error_0.14'])

X = tdf['avg_weight_col']
X = sm.add_constant(X)
model = sm.OLS(tdf['avg_weight_error_0.14'], X)
results = model.fit()
results.summary()

In [None]:
np.mean(np.abs(tdf1['avg_weight_error_0.14'])), np.mean(np.abs(tdf1['avg_weight_error_0.14'] - 0.0292 + 0.0028 * tdf1['starvation_days_col']))

In [None]:
plt.plot(tdf['avg_weight_error_0.14'], tdf['avg_weight_error_0.14'], color = 'blue')
plt.scatter(tdf['avg_weight_error_0.14'], tdf['avg_weight_error_0.16'], color = 'green')
plt.scatter(tdf['avg_weight_error_0.14'], tdf.avg_weight_error_exp, color = 'red')

In [None]:
plt.hist(tdf.avg_weight_error_exp)
plt.hist(tdf['avg_weight_error_0.16'])

In [None]:
tdf1 = tdf[tdf.hour_filter_method_col == 'manual']
mask1 = tdf1.akpd_cutoff_col == 0.95
counts, bins, _ = plt.hist(tdf1[mask1]['avg_weight_error_0.14'], alpha = 0.5, color = 'cyan')
plt.hist(tdf1[mask1]['avg_weight_error_exp'], alpha = 0.5, color = 'red', bins = bins)

<h1> Generate average weight accuracy with new model </h1>

In [None]:
np.mean(tdf[tdf.akpd_cutoff_col == 0.01])
np.std(tdf[tdf.akpd_cutoff_col == 0.01])
np.mean(tdf[tdf.akpd_cutoff_col == 0.95])
np.std(tdf[tdf.akpd_cutoff_col == 0.95])
tdf[tdf.hour_filter_method_col == 'hour_hist']
tdf[tdf.hour_filter_method_col == 'manual']

In [None]:
tdf1 = tdf[tdf.hour_filter_method_col == 'hour_hist']
mask1 = tdf1.akpd_cutoff_col == 0.01
counts, bins, _ = plt.hist(tdf1[mask1]['avg_weight_error_0.14'], alpha = 0.5, color = 'cyan')
plt.hist(tdf1[~mask1]['avg_weight_error_0.14'], alpha = 0.5, color = 'red', bins = bins)

In [None]:
start_hours = [7]
end_hours = [15]
apply_growth_rate = True
max_day_diff = 3
days_post_feeding = 1
final_days_post_feeding = 1
loss_factors = [0.16, 'expected_loss_factor'] # need to determine the right values here
akpd_cutoffs = [0.01, 0.95]

hour_filter_methods = ['manual', 'hour_hist'] #  'u-shape',

cohort_name_col = []
akpd_cutoff_col = []
hour_filter_method_col = []
start_hour_col = []
end_hour_col = []
loss_factor_col = []
starvation_days_col = []
avg_weight_col = []
avg_weight_error_col = []
gt_avg_weight_col = []
count_distribution_error_col = []

for loss_factor in loss_factors:
    avg_weight_error_col.append([])
    count_distribution_error_col.append([])

for cohort_name in sorted(list(dfs.keys())):
    print(cohort_name)
# for cohort_name in ['dale_pen_id_143_2020-10-07_2020-10-21']:
    gt_metadata = gt_metadatas[cohort_name]
    
    last_feeding_date = gt_metadata['last_feeding_date']
    slaughter_date = gt_metadata['slaughter_date']

    if slaughter_date is not None and last_feeding_date is not None:
        date_diff = datetime.strptime(slaughter_date, '%Y-%m-%d') - datetime.strptime(last_feeding_date, '%Y-%m-%d')
        starvation_days = date_diff.days
    else:
        starvation_days = None
        
    df = dfs[cohort_name]
    
    mask = df['weight_v1'] > 0
    
    df['estimated_weight_g'][mask] = df['weight_v6'][mask]
    df['estimated_weight_g'][~mask] = df['weight_v1'][~mask]    
    
    final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
    tdf = df[df.date <= final_date_post_feeding]
    
    start_end_hours = []
    
    for method in hour_filter_methods:
        if method == 'manual':
            for start_hour in start_hours:
                for end_hour in end_hours:
                    start_end_hours.append((method, start_hour, end_hour))
        elif method == 'u-shape':
            pass
        elif method == 'hour_hist':
            df2 = df[(df.hour >= 3) & (df.hour <= 20)]

            count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

            idx_values = np.where(count > 1.0 / 18)[0]

            start_index = np.where(bins == 10)[0][0]
            start_array = np.where(idx_values == start_index)[0][0]

            lower_index = start_array
            upper_index = start_array

            while lower_index > 0 and (idx_values[lower_index] - idx_values[lower_index - 1] == 1):
                lower_index = lower_index - 1
            while upper_index < len(idx_values) - 1 and (idx_values[upper_index + 1] - idx_values[upper_index] == 1):
                upper_index = upper_index + 1
    
            start_hour, end_hour = bins[idx_values[lower_index]], bins[idx_values[upper_index]]
            
            start_end_hours.append((method, start_hour, end_hour))
    
    for akpd_cutoff in akpd_cutoffs:
        for method, start_hour, end_hour in start_end_hours:
            sampling_filter = SamplingFilter(
                start_hour=start_hour,
                end_hour=end_hour,
                kf_cutoff=0.0,
                akpd_score_cutoff=akpd_cutoff
            )

            pm_base = gen_pm_base(tdf, sampling_filter)

            try:
                weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
            except ValidationError as err:
                continue

            akpd_cutoff_col.append(akpd_cutoff)
            cohort_name_col.append(cohort_name)
            hour_filter_method_col.append(method)
            start_hour_col.append(start_hour)
            end_hour_col.append(end_hour)
            loss_factor_col.append(gt_metadata['expected_loss_factor'])
            starvation_days_col.append(starvation_days)
            avg_weight_col.append(np.mean(weights))
            gt_avg_weight_col.append(gt_metadata['gutted_average_weight'])

            for index, loss_factor in enumerate(loss_factors):
                if loss_factor == 'expected_loss_factor':
                    loss_factor = gt_metadata['expected_loss_factor'] or 0.165
                    
                    if loss_factor > 10:
                        loss_factor = loss_factor / 100.0
                    
                avg_weight_err = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                avg_weight_error_col[index].append(avg_weight_err)

                count_distribution_errors = generate_distribution_accuracy(weights, gt_metadata, loss_factor)
                count_distribution_error_col[index].append(count_distribution_errors)


In [None]:
columns = {
    'cohort_name': cohort_name_col,
    'hour_filter_method_col': hour_filter_method_col,
    'akpd_cutoff_col': akpd_cutoff_col,
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'loss_factor_col': loss_factor_col,
    'starvation_days_col': starvation_days_col,
    'avg_weight_col': avg_weight_col,
    'gt_avg_weight_col': gt_avg_weight_col
}

for index, loss_factor in enumerate(loss_factors):
    if loss_factor == 'expected_loss_factor':
        col_name = 'avg_weight_error_exp'
        col_abs_name = 'avg_weight_error_abs_exp'
        col_abs_dist_name = 'avg_count_dist_error_abs_exp'
    else:
        col_name = 'avg_weight_error_%0.2f' % (loss_factor,)
        col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
        col_abs_dist_name = 'avg_count_dist_error_abs_%0.2f' % (loss_factor,)
        
    columns[col_name] = avg_weight_error_col[index]
    columns[col_abs_name] = np.abs(avg_weight_error_col[index])
    columns[col_abs_dist_name] = [np.mean(np.abs(l)) for l in count_distribution_error_col[index]]

tdf = pd.DataFrame(columns)

In [None]:
df6 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]


In [None]:
start_hours = [7]
end_hours = [15]
apply_growth_rate = True
max_day_diff = 3
days_post_feeding = 1
final_days_post_feeding = 1
loss_factors = [0.16, 'expected_loss_factor'] # need to determine the right values here
akpd_cutoffs = [0.01, 0.95]

hour_filter_methods = ['manual', 'hour_hist'] #  'u-shape',

cohort_name_col = []
akpd_cutoff_col = []
hour_filter_method_col = []
start_hour_col = []
end_hour_col = []
loss_factor_col = []
starvation_days_col = []
avg_weight_col = []
avg_weight_error_col = []
gt_avg_weight_col = []
count_distribution_error_col = []

for loss_factor in loss_factors:
    avg_weight_error_col.append([])
    count_distribution_error_col.append([])

for cohort_name in sorted(list(dfs.keys())):
    print(cohort_name)
# for cohort_name in ['dale_pen_id_143_2020-10-07_2020-10-21']:
    gt_metadata = gt_metadatas[cohort_name]
    
    last_feeding_date = gt_metadata['last_feeding_date']
    slaughter_date = gt_metadata['slaughter_date']

    if slaughter_date is not None and last_feeding_date is not None:
        date_diff = datetime.strptime(slaughter_date, '%Y-%m-%d') - datetime.strptime(last_feeding_date, '%Y-%m-%d')
        starvation_days = date_diff.days
    else:
        starvation_days = None
        
    df = dfs[cohort_name]
    
    mask = df['weight_v5'] > 8000
    
    df['estimated_weight_g'][mask] = df['weight_v7'][mask]
    df['estimated_weight_g'][~mask] = df['weight_v5'][~mask]    
    
    final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
    tdf = df[df.date <= final_date_post_feeding]
    
    start_end_hours = []
    
    for method in hour_filter_methods:
        if method == 'manual':
            for start_hour in start_hours:
                for end_hour in end_hours:
                    start_end_hours.append((method, start_hour, end_hour))
        elif method == 'u-shape':
            pass
        elif method == 'hour_hist':
            df2 = df[(df.hour >= 3) & (df.hour <= 20)]

            count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

            idx_values = np.where(count > 1.0 / 18)[0]

            start_index = np.where(bins == 10)[0][0]
            start_array = np.where(idx_values == start_index)[0][0]

            lower_index = start_array
            upper_index = start_array

            while lower_index > 0 and (idx_values[lower_index] - idx_values[lower_index - 1] == 1):
                lower_index = lower_index - 1
            while upper_index < len(idx_values) - 1 and (idx_values[upper_index + 1] - idx_values[upper_index] == 1):
                upper_index = upper_index + 1
    
            start_hour, end_hour = bins[idx_values[lower_index]], bins[idx_values[upper_index]]
            
            start_end_hours.append((method, start_hour, end_hour))
    
    for akpd_cutoff in akpd_cutoffs:
        for method, start_hour, end_hour in start_end_hours:
            sampling_filter = SamplingFilter(
                start_hour=start_hour,
                end_hour=end_hour,
                kf_cutoff=0.0,
                akpd_score_cutoff=akpd_cutoff
            )

            pm_base = gen_pm_base(tdf, sampling_filter)

            try:
                weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
            except ValidationError as err:
                continue

            akpd_cutoff_col.append(akpd_cutoff)
            cohort_name_col.append(cohort_name)
            hour_filter_method_col.append(method)
            start_hour_col.append(start_hour)
            end_hour_col.append(end_hour)
            loss_factor_col.append(gt_metadata['expected_loss_factor'])
            starvation_days_col.append(starvation_days)
            avg_weight_col.append(np.mean(weights))
            gt_avg_weight_col.append(gt_metadata['gutted_average_weight'])

            for index, loss_factor in enumerate(loss_factors):
                if loss_factor == 'expected_loss_factor':
                    loss_factor = gt_metadata['expected_loss_factor'] or 0.165
                    
                    if loss_factor > 10:
                        loss_factor = loss_factor / 100.0
                    
                avg_weight_err = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                avg_weight_error_col[index].append(avg_weight_err)

                count_distribution_errors = generate_distribution_accuracy(weights, gt_metadata, loss_factor)
                count_distribution_error_col[index].append(count_distribution_errors)


In [None]:
columns = {
    'cohort_name': cohort_name_col,
    'hour_filter_method_col': hour_filter_method_col,
    'akpd_cutoff_col': akpd_cutoff_col,
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'loss_factor_col': loss_factor_col,
    'starvation_days_col': starvation_days_col,
    'avg_weight_col': avg_weight_col,
    'gt_avg_weight_col': gt_avg_weight_col
}

for index, loss_factor in enumerate(loss_factors):
    if loss_factor == 'expected_loss_factor':
        col_name = 'avg_weight_error_exp'
        col_abs_name = 'avg_weight_error_abs_exp'
        col_abs_dist_name = 'avg_count_dist_error_abs_exp'
    else:
        col_name = 'avg_weight_error_%0.2f' % (loss_factor,)
        col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
        col_abs_dist_name = 'avg_count_dist_error_abs_%0.2f' % (loss_factor,)
        
    columns[col_name] = avg_weight_error_col[index]
    columns[col_abs_name] = np.abs(avg_weight_error_col[index])
    columns[col_abs_dist_name] = [np.mean(np.abs(l)) for l in count_distribution_error_col[index]]

tdf = pd.DataFrame(columns)

In [None]:
df7 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]


In [None]:
start_hours = [7]
end_hours = [15]
apply_growth_rate = True
max_day_diff = 3
days_post_feeding = 1
final_days_post_feeding = 1
loss_factors = [0.16, 'expected_loss_factor'] # need to determine the right values here
akpd_cutoffs = [0.01, 0.95]

hour_filter_methods = ['manual', 'hour_hist'] #  'u-shape',

cohort_name_col = []
akpd_cutoff_col = []
hour_filter_method_col = []
start_hour_col = []
end_hour_col = []
loss_factor_col = []
starvation_days_col = []
avg_weight_col = []
avg_weight_error_col = []
gt_avg_weight_col = []
count_distribution_error_col = []

for loss_factor in loss_factors:
    avg_weight_error_col.append([])
    count_distribution_error_col.append([])

for cohort_name in sorted(list(dfs.keys())):
    print(cohort_name)
# for cohort_name in ['dale_pen_id_143_2020-10-07_2020-10-21']:
    gt_metadata = gt_metadatas[cohort_name]
    
    last_feeding_date = gt_metadata['last_feeding_date']
    slaughter_date = gt_metadata['slaughter_date']

    if slaughter_date is not None and last_feeding_date is not None:
        date_diff = datetime.strptime(slaughter_date, '%Y-%m-%d') - datetime.strptime(last_feeding_date, '%Y-%m-%d')
        starvation_days = date_diff.days
    else:
        starvation_days = None
        
    df = dfs[cohort_name]
    
    mask = df['weight_v5'] > 6000
    
    df['estimated_weight_g'][mask] = df['weight_v8'][mask]
    df['estimated_weight_g'][~mask] = df['weight_v5'][~mask]    
    
    final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
    tdf = df[df.date <= final_date_post_feeding]
    
    start_end_hours = []
    
    for method in hour_filter_methods:
        if method == 'manual':
            for start_hour in start_hours:
                for end_hour in end_hours:
                    start_end_hours.append((method, start_hour, end_hour))
        elif method == 'u-shape':
            pass
        elif method == 'hour_hist':
            df2 = df[(df.hour >= 3) & (df.hour <= 20)]

            count, bins, _ = plt.hist(df2.hour, density = True, bins = (np.max(df2.hour) - np.min(df2.hour)))

            idx_values = np.where(count > 1.0 / 18)[0]

            start_index = np.where(bins == 10)[0][0]
            start_array = np.where(idx_values == start_index)[0][0]

            lower_index = start_array
            upper_index = start_array

            while lower_index > 0 and (idx_values[lower_index] - idx_values[lower_index - 1] == 1):
                lower_index = lower_index - 1
            while upper_index < len(idx_values) - 1 and (idx_values[upper_index + 1] - idx_values[upper_index] == 1):
                upper_index = upper_index + 1
    
            start_hour, end_hour = bins[idx_values[lower_index]], bins[idx_values[upper_index]]
            
            start_end_hours.append((method, start_hour, end_hour))
    
    for akpd_cutoff in akpd_cutoffs:
        for method, start_hour, end_hour in start_end_hours:
            sampling_filter = SamplingFilter(
                start_hour=start_hour,
                end_hour=end_hour,
                kf_cutoff=0.0,
                akpd_score_cutoff=akpd_cutoff
            )

            pm_base = gen_pm_base(tdf, sampling_filter)

            try:
                weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
            except ValidationError as err:
                continue

            akpd_cutoff_col.append(akpd_cutoff)
            cohort_name_col.append(cohort_name)
            hour_filter_method_col.append(method)
            start_hour_col.append(start_hour)
            end_hour_col.append(end_hour)
            loss_factor_col.append(gt_metadata['expected_loss_factor'])
            starvation_days_col.append(starvation_days)
            avg_weight_col.append(np.mean(weights))
            gt_avg_weight_col.append(gt_metadata['gutted_average_weight'])

            for index, loss_factor in enumerate(loss_factors):
                if loss_factor == 'expected_loss_factor':
                    loss_factor = gt_metadata['expected_loss_factor'] or 0.165
                    
                    if loss_factor > 10:
                        loss_factor = loss_factor / 100.0
                    
                avg_weight_err = generate_average_weight_accuracy(weights, gt_metadata, loss_factor)
                avg_weight_error_col[index].append(avg_weight_err)

                count_distribution_errors = generate_distribution_accuracy(weights, gt_metadata, loss_factor)
                count_distribution_error_col[index].append(count_distribution_errors)


In [None]:
columns = {
    'cohort_name': cohort_name_col,
    'hour_filter_method_col': hour_filter_method_col,
    'akpd_cutoff_col': akpd_cutoff_col,
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'loss_factor_col': loss_factor_col,
    'starvation_days_col': starvation_days_col,
    'avg_weight_col': avg_weight_col,
    'gt_avg_weight_col': gt_avg_weight_col
}

for index, loss_factor in enumerate(loss_factors):
    if loss_factor == 'expected_loss_factor':
        col_name = 'avg_weight_error_exp'
        col_abs_name = 'avg_weight_error_abs_exp'
        col_abs_dist_name = 'avg_count_dist_error_abs_exp'
    else:
        col_name = 'avg_weight_error_%0.2f' % (loss_factor,)
        col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
        col_abs_dist_name = 'avg_count_dist_error_abs_%0.2f' % (loss_factor,)
        
    columns[col_name] = avg_weight_error_col[index]
    columns[col_abs_name] = np.abs(avg_weight_error_col[index])
    columns[col_abs_dist_name] = [np.mean(np.abs(l)) for l in count_distribution_error_col[index]]

tdf = pd.DataFrame(columns)

In [None]:
df8 = tdf[(tdf.akpd_cutoff_col == 0.01) & (tdf.hour_filter_method_col == 'hour_hist')][['cohort_name', 'gt_avg_weight_col', 'avg_weight_error_0.16', 'avg_weight_error_exp']]


In [None]:
gt = pd.read_csv('eide_langoy_singleweights.csv')
langoy_gt = gt['weight'] * 1000 / 0.86

pen5 = pd.read_csv('blom_vikane_singleweights.csv')
vikane_gt = pen5['weight'] * 1000 / 0.86

# imr_gt = pd.read_csv('imr.csv').weight

single_weights = [
    ('langoy_pen_id_108_2020-05-07_2020-05-17', langoy_gt),
    ('vikane_pen_id_60_2020-08-10_2020-08-30', vikane_gt)
]

In [None]:
plt.hist(langoy_gt)

In [None]:
plt.hist(vikane_gt)

In [None]:
for cohort_name, gt_weights in single_weights:
    gt_metadata = gt_metadatas[cohort_name]

    df = dfs[cohort_name]
    df['estimated_weight_g'] = df['weight_v2']
    final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
    tdf = df[df.date <= final_date_post_feeding]

    sampling_filter = SamplingFilter(
        start_hour=7,
        end_hour=15,
        kf_cutoff=0.0,
        akpd_score_cutoff=0.01
    )

    pm_base = gen_pm_base(tdf, sampling_filter)

    try:
        weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
    except ValidationError as err:
        pass

    plt.figure(figsize=(20, 10))
    count, bins, _ = plt.hist(gt_weights, density = True, color = 'blue', bins = 50)
    plt.hist(weights, density = True, alpha = 0.5, color = 'red', bins = bins)
    print(np.mean(gt_weights), np.mean(weights))

In [None]:
for cohort_name, gt_weights in single_weights:
    gt_metadata = gt_metadatas[cohort_name]

    df = dfs[cohort_name]
    df['estimated_weight_g'] = df['weight_v5']
    final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
    tdf = df[df.date <= final_date_post_feeding]

    sampling_filter = SamplingFilter(
        start_hour=7,
        end_hour=15,
        kf_cutoff=0.0,
        akpd_score_cutoff=0.01
    )

    pm_base = gen_pm_base(tdf, sampling_filter)

    try:
        weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
    except ValidationError as err:
        pass

    plt.figure(figsize=(20, 10))
    count, bins, _ = plt.hist(gt_weights, density = True, color = 'blue', bins = 50)
    plt.hist(weights, density = True, alpha = 0.5, color = 'red', bins = bins)
    print(np.mean(gt_weights), np.mean(weights))

In [None]:
for cohort_name, gt_weights in single_weights:
    gt_metadata = gt_metadatas[cohort_name]

    df = dfs[cohort_name]
    df['estimated_weight_g'] = df['weight_v4']
    final_date_post_feeding = add_days(gt_metadata['last_feeding_date'], final_days_post_feeding)
    tdf = df[df.date <= final_date_post_feeding]

    sampling_filter = SamplingFilter(
        start_hour=7,
        end_hour=15,
        kf_cutoff=0.0,
        akpd_score_cutoff=0.01
    )

    pm_base = gen_pm_base(tdf, sampling_filter)

    try:
        weights = generate_raw_individual_values(pm_base, gt_metadata, start_hour, end_hour, apply_growth_rate, max_day_diff, days_post_feeding, final_days_post_feeding)
    except ValidationError as err:
        pass

    plt.figure(figsize=(20, 10))
    count, bins, _ = plt.hist(gt_weights, density = True, color = 'blue', bins = 50)
    plt.hist(weights, density = True, alpha = 0.5, color = 'red', bins = bins)
    print(np.mean(gt_weights), np.mean(weights))

In [None]:
kdf = dfs['tittelsnes_pen_id_37_2020-06-10_2020-06-24']

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(kdf.weight_v1 - kdf.weight_v2, bins=200)
plt.grid()
plt.show()

In [None]:
index = 0
loss_factor = loss_factors[index]
col_abs_name = 'avg_weight_error_abs_%0.2f' % (loss_factor,)
error = tdf[col_abs_name]

print('Loss factor', loss_factor)

print()

print('Average Weight Error')
print('Avg %0.1f' % (np.mean(error) * 100, ))
print('90th Pct %0.1f' % (np.percentile(error, 90) * 100, ))
print('Max %0.1f' % (np.max(error) * 100, ))

print()

dist_errors = [item for sublist in count_distribution_error_col[index] for item in sublist]

print('Count Distribution Error')
print('Avg %0.1f' % (np.mean(np.abs(dist_errors)) * 100, ))
print('90th Pct %0.1f' % (np.percentile(np.abs(dist_errors), 90) * 100, ))
print('Max %0.1f' % (np.max(np.abs(dist_errors)) * 100, ))

In [None]:
# for cohort_name in cohort_names:
#     mask = tdf.cohort_name == cohort_name
#     print(tdf[mask].sort_values('avg_weight_error_abs', ascending=True).head(10))

In [None]:
gt_metadatas['vikane_pen_id_60_2020-08-05_2020-08-30']

In [None]:
tdf.cohort_name.unique()

In [None]:
mask = (tdf.cohort_name == 'tittelsnes_pen_id_37_2020-05-23_2020-06-24') & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3) & (tdf.loss_factor == 0.17)
tdf[mask].sort_values('avg_weight_error_abs')



In [None]:
mask = (tdf.start_hour_col == 6) & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3)
tdf[mask].avg_weight_error_abs.median()



In [None]:
mask = (tdf.start_hour_col == 7) & (tdf.days_post_feeding == 1) & (tdf.final_days_post_feeding == 1) & (tdf.max_day_diff == 3)
tdf[mask].avg_weight_error_abs.median()



In [None]:
cohort_name_col = []
start_hour_col = []
end_hour_col = []
apply_growth_rate_col = []
max_day_diff_col = []
days_post_feeding_col = []
final_days_post_feeding_col = []
loss_factor_col = []
std_avg_weight_error_col = []
abs_avg_weight_error_col = []
mean_avg_weight_error_col = []

for start_hour in start_hours:
    for end_hour in end_hours:
        for apply_growth_rate in apply_growth_rate_list:
            for max_day_diff in max_day_diff_list:
                for days_post_feeding in days_post_feeding_list:
                    for final_days_post_feeding in final_days_post_feeding_list:
                        for loss_factor in loss_factors:
                            mask = (tdf.start_hour_col == start_hour) & \
                            (tdf.end_hour_col == end_hour) & \
                            (tdf.apply_growth_rate == apply_growth_rate) & \
                            (tdf.max_day_diff == max_day_diff) & \
                            (tdf.days_post_feeding == days_post_feeding) & \
                            (tdf.final_days_post_feeding == final_days_post_feeding) & \
                            (tdf.loss_factor == loss_factor)
                            
                            start_hour_col.append(start_hour)
                            end_hour_col.append(end_hour)
                            apply_growth_rate_col.append(apply_growth_rate)
                            max_day_diff_col.append(max_day_diff)
                            days_post_feeding_col.append(days_post_feeding)
                            final_days_post_feeding_col.append(final_days_post_feeding)
                            loss_factor_col.append(loss_factor)
                            std_avg_weight_error_col.append(tdf[mask].avg_weight_error.std())
                            abs_avg_weight_error_col.append(tdf[mask].avg_weight_error_abs.mean())
                            mean_avg_weight_error_col.append(tdf[mask].avg_weight_error.mean())

In [None]:
rdf = pd.DataFrame({
    'start_hour_col': start_hour_col,
    'end_hour_col': end_hour_col,
    'apply_growth_rate': apply_growth_rate_col,
    'max_day_diff': max_day_diff_col,
    'days_post_feeding': days_post_feeding_col,
    'final_days_post_feeding': final_days_post_feeding_col,
    'loss_factor': loss_factor_col,
    'abs_avg_weight_error': abs_avg_weight_error_col,
    'std_avg_weight_error': std_avg_weight_error_col,
    'mean_avg_weight_error': mean_avg_weight_error_col,
})



In [None]:
rdf

In [None]:
mask = (rdf.loss_factor == 0.16)
rdf[mask].sort_values('abs_avg_weight_error')

In [None]:
tdf.to_csv('/root/data/alok/biomass_estimation/playground/smart_average_param_grid_search.csv')

In [None]:
tdf[(tdf.cohort_name == 'bolaks_pen_id_88_2020-02-10_2020-03-10')].sort_values('avg_weight_error_abs')



In [None]:
# generate Vikane average weight and distribution error - explore basic parameters

ground_truth_metadata = json.load(open(ground_truth_f))
day_after_feeding_stop = add_days(ground_truth_metadata['last_feeding_date'], 1)
start_date, end_date = add_days(day_after_feeding_stop, -2), add_days(day_after_feeding_stop, -1)
tdf = df[(df.date >= start_date) & (df.date <= end_date)].copy(deep=True)

sampling_filter = SamplingFilter(
    start_hour=7,
    end_hour=15,
    akpd_score_cutoff=0.95,
    kf_cutoff=0.0
)
pm_base = gen_pm_base(tdf, sampling_filter)
weights, _ = generate_smart_individual_values(pm_base, day_after_feeding_stop, 3, True, True, 0.9)


In [None]:
np.mean(weights)