In [None]:
"""
This module contains constants representing core & auxiliary fish body parts.
"""

UPPER_LIP = 'UPPER_LIP'
EYE = 'EYE'
PECTORAL_FIN = 'PECTORAL_FIN'
DORSAL_FIN = 'DORSAL_FIN'
PELVIC_FIN = 'PELVIC_FIN'
ADIPOSE_FIN = 'ADIPOSE_FIN'
ANAL_FIN = 'ANAL_FIN'
TAIL_NOTCH = 'TAIL_NOTCH'
UPPER_PRECAUDAL_PIT = 'UPPER_PRECAUDAL_PIT'
LOWER_PRECAUDAL_PIT = 'LOWER_PRECAUDAL_PIT'
HYPURAL_PLATE = 'HYPURAL_PLATE'

core_body_parts = sorted([UPPER_LIP,
                          EYE,
                          PECTORAL_FIN,
                          DORSAL_FIN,
                          PELVIC_FIN,
                          ADIPOSE_FIN,
                          ANAL_FIN,
                          TAIL_NOTCH])

auxiliary_body_parts = sorted([UPPER_PRECAUDAL_PIT,
                               LOWER_PRECAUDAL_PIT,
                               HYPURAL_PLATE])

all_body_parts = sorted(core_body_parts + auxiliary_body_parts)

In [None]:
"""This module contains utility helper functions for the WeightEstimator class."""

from collections import namedtuple
from typing import Dict, List, Tuple
import numpy as np
import torch


CameraMetadata = namedtuple('CameraMetadata',
                            ['focal_length', 'focal_length_pixel', 'baseline_m',
                             'pixel_count_width', 'pixel_count_height', 'image_sensor_width',
                             'image_sensor_height'])


def get_left_right_keypoint_arrs(annotation: Dict[str, List[Dict]]) -> Tuple:
    """Gets numpy array of left and right keypoints given input keypoint annotation.
    Args:
        annotation: dict with keys 'leftCrop' and 'rightCrop'. Values are lists where each element
        is a dict with keys 'keypointType', 'xCrop' (num pixels from crop left edge),
        'yCrop' (num pixels from crop top edge), 'xFrame' (num pixels from full frame left edge),
        and 'yFrame' (num pixels from full frame top edge).
    Returns:
        X_left: numpy array containing left crop (xFrame, yFrame) for each key-point ordered
        alphabetically.
        X_right: same as above, but for right crop.
    """

    left_keypoints, right_keypoints = {}, {}
    for item in annotation['leftCrop']:
        body_part = item['keypointType']
        left_keypoints[body_part] = (item['xFrame'], item['yFrame'])

    for item in annotation['rightCrop']:
        body_part = item['keypointType']
        right_keypoints[body_part] = (item['xFrame'], item['yFrame'])

    left_keypoint_arr, right_keypoint_arr = [], []
    for body_part in core_body_parts:
        left_keypoint_arr.append(left_keypoints[body_part])
        right_keypoint_arr.append(right_keypoints[body_part])

    X_left = np.array(left_keypoint_arr)
    X_right = np.array(right_keypoint_arr)
    return X_left, X_right


def normalize_left_right_keypoint_arrs(X_left: np.ndarray, X_right: np.ndarray) -> Tuple:
    """Normalizes input left and right key-point arrays. The normalization involves (1) 2D
    translation of all keypoints such that they are centered, (2) rotation of the 2D coordiantes
    about the center such that the line passing through UPPER_LIP and fish center is horizontal.
    """

    # translate key-points, perform reflection if necessary
    upper_lip_idx = core_body_parts.index(UPPER_LIP)
    tail_notch_idx = core_body_parts.index(TAIL_NOTCH)
    if X_left[upper_lip_idx, 0] > X_left[tail_notch_idx, 0]:
        X_center = 0.5 * (np.max(X_left, axis=0) + np.min(X_left, axis=0))
        X_left_centered = X_left - X_center
        X_right_centered = X_right - X_center
    else:
        X_center = 0.5 * (np.max(X_right, axis=0) + np.min(X_right, axis=0))
        X_left_centered = X_right - X_center
        X_right_centered = X_left - X_center
        X_left_centered[:, 0] = -X_left_centered[:, 0]
        X_right_centered[:, 0] = -X_right_centered[:, 0]

    # rotate key-points
    upper_lip_x, upper_lip_y = tuple(X_left_centered[upper_lip_idx])
    theta = np.arctan(upper_lip_y / upper_lip_x)
    R = np.array([
        [np.cos(theta), -np.sin(theta)],
        [np.sin(theta), np.cos(theta)]
    ])

    D = X_left_centered - X_right_centered
    X_left_rot = np.dot(X_left_centered, R)
    X_right_rot = X_left_rot - D
    return X_left_rot, X_right_rot


def convert_to_world_point_arr(X_left: np.ndarray, X_right: np.ndarray,
                               camera_metadata: CameraMetadata) -> np.ndarray:
    """Converts input left and right normalized keypoint arrays into world coordinate array."""

    y_world = camera_metadata.focal_length_pixel * camera_metadata.baseline_m / \
              (X_left[:, 0] - X_right[:, 0])

    # Note: the lines commented out below are technically the correct formula for conversion
    # x_world = X_left[:, 0] * y_world / camera_metadata.focal_length_pixel
    # z_world = -X_left[:, 1] * y_world / camera_metadata.focal_length_pixel
    x_world = ((X_left[:, 0] * camera_metadata.image_sensor_width / camera_metadata.pixel_count_width) * y_world) / (camera_metadata.focal_length)
    z_world = (-(X_left[:, 1] * camera_metadata.image_sensor_height / camera_metadata.pixel_count_height) * y_world) / (camera_metadata.focal_length)
    X_world = np.vstack([x_world, y_world, z_world]).T
    return X_world


def stabilize_keypoints(X: np.ndarray) -> np.ndarray:
    """Transforms world coordinate array so that neural network inputs are stabilized"""
    X_new = np.zeros(X.shape)
    X_new[:, 0] = 0.5 * X[:, 0] / X[:, 1]
    X_new[:, 1] = 0.5 * X[:, 2] / X[:, 1]
    X_new[:, 2] = 0.05 / X[:, 1]
    return X_new


def convert_to_nn_input(annotation: Dict[str, List[Dict]], camera_metadata: CameraMetadata) \
        -> torch.Tensor:
    """Convrts input keypoint annotation and camera metadata into neural network tensor input."""
    X_left, X_right = get_left_right_keypoint_arrs(annotation)
    X_left_norm, X_right_norm = normalize_left_right_keypoint_arrs(X_left, X_right)
    X_world = convert_to_world_point_arr(X_left_norm, X_right_norm, camera_metadata)
    X = stabilize_keypoints(X_world)
    nn_input = torch.from_numpy(np.array([X])).float()
    return nn_input

In [None]:
"""
This module contains the WeightEstimator class for estimating fish weight (g), length (mm), and
k-factor given input keypoint coordinates and camera metadata.
"""

from typing import Dict, Tuple
import torch
from torch import nn


class Network(nn.Module):
    """Network class defines neural-network architecture for both weight and k-factor estimation
    (currently both neural networks share identical architecture)."""

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(24, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        """Run inference on input keypoint tensor."""
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.output(x)
        return x


class WeightEstimator:
    """WeightEstimator class is used to predict fish weight, k-factor, and length
    given input keypoint annotations and camera metadata."""

    def __init__(self, weight_model_f: str, kf_model_f: str) -> None:
        """Initializes class with input weight and k-factor neural-networks."""
        self.weight_model = Network()
        self.weight_model.load_state_dict(torch.load(weight_model_f))
        self.weight_model.eval()

        self.kf_model = Network()
        self.kf_model.load_state_dict(torch.load(kf_model_f))
        self.kf_model.eval()

    @staticmethod
    def _get_model_input(annotation: Dict, camera_metadata: CameraMetadata) -> torch.Tensor:
        """Generates neural-network input tensor given annotation and camera_metadata."""
        X = convert_to_nn_input(annotation, camera_metadata)
        return X

    def predict_weight(self, annotation: Dict, camera_metadata: CameraMetadata) -> float:
        """Generates weight prediction given input annotation and camera metadata."""
        X = self._get_model_input(annotation, camera_metadata)
        weight = 1e4 * self.weight_model(X).item()
        return weight

    def predict_kf(self, annotation: Dict, camera_metadata: CameraMetadata) -> float:
        """Generates k-factor prediction gievn input annotation and camera metadata."""
        X = self._get_model_input(annotation, camera_metadata)
        kf = self.kf_model(X).item()
        return kf

    def predict(self, annotation: Dict, camera_metadata: CameraMetadata) -> Tuple:
        """Generates weight, k-factor, and length predictions given input annotation and camera
        metadata."""
        weight = self.predict_weight(annotation, camera_metadata)
        kf = self.predict_kf(annotation, camera_metadata)
        if weight * kf > 0:
            length = (1e5 * weight / kf) ** (1.0 / 3)
        else:
            length = 0
        return weight, length, kf

In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
import numpy as np

from collections import defaultdict
# from weight_estimation.train import train, augment, normalize, get_data_split, train_model
# from weight_estimation.utils import body_parts, normalize_left_right_keypoint_arrs
from research.utils.data_access_utils import S3AccessUtils
# from weight_estimation.dataset import prepare_gtsf_data
# from weight_estimation.weight_estimator import WeightEstimator
# from weight_estimation.utils import CameraMetadata
import pandas as pd
from research.weight_estimation.keypoint_utils.optics import pixel2world
# from weight_estimation.body_parts import core_body_parts
from keras.models import load_model

from matplotlib import pyplot as plt
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
import uuid

In [None]:
from collections import namedtuple
CameraMetadata = namedtuple('CameraMetadata',
                            ['focal_length', 'focal_length_pixel', 'baseline_m', 'pixel_count_width',
                             'pixel_count_height', 'image_sensor_width', 'image_sensor_height'])

In [None]:
s3 = S3AccessUtils('/tmp/data', json.load(open(os.environ['AWS_CREDENTIALS'])))

<h1> Generate base data </h1>

In [None]:
akpd_scorer_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/keypoint-detection-scorer/akpd_scorer_model_TF.h5'
s3 = S3AccessUtils('/tmp/data', json.load(open(os.environ['AWS_CREDENTIALS'])))
akpd_scorer_f, _, _ = s3.download_from_url(akpd_scorer_url)
df1 = prepare_gtsf_data('2019-03-01', '2019-09-20', akpd_scorer_f, 0.5, 1.0)

In [None]:
df2 = prepare_gtsf_data('2020-06-01', '2020-08-20', akpd_scorer_f, 0.5, 1.0)

In [None]:
df = pd.concat([df1, df2])
mask = df.k_factor < 3.0
df = df[mask].copy(deep=True)

In [None]:
df = pd.read_csv('/root/alok/repos/research-exploration/bryton/biomass/gtsf_data.csv')

lengths = []

for idx, row in df.iterrows():
    ann, cm = eval(row.keypoints), eval(row.camera_metadata)
    wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
    lengths.append(np.linalg.norm(wkps['UPPER_LIP'] - wkps['TAIL_NOTCH']))
    
df['length'] = lengths

<h1> Get length -> weight allometric model </h1>

In [None]:
# df['length'] = df.data.apply(lambda x: x['lengthMms'])
plt.scatter(df.length, df.weight)
plt.grid()
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
res = lr.fit(np.log(df.length.values).reshape(-1, 1), np.log(df.weight.values))

In [None]:
res.coef_

In [None]:
def predict_weight_from_length(length):
    weight = np.exp(res.intercept_) * length**(res.coef_[0])
    return weight

In [None]:
df['pred_weight'] = df.length.apply(lambda x: predict_weight_from_length(x))

<h1> Plot relationship between weight and predicted length </h1>

In [None]:
world_keypoints = []
for idx, row in df.iterrows():
    ann, cm = eval(row.keypoints), eval(row.camera_metadata)
    wkps = pixel2world(ann['leftCrop'], ann['rightCrop'], cm)
    world_keypoints.append(wkps)
    
    
df['world_keypoints'] = world_keypoints

In [None]:
df['pred_length'] = df.world_keypoints.apply(lambda x: np.linalg.norm(x['UPPER_LIP'] - x['TAIL_NOTCH']))

In [None]:
plt.scatter(df.pred_length.values, df.weight.values)

In [None]:
plt.scatter(df['weight'], df['pred_weight'])

<h1> Using fish in 3-4 kg, simulate larger fish and assess representativeness </h1>

In [None]:
def simulate_larger_fish(world_keypoints, weight, pct_inflation):
    modified_world_keypoints = {}
    for body_part in core_body_parts:
        kps = world_keypoints[body_part]
        modified_kps = (1.0 + pct_inflation) * kps
        modified_world_keypoints[body_part] = modified_kps
        
    modified_weight = (1.0 + pct_inflation)**3.08 * weight
    return modified_world_keypoints, modified_weight
    

def get_ann_from_world_keypoints(world_keypoints, cm):
    ann = {'leftCrop': [], 'rightCrop': []}
    for body_part in core_body_parts:
        x, y, z = world_keypoints[body_part]
        px_x = round(x * cm['focalLengthPixel'] / y + cm['pixelCountWidth'] / 2.0)
        px_y = round(cm['pixelCountHeight'] / 2.0 - z * cm['focalLengthPixel'] / y)
        disparity = round(cm['focalLengthPixel'] * cm['baseline'] / y)
        
        left_item = {
            'keypointType': body_part,
            'xFrame': px_x,
            'yFrame': px_y
        }
        
        right_item = {
            'keypointType': body_part,
            'xFrame': px_x - disparity,
            'yFrame': px_y
        }
        
        ann['leftCrop'].append(left_item)
        ann['rightCrop'].append(right_item)
    return ann
    

In [None]:
np.random.uniform(0, 1)

In [None]:
mask = (df.weight <= 5000) 
max_pct_inflation = 0.15


modified_ann_list = []
modified_weight_list = []
cm_list = []
for idx, row in df[mask].iterrows():
    pct_inflation = np.random.uniform(0, max_pct_inflation)
    world_keypoints = row.world_keypoints
    cm = eval(row.camera_metadata)
    weight = row.weight
    modified_world_keypoints, modified_weight = simulate_larger_fish(world_keypoints, weight, pct_inflation)
    modified_ann = get_ann_from_world_keypoints(modified_world_keypoints, cm)
    
    modified_ann_list.append(modified_ann)
    modified_weight_list.append(modified_weight)
    cm_list.append(cm)
    
    
    



In [None]:
# weight_model_f = '/Users/aloksaxena/Documents/repos/production_algo/weight_estimation/src/weight_estimation/weight_model.h5'
# kf_model_f = '/Users/aloksaxena/Documents/repos/production_algo/weight_estimation/src/weight_estimation/kf_model.h5'
weight_model_f = '/root/alok/repos/research-exploration/bryton/biomass/weight_model_synthetic_data.pb'
kf_model_f = '/root/alok/repos/research-exploration/bryton/biomass/kf_model.pb'
    
    
weight_estimator = WeightEstimator(weight_model_f, kf_model_f)
weights, lengths, kfs = [], [], []
count = 0

In [None]:
pred_weights = []
for ann, camera_metadata in zip(modified_ann_list, cm_list):
    camera_metadata_obj = CameraMetadata(
        focal_length=camera_metadata['focalLength'],
        focal_length_pixel=camera_metadata['focalLengthPixel'],
        baseline_m=camera_metadata['baseline'],
        pixel_count_width=camera_metadata['pixelCountWidth'],
        pixel_count_height=camera_metadata['pixelCountHeight'],
        image_sensor_width=camera_metadata['imageSensorWidth'],
        image_sensor_height=camera_metadata['imageSensorHeight']
    )
    
    weight, _, _ = weight_estimator.predict(ann, camera_metadata_obj)
    pred_weights.append(weight)

In [None]:
plt.scatter(np.array(modified_weight_list), np.array(pred_weights))
plt.plot([min(modified_weight_list), max(modified_weight_list)], 
         [min(modified_weight_list), max(modified_weight_list)], color='red')
plt.xlabel('Simulated weight of synthetic fish')
plt.ylabel('Production model prediction on synthetic fish')
plt.show()

In [None]:
(np.mean(pred_weights) - np.mean(modified_weight_list)) / np.mean(modified_weight_list)

In [None]:
np.mean((np.abs(np.array(pred_weights) - np.array(modified_weight_list))) / np.array(modified_weight_list))

In [None]:
df.columns

<h1> Modify the training dataset with synthetic data and apply augmentation </h1>

In [None]:
mask = df.weight < 10000
tdf_original = df.loc[mask, ['keypoints', 'fish_id', 'weight', 'k_factor', 'camera_metadata']].copy(deep=True)

annotation_list = []
fish_id_list = []
weight_list = []
kf_list = []
camera_metadata_list = []
for ann, weight, camera_metadata in zip(modified_ann_list, modified_weight_list, cm_list):
    annotation_list.append(ann)
    fish_id_list.append(uuid.uuid1())
    weight_list.append(weight)
    kf_list.append(1.0)
    camera_metadata_list.append(camera_metadata)
    
tdf_synthetic = pd.DataFrame({
    'keypoints': annotation_list,
    'fish_id': fish_id_list,
    'weight': weight_list,
    'k_factor': kf_list,
    'camera_metadata': camera_metadata_list
})

tdf = pd.concat([tdf_original, tdf_synthetic])
    


In [None]:
plt.hist(tdf.weight, bins=20)

In [None]:
from typing import Dict, List
import json, os
import numpy as np
import pandas as pd
from keras.models import load_model
from research.weight_estimation.akpd_utils.akpd_scorer import generate_confidence_score
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils
from weight_estimation.utils import get_left_right_keypoint_arrs, convert_to_world_point_arr, \
    CameraMetadata


# generate raw GTSF dataframe from database
def generate_raw_df(start_date, end_date):
    rds = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))
    query = """
        select * from research.fish_metadata a left join keypoint_annotations b
        on a.left_url = b.left_image_url 
        where b.keypoints -> 'leftCrop' is not null
        and b.keypoints -> 'rightCrop' is not null
        and b.captured_at between '{0}' and '{1}';
    """.format(start_date, end_date)
    df = rds.extract_from_database(query)
    return df


def process(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df.data.apply(lambda x: x['species'].lower()) == 'salmon'].copy(deep=True)
    qa_df = df[df.is_qa == True]
    cogito_df = df[(df.is_qa != True) & ~(df.left_image_url.isin(qa_df.left_image_url))]
    df = pd.concat([qa_df, cogito_df], axis=0)
    return df


def compute_akpd_score(akpd_scorer_network, keypoints: Dict, camera_metadata: Dict) -> float:
    input_sample = {
        'keypoints': keypoints,
        'cm': camera_metadata,
        'stereo_pair_id': 0,
        'single_point_inference': True
    }

    akpd_score = generate_confidence_score(input_sample, akpd_scorer_network)
    return akpd_score


def generate_akpd_scores(df: pd.DataFrame, akpd_scorer_f: str) -> List[float]:
    akpd_scorer_network = load_model(akpd_scorer_f)
    akpd_scores = []
    count = 0
    for idx, row in df.iterrows():
        if count % 1000 == 0:
            print('Percentage complete: {}%'.format(round(100 * count / df.shape[0], 2)))
        count += 1
        akpd_score = compute_akpd_score(akpd_scorer_network, row.keypoints, row.camera_metadata)
        akpd_scores.append(akpd_score)
    return akpd_scores


def generate_depths(df: pd.DataFrame):
    depths = []
    for idx, row in df.iterrows():
        annotation = row.keypoints
        camera_metadata = row.camera_metadata
        cm = CameraMetadata(
            focal_length=camera_metadata['focalLength'],
            focal_length_pixel=camera_metadata['focalLengthPixel'],
            baseline_m=camera_metadata['baseline'],
            pixel_count_width=camera_metadata['pixelCountWidth'],
            pixel_count_height=camera_metadata['pixelCountHeight'],
            image_sensor_width=camera_metadata['imageSensorWidth'],
            image_sensor_height=camera_metadata['imageSensorHeight']
        )
        X_left, X_right = get_left_right_keypoint_arrs(annotation)
        X_world = convert_to_world_point_arr(X_left, X_right, cm)
        depths.append(np.mean(X_world[:, ]))
    return depths


def prepare_gtsf_data(start_date: str, end_date: str, akpd_scorer_f: str,
                      akpd_score_cutoff: float, depth_cutoff: float) -> pd.DataFrame:
    df = generate_raw_df(start_date, end_date)
    print('Raw data loaded!')
    df = process(df)
    print('Data preprocessed!')
    df['k_factor'] = 1e5 * df.weight / df.data.apply(lambda x: x['lengthMms']**3).astype(float)
    df['akpd_score'] = generate_akpd_scores(df, akpd_scorer_f)
    df['depth'] = generate_depths(df)
    mask = (df.akpd_score > akpd_score_cutoff) & (df.depth < depth_cutoff)
    df = df[mask].copy(deep=True)
    return df



In [None]:
augmentation_config = dict(
    trials=10,
    max_jitter_std=10,
    min_scaling_factor=0.3,
    max_scaling_factor=2.0
)

augmented_df = augment(tdf, augmentation_config)

In [None]:
count = 0
akpd_scores = []
akpd_scorer_network = load_model(akpd_scorer_f)
for idx, row in augmented_df.iterrows():
    if count % 1000 == 0:
        print('Percentage complete: {}%'.format(round(100 * count / augmented_df.shape[0], 2)))
    count += 1
    akpd_score = compute_akpd_score(akpd_scorer_network, row.annotation, row.camera_metadata)
    akpd_scores.append(akpd_score)


augmented_df['akpd_score'] = akpd_scores

<h1> Train the synthetic-data based model </h1>

In [None]:
from collections import defaultdict
import json
import os
import random
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
from weight_estimation.utils import get_left_right_keypoint_arrs, get_ann_from_keypoint_arrs,\
    convert_to_nn_input, CameraMetadata
from weight_estimation.dataset import prepare_gtsf_data
from keras.layers import Input, Dense, Flatten
from keras.models import Model
import keras
from research.utils.data_access_utils import S3AccessUtils


def augment(df: pd.DataFrame, augmentation_config: Dict) -> pd.DataFrame:
    trials = augmentation_config['trials']
    max_jitter_std = augmentation_config['max_jitter_std']
    min_scaling_factor = augmentation_config['min_scaling_factor']
    max_scaling_factor = augmentation_config['max_scaling_factor']

    augmented_data = defaultdict(list)
    for idx, row in df.iterrows():
        for _ in range(trials):
            scaling_factor = np.random.uniform(min_scaling_factor, max_scaling_factor)
            jitter_std = np.random.uniform(0, max_jitter_std)
            ann = row.keypoints
            X_left, X_right = get_left_right_keypoint_arrs(ann)

            # rescale
            X_left = X_left * scaling_factor
            X_right = X_right * scaling_factor

            # add jitter
            X_left[:, 0] += np.random.normal(0, jitter_std, X_left.shape[0])
            X_right[:, 0] += np.random.normal(0, jitter_std, X_right.shape[0])

            # reconstruct annotation
            ann = get_ann_from_keypoint_arrs(X_left, X_right)
            augmented_data['annotation'].append(ann)
            augmented_data['fish_id'].append(row.fish_id)
            augmented_data['weight'].append(row.weight)
            augmented_data['kf'].append(row.k_factor)
            augmented_data['camera_metadata'].append(row.camera_metadata)

    augmented_df = pd.DataFrame(augmented_data)
    return augmented_df


def normalize(anns: List, camera_metadatas: List) -> np.ndarray:
    norm_anns = []
    for ann, camera_metadata in zip(anns, camera_metadatas):

        cm = CameraMetadata(
            focal_length=camera_metadata['focalLength'],
            focal_length_pixel=camera_metadata['focalLengthPixel'],
            baseline_m=camera_metadata['baseline'],
            pixel_count_width=camera_metadata['pixelCountWidth'],
            pixel_count_height=camera_metadata['pixelCountHeight'],
            image_sensor_width=camera_metadata['imageSensorWidth'],
            image_sensor_height=camera_metadata['imageSensorHeight']
        )

        norm_ann = convert_to_nn_input(ann, cm)
        norm_anns.append(norm_ann)
    return np.array(norm_anns)


def get_data_split(X: np.ndarray, y: np.ndarray, fish_ids: np.ndarray, train_pct: float,
                   val_pct: float) -> Tuple:
    # select train / test sets such that there are no overlapping fish IDs

    test_pct = 1.0 - train_pct - val_pct
    unique_fish_ids = np.array(list(set(fish_ids)))
    train_cnt, val_cnt, test_cnt = np.random.multinomial(len(unique_fish_ids),
                                                         [train_pct, val_pct, test_pct])

    assignments = np.array([0] * train_cnt + [1] * val_cnt + [2] * test_cnt)
    np.random.shuffle(assignments)
    train_fish_ids = unique_fish_ids[np.where(assignments == 0)]
    val_fish_ids = unique_fish_ids[np.where(assignments == 1)]
    test_fish_ids = unique_fish_ids[np.where(assignments == 2)]

    train_mask = np.isin(fish_ids, train_fish_ids)
    val_mask = np.isin(fish_ids, val_fish_ids)
    test_mask = np.isin(fish_ids, test_fish_ids)

    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    X_test, y_test = X[test_mask], y[test_mask]

    return X_train, y_train, X_val, y_val, X_test, y_test


def train_model(X_train, y_train, X_val, y_val, train_config):
    inputs = Input(shape=(24,))
    x = Dense(256, activation='relu')(inputs)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    pred = Dense(1)(x)
    model = Model(inputs, pred)

    epochs = train_config['epochs']
    batch_size = train_config['batch_size']
    lr = train_config['learning_rate']
    patience = train_config['patience']

    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
                                               min_delta=0,
                                               patience=patience,
                                               verbose=0,
                                               mode='auto')]

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer,
                  loss='mean_squared_error',
                  metrics=['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=callbacks,
              batch_size=batch_size, epochs=epochs)

    return model


def density_scatter(x, y, bins=20, **kwargs):
    fig, ax = plt.subplots(figsize=(20, 10))
    data, x_e, y_e = np.histogram2d(x, y, bins=bins, density=True)
    z = interpn((0.5*(x_e[1:] + x_e[:-1]), 0.5*(y_e[1:]+y_e[:-1])), data, np.vstack([x, y]).T,
                method="splinef2d", bounds_error=False)

    z[np.where(np.isnan(z))] = 0.0

    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]

    ax.scatter(x, y, c=z, **kwargs)

    norm = Normalize(vmin=np.min(z), vmax=np.max(z))
    cbar = fig.colorbar(cm.ScalarMappable(norm=norm), ax=ax)
    cbar.ax.set_ylabel('Density')

    ax.set_xlabel('Prediction')
    ax.set_ylabel('Ground Truth')
    ax.grid()

    return ax


def generate_accuracy_details(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train).squeeze().astype(float)
    y_test_pred = model.predict(X_test).squeeze().astype(float)
    ax_train = density_scatter(1e4 * y_train, 1e4 * y_train_pred)
    ax_test = density_scatter(1e4 * y_test, 1e4 * y_test_pred)
    train_stats = {
        'mean_absolute_error_pct': 100 * np.mean(np.abs((y_train_pred - y_train) / y_train)),
        'mean_error_pct': 100 * np.mean(y_train_pred - y_train) / np.mean(y_train)
    }
    test_stats = {
        'mean_absolute_error_pct': 100 * np.mean(np.abs((y_test_pred - y_test) / y_test)),
        'mean_error_pct': 100 * np.mean(y_test_pred - y_test) / np.mean(y_test)
    }

    return ax_train, ax_test, train_stats, test_stats


def train(augmented_df, train_config, weight):
    print('here')
    random.seed(0)
    np.random.seed(0)
    anns = augmented_df.annotation.values.tolist()
    cms = augmented_df.camera_metadata.values.tolist()
    X = normalize(anns, cms)

    if weight:
        y = 1e-4 * augmented_df.weight.values
    else:
        y = (augmented_df.kf.values - 1.2) / 0.3
    print(y)
    fish_ids = augmented_df.fish_id.values
    X_train, y_train, X_val, y_val, X_test, y_test = get_data_split(X, y, fish_ids,
                                                                    train_config['train_pct'],
                                                                    train_config['val_pct'])
    model = train_model(X_train, y_train, X_val, y_val, train_config)
    ax_train, ax_test, train_stats, test_stats = \
        generate_accuracy_details(model, X_train, y_train, X_test, y_test)
    return model, ax_train, ax_test, train_stats, test_stats




In [None]:
train_config = dict(
    train_pct=0.8,
    val_pct=0.1,
    epochs=1000,
    batch_size=64,
    learning_rate=2e-5,
    patience=30
)

key = 'weight'
mask = (augmented_df.akpd_score >= 0.9) & (augmented_df.weight > 7500)
model, ax_train, ax_test, train_stats, test_stats = train(augmented_df[mask], train_config,
                                                          weight=True if key == 'weight' else
                                                          False)


In [None]:
f = '/tmp/synthetic_data_model_large-bryton.h5'
# model.save(f)

In [None]:
model = load_model(f)

<h1> Test out on Leivesthamran pen </h1>

In [None]:
import argparse
from collections import defaultdict
import json
import os
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.dates as mdates
import numpy as np
import pandas as pd

from filter_optimization.filter_optimization_task import NoDataException, SamplingFilter, generate_filter_mask, \
     extract_biomass_data
from population_metrics.population_metrics_base import generate_pm_base, PopulationMetricsBase
from population_metrics.growth_rate import compute_local_growth_rate
from population_metrics.raw_metrics import get_raw_kf_values, generate_raw_average_weight, get_raw_sample_size
from population_metrics.smart_metrics import generate_smart_avg_weight, generate_smart_individual_values, \
     generate_smart_distribution, generate_smart_avg_kf, get_smart_sample_size, get_smart_growth_rate, \
     generate_smart_standard_deviation
from population_metrics.confidence_metrics import generate_trend_stability, generate_distribution_consistency, \
     compute_biomass_kpi, get_raw_and_historical_weights
from research.utils.datetime_utils import get_dates_in_range
from research.utils.data_access_utils import S3AccessUtils, RDSAccessUtils

pd.set_option('display.max_colwidth', 500)

In [None]:
pen_id, start_date, end_date = 204, '2020-11-25', '2020-12-04'
sampling_filter = SamplingFilter(start_hour=7, end_hour=15, kf_cutoff=0.0, akpd_score_cutoff=0.95)
rdf = extract_biomass_data(pen_id, start_date, end_date, sampling_filter.akpd_score_cutoff)

In [None]:
# s3.s3_client.upload_file('/tmp/large_weight_pytorch_model.pb', 'aquabyte-models', 'biomass/trained_models/2020-11-27T00-00-00/weight_model_synthetic_data.pb')



In [None]:
# weight_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2020-11-27T00-00-00/weight_model_synthetic_data.pb')
weight_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/playground/nn_epoch_798_v2.pb')
kf_model_f, _, _ = s3.download_from_url('https://aquabyte-models.s3-us-west-1.amazonaws.com/k-factor/playground/kf_predictor_v2.pb')

weight_estimator = WeightEstimator(weight_model_f, kf_model_f)
weights, lengths, kfs = [], [], []
count = 0

In [None]:
for idx, row in rdf.iterrows():
    if count % 100 == 0:
        print('Percentage completion: {}%'.format(round(100 * count / rdf.shape[0], 2)))
        print(count)
    count += 1
    annotation = row.annotation
    if not annotation:
        weights.append(None)
        continue
    camera_metadata = row.camera_metadata
    if not camera_metadata:
        camera_metadata = rdf.camera_metadata.iloc[0]

    camera_metadata_obj = CameraMetadata(
        focal_length=camera_metadata['focalLength'],
        focal_length_pixel=camera_metadata['focalLengthPixel'],
        baseline_m=camera_metadata['baseline'],
        pixel_count_width=camera_metadata['pixelCountWidth'],
        pixel_count_height=camera_metadata['pixelCountHeight'],
        image_sensor_width=camera_metadata['imageSensorWidth'],
        image_sensor_height=camera_metadata['imageSensorHeight']
    )

    weight, length, kf = weight_estimator.predict(annotation, camera_metadata_obj)
    weights.append(weight)
    lengths.append(length)
    kfs.append(kf)

In [None]:
rdf.estimated_weight_g.mean()

In [None]:
rdf['estimated_weight_g'] = weights

In [None]:
pm_base = gen_pm_base(rdf, sampling_filter)

<h1> Large fish synthetic model Result </h1>

In [None]:
generate_smart_avg_weight(pm_base, '2020-12-03')

<h1> Original Production Model Result </h1>

In [None]:
generate_smart_avg_weight(pm_base, '2020-12-03')

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(rdf.new_weight.values, bins=20, alpha=0.5, weights=np.ones(rdf.shape[0]) / rdf.shape[0], label='synthetic model weights')
plt.hist(rdf.estimated_weight_g.values, bins=20, weights=np.ones(rdf.shape[0]) / rdf.shape[0], alpha=0.5, label='original model weights')
plt.legend()
plt.grid()
plt.show()

In [None]:
rdf.estimated_weight_g.max()

In [None]:
rdf.new_weight.max()

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(rdf.new_weight.values, bins=20, alpha=0.5, weights=np.ones(rdf.shape[0]) / rdf.shape[0], label='synthetic model weights')
plt.hist(rdf.estimated_weight_g.values, bins=20, weights=np.ones(rdf.shape[0]) / rdf.shape[0], alpha=0.5, label='original model weights')
plt.legend()
plt.grid()
plt.show()

In [None]:
rds = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))


def gen_pm_base(df: pd.DataFrame, sampling_filter: SamplingFilter) -> PopulationMetricsBase:
    """
    Returns PopulationMetricsBase instance given input biomass computations
    data-frame (see README for more details) and SamplingFilter instance.
    """

    mask = generate_filter_mask(df, sampling_filter)

    # get filtered set of biomass computations
    biomass_computations = list(zip(df[mask].date.values,
                                    df.loc[mask, 'estimated_weight_g'].values,
                                    df[mask].estimated_k_factor.values))

    # generate population metrics estimator
    if not biomass_computations:
        raise NoDataException('No data found for given filter!')
    return generate_pm_base(biomass_computations)


def generate_ts_data(df: pd.DataFrame, sampling_filter: SamplingFilter) -> defaultdict:
    """
    Given input data-frame of biomass computations and SamplingFilter instance,
    generates time-series data for different raw metrics, smart metrics, growth rate metrics,
    and confidence metrics.
    """

    pm_base = gen_pm_base(df, sampling_filter)
    start_date, end_date = pm_base.unique_dates[0], pm_base.unique_dates[-1]
    dates = get_dates_in_range(start_date, end_date)
    ts_data = defaultdict(list)
    ts_data['date'].extend(dates)
    for date in dates:

        # raw metrics
        raw_average_weight = generate_raw_average_weight(pm_base, date)
        raw_sample_size = get_raw_sample_size(pm_base, date)

        # growth rate metrics
        growth_rate = compute_local_growth_rate(pm_base, date)

        # confidence metrics
        distribution_consistency = generate_distribution_consistency(pm_base, date)
        kpi = compute_biomass_kpi(pm_base, date)

        # smart metrics
        smart_average_weight = generate_smart_avg_weight(pm_base, date)
        smart_average_kf = generate_smart_avg_kf(pm_base, date)
        smart_sample_size = get_smart_sample_size(pm_base, date)
        smart_growth_rate = get_smart_growth_rate(pm_base, date)

        ts_data['raw_average_weight'].append(raw_average_weight)
        ts_data['raw_sample_size'].append(raw_sample_size)
        ts_data['growth_rate'].append(growth_rate)
        ts_data['distribution_consistency'].append(distribution_consistency)
        ts_data['kpi'].append(kpi)
        ts_data['smart_average_weight'].append(smart_average_weight)
        ts_data['smart_average_kf'].append(smart_average_kf)
        ts_data['smart_sample_size'].append(smart_sample_size)
        ts_data['smart_growth_rate'].append(smart_growth_rate)

    return ts_data




In [None]:
pm_base = gen_pm_base(rdf, sampling_filter)

In [None]:
generate_smart_avg_weight(pm_base, '2020-11-20')

In [None]:
generate_smart_avg_weight(pm_base, '2020-11-27')

In [None]:
generate_smart_avg_weight(pm_base, '2020-11-21')

In [None]:

def get_distribution(weights, bucket_cutoffs):
    dist = {}
    count = 0
    for low, high in zip(bucket_cutoffs, bucket_cutoffs[1:]):
        bucket = f'{1e-3 * low}-{1e-3 * high}'
        bucket_count = weights[(weights >= low) & (weights < high)].shape[0]
        dist[bucket] = bucket_count
        count += bucket_count
    
    dist = {k: round(100 * v / count, 1) for k, v in dist.items()}
    return dist


def get_kf_breakdown(weights, kfs, bucket_cutoffs):
    dist = {}
    count = 0
    for low, high in zip(bucket_cutoffs, bucket_cutoffs[1:]):
        bucket = f'{1e-3 * low}-{1e-3 * high}'
        mean_kf = kfs[(weights >= low) & (weights < high)].mean()
        dist[bucket] = round(mean_kf, 2)
    
    return dist
        
def pretty(d, indent=0):
    for key, value in d.items():
        print('\t' * indent + str(key))
        if isinstance(value, dict):
            pretty(value, indent+1)
        else:
            print('\t' * (indent+1) + str(value))
    

def generate_info(pm_base, date, loss_factor, adjustment_pct):
    weights, kfs = generate_smart_individual_values(pm_base, date, 3, True, True, 0.9)
    vals = (1.0 + 0.01 * adjustment_pct) * weights * (1.0 - loss_factor)
    smart_avg = np.mean(vals)
    smart_kf = np.mean(kfs)
    smart_sample_size = get_smart_sample_size(pm_base, date)
    smart_std = np.std(vals)
    cov = smart_std / smart_avg
    weight_dist = get_distribution(vals, np.arange(0, 15000, 1000))
    kf_breakdown = get_kf_breakdown(vals, kfs, np.arange(0, 15000, 1000))
    
    print('Loss Factor: {}%'.format(round(100 * loss_factor)))
    print('-----------')
    print('Smart Avg Weight: {}g'.format(round(smart_avg)))
    print('Smart K Factor: {}'.format(round(smart_kf, 2)))
    print('Smart Sample Size: {}'.format(smart_sample_size))
    print('Smart Standard Deviation: {}g'.format(round(smart_std)))
    print('Coefficient of Variation: {}%'.format(round(100 * cov, 1)))
    print('Weight Distribution:')
    print(json.dumps(weight_dist, indent=4))
    print('KF Breakdown:')
    print(json.dumps(kf_breakdown, indent=4))
    
    return {
        'loss_factor': round(100 * loss_factor),
        'smart_average_weight': round(smart_avg),
        'smart_k_factor': round(smart_kf, 2),
        'smart_sample_size': smart_sample_size,
        'smart_standard_deviation': smart_std,
        'coefficient_of_variation': round(100 * cov, 1),
        'weight_distribution': weight_dist,
        'kf_breakdown': kf_breakdown
    }

In [None]:
date = '2020-12-03'
adjustment_pct = -1.5

output = []
for loss_factor in [0] + list(np.arange(0.13, 0.19, 0.01)):
    
    output.append(generate_info(pm_base, date, loss_factor, adjustment_pct))
    print(' ')

In [None]:
print(json.dumps(output, indent=4).replace('NaN', 'null'))

<h1> Generate weight trend line over larger period </h1>

In [None]:
pen_id, start_date, end_date = 153, '2020-08-01', '2020-11-22'
sampling_filter = SamplingFilter(start_hour=7, end_hour=15, kf_cutoff=0.0, akpd_score_cutoff=0.95)
rdf = extract_biomass_data(pen_id, start_date, end_date, sampling_filter.akpd_score_cutoff)

In [None]:
pm_base = gen_pm_base(rdf, sampling_filter)

In [None]:
dates = sorted(rdf.date.unique())
original_weights = []
for date in dates:
    weight = generate_smart_avg_weight(pm_base, date)
    original_weights.append(weight)

In [None]:
dates = sorted(rdf.date.unique())
new_weights = []
for date in dates:
    weight = generate_smart_avg_weight(pm_base, date)
    new_weights.append(weight)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(dates, original_weights, label='Original Weight')
ax.plot(dates, new_weights, label='Corrected Weight')
ax.set_ylim([2000, 7000])
ax.set_xlabel('Date')
ax.set_ylabel('Average Weight (g)')
ax.grid()
ax.legend()
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
fig.autofmt_xdate()
plt.show()

In [None]:
import matplotlib.dates as mdates
import matplotlib.cbook as cbook

In [None]:
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
years_fmt = mdates.DateFormatter('%Y')

In [None]:
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 10}

matplotlib.rc('font', **font)

In [None]:
del matplotlib
import matplotlib

In [None]:
xdf = pd.DataFrame({
    'date': dates,
    'original_weight': original_weights,
    'new_weight': new_weights
})

In [None]:
xdf.to_csv('/Users/aloksaxena/Desktop/original_new_weights.csv')

In [None]:
x, _ = generate_smart_individual_values(pm_base, '2020-05-12', 3, True, True, 0.9)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(x, bins=100)
plt.grid()
plt.show()

In [None]:
y, _ = generate_smart_individual_values(pm_base, '2020-05-12', 3, True, True, 0.9)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(x, color='blue', alpha=0.5, weights=np.ones(len(x)) / len(x), bins=20, label='prediction')
plt.hist(gt_df.weight * 1000 * 1.17, color='red', alpha=0.5, weights=np.ones(gt_df.shape[0]) / gt_df.shape[0], bins=20, label='ground truth')
plt.legend()
plt.grid()
plt.show()

In [None]:
gt_df = pd.read_csv('/Users/aloksaxena/Desktop/eide_langoy_singleweights.csv')

<h1> Convert to PyTorch </h1>

In [None]:
from weight_estimation.weight_estimator import Network

In [None]:
pytorch_model = Network()

In [None]:
weights = model.get_weights()

In [None]:
pytorch_model.fc1.weight.data = torch.from_numpy(np.transpose(weights[0]))
pytorch_model.fc1.bias.data = torch.from_numpy(np.transpose(weights[1]))
pytorch_model.fc2.weight.data = torch.from_numpy(np.transpose(weights[2]))
pytorch_model.fc2.bias.data = torch.from_numpy(np.transpose(weights[3]))
pytorch_model.fc3.weight.data = torch.from_numpy(np.transpose(weights[4]))
pytorch_model.fc3.bias.data = torch.from_numpy(np.transpose(weights[5]))
pytorch_model.output.weight.data = torch.from_numpy(np.transpose(weights[6]))
pytorch_model.output.bias.data = torch.from_numpy(np.transpose(weights[7]))
                                                            

In [None]:
torch.save(pytorch_model.state_dict(), '/tmp/large_weight_pytorch_model.pb')

<h1> Evaluate model on standard GTSF dataset </h1>

In [None]:
# weight_model_f = f
# kf_model_f = '/Users/aloksaxena/Documents/repos/production_algo/weight_estimation/src/weight_estimation/kf_model.h5'

weight_model_f = '/Users/aloksaxena/Documents/repos/production_algo/weight_estimation/tests/artifacts/weight_model.pb'
kf_model_f = '/Users/aloksaxena/Documents/repos/production_algo/weight_estimation/tests/artifacts/kf_model.pb'
    
    
weight_estimator = WeightEstimator(weight_model_f, kf_model_f)
weights, lengths, kfs = [], [], []
count = 0

In [None]:
for idx, row in df.iterrows():
    if count % 100 == 0:
        print('Percentage completion: {}%'.format(round(100 * count / df.shape[0], 2)))
        print(count)
    count += 1
    annotation = row.keypoints
    camera_metadata = row.camera_metadata

    camera_metadata_obj = CameraMetadata(
        focal_length=camera_metadata['focalLength'],
        focal_length_pixel=camera_metadata['focalLengthPixel'],
        baseline_m=camera_metadata['baseline'],
        pixel_count_width=camera_metadata['pixelCountWidth'],
        pixel_count_height=camera_metadata['pixelCountHeight'],
        image_sensor_width=camera_metadata['imageSensorWidth'],
        image_sensor_height=camera_metadata['imageSensorHeight']
    )

    weight, length, kf = weight_estimator.predict(annotation, camera_metadata_obj)
    weights.append(weight)
    lengths.append(length)
    kfs.append(kf)

In [None]:
df['estimated_weight_g'] = weights

In [None]:
bucket_cutoffs = np.arange(0, 13000, 1000)
buckets, biases, maes = [], [], []
for low, high in zip(bucket_cutoffs, bucket_cutoffs[1:]):
    bucket = '{}-{}'.format(low, high)
    mask = (df.weight > low) & (df.weight <= high) & (df.akpd_score > 0.9)
    bias = (df[mask].estimated_weight_g.mean() - df[mask].weight.mean()) / (df[mask].weight.mean())
    mae = np.mean(np.abs((df[mask].estimated_weight_g.values - df[mask].weight.values) / (df[mask].weight.values)))
    
    buckets.append(bucket)
    biases.append(bias)
    maes.append(mae)
    print(bucket, bias, mae)
    


In [None]:
kdf = pd.DataFrame({
    'bucket': buckets,
    'bias': biases,
    'mae': maes
})

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(kdf.bucket, kdf.bias * 100)
plt.xlabel('Weight Bucket (g)')
plt.ylabel('Bias (%)')
plt.title('Single Fish Accuracy vs. Weight Bucket')
plt.grid()
plt.show()

In [None]:
bucket_cutoffs = np.arange(0, 10000, 1000)
buckets, biases, maes = [], [], []
for low, high in zip(bucket_cutoffs, bucket_cutoffs[1:]):
    bucket = '{}-{}'.format(low, high)
    mask = (df.weight > low) & (df.weight <= high) & (df.akpd_score > 0.9)
    bias = (df[mask].estimated_weight_g_2.mean() - df[mask].weight.mean()) / (df[mask].weight.mean())
    mae = np.mean(np.abs((df[mask].estimated_weight_g_2.values - df[mask].weight.values) / (df[mask].weight.values)))
    
    buckets.append(bucket)
    biases.append(bias)
    maes.append(mae)
    print(bucket, bias, mae)
    


In [None]:
pd.DataFrame({
    'bucket': buckets,
    'bias': biases,
    'mae': maes
})

<h1> Train the model </h1>

In [None]:
"""This module contains utility helper functions for the WeightEstimator class."""

from collections import namedtuple
from typing import Dict, List, Tuple
import numpy as np
from weight_estimation import body_parts


CameraMetadata = namedtuple('CameraMetadata',
                            ['focal_length_pixel', 'baseline_m', 'pixel_count_width',
                             'pixel_count_height', 'image_sensor_width', 'image_sensor_height'])


def get_left_right_keypoint_arrs(annotation: Dict[str, List[Dict]]) -> Tuple:
    """Gets numpy array of left and right keypoints given input keypoint annotation.
    Args:
        annotation: dict with keys 'leftCrop' and 'rightCrop'. Values are lists where each element
        is a dict with keys 'keypointType', 'xCrop' (num pixels from crop left edge),
        'yCrop' (num pixels from crop top edge), 'xFrame' (num pixels from full frame left edge),
        and 'yFrame' (num pixels from full frame top edge).
    Returns:
        X_left: numpy array containing left crop (xFrame, yFrame) for each key-point ordered
        alphabetically.
        X_right: same as above, but for right crop.
    """

    left_keypoints, right_keypoints = {}, {}
    for item in annotation['leftCrop']:
        body_part = item['keypointType']
        left_keypoints[body_part] = (item['xFrame'], item['yFrame'])

    for item in annotation['rightCrop']:
        body_part = item['keypointType']
        right_keypoints[body_part] = (item['xFrame'], item['yFrame'])

    left_keypoint_arr, right_keypoint_arr = [], []
    for body_part in body_parts.core_body_parts:
        left_keypoint_arr.append(left_keypoints[body_part])
        right_keypoint_arr.append(right_keypoints[body_part])

    X_left = np.array(left_keypoint_arr)
    X_right = np.array(right_keypoint_arr)
    return X_left, X_right


def get_ann_from_keypoint_arrs(X_left: np.ndarray, X_right: np.ndarray) -> Dict:
    """Constructs annotation from left and right key-point arrays (i.e. inverse of
    get_left_right_keypoint_arrs method."""

    ann = {'leftCrop': [], 'rightCrop': []}
    for idx in range(X_left.shape[0]):
        x_left, y_left = tuple(X_left[idx, :])
        x_right, y_right = tuple(X_right[idx, :])
        body_part = body_parts.core_body_parts[idx]
        left_item = dict(keypointType=body_part, xFrame=x_left, yFrame=y_left)
        right_item = dict(keypointType=body_part, xFrame=x_right, yFrame=y_right)
        ann['leftCrop'].append(left_item)
        ann['rightCrop'].append(right_item)

    return ann


def normalize_left_right_keypoint_arrs(X_left: np.ndarray, X_right: np.ndarray) -> Tuple:
    """Normalizes input left and right key-point arrays. The normalization involves (1) 2D
    translation of all keypoints such that they are centered, (2) rotation of the 2D coordiantes
    about the center such that the line passing through UPPER_LIP and fish center is horizontal.
    """

    # translate key-points, perform reflection if necessary
    upper_lip_idx = body_parts.core_body_parts.index(body_parts.UPPER_LIP)
    tail_notch_idx = body_parts.core_body_parts.index(body_parts.TAIL_NOTCH)
    if X_left[upper_lip_idx, 0] > X_left[tail_notch_idx, 0]:
        X_center = 0.5 * (np.max(X_left, axis=0) + np.min(X_left, axis=0))
        X_left_centered = X_left - X_center
        X_right_centered = X_right - X_center
    else:
        X_center = 0.5 * (np.max(X_right, axis=0) + np.min(X_right, axis=0))
        X_left_centered = X_right - X_center
        X_right_centered = X_left - X_center
        X_left_centered[:, 0] = -X_left_centered[:, 0]
        X_right_centered[:, 0] = -X_right_centered[:, 0]

    # rotate key-points
    upper_lip_x, upper_lip_y = tuple(X_left_centered[upper_lip_idx])
    theta = np.arctan(upper_lip_y / upper_lip_x)
    R = np.array([
        [np.cos(theta), -np.sin(theta)],
        [np.sin(theta), np.cos(theta)]
    ])

    D = X_left_centered - X_right_centered
    X_left_rot = np.dot(X_left_centered, R)
    X_right_rot = X_left_rot - D
    return X_left_rot, X_right_rot


def convert_to_world_point_arr(X_left: np.ndarray, X_right: np.ndarray,
                               camera_metadata: CameraMetadata) -> np.ndarray:
    """Converts input left and right normalized keypoint arrays into world coordinate array."""

    y_world = camera_metadata.focal_length_pixel * camera_metadata.baseline_m / \
              (X_left[:, 0] - X_right[:, 0])
    x_world = X_left[:, 0] * y_world / camera_metadata.focal_length_pixel
    z_world = -X_left[:, 1] * y_world / camera_metadata.focal_length_pixel
    X_world = np.vstack([x_world, y_world, z_world]).T
    return X_world


def stabilize_keypoints(X: np.ndarray) -> np.ndarray:
    """Transforms world coordinate array so that neural network inputs are stabilized"""
    X_new = np.zeros(X.shape)
    X_new[:, 0] = 0.5 * X[:, 0] / X[:, 1]
    X_new[:, 1] = 0.5 * X[:, 2] / X[:, 1]
    X_new[:, 2] = 0.05 / X[:, 1]
    return X_new


def convert_to_nn_input(annotation: Dict[str, List[Dict]], camera_metadata: CameraMetadata) \
        -> np.ndarray:
    """Convrts input keypoint annotation and camera metadata into neural network tensor input."""
    X_left, X_right = get_left_right_keypoint_arrs(annotation)
    X_left_norm, X_right_norm = normalize_left_right_keypoint_arrs(X_left, X_right)
    X_world = convert_to_world_point_arr(X_left_norm, X_right_norm, camera_metadata)
    X = stabilize_keypoints(X_world).reshape(1, -1)
    return X

In [None]:
from collections import defaultdict
import json
import os
import random
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize
from scipy.interpolate import interpn
from weight_estimation.utils import get_left_right_keypoint_arrs, CameraMetadata
from weight_estimation.dataset import prepare_gtsf_data
from keras.layers import Input, Dense, Flatten
from keras.models import Model
import keras
from research.utils.data_access_utils import S3AccessUtils

def convert_to_nn_input(annotation: Dict[str, List[Dict]], camera_metadata: CameraMetadata) \
        -> np.ndarray:
    """Convrts input keypoint annotation and camera metadata into neural network tensor input."""
    X_left, X_right = get_left_right_keypoint_arrs(annotation)
    X_left_norm, X_right_norm = normalize_left_right_keypoint_arrs(X_left, X_right)
    X_world = convert_to_world_point_arr(X_left_norm, X_right_norm, camera_metadata)
    X = stabilize_keypoints(X_world).reshape(1, -1)
    return X


def augment(df: pd.DataFrame, augmentation_config: Dict) -> pd.DataFrame:
    trials = augmentation_config['trials']
    max_jitter_std = augmentation_config['max_jitter_std']
    min_scaling_factor = augmentation_config['min_scaling_factor']
    max_scaling_factor = augmentation_config['max_scaling_factor']

    augmented_data = defaultdict(list)
    for idx, row in df.iterrows():
        for _ in range(trials):
            scaling_factor = np.random.uniform(min_scaling_factor, max_scaling_factor)
            jitter_std = np.random.uniform(0, max_jitter_std)
            ann = row.keypoints
            X_left, X_right = get_left_right_keypoint_arrs(ann)

            # rescale
            X_left = X_left * scaling_factor
            X_right = X_right * scaling_factor

            # add jitter
            X_left[:, 0] += np.random.normal(0, jitter_std, X_left.shape[0])
            X_right[:, 0] += np.random.normal(0, jitter_std, X_right.shape[0])

            # reconstruct annotation
            ann = get_ann_from_keypoint_arrs(X_left, X_right)
            augmented_data['annotation'].append(ann)
            augmented_data['fish_id'].append(row.fish_id)
            augmented_data['weight'].append(row.weight)
            augmented_data['kf'].append(row.k_factor)
            augmented_data['camera_metadata'].append(row.camera_metadata)

    augmented_df = pd.DataFrame(augmented_data)
    return augmented_df


def normalize(anns: List, camera_metadatas: List) -> np.ndarray:
    norm_anns = []
    for ann, camera_metadata in zip(anns, camera_metadatas):

        cm = CameraMetadata(
            focal_length=camera_metadata['focalLength'],
            focal_length_pixel=camera_metadata['focalLengthPixel'],
            baseline_m=camera_metadata['baseline'],
            pixel_count_width=camera_metadata['pixelCountWidth'],
            pixel_count_height=camera_metadata['pixelCountHeight'],
            image_sensor_width=camera_metadata['imageSensorWidth'],
            image_sensor_height=camera_metadata['imageSensorHeight']
        )

        norm_ann = convert_to_nn_input(ann, cm)
        norm_anns.append(norm_ann)
    return np.array(norm_anns)


def get_data_split(X: np.ndarray, y: np.ndarray, fish_ids: np.ndarray, train_pct: float,
                   val_pct: float) -> Tuple:
    # select train / test sets such that there are no overlapping fish IDs

    test_pct = 1.0 - train_pct - val_pct
    unique_fish_ids = np.array(list(set(fish_ids)))
    train_cnt, val_cnt, test_cnt = np.random.multinomial(len(unique_fish_ids),
                                                         [train_pct, val_pct, test_pct])

    assignments = np.array([0] * train_cnt + [1] * val_cnt + [2] * test_cnt)
    np.random.shuffle(assignments)
    train_fish_ids = unique_fish_ids[np.where(assignments == 0)]
    val_fish_ids = unique_fish_ids[np.where(assignments == 1)]
    test_fish_ids = unique_fish_ids[np.where(assignments == 2)]

    train_mask = np.isin(fish_ids, train_fish_ids)
    val_mask = np.isin(fish_ids, val_fish_ids)
    test_mask = np.isin(fish_ids, test_fish_ids)

    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[val_mask], y[val_mask]
    X_test, y_test = X[test_mask], y[test_mask]

    return X_train, y_train, X_val, y_val, X_test, y_test


def train_model(X_train, y_train, X_val, y_val, train_config):
    inputs = Input(shape=(24,))
    x = Dense(256, activation='relu')(inputs)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    pred = Dense(1)(x)
    model = Model(inputs, pred)

    epochs = train_config['epochs']
    batch_size = train_config['batch_size']
    lr = train_config['learning_rate']
    patience = train_config['patience']

    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
                                               min_delta=0,
                                               patience=patience,
                                               verbose=0,
                                               mode='auto')]

    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer,
                  loss='mean_squared_error',
                  metrics=['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=callbacks,
              batch_size=batch_size, epochs=epochs)

    return model


def density_scatter(x, y, bins=20, **kwargs):
    fig, ax = plt.subplots(figsize=(20, 10))
    data, x_e, y_e = np.histogram2d(x, y, bins=bins, density=True)
    z = interpn((0.5*(x_e[1:] + x_e[:-1]), 0.5*(y_e[1:]+y_e[:-1])), data, np.vstack([x, y]).T,
                method="splinef2d", bounds_error=False)

    z[np.where(np.isnan(z))] = 0.0

    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]

    ax.scatter(x, y, c=z, **kwargs)

    norm = Normalize(vmin=np.min(z), vmax=np.max(z))
    cbar = fig.colorbar(cm.ScalarMappable(norm=norm), ax=ax)
    cbar.ax.set_ylabel('Density')

    ax.set_xlabel('Prediction')
    ax.set_ylabel('Ground Truth')
    ax.grid()

    return ax


def generate_accuracy_details(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train).squeeze().astype(float)
    y_test_pred = model.predict(X_test).squeeze().astype(float)
    ax_train = density_scatter(1e4 * y_train, 1e4 * y_train_pred)
    ax_test = density_scatter(1e4 * y_test, 1e4 * y_test_pred)
    train_stats = {
        'mean_absolute_error_pct': 100 * np.mean(np.abs((y_train_pred - y_train) / y_train)),
        'mean_error_pct': 100 * np.mean(y_train_pred - y_train) / np.mean(y_train)
    }
    test_stats = {
        'mean_absolute_error_pct': 100 * np.mean(np.abs((y_test_pred - y_test) / y_test)),
        'mean_error_pct': 100 * np.mean(y_test_pred - y_test) / np.mean(y_test)
    }

    return ax_train, ax_test, train_stats, test_stats


def train(augmented_df, train_config, weight):
    print('here')
    random.seed(0)
    np.random.seed(0)
    anns = augmented_df.annotation.values.tolist()
    cms = augmented_df.camera_metadata.values.tolist()
    X = normalize(anns, cms)

    if weight:
        y = 1e-4 * augmented_df.weight.values
    else:
        y = (augmented_df.kf.values - 1.2) / 0.3
    fish_ids = augmented_df.fish_id.values
    X_train, y_train, X_val, y_val, X_test, y_test = get_data_split(X, y, fish_ids,
                                                                    train_config['train_pct'],
                                                                    train_config['val_pct'])
    model = train_model(X_train, y_train, X_val, y_val, train_config)
    ax_train, ax_test, train_stats, test_stats = \
        generate_accuracy_details(model, X_train, y_train, X_test, y_test)
    return model, ax_train, ax_test, train_stats, test_stats

def get_ann_from_keypoint_arrs(X_left: np.ndarray, X_right: np.ndarray) -> Dict:
    """Constructs annotation from left and right key-point arrays (i.e. inverse of
    get_left_right_keypoint_arrs method."""

    ann = {'leftCrop': [], 'rightCrop': []}
    for idx in range(X_left.shape[0]):
        x_left, y_left = tuple(X_left[idx, :])
        x_right, y_right = tuple(X_right[idx, :])
        body_part = body_parts.core_body_parts[idx]
        left_item = dict(keypointType=body_part, xFrame=x_left, yFrame=y_left)
        right_item = dict(keypointType=body_part, xFrame=x_right, yFrame=y_right)
        ann['leftCrop'].append(left_item)
        ann['rightCrop'].append(right_item)

    return ann


In [None]:
model.save('/Users/aloksaxena/Documents/general_model.h5')

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(augmented_df.weight)
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.grid()
plt.show()

<h3> Experiment #1: Filter on AKPD > 0.9; Incorporate all data </h3>

In [None]:
train_config = dict(
    train_pct=0.8,
    val_pct=0.1,
    epochs=1000,
    batch_size=64,
    learning_rate=2e-5,
    patience=30
)

key = 'weight'
mask = (augmented_df.akpd_score >= 0.9) & (augmented_df.weight < 5000)
model, ax_train, ax_test, train_stats, test_stats = train(augmented_df[mask], train_config,
                                                          weight=True if key == 'weight' else
                                                          False)


In [None]:
mask = augmented_df.akpd_score > 0.9
tdf = augmented_df[mask].copy(deep=True)

anns = tdf.annotation.values.tolist()
cms = tdf.camera_metadata.values.tolist()
X = normalize(anns, cms)
pred = 1e4 * model(X).numpy().squeeze()
tdf['pred1'] = pred


In [None]:
mask2 = (tdf.weight >= 0)
error_pcts = ((tdf[mask2].pred1 - tdf[mask2].weight) / tdf[mask2].weight).values
mean_abs_pct_err = np.mean(np.abs(error_pcts))
mean_pct_err = np.mean(error_pcts)
print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
print('Mean percentage error: {}'.format(mean_pct_err))

In [None]:
mask2 = (tdf.weight > 1500)# & (tdf.weight < 3000)
error_pcts = ((tdf[mask2].pred1 - tdf[mask2].weight) / tdf[mask2].weight).values
mean_abs_pct_err = np.median(np.abs(error_pcts))
mean_pct_err = np.mean(error_pcts)
print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
print('Mean percentage error: {}'.format(mean_pct_err))

In [None]:
weights = list(np.arange(0, 10000, 1000))
for lo, hi in zip(weights, weights[1:]):
    mask2 = (tdf.weight > lo) & (tdf.weight <= hi)
    error_pcts = ((tdf[mask2].pred1 - tdf[mask2].weight) / tdf[mask2].weight).values
    mean_abs_pct_err = np.median(np.abs(error_pcts))
    mean_pct_err = np.mean(error_pcts)
#     print('Weight bucket: {}-{}'.format(lo, hi))
#     print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
#     print('Mean percentage error: {}'.format(mean_pct_err)) 
    weight_bucket = '{}-{}'.format(lo, hi)
    print(weight_bucket, mean_pct_err)

In [None]:
test_stats

In [None]:
rds = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
query = """
select * from prod.biomass_computations
where akpd_score > 0.9
and pen_id = 193
and captured_at > '2020-10-01'
limit 10000;
"""

kdf = rds.extract_from_database(query)

In [None]:
weight_model_f = '/Users/aloksaxena/Documents/model.h5'
kf_model_f = '/Users/aloksaxena/Documents/repos/production_algo/weight_estimation/src/weight_estimation/kf_model.h5'
    
    
weight_estimator = WeightEstimator(weight_model_f, kf_model_f)
weights, lengths, kfs = [], [], []
count = 0

for idx, row in kdf.iterrows():
    if count % 100 == 0:
        print('Percentage completion: {}%'.format(round(100 * count / df.shape[0], 2)))
        print(count)
    count += 1
    annotation = row.annotation
    camera_metadata = row.camera_metadata

    camera_metadata_obj = CameraMetadata(
        focal_length_pixel=camera_metadata['focalLengthPixel'],
        baseline_m=camera_metadata['baseline'],
        pixel_count_width=camera_metadata['pixelCountWidth'],
        pixel_count_height=camera_metadata['pixelCountHeight'],
        image_sensor_width=camera_metadata['imageSensorWidth'],
        image_sensor_height=camera_metadata['imageSensorHeight']
    )

    if row.estimated_weight_g < 1000:
        weight, length, kf = weight_estimator.predict(annotation, camera_metadata_obj)
        weights.append(weight)
        lengths.append(length)
        kfs.append(kf)
    else:
        weights.append(row.estimated_weight_g)
        lengths.append(row.estimated_length_mm)
        kfs.append(row.estimated_k_factor)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(weights, bins=50)
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(kdf.estimated_weight_g, bins=50)
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
kdf.estimated_weight_g.mean()

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(error_pcts, bins=100)
plt.xlim([-0.2, 0.2])
plt.grid()
plt.show()

In [None]:
mask = augmented_df.weight >= 1500
error_pcts = ((augmented_df[mask].pred - augmented_df[mask].weight) / augmented_df[mask].weight).values
mean_abs_pct_err = np.mean(np.abs(error_pcts))
mean_pct_err = np.mean(error_pcts)
print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
print('Mean percentage error: {}'.format(mean_pct_err))

In [None]:
mask = augmented_df.weight < 1500
error_pcts = ((augmented_df[mask].pred2 - augmented_df[mask].weight) / augmented_df[mask].weight).values
mean_abs_pct_err = np.mean(np.abs(error_pcts))
mean_pct_err = np.mean(error_pcts)
print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
print('Mean percentage error: {}'.format(mean_pct_err))

In [None]:
mask = augmented_df.weight >= 1500
error_pcts = ((augmented_df[mask].pred2 - augmented_df[mask].weight) / augmented_df[mask].weight).values
mean_abs_pct_err = np.mean(np.abs(error_pcts))
mean_pct_err = np.mean(error_pcts)
print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
print('Mean percentage error: {}'.format(mean_pct_err))

In [None]:
train_config = dict(
    train_pct=0.8,
    val_pct=0.1,
    epochs=500,
    batch_size=64,
    learning_rate=1e-4,
    patience=10
)

key = 'weight'
model_small, ax_train, ax_test, train_stats, test_stats = train(augmented_df[augmented_df.weight < 1500], train_config,
                                                          weight=True if key == 'weight' else
                                                          False)
# model.save(f'{key}_model.h5')
# ax_train.figure.savefig(f'{key}_train_plot.png')
# ax_test.figure.savefig(f'{key}_test_plot.png')
# json.dump(train_stats, open(f'{key}_train_stats.json', 'w'))
# json.dump(test_stats, open(f'{key}_test_stats.json', 'w'))

In [None]:
pred3 = 1e4 * model_small(X).numpy().squeeze()

In [None]:
augmented_df['pred3'] = pred3

In [None]:
mask = augmented_df.weight <= 1500
error_pcts = ((augmented_df[mask].pred3 - augmented_df[mask].weight) / augmented_df[mask].weight).values
mean_abs_pct_err = np.mean(np.abs(error_pcts))
mean_pct_err = np.mean(error_pcts)
print('Mean absolute percentage error: {}'.format(mean_abs_pct_err))
print('Mean percentage error: {}'.format(mean_pct_err))