In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from wpca import WPCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer, _normalize_world_keypoints
import random
import pickle
from scipy.stats import norm
from copy import copy

import matplotlib.cm as cm
pd.set_option('display.max_rows', 500)

<h1> Extract base data from database </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))
query = """
    select * from research.fish_metadata a left join keypoint_annotations b
    on a.left_url = b.left_image_url 
    where b.keypoints is not null and b.is_qa = false;
"""
df = rds_access_utils.extract_from_database(query)

<h1> Append world kepyoints to the data </h1>
<h3> Ideally, this data should already live directly in the database </h3>

In [None]:
def get_world_keypoints(row):
    if 'leftCrop' in row.keypoints and 'rightCrop' in row.keypoints:
        return pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata)
    else:
        return None
    
df['world_keypoints'] = df.apply(
    lambda x: get_world_keypoints(x), axis=1
)

<h1> Get the features dataframe from the base data with all pairwise distances </h1>

In [None]:
def coord2biomass(world_keypoints, model):
    """from coordinates to biomass"""

    mean = model['mean']
    std= model['std']
    PCA_components = model['PCA_components']
    reg_coef = model['reg_coef']
    reg_intercept = model['reg_intercept']
    body_parts = model['body_parts']
    # calculate pairwise distances for production coord
    # based on the exact ordering reflected in the body_parts
    # variable above

    pairwise_distances = []
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            dist = euclidean_distance(world_keypoints[body_parts[i]], world_keypoints[body_parts[j]])
            pairwise_distances.append(dist)

    interaction_values_quadratic = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            dist1 = pairwise_distances[i]
            dist2 = pairwise_distances[j]
            interaction_values_quadratic.append(dist1 * dist2)

    interaction_values_cubic = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            for k in range(j, len(pairwise_distances)):
                dist1 = pairwise_distances[i]
                dist2 = pairwise_distances[j]
                dist3 = pairwise_distances[k]
                interaction_values_cubic.append(dist1 * dist2 * dist3)


    X = np.array(pairwise_distances + interaction_values_quadratic + interaction_values_cubic)

    X_normalized = (X - model['mean']) / model['std']
    X_transformed = np.dot(X_normalized, model['PCA_components'].T)
    prediction = np.dot(X_transformed, reg_coef) + reg_intercept
    return prediction

In [None]:
models = {
    '163_eigs': pickle.load(open('/root/data/alok/biomass_estimation/playground/model_163_eigs.pkl', 'rb')),
    '20_eigs': pickle.load(open('/root/data/alok/biomass_estimation/playground/model_20_eigs.pkl', 'rb'))
}

In [None]:
np.random.seed(1)
random.seed(1)
analysis_dict = defaultdict(list)
trials = 20
row_count = 0
for idx, row in df.sample(1000).iterrows():

    gt_weight = row.weight
    # random keypoint jitters
    keypoints = row.keypoints
    for jitter in [0, 5, 10, 20, 50]:
        jittered_keypoints = {'leftCrop': [], 'rightCrop': []}
        T = trials if jitter > 0 else 1
        for t in range(T):
            for model_name, model in models.items():
                for key in ['leftCrop', 'rightCrop']:
                    for item in keypoints[key]:
                        jittered_item = copy(item)
                        j = np.random.normal(0, jitter)
                        jittered_item['xFrame'] += j
                        jittered_keypoints[key].append(jittered_item)

                jittered_world_keypoints = pixel2world(jittered_keypoints['leftCrop'],
                                                       jittered_keypoints['rightCrop'],
                                                       row.camera_metadata)
                estimated_weight = coord2biomass(jittered_world_keypoints, model)
                analysis_dict['jitter'].append(jitter)
                analysis_dict['estimated_weight'].append(estimated_weight)
                analysis_dict['gt_weight'].append(gt_weight)
                analysis_dict['trial'].append(t)
                analysis_dict['model_name'].append(model_name)

    if row_count % 10 == 0:
        print('Row count: {}'.format(row_count))
        print('Percentage complete: {}'.format(row_count / 1000.0))
        print('-'*20)
    row_count += 1



In [None]:
analysis_df = pd.DataFrame(analysis_dict)

In [None]:
for jitter in [0, 5, 10, 20, 50]:
    for model_name, model in models.items():
        model_mask = analysis_df.model_name == model_name
        jitter_mask = analysis_df.jitter == jitter
        inliner_mask = (analysis_df.estimated_weight > -100000) & (analysis_df.estimated_weight < 100000)
        mask = model_mask & jitter_mask & inliner_mask
        mean_weight = analysis_df[mask].estimated_weight.mean()
        print('Mean weight for model = {}, jitter = {}: {}'.format(model_name, jitter, mean_weight))
print('Mean GT weight: {}'.format(analysis_df[analysis_df.jitter == 0].gt_weight.mean()))

<h1> Visualize Accuracy Plots </h1>

In [None]:
for jitter in [0, 5, 10, 20, 50]:
    for model_name, model in models.items():
        model_mask = analysis_df.model_name == model_name
        jitter_mask = analysis_df.jitter == jitter
        trial_mask = analysis_df.trial == 0
        mask = model_mask & jitter_mask & trial_mask
        plt.figure(figsize=(10, 5))
        plt.scatter(analysis_df[(analysis_df.jitter == 0) & model_mask].gt_weight, analysis_df[mask].estimated_weight)
        plt.plot([0, 10000], [0, 10000], color='red')
        plt.xlim([0, 10000])
        plt.ylim([0, 10000])
        plt.title('Model: {}, Jitter: {}'.format(model_name, jitter))
        plt.show()
        