In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import cv2

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from scipy.stats import norm
import tqdm
import pickle
from itertools import combinations
from aquabyte.optics import convert_to_world_point, depth_from_disp, pixel2world, euclidean_distance
from aquabyte.data_access_utils import DataAccessUtils

from PIL import Image, ImageDraw
from multiprocessing import Pool, Manager
import copy
import uuid
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)



In [None]:
# aws_credentials = json.load(open(os.environ["AWS_CREDENTIALS"]))
# s3_client = boto3.client('s3', aws_access_key_id=aws_credentials["aws_access_key_id"],
#                          aws_secret_access_key=aws_credentials["aws_secret_access_key"],
#                          region_name="eu-west-1")


sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
sql_engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], sql_credentials["password"],
                           sql_credentials["host"], sql_credentials["port"],
                           sql_credentials["database"]))

Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs

# data_access_utils = DataAccessUtils()


In [None]:
def coord2biomass_linear(world_keypoints, model):
    """from coordinates to biomass"""

    mean = model['mean']
    std= model['std']
    PCA_components = model['PCA_components']
    reg_coef = model['reg_coef']
    reg_intercept = model['reg_intercept']
    body_parts = model['body_parts']

    # calculate pairwise distances for production coord
    # based on the exact ordering reflected in the body_parts
    # variable above

    pairwise_distances = []
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            dist = euclidean_distance(world_keypoints[body_parts[i]], world_keypoints[body_parts[j]])
            pairwise_distances.append(dist)
    
    interaction_values = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            dist1 = pairwise_distances[i]
            dist2 = pairwise_distances[j]
            interaction_values.append(dist1 * dist2)

    X = np.array(pairwise_distances + interaction_values)

    X_normalized = (X - model['mean']) / model['std']
    X_transformed = np.dot(X_normalized, model['PCA_components'].T)
    prediction = np.dot(X_transformed, reg_coef) + reg_intercept
    return prediction

def coord2biomass_blender(world_keypoints, blender):
    """from coordinates to biomass"""

    # mapping helps for consistency with the kp order
    reverse_mapping = blender["reverse_mapping"]
    distances = np.array(blender["distances"])
    volumes = blender["volume"]
    regression_coeff = blender["coeff"]

    # calculate pairwise distances for production coord
    # the reverse mapping insure that we listing the kp
    # in the same order
    measurements = []
    number_of_parts = len(world_keypoints)
    for k in range(number_of_parts):
        v = world_keypoints[reverse_mapping[str(k)]]
        for k0 in range(k+1, number_of_parts):
            v0 = world_keypoints[reverse_mapping[str(k0)]]
            dist = euclidean_distance(v, v0)*1000 # mm to m
            measurements.append(dist)
    measurements = np.array(measurements)

    # absolute diff
    diff = np.nanmean(np.abs(distances - measurements), axis=1)
    closest = np.argmin(diff)
    prediction = volumes[closest]

    # here is some machine learning
    prediction = prediction*regression_coeff[0] + regression_coeff[1]
    return prediction

In [None]:
model = pickle.load(open('./model.pkl', 'rb'))
blender = json.load(open('./volumes.json'))

In [None]:
session.rollback()
sfps_all = session.query(StereoFramePair).all()
df = pd.DataFrame()

for idx, row in enumerate(sfps_all):
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    
    # skip bad cases
    if row.gtsf_fish_identifier == '190321010002':
        print('Skipping fish ID {}'.format(row.gtsf_fish_identifier))
        continue
    if ground_truth_metadata['data'].get('species') != 'salmon':
        print('Skipping non-samlon fish: {}'.format(row.gtsf_fish_identifier))
        continue
        
    
    # get pairwise distances and biomass predictions
    world_keypoint_coordinates = json.loads(row.world_keypoint_coordinates)
    predicted_weight_linear = coord2biomass_linear(world_keypoint_coordinates, model)
    predicted_weight_blender = coord2biomass_blender(world_keypoint_coordinates, blender)
    predicted_length = euclidean_distance(world_keypoint_coordinates['UPPER_LIP'], world_keypoint_coordinates['TAIL_NOTCH'])
    
    df_row = {}
    body_parts = sorted(list(world_keypoint_coordinates.keys()))
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            bp1, bp2 = body_parts[i], body_parts[j]
            df_row['{}<->{}'.format(body_parts[i], body_parts[j])] = \
                euclidean_distance(world_keypoint_coordinates[bp1], world_keypoint_coordinates[bp2])
            
    left_image_keypoint_coordinates = json.loads(row.left_image_keypoint_coordinates)
    right_image_keypoint_coordinates = json.loads(row.right_image_keypoint_coordinates)
    if row.annotations_project_name == 'Automated keypoints detection':
        left_image_keypoint_coordinates = {bp: (x[1], x[0]) for bp, x in left_image_keypoint_coordinates.items()}
        right_image_keypoint_coordinates = {bp: (x[1], x[0]) for bp, x in right_image_keypoint_coordinates.items()}
    
    keypoints_valid = True
    threshold = 10
    for bp in body_parts:
        if abs(left_image_keypoint_coordinates[bp][1] - right_image_keypoint_coordinates[bp][1]) > threshold:
            keypoints_valid = False
        
    if not keypoints_valid:
        continue
            
    # append to dataset
    df_row.update({
        'weight': ground_truth_metadata['data']['weight'],
        'length': ground_truth_metadata['data']['length'],
        'width': ground_truth_metadata['data']['width'],
        'breadth': ground_truth_metadata['data']['breath'],
        'world_keypoint_coordinates': world_keypoint_coordinates,
        'left_image_keypoint_coordinates': json.loads(row.left_image_keypoint_coordinates),
        'right_image_keypoint_coordinates': json.loads(row.right_image_keypoint_coordinates),
        'kfactor': 1e5 * ground_truth_metadata['data']['weight'] / ground_truth_metadata['data']['length']**3,
        'date': row.date,
        'left_image_s3_key': row.left_image_s3_key,
        'right_image_s3_key': row.right_image_s3_key,
        'image_s3_bucket': row.image_s3_bucket,
        'predicted_weight_linear': predicted_weight_linear,
        'predicted_weight_blender': predicted_weight_blender,
        'predicted_length': predicted_length,
        'error_pct_linear': (predicted_weight_linear - ground_truth_metadata['data']['weight']) / ground_truth_metadata['data']['weight'],
        'error_pct_blender': (predicted_weight_blender - ground_truth_metadata['data']['weight']) / ground_truth_metadata['data']['weight'],
        'project_name': row.annotations_project_name,
        'gtsf_fish_identifier': row.gtsf_fish_identifier
    })
    
    df = df.append(df_row, ignore_index=True)
            
    
df_cache = df.copy()
df = df.dropna()


<h1> Compute bioamss estimates for all GTSF data with production model </h1>

In [None]:
plt.figure(figsize=(20, 10))


features = []
for i in range(len(body_parts)-1):
    for j in range(i+1, len(body_parts)):
        feature = '{}<->{}'.format(body_parts[i], body_parts[j])
        features.append(feature)

mask = (df[features] > 0.95).any(axis=1)

plt.scatter(df.loc[mask, 'weight'], df.loc[mask, 'predicted_weight_linear'], color='r')
plt.scatter(df.loc[~mask, 'weight'], df.loc[~mask, 'predicted_weight_linear'], color='b')
# plt.scatter(df.loc[mask2, 'weight'], df.loc[mask2, 'predicted_weight_linear'], color='r')

# plt.scatter(df.loc[~date_mask, 'weight'], df.loc[~date_mask, 'predicted_weight_linear'], color='r')
plt.xlabel('Ground Truth Weight')
plt.ylabel('Prediction')
plt.xlim([0, 8000])
plt.ylim([0, 8000])
# plt.axis('scaled')
plt.plot(range(5000), range(5000))
plt.show()

In [None]:
import glob

for path in np.random.choice(glob.glob('/root/data/gtsf_phase_I/2019-05-10/*/rectified/*'), 20):
    plt.imshow(cv2.imread(path))
    plt.show()

<h1> Error Analysis </h1>

In [None]:
def load_image(image_key, bucket):
    truncated_key = image_key.replace('phase_I/small-pen-test-site/1/', '')
    f = '/root/data/gtsf_phase_I/{}'.format(truncated_key)
    if not os.path.exists(f):
        print('here')
        s3_client.download_file(bucket, image_key, f)
    
    
    return plt.imread(f)

def plot_coordinates(image_key, bucket, side, keypoints, ax):
    im = plt.imread(data_access_utils.download_from_s3(bucket, image_key))
#     im = load_image(image_key, bucket)
    
#     plt.figure(figsize=(25, 10))
#     im = plt.imread(image_f)
    

    padding=100
    x_values = [coord[1] for body_part, coord in keypoints.items()]
    y_values = [coord[0] for body_part, coord in keypoints.items()]
    x_min, x_max, y_min, y_max = min(x_values)-padding, max(x_values)+padding, min(y_values)-padding, max(y_values)+padding
    
    for body_part, coordinates in keypoints.items():
        x, y = coordinates[1], coordinates[0]
#         x, y = coordinates[0], coordinates[1]
        ax.scatter([x-x_min], [y-y_min], c='red')
        ax.annotate(body_part, (x-x_min, y-y_min), color='red', )
        
    
    ax.imshow(im[y_min:y_max, x_min:x_max])    
    
def plot_gtsf(analysis_df, idx):
    
    data_point = analysis_df[analysis_df.index == idx].iloc[0]
    image_s3_bucket = data_point['image_s3_bucket']
    left_image_s3_key = data_point['left_image_s3_key']
    left_keypoints = data_point['left_image_keypoint_coordinates']
    
    right_image_s3_key = data_point['right_image_s3_key']
    right_keypoints = data_point['right_image_keypoint_coordinates']
    
    f, axarr = plt.subplots(1,2, figsize=(20, 10))
    plot_coordinates(left_image_s3_key, image_s3_bucket, 'left', left_keypoints, axarr[0])
    plot_coordinates(right_image_s3_key, image_s3_bucket, 'right', right_keypoints, axarr[1])
    plt.show()

In [None]:
df['abs_error_pct_linear'] = df.error_pct_linear.abs()
# df.sort_values('abs_error_pct_linear', ascending=False)

In [None]:
mask = df.project_name == 'Automated keypoints detection'
for idx, row in df[mask].sort_values('abs_error_pct_linear', ascending=False).iterrows():
    print(row.gtsf_fish_identifier)
    plot_gtsf(df, idx)

In [None]:
df[mask].shape

In [None]:
for idx, row in df.sort_values('error_pct', ascending=False).iterrows():
    plot_gtsf(analysis_df, idx)
    print('Error percentage: {}'.format(row.error_pct))

