# GTSF phase I: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

### Look at the volumes created with blender

Load blender data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import cv2

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from scipy.stats import norm
import tqdm
import pickle
from itertools import combinations
from aquabyte.data_access_utils import DataAccessUtils
from aquabyte.optics import euclidean_distance

from PIL import Image, ImageDraw
from multiprocessing import Pool, Manager
import copy
import uuid
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)





In [None]:
data_access_utils = DataAccessUtils('/root/data/')

<h1> Get world keypoint coordinates from GTSF data </h1>

In [None]:
aws_credentials = json.load(open(os.environ["AWS_CREDENTIALS"]))
s3_client = boto3.client('s3', aws_access_key_id=aws_credentials["aws_access_key_id"],
                         aws_secret_access_key=aws_credentials["aws_secret_access_key"],
                         region_name="eu-west-1")


sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
sql_engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], sql_credentials["password"],
                           sql_credentials["host"], sql_credentials["port"],
                           sql_credentials["database"]))

Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs


<h1> Utility functions for world keypoint normalization </h1>

In [None]:
def generate_rotation_matrix(u_base, v):
    u = v / np.linalg.norm(v)
    n = np.cross(u_base, u)
    n = n / np.linalg.norm(n)
    theta = -np.arccos(np.dot(u, u_base))

    R = np.array([[
        np.cos(theta) + n[0]**2*(1-np.cos(theta)), 
        n[0]*n[1]*(1-np.cos(theta)) - n[2]*np.sin(theta),
        n[0]*n[2]*(1-np.cos(theta)) + n[1]*np.sin(theta)
    ], [
        n[1]*n[0]*(1-np.cos(theta)) + n[2]*np.sin(theta),
        np.cos(theta) + n[1]**2*(1-np.cos(theta)),
        n[1]*n[2]*(1-np.cos(theta)) - n[0]*np.sin(theta),
    ], [
        n[2]*n[0]*(1-np.cos(theta)) - n[1]*np.sin(theta),
        n[2]*n[1]*(1-np.cos(theta)) + n[0]*np.sin(theta),
        np.cos(theta) + n[2]**2*(1-np.cos(theta))
    ]])
    
    return R

def normalize_world_keypoints(world_keypoint_coordinates):
    body_parts = sorted(world_keypoint_coordinates.keys())
    wkps = {bp: np.array(world_keypoint_coordinates[bp]) for bp in body_parts}
    
    # translate keypoints such that tail notch is at origin
    translated_wkps = {bp: wkps[bp] - wkps['TAIL_NOTCH'] for bp in body_parts}
    
    # perform first rotation
    u_base=np.array([1, 0, 0])
    v = translated_wkps['UPPER_LIP']
    R = generate_rotation_matrix(u_base, v)
    norm_wkps_intermediate = {bp: np.dot(R, translated_wkps[bp]) for bp in body_parts}
    
    # perform second rotation
    u_base = np.array([0, 0, 1])
    v = norm_wkps_intermediate['DORSAL_FIN'] - np.array([norm_wkps_intermediate['DORSAL_FIN'][0], 0, 0])
    R = generate_rotation_matrix(u_base, v)
    norm_wkps = {bp: np.dot(R, norm_wkps_intermediate[bp]) for bp in body_parts}
    
    return norm_wkps
    


<h1> Utility Method: World Keypoint Calculation </h1>

In [None]:
session.rollback()

<h1> Train linear model with PCA + interaction features </h1>

In [None]:
sfps_all = session.query(StereoFramePair).all()
df = pd.DataFrame()

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])

session.rollback()
for idx, row in enumerate(sfps_all):
    if idx % 10 == 0:
        print(idx)
        
    # get fish_id and ground truth metadata
    if row.gtsf_fish_identifier == '190321010002':
        continue
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    if ground_truth_metadata['data'].get('species') != 'salmon':
        continue
    
    left_keypoints = json.loads(row.left_image_keypoint_coordinates)
    right_keypoints = json.loads(row.right_image_keypoint_coordinates)
    wkps = json.loads(row.world_keypoint_coordinates)

    df_row = {'0': idx}
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(wkps[body_parts[i]], wkps[body_parts[j]])
            df_row['{0}-{1}'.format(i, j)] = d
    
    weight_key = None
    if 'weight' in ground_truth_metadata['data'].keys():
        weight_key = 'weight'
    elif 'weightKgs' in ground_truth_metadata['data'].keys():
        weight_key = 'weightKgs'
    else:
        print('No weight recorded for fish ID: {}'.format(row.gtsf_fish_identifier))
        continue
        
        
    # calculate curvature
    wkp = {bp: [wkps[bp][2], wkps[bp][1], wkps[bp][0]] for bp in body_parts}
    fv1 = np.array(wkp['UPPER_LIP']) - np.array(wkp['DORSAL_FIN'])
    fv2 = np.array(wkp['UPPER_LIP']) - np.array(wkp['PELVIC_FIN'])
    n1 = np.cross(fv1, fv2)
    
    bv1 = np.array(wkp['PELVIC_FIN']) -  np.array(wkp['TAIL_NOTCH'])
    bv2 = np.array(wkp['DORSAL_FIN']) -  np.array(wkp['TAIL_NOTCH'])
    n2 = np.cross(bv1, bv2)
    curvature_theta = (180 / np.pi) * np.arccos(np.dot(n1, n2) / (np.linalg.norm(n1) * np.linalg.norm(n2)))
    
    weight = ground_truth_metadata['data'][weight_key]
    df_row['weight'] = weight
    df_row['date'] = row.date
    df_row['project_name'] = row.annotations_project_name
    df_row['left_keypoints'] = json.loads(row.left_image_keypoint_coordinates)
    df_row['right_keypoints'] = json.loads(row.right_image_keypoint_coordinates)
    df_row['world_keypoints'] = wkps
    df_row['gtsf_fish_identifier'] = row.gtsf_fish_identifier
    df_row['epoch'] = row.epoch
    df_row['stereo_frame_pair_id'] = row.id
    df_row['curvature_theta'] = curvature_theta
        
    df = df.append(df_row, ignore_index=True)
            
    



<h1> Apply filters </h1>

In [None]:
df_cache = df.copy()
df = df.dropna()

In [None]:
session.rollback()

In [None]:
df_cache.to_hdf('/root/data/df_cache.h5', 'key')

In [None]:
df = pd.read_hdf('/root/data/df_cache.h5')

In [None]:
def coord2biomass_linear(world_keypoints, model):
    """from coordinates to biomass"""

    mean = model['mean']
    std= model['std']
    PCA_components = model['PCA_components']
    reg_coef = model['reg_coef']
    reg_intercept = model['reg_intercept']
    body_parts = model['body_parts']
    print(body_parts)
    # calculate pairwise distances for production coord
    # based on the exact ordering reflected in the body_parts
    # variable above

    pairwise_distances = []
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            dist = euclidean_distance(world_keypoints[body_parts[i]], world_keypoints[body_parts[j]])
            pairwise_distances.append(dist)

    interaction_values_quadratic = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            dist1 = pairwise_distances[i]
            dist2 = pairwise_distances[j]
            interaction_values_quadratic.append(dist1 * dist2)

    interaction_values_cubic = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            for k in range(j, len(pairwise_distances)):
                dist1 = pairwise_distances[i]
                dist2 = pairwise_distances[j]
                dist3 = pairwise_distances[k]
                interaction_values_cubic.append(dist1 * dist2 * dist3)


    X = np.array(pairwise_distances + interaction_values_quadratic + interaction_values_cubic)

    X_normalized = (X - model['mean']) / model['std']
    X_transformed = np.dot(X_normalized, model['PCA_components'].T)
    prediction = np.dot(X_transformed, reg_coef) + reg_intercept
    return prediction

In [None]:
def apply_filters(left_keypoints, right_keypoints, world_keypoints, baseline_biomass_model):
    filter_out, reason = False, None
    
    # apply y-coordinate deviation filter
    body_parts = sorted(list(left_keypoints.keys()))
    max_y_coordinate_deviation = max([abs(left_keypoints[bp][1] - right_keypoints[bp][1]) for bp in body_parts])
    max_x_coordinate_deviation = max([abs(left_keypoints[bp][0] - right_keypoints[bp][0]) for bp in body_parts])
    print(max_y_coordinate_deviation, max_x_coordinate_deviation)
    if (max_y_coordinate_deviation > 25):
        filter_out = True
        reason = 'Y-coordinate deviation too high'
        
    # apply world y-coordinate deviation filter
    norm_wkps = normalize_world_keypoints(world_keypoints)
    y_world_coordinates = [norm_wkps[bp][1] for bp in body_parts]
    max_y_world_coordinate_deviation = max(y_world_coordinates) - min(y_world_coordinates)
    if max_y_world_coordinate_deviation > 0.25:
        filter_out = True
        reason = 'World y-coordinate deviation too high'
        
    # apply baseline biomass model
    baseline_weight_prediction = coord2biomass_linear(world_keypoints, baseline_biomass_model)
    if (baseline_weight_prediction < 0) or (baseline_weight_prediction > 15000):
        filter_out = True
        reason = 'Baseline prediction way too off'
        
    
    return filter_out, reason


In [None]:
baseline_biomass_model = pickle.load(open('/root/data/alok/biomass_estimation/models/model_v2.pkl', 'rb'))
df['filter_out'] = False
df['reason'] = None
for idx, row in df.iterrows():
    filter_out, reason = \
        apply_filters(row.left_keypoints, row.right_keypoints, row.world_keypoints, baseline_biomass_model)
    if filter_out:
        df.at[idx, 'filter_out'] = True
        df.at[idx, 'reason'] = reason

    

In [None]:
df = df[(df.project_name != 'Automated keypoints detection') & (df.weight != 5057.0)]

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    


In [None]:
# define all features

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(list(range(len(body_parts))), 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        df[interaction_column] = df[col1] * df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            df[interaction_column] = df[col1] * df[col2] * df[col3]
            interaction_columns_cubic.append(interaction_column)

In [None]:
np.random.seed(0)

mask = generate_train_mask(df, train_frac=0.8)
mask = mask & (df.index != 830) & (~df.gtsf_fish_identifier.str.contains('190620'))
columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

X_train = df.loc[mask, columns].values
y_train = df.loc[mask, 'weight'].values
X_test = df.loc[~mask, columns].values
y_test = df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
# idx = np.where(explained_variance_ratio > 0.999)[0][0]
idx = 4
print(idx)

pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)
print(reg.score(X_test_transformed, y_test))

y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
df['prediction'] = y_pred
df['error'] = df.prediction - df.weight
df['error_pct'] = df.error / df.weight
df['abs_error_pct'] = df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts   
}



In [None]:
pickle.dump(model, open('/root/data/alok/biomass_estimation/models/20190715_model_4_eig.pkl', 'wb'))

In [None]:
# m = (df.gtsf_fish_identifier.str.contains('viking')) 
# (df[m].prediction.mean() - df[m].weight.mean())/(df[m].weight.mean())

In [None]:
%matplotlib inline
plt.figure(figsize=(20, 10))


plt.scatter(df[mask]['weight'], df[mask]['prediction'])
plt.scatter(df[~mask]['weight'], df[~mask]['prediction'], color='r')
plt.xlabel('Ground Truth Weight')
plt.ylabel('Prediction')
plt.plot(range(10000), range(10000))


In [None]:
plt.figure(figsize=(15, 10))
plt.title('Predicted biomass histogram for underwater fish')
plt.hist(df[df.gtsf_fish_identifier == '190620-4e4e0640-d4eb-405d-8fcf-57fda11d7660'].prediction, bins=20)
plt.axvline(1500, color='red', label='Ground Truth Weight')
plt.legend()
plt.grid()

<h1> Investigate Individual Cases </h1>

In [None]:
data_access_utils = DataAccessUtils('/root/data/')

In [None]:
def visualize_stereo_frame_pair(stereo_frame_pair_id):
    sfp = session.query(StereoFramePair).filter(StereoFramePair.id == stereo_frame_pair_id).all()[0]
    left_image_s3_key = sfp.left_image_s3_key
    right_image_s3_key = sfp.right_image_s3_key
    image_s3_bucket = sfp.image_s3_bucket
    left_image_keypoint_coordinates = json.loads(sfp.left_image_keypoint_coordinates)
    right_image_keypoint_coordinates = json.loads(sfp.right_image_keypoint_coordinates)
    
    left_image_f = data_access_utils.download_from_s3(image_s3_bucket, left_image_s3_key)
    right_image_f = data_access_utils.download_from_s3(image_s3_bucket, right_image_s3_key)
    print(left_image_s3_key)
    left_image = plt.imread(left_image_f)
    right_image = plt.imread(right_image_f)
    
    fig, axes = plt.subplots(1, 2, figsize=(30, 20))
    axes[0].imshow(left_image)
    print(left_image_keypoint_coordinates)
    for bp, coords in left_image_keypoint_coordinates.items():
        axes[0].scatter(coords[0], coords[1], s=2, label=bp, color='red')
    
    axes[1].imshow(right_image)
    print(right_image_keypoint_coordinates)
    for bp, coords in right_image_keypoint_coordinates.items():
        axes[1].scatter(coords[0], coords[1], s=2, label=bp, color='red')
    
    
    


<h2> Prediction = 2500 grams, Error = +68% </h1>

In [None]:
visualize_stereo_frame_pair(3603)

<h2> Prediction = 2017.9 grams, Error = 34.5% </h2>

In [None]:
visualize_stereo_frame_pair(3632)

<h2> Prediction = 1504 grams, Error = 0.2% </h1>

In [None]:
visualize_stereo_frame_pair(3617)

<h2> Prediction = 1463 grams, Error = -2.4% </h2>

In [None]:
visualize_stereo_frame_pair(3635)

<h2> Prediction = 1452 grams, Error = -3.1% </h1>

In [None]:
visualize_stereo_frame_pair(3621)

In [None]:
visualize_stereo_frame_pair(3624)

In [None]:
visualize_stereo_frame_pair(3619)

In [None]:
visualize_stereo_frame_pair(3756)

In [None]:
df.ix[df.weight == 1500, 
      [
          'gtsf_fish_identifier', 
          'epoch', 
          'prediction', 
          'stereo_frame_pair_id', 
          'abs_error_pct', 
          '6-7',
          'curvature_theta'
      ]
     ].sort_values('abs_error_pct')




In [None]:
df.ix[df.weight == 1500, 
      [
          'gtsf_fish_identifier', 
          'epoch', 
          'prediction', 
          'stereo_frame_pair_id', 
          'abs_error_pct', 
          '6-7',
          'curvature_theta'
      ]
     ].sort_values('abs_error_pct').loc[1855]



In [None]:
sfp = session.query(StereoFramePair).filter(StereoFramePair.id == 3756).all()[0]

In [None]:
sfp.left_image_keypoint_coordinates

In [None]:
sfp.right_image_keypoint_coordinates

In [None]:
tdf = df.ix[df.weight == 1500, 
      [
          'gtsf_fish_identifier', 
          'epoch', 
          'prediction', 
          'stereo_frame_pair_id', 
          'abs_error_pct', 
          '6-7',
          'curvature_theta'
      ]
     ].copy(deep=True)

In [None]:
(tdf.prediction.mean() - 1500)/1500.