# GTSF phase I: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

### Look at the volumes created with blender

Load blender data

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import cv2

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from scipy.stats import norm
import tqdm
import pickle
from itertools import combinations
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.optics import euclidean_distance

from PIL import Image, ImageDraw
from multiprocessing import Pool, Manager
import copy
import uuid
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)





<h1> Get world keypoint coordinates from GTSF data </h1>

In [None]:
research_sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
research_rds_access_utils = RDSAccessUtils(research_sql_credentials)
sql_engine = research_rds_access_utils.sql_engine
Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs

In [None]:
s3_access_utils = S3AccessUtils('/root/data')

<h1> Create training dataset </h1>

In [None]:
session.rollback()
sfps_all = session.query(StereoFramePair).all()
df = pd.DataFrame()

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
    'UPPER_PRECAUDAL_PIT', 
    'LOWER_PRECAUDAL_PIT',
    'HYPURAL_PLATE'
])

session.rollback()
for idx, row in enumerate(sfps_all):
    if idx % 100 == 0:
        print(idx)
        
    # get fish_id and ground truth metadata
    if row.gtsf_fish_identifier == '190321010002':
        continue
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    if ground_truth_metadata['data'].get('species') != 'salmon':
        continue
    
    left_keypoints = json.loads(row.left_image_keypoint_coordinates)
    right_keypoints = json.loads(row.right_image_keypoint_coordinates)
    wkps = json.loads(row.world_keypoint_coordinates)

    df_row = {'0': idx}
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(wkps[body_parts[i]], wkps[body_parts[j]])
            df_row['{0}-{1}'.format(i, j)] = d
    
    weight, length, kfactor = None, None, None
    if 'data' in ground_truth_metadata.keys():
        keys = ground_truth_metadata['data'].keys()
        if 'weight' in keys or 'weightKgs' in keys:
            weightKey = 'weight' if 'weight' in keys else 'weightKgs'
            lengthKey = 'length' if 'length' in keys else 'lengthMms'
            weight = ground_truth_metadata['data'][weightKey]
            length = ground_truth_metadata['data'][lengthKey]
            kfactor = (weight / length**3) * 1e5
    if not weight:
        print('No weight recorded for GTSF fish identifier: {}'.format(row.gtsf_fish_identifier))
        continue
        
        
    # calculate curvature
    wkp = {bp: [wkps[bp][2], wkps[bp][1], wkps[bp][0]] for bp in body_parts}
    fv1 = np.array(wkp['UPPER_LIP']) - np.array(wkp['DORSAL_FIN'])
    fv2 = np.array(wkp['UPPER_LIP']) - np.array(wkp['PELVIC_FIN'])
    n1 = np.cross(fv1, fv2)
    
    bv1 = np.array(wkp['PELVIC_FIN']) -  np.array(wkp['TAIL_NOTCH'])
    bv2 = np.array(wkp['DORSAL_FIN']) -  np.array(wkp['TAIL_NOTCH'])
    n2 = np.cross(bv1, bv2)
    curvature_theta = (180 / np.pi) * np.arccos(np.dot(n1, n2) / (np.linalg.norm(n1) * np.linalg.norm(n2)))
    
    df_row['weight'] = weight
    df_row['length'] = length
    df_row['kfactor'] = kfactor
    df_row['date'] = row.date
    df_row['project_name'] = row.annotations_project_name
    df_row['left_keypoints'] = json.loads(row.left_image_keypoint_coordinates)
    df_row['right_keypoints'] = json.loads(row.right_image_keypoint_coordinates)
    df_row['world_keypoints'] = wkps
    df_row['gtsf_fish_identifier'] = row.gtsf_fish_identifier
    df_row['epoch'] = row.epoch
    df_row['stereo_frame_pair_id'] = row.id
    df_row['curvature_theta'] = curvature_theta
        
    df = df.append(df_row, ignore_index=True)
            
    



<h1> Train Model with Old Calibration </h1>

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    


In [None]:
# define all features

body_parts_subset = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        df[interaction_column] = df[col1] * df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            df[interaction_column] = df[col1] * df[col2] * df[col3]
            interaction_columns_cubic.append(interaction_column)
            

In [None]:
np.random.seed(1)

mask = generate_train_mask(df, train_frac=0.8)
mask = mask & (~df.gtsf_fish_identifier.str.contains('190620'))
columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

X_train = df.loc[mask, columns].values
y_train = df.loc[mask, 'weight'].values
X_test = df.loc[~mask, columns].values
y_test = df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
idx = np.where(explained_variance_ratio > 0.999)[0][0]
# idx = np.where(explained_variance_ratio > 0.999)[0][0]
print(idx)

pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)

y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
df['prediction'] = y_pred
df['error'] = df.prediction - df.weight
df['error_pct'] = df.error / df.weight
df['abs_error_pct'] = df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts_subset
}



In [None]:
print('Done')

<h1> Perturb pairwise distances - biased perturbation </h1>

In [None]:
def get_biomass_error(mean_pct_err, std_pct_err):
    body_parts = sorted([
        'TAIL_NOTCH',
        'ADIPOSE_FIN',
        'ANAL_FIN',
        'PECTORAL_FIN',
        'PELVIC_FIN',
        'DORSAL_FIN',
        'UPPER_LIP',
        'EYE',
        'UPPER_PRECAUDAL_PIT', 
        'LOWER_PRECAUDAL_PIT',
        'HYPURAL_PLATE'
    ])

    perturbed_df = pd.DataFrame()
    for idx, row in df.iterrows():
        df_row = {}
        wkps = row.world_keypoints
        for i in range(len(body_parts)-1):
            for j in range(i+1, len(body_parts)):
                d = euclidean_distance(wkps[body_parts[i]], wkps[body_parts[j]])
                df_row['{0}-{1}'.format(i, j)] = d*(1+np.random.normal(mean_pct_err, std_pct_err))
        df_row['weight'] = row.weight
        perturbed_df = perturbed_df.append(df_row, ignore_index=True)
    
    # define all features

    body_parts_subset = sorted([
        'TAIL_NOTCH',
        'ADIPOSE_FIN',
        'ANAL_FIN',
        'PECTORAL_FIN',
        'PELVIC_FIN',
        'DORSAL_FIN',
        'UPPER_LIP',
        'EYE',
    ])

    body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

    pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
    interaction_columns_quadratic = []
    interaction_columns_cubic = []
    for i in range(len(pairwise_distance_columns)):
        for j in range(i, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            interaction_column = '{},{}'.format(col1, col2)
            perturbed_df[interaction_column] = perturbed_df[col1] * perturbed_df[col2]
            interaction_columns_quadratic.append(interaction_column)

    for i in range(len(pairwise_distance_columns)):
        for j in range(i, len(pairwise_distance_columns)):
            for k in range(j, len(pairwise_distance_columns)):
                col1 = pairwise_distance_columns[i]
                col2 = pairwise_distance_columns[j]
                col3 = pairwise_distance_columns[k]
                interaction_column = '{},{},{}'.format(col1, col2, col3)
                perturbed_df[interaction_column] = perturbed_df[col1] * perturbed_df[col2] * perturbed_df[col3]
                interaction_columns_cubic.append(interaction_column)

    y_pred_perturbed = reg.predict(pca.transform(scaler.transform(perturbed_df[columns].values)))
    errs = y_pred_perturbed - perturbed_df.weight.values
    perturbed_df['prediction'] = y_pred_perturbed
    perturbed_df['error'] = errs
#     return np.median(errs) / perturbed_df.weight.values.mean()
    
    return perturbed_df
    
    
    
    

In [None]:
errors = list(np.arange(0, 0.2, 0.05))
biomass_errors = []
for e in errors:
    biomass_error = get_biomass_error(e, 0)
    biomass_errors.append(biomass_error)
    
plt.figure(figsize=(20, 10))
plt.plot([100 * x for x in errors], [100 * x for x in biomass_errors])
plt.xlabel('Length error percentage (%)')
plt.ylabel('Biomass error percentage (%)')
plt.grid()
plt.show()

In [None]:
std_pct_errors = list(np.arange(0, 0.2, 0.05))
biomass_errors = []
for s in std_pct_errors:
    biomass_error = get_biomass_error(0, s)
    biomass_errors.append(biomass_error)

plt.figure(figsize=(20, 10))
plt.plot([100 * x for x in std_pct_errors], [100 * x for x in biomass_errors])
plt.xlabel('Length error percentage (%)')
plt.ylabel('Biomass error percentage (%)')
plt.grid()
plt.show()

In [None]:
perturbed_df = get_biomass_error(0.0, 0.05)

In [None]:
plt.scatter(perturbed_df.weight, perturbed_df.prediction)
plt.xlim([0, 10000])
plt.ylim([0, 10000])
plt.show()

In [None]:
((perturbed_df.weight - perturbed_df.prediction)/perturbed_df.prediction).mean()

In [None]:
get_biomass_error(0, s)

In [None]:
total = 0
N = 100000
errs = []
for i in range(N):
    errs.append((1 + np.random.normal(0.0, .05))**3 - 1)
    
errs = np.array(errs)
print(np.mean(errs))
print(np.std(errs))

In [None]:
mean_pct_err, std_pct_err = 0, 0.01

body_parts = sorted([
        'TAIL_NOTCH',
        'ADIPOSE_FIN',
        'ANAL_FIN',
        'PECTORAL_FIN',
        'PELVIC_FIN',
        'DORSAL_FIN',
        'UPPER_LIP',
        'EYE',
        'UPPER_PRECAUDAL_PIT', 
        'LOWER_PRECAUDAL_PIT',
        'HYPURAL_PLATE'
    ])

perturbed_df = pd.DataFrame()
for idx, row in df.head(1).iterrows():
    df_row = {}
    wkps = row.world_keypoints
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(wkps[body_parts[i]], wkps[body_parts[j]])
            df_row['{0}-{1}'.format(i, j)] = d*(1+np.random.normal(mean_pct_err, std_pct_err))
    df_row['weight'] = row.weight
    perturbed_df = perturbed_df.append(df_row, ignore_index=True)


# define all features

body_parts_subset = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        perturbed_df[interaction_column] = perturbed_df[col1] * perturbed_df[col2]
        interaction_columns_quadratic.append(interaction_column)

for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            perturbed_df[interaction_column] = perturbed_df[col1] * perturbed_df[col2] * perturbed_df[col3]
            interaction_columns_cubic.append(interaction_column)

dev = reg.predict(pca.transform(scaler.transform(perturbed_df[columns].values))) - reg.predict(pca.transform(scaler.transform(df.head(1)[columns].values)))
print(dev/reg.predict(pca.transform(scaler.transform(df.head(1)[columns].values))))




In [None]:
a = pca.transform(scaler.transform(perturbed_df[columns].values))
b = pca.transform(scaler.transform(df.head(1)[columns].values))






In [None]:
pca.components_.shape

In [None]:
scaler.transform(perturbed_df[columns].values).shape

In [None]:
reg.predict(a)

In [None]:
np.dot(pca.components_, scaler.transform(perturbed_df[columns].values).T)

In [None]:
np.dot(pca.components_, scaler.transform(df.head(1)[columns].values).T)

In [None]:
np.dot(pca.components_[6, :], scaler.transform(perturbed_df[columns].values).T)

In [None]:
np.dot(pca.components_[6, :], scaler.transform(df.head(1)[columns].values).T)

In [None]:
scaler.transform(perturbed_df[columns].values)[0][7]

In [None]:
scaler.transform(df[columns].values)[0][7]

In [None]:
scaler.scale_

In [None]:
a

In [None]:
perturbed_df[columns].values

In [None]:
(perturbed_df[columns].values - scaler.mean_)/scaler.scale_

In [None]:
x = (scaler.transform(perturbed_df[columns]) - scaler.transform(df.head(1)[columns]))/scaler.transform(df.head(1)[columns])




In [None]:
(perturbed_df[columns].values - df.head(1)[columns].values)

In [None]:
lr = LinearRegression().fit(np.array(errors)[:, np.newaxis], np.array(biomass_errors))

In [None]:
lr.coef_

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(perturbed_df.weight.values, reg.predict(pca.transform(scaler.transform(perturbed_df[columns].values))))
plt.plot([0, 10000], [0, 10000])
plt.xlim([0, 10000])
plt.ylim([0, 10000])

plt.show()