# GTSF phase: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

### Look at the volumes created with blender

Load blender data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from scipy.stats import norm


%matplotlib qt

In [None]:
with open("/root/data/alok/blender_data/volumes_all.json", "r") as f:
    data = json.load(f)

Some plot

In [None]:
# plt.scatter(np.array(data["dimensions"])[:, 1], data["volume"])
# plt.ylabel("Volume (cm^3)")
# plt.xlabel("Length (mm)")
# plt.show()

In [None]:
plt.hist(data["volume"])
plt.title("Blender volume histogram")
plt.show()

<h1> Get world keypoint coordinates from GTSF data </h1>

In [None]:
aws_credentials = json.load(open(os.environ["AWS_CREDENTIALS"]))
s3_client = boto3.client('s3', aws_access_key_id=aws_credentials["aws_access_key_id"],
                         aws_secret_access_key=aws_credentials["aws_secret_access_key"],
                         region_name="eu-west-1")


sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
sql_engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], sql_credentials["password"],
                           sql_credentials["host"], sql_credentials["port"],
                           sql_credentials["database"]))

Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs


<h1> Utility functions for world keypoint normalization </h1>

In [None]:
def generate_rotation_matrix(u_base, v):
    u = v / np.linalg.norm(v)
    n = np.cross(u_base, u)
    n = n / np.linalg.norm(n)
    theta = -np.arccos(np.dot(u, u_base))

    R = np.array([[
        np.cos(theta) + n[0]**2*(1-np.cos(theta)), 
        n[0]*n[1]*(1-np.cos(theta)) - n[2]*np.sin(theta),
        n[0]*n[2]*(1-np.cos(theta)) + n[1]*np.sin(theta)
    ], [
        n[1]*n[0]*(1-np.cos(theta)) + n[2]*np.sin(theta),
        np.cos(theta) + n[1]**2*(1-np.cos(theta)),
        n[1]*n[2]*(1-np.cos(theta)) - n[0]*np.sin(theta),
    ], [
        n[2]*n[0]*(1-np.cos(theta)) - n[1]*np.sin(theta),
        n[2]*n[1]*(1-np.cos(theta)) + n[0]*np.sin(theta),
        np.cos(theta) + n[2]**2*(1-np.cos(theta))
    ]])
    
    return R

In [None]:
def normalize_world_keypoints(wkps):
    body_parts = wkps.keys()
    
    # translate keypoints such that tail notch is at origin
    translated_wkps = {bp: wkps[bp] - wkps['TAIL_NOTCH'] for bp in body_parts}
    
    # perform first rotation
    u_base=np.array([1, 0, 0])
    v = translated_wkps['UPPER_LIP']
    R = generate_rotation_matrix(u_base, v)
    norm_wkps_intermediate = {bp: np.dot(R, translated_wkps[bp]) for bp in body_parts}
    
    # perform second rotation
    u_base = np.array([0, 0, 1])
    v = norm_wkps_intermediate['DORSAL_FIN'] - np.array([norm_wkps_intermediate['DORSAL_FIN'][0], 0, 0])
    R = generate_rotation_matrix(u_base, v)
    norm_wkps = {bp: np.dot(R, norm_wkps_intermediate[bp]) for bp in body_parts}
    
    return norm_wkps
    


<h1> Get normalized world keyponts of all cached Blender models </h1>

In [None]:
def euclidean_distance(p1, p2):
    return np.linalg.norm(p1-p2)

In [None]:
weight_bp = {
    'UPPER_LIP': 1.0,
    'PECTORAL_FIN': 1.0,
    'TAIL_NOTCH': 1.0,
    'DORSAL_FIN': 1.0,
    'ANAL_FIN': 1.0,
    'ADIPOSE_FIN': 1.0,
    'EYE': 1.0,
    'PELVIC_FIN': 1.0
}

In [None]:
sfps = session.query(StereoFramePair).all()

In [None]:
canonical_wkps = {bp: 1e-3*np.array(data['coordinates'][0][bp]) for bp in data['mapping'].keys()}

norm_canonical_wkps = normalize_world_keypoints(canonical_wkps)
canonical_volume = data['volume'][0]

analysis_df = pd.DataFrame()
predicted_volumes = []
gt_biomass = []
gt_kfactor = []
y_factors = []
ys = []
for idx, row in enumerate(sfps):
    # extract and normalize the predicted 3D keypoints
    wkps = json.loads(row.world_keypoint_coordinates)
    wkps = {bp: np.array(wkps[bp]) for bp in wkps.keys()}
    norm_wkps = normalize_world_keypoints(wkps)
    
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    species = ground_truth_metadata['data'].get('species')
    
    if species != 'salmon':
        continue
    ground_truth_biomass = ground_truth_metadata['data']['weight']
    ground_truth_length = ground_truth_metadata['data']['length']
    
    x_factor = abs(sum([norm_canonical_wkps[bp][0]*norm_wkps[bp][0]*weight_bp[bp] for bp in data['mapping'].keys()]) / \
               sum([norm_canonical_wkps[bp][0]**2*weight_bp[bp] for bp in data['mapping'].keys()]))
    
    y_factor = abs(sum([norm_canonical_wkps[bp][1]*norm_wkps[bp][1]*weight_bp[bp] for bp in data['mapping'].keys()]) / \
               sum([norm_canonical_wkps[bp][1]**2*weight_bp[bp] for bp in data['mapping'].keys()]))
    
    z_factor = abs(sum([norm_canonical_wkps[bp][2]*norm_wkps[bp][2]*weight_bp[bp] for bp in data['mapping'].keys()]) / \
               sum([norm_canonical_wkps[bp][2]**2*weight_bp[bp] for bp in data['mapping'].keys()]))
    
    volume = canonical_volume * x_factor * z_factor * (1 + (y_factor - 0.9) * 0.12)
    y = norm_wkps['PECTORAL_FIN'][1]-norm_wkps['UPPER_LIP'][1]
    ys.append(y_factor)
    if ground_truth_biomass > 1000:
        gt_biomass.append(ground_truth_biomass)
        predicted_volumes.append(volume)
        gt_kfactor.append(1e5 * ground_truth_biomass / ground_truth_length**3)
    
    
    


In [None]:
predictions = np.array(predicted_volumes)[:, np.newaxis]
reg = LinearRegression().fit(predictions, gt_biomass)
print(reg.coef_, reg.intercept_)
print("R2 : {}".format(reg.score(predictions, gt_biomass)))
predictions = np.squeeze(predictions)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot([0, 5000], [0, 5000], "--", c="r", linewidth=2)
plt.scatter(gt_biomass, predictions*reg.coef_ + reg.intercept_,  c=gt_kfactor)
plt.colorbar()
plt.xlabel("Ground truth weight")
plt.ylabel("Predicted weight")
plt.axis("scaled")
plt.show()

In [None]:
ground_truth = np.array(list(zip(gt_biomass, gt_kfactor)))

In [None]:
fitted_predictions = predictions*reg.coef_ + reg.intercept_
error = fitted_predictions-gt_biomass
print("Average absolute error: {}".format(np.nanmean(np.abs(error))))
print("Average error: {}".format(np.nanmean(error)))
# error5 = predictions_average-ground_truth
#print("Average absolute error5: {}".format(np.nanmean(np.abs(error5))))
relative_error = ((fitted_predictions-gt_biomass) / gt_biomass)*100
print("Average relative error: {} %".format(np.nanmean(relative_error)))

In [None]:
from scipy.stats.kde import gaussian_kde

In [None]:
kde = gaussian_kde(error)
dist_space = np.linspace( min(error), max(error), 100 )
plt.hist(error, bins=20, density=True)
plt.plot( dist_space, kde(dist_space) )
plt.title("Error")
plt.show()

In [None]:
kde = gaussian_kde(relative_error)
dist_space = np.linspace( min(relative_error), max(relative_error), 100 )
plt.hist(relative_error, bins=20, density=True)
plt.plot( dist_space, kde(dist_space) )
plt.title("Relative Error (%)")
plt.show()

In [None]:
values = np.arange(0, 101, 5)
percentiles = np.percentile(np.abs(relative_error), values)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(percentiles, values)
plt.yticks(np.arange(0,101,5))
plt.ylabel("Percentage")
plt.xlabel("Absolute relative error (%)")
plt.grid()
plt.show()

In [None]:
from scipy.stats import kstest
from scipy.optimize import curve_fit
from scipy.stats import norm

In [None]:
mean, std = norm.fit(fitted_predictions)
print("Mean: {}, Standard deviation: {}".format(mean, std))
plt.hist(fitted_predictions, bins=20, normed=True)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
y = norm.pdf(x, mean, std)
plt.plot(x, y)
plt.show()

In [None]:
kstest(fitted_predictions, norm(loc=mean, scale=std).cdf)

<h1> Cross validation </h1>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
predictions = np.squeeze(predictions)
all_errors = []
all_relative_errors = []
for i in range(1000):
    predictions = predictions[:, np.newaxis]
    test_size = i
    X_train, X_test, y_train, y_test = train_test_split(predictions, gt_biomass, test_size=0.2)
    X_test= np.squeeze(X_test)
    
    reg = LinearRegression().fit(X_train, y_train)
    # print(reg.coef_, reg.intercept_)
    # print("R2 : {}".format(reg.score(X_train, y_train)))
    predictions = np.squeeze(predictions)
    
    
    fitted_X_test = X_test*reg.coef_ + reg.intercept_
    error = fitted_X_test-y_test
    relative_error = ((fitted_X_test-y_test) / y_test)*100
    all_errors.append(np.nanmean(error))
    all_relative_errors.append(np.nanmean(relative_error))


In [None]:
plt.hist(all_errors)
plt.xlabel("Average error distribution")
plt.show()
plt.hist(all_relative_errors)
plt.xlabel("Average relative error distribution")
plt.show()

<h1> Extra Metrics </h1>

In [None]:
errors = []
errors_means = []
kfactors = []
predictions = np.squeeze(predictions)

for i in range(1000):
    predictions = predictions[:, np.newaxis]
    test_size = i
    X_train, X_test, y_train, y_test = train_test_split(predictions, ground_truth, test_size=0.2)
    X_test = np.squeeze(X_test)
    
    reg = LinearRegression().fit(X_train, y_train[:, 0])
    predictions = np.squeeze(predictions)
    
    fitted_X_test = X_test*reg.coef_ + reg.intercept_
    error_mean = np.mean(fitted_X_test) - np.mean(y_test[:, 0])
    error = fitted_X_test - y_test[:, 0]
    errors_means.append(error_mean)
    errors.append(error)
    kfactors.append(y_test[:, 1])
#     relative_error = ((fitted_X_test-y_test) / y_test)*100
#     all_errors.append(np.nanmean(error))
#     all_relative_errors.append(np.nanmean(relative_error))

In [None]:
plt.hist(errors_means)
plt.title("Error on mean")
plt.show()

In [None]:
idx = np.random.randint(0, 1001)
abs_error = np.abs(errors[idx])
plt.scatter(kfactors[idx], errors[idx])
plt.xlabel("K factor")
plt.ylabel("absolute error")
plt.show()

In [None]:
errors = []

# isolate 50% of the dataset
X_train, X_test, y_train, y_test = train_test_split(predictions, gt_biomass, test_size=0.5)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.squeeze(X_test)

reg = LinearRegression().fit(X_train, y_train)

fitted_X_test = X_test*reg.coef_ + reg.intercept_

In [None]:
for i in range(50, 260, 50):
    predictions = predictions[:, np.newaxis]
    test_size = i
    tmp = []
    for j in range(100):
        random_idx = np.random.choice(range(len(X_test)), size=i, replace=False)
        fitted_X_test_subset = fitted_X_test[random_idx]
        y_test_subset = np.array(y_test)[random_idx]

        # error = fitted_X_test - y_test[:, 0]
        # relative_error = np.abs(((fitted_X_test-y_test_subset) / y_test_subset)*100)
        err = (np.mean(fitted_X_test) - np.mean(y_test_subset))*100 / np.mean(y_test_subset)
        err = np.abs(err)
        tmp.append(err)
        # tmp.append(np.mean(relative_error))
    errors.append(tmp)

In [None]:
c = 0
plt.figure(figsize=(10,10))
for i in range(50, 260, 50):
    values = np.arange(0, 101, 5)
    percentiles = np.percentile(errors[c], values)
    plt.plot(values, percentiles, label="Sample size {}".format(i))
    c += 1
    
plt.xticks(np.arange(0,101,5))
plt.yticks(np.arange(0, 9, 1))
plt.xlabel("Percentage")
# plt.ylabel("Mean Absolute relative error (%)")
plt.grid()
plt.legend(loc='upper left')
plt.show()