# GTSF phase: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

### Look at the volumes created with blender

Load blender data

In [None]:
import pandas as pd
import numpy as np
import json

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from scipy.stats import norm
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.float_format', lambda x: str(x))

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from itertools import combinations

from sklearn.preprocessing import quantile_transform

%matplotlib notebook


In [None]:
with open("/root/data/alok/blender_data/volumes_all.json", "r") as f:
    data = json.load(f)

Some plot

In [None]:
# plt.scatter(np.array(data["dimensions"])[:, 1], data["volume"])
# plt.ylabel("Volume (cm^3)")
# plt.xlabel("Length (mm)")
# plt.show()

In [None]:
plt.hist(data["volume"])
plt.title("Blender volume histogram")
plt.show()

<h1> Get world keypoint coordinates from GTSF data </h1>

In [None]:
aws_credentials = json.load(open(os.environ["AWS_CREDENTIALS"]))
s3_client = boto3.client('s3', aws_access_key_id=aws_credentials["aws_access_key_id"],
                         aws_secret_access_key=aws_credentials["aws_secret_access_key"],
                         region_name="eu-west-1")


sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
sql_engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], sql_credentials["password"],
                           sql_credentials["host"], sql_credentials["port"],
                           sql_credentials["database"]))

Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs


    


<h1> Utility functions for world keypoint normalization </h1>

In [None]:
def euclidean_distance(p1, p2):
    if type(p1) == list:
        p1 = np.array(p1)
    if type(p2) == list:
        p2 = np.array(p2)
    return np.linalg.norm(p1 - p2)


def generate_rotation_matrix(u_base, v):
    u = v / np.linalg.norm(v)
    n = np.cross(u_base, u)
    n = n / np.linalg.norm(n)
    theta = -np.arccos(np.dot(u, u_base))

    R = np.array([[
        np.cos(theta) + n[0]**2*(1-np.cos(theta)), 
        n[0]*n[1]*(1-np.cos(theta)) - n[2]*np.sin(theta),
        n[0]*n[2]*(1-np.cos(theta)) + n[1]*np.sin(theta)
    ], [
        n[1]*n[0]*(1-np.cos(theta)) + n[2]*np.sin(theta),
        np.cos(theta) + n[1]**2*(1-np.cos(theta)),
        n[1]*n[2]*(1-np.cos(theta)) - n[0]*np.sin(theta),
    ], [
        n[2]*n[0]*(1-np.cos(theta)) - n[1]*np.sin(theta),
        n[2]*n[1]*(1-np.cos(theta)) + n[0]*np.sin(theta),
        np.cos(theta) + n[2]**2*(1-np.cos(theta))
    ]])
    
    return R

In [None]:
def normalize_world_keypoints(wkps):
    body_parts = wkps.keys()
    
    # translate keypoints such that tail notch is at origin
    translated_wkps = {bp: wkps[bp] - wkps['TAIL_NOTCH'] for bp in body_parts}
    
    # perform first rotation
    u_base=np.array([1, 0, 0])
    v = translated_wkps['UPPER_LIP']
    R = generate_rotation_matrix(u_base, v)
    norm_wkps_intermediate = {bp: np.dot(R, translated_wkps[bp]) for bp in body_parts}
    
    # perform second rotation
    u_base = np.array([0, 0, 1])
    v = norm_wkps_intermediate['DORSAL_FIN'] - np.array([norm_wkps_intermediate['DORSAL_FIN'][0], 0, 0])
    R = generate_rotation_matrix(u_base, v)
    norm_wkps = {bp: np.dot(R, norm_wkps_intermediate[bp]) for bp in body_parts}
    
    # perform reflecton if necessary
    if norm_wkps['PECTORAL_FIN'][1] > 0:
        norm_wkps = {bp: np.array([
            norm_wkps[bp][0],
            -norm_wkps[bp][1],
            norm_wkps[bp][2]
        ]) for bp in body_parts}
    
    return norm_wkps
    


<h1> Get normalized world keyponts of all cached Blender models </h1>

<h1> Linear Model </h1>

In [None]:
session.rollback()
sfps = session.query(StereoFramePair).all()

df = pd.DataFrame()
for idx, sfp in enumerate(sfps):
    
    if sfp.gtsf_fish_identifier == '190321010002':
        continue
    
    ground_truth_metadata = json.loads(sfp.ground_truth_metadata)
    if ground_truth_metadata['data'].get('species') != 'salmon':
        continue
    
    wkps = json.loads(sfp.world_keypoint_coordinates)
    body_parts = list(wkps.keys())
    wkps = {bp: np.array(wkps[bp]) for bp in body_parts}
    
    row = {'0': idx}
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(wkps[body_parts[i]], wkps[body_parts[j]])
            row['{0}-{1}'.format(i, j)] = d
    
    
    weight = ground_truth_metadata['data']['weight']
    length = ground_truth_metadata['data']['length']
    row['weight'] = weight
    row['length'] = length
    row['kfactor'] = 1e5 * weight / length**3
    df = df.append(row, ignore_index=True)
            
    
                                                       

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    
    

In [None]:
mask = generate_train_mask(df, train_frac=0.615)


columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(list(range(8)), 2))]
X_train = df.loc[mask, columns].values
y_train = df.loc[mask, 'weight'].values
X_test = df.loc[~mask, columns].values
y_test = df.loc[~mask, 'weight'].values

reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)



In [None]:
analysis_df = pd.DataFrame()

predictions = []
predictions_average = []
ground_truth = []

for row in sfps:
    
    if row.gtsf_fish_identifier == '190321010002':
        continue
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    if ground_truth_metadata['data'].get('species') != 'salmon':
        continue
    
    # load keypoints
    world_keypoints_from_db = json.loads(row.world_keypoint_coordinates)
    world_keypoints = {bp: np.array(world_keypoints_from_db[bp]) for bp in world_keypoints_from_db.keys()}
    # calculate distances
    measurements = []
    for k in range(number_of_parts):
        v = world_keypoints[reverse_mapping[str(k)]]
        for k0 in range(k+1, number_of_parts):
            v0 = world_keypoints[reverse_mapping[str(k0)]]
            dist = euclidean_distance(v, v0)*1000 # mm to m
            measurements.append(dist)
    
    # find closest blender volume
    # calculate l1 distance
    diff = np.nanmean(np.abs(np.array(df)[:, :-1] - measurements), axis=1)
    closest = np.argsort(diff)
    idx = 10
    closest5 = np.array(df)[closest[:idx], -1]
    print("closest volumes", closest5)
    print("standard dev:", np.std(closest5))
    print("estimated length", measurements[13])
    closest_length = np.array(list(df["2-3"].iloc()[closest[:idx]]))
    kfactor = 10**5*closest5 / closest_length**3
    print("closest length", closest_length)
    print("closest kfactor", kfactor)
    print("closest height", list(df["4-6"].iloc()[closest[:idx]]))
    print("#"*50)
    pred_volume = np.array(df)[closest[0], -1]
    predictions.append(pred_volume)
    pred_avg = np.mean(closest5)
    predictions_average.append(np.mean(closest5))
    
    closest_wkps = data['coordinates'][closest[0]]
    closest_wkps = {bp: [x / 1e3 for x in closest_wkps[bp]] for bp in closest_wkps.keys()}
    
    # ground truth
    
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    ground_truth_weight = ground_truth_metadata['data']['weight']
    ground_truth_width = ground_truth_metadata['data']['width']
    ground_truth_breadth = ground_truth_metadata['data']['breath']
    ground_truth_length = ground_truth_metadata['data']['length']
    ground_truth_kfactor = 1e5 * (ground_truth_weight / ground_truth_length**3)
    
    ground_truth.append([ground_truth_weight, ground_truth_kfactor])
    
    row = {
        'gtsf_data_collection_id': row.gtsf_data_collection_id,
        'gtsf_fish_identifier': int(row.gtsf_fish_identifier),
        'ground_truth_weight': ground_truth_weight,
        'ground_truth_length': ground_truth_length,
        'ground_truth_width': ground_truth_width,
        'ground_truth_breadth': ground_truth_breadth,
        'ground_truth_kfactor': ground_truth_kfactor,
        'pred_volume': pred_volume,
        'pred_avg_volume': pred_avg,
        'pred_length': closest_length[0],
        'wkps': row.world_keypoint_coordinates,
        'closest_wkps': json.dumps(closest_wkps),
        'closest_idx': closest[0]
    }
    
    analysis_df = analysis_df.append(row, ignore_index=True)


In [None]:
predictions = np.array(predictions)
predictions_average = np.array(predictions_average)
ground_truth = np.array(ground_truth)
gt_weight = ground_truth[:, 0]
gt_kfactor = ground_truth[:, 1]
predictions = predictions[:, np.newaxis]
reg = LinearRegression().fit(predictions, gt_weight)
print(reg.coef_, reg.intercept_)
print("R2 : {}".format(reg.score(predictions, gt_weight)))
predictions = np.squeeze(predictions)
analysis_df['prediction'] = predictions*reg.coef_ + reg.intercept_
analysis_df['error'] = analysis_df.prediction - analysis_df.ground_truth_weight
analysis_df['error_pct'] = analysis_df.error / analysis_df.ground_truth_weight
analysis_df['abs_error_pct'] = analysis_df.error_pct.abs()

In [None]:
y_pred = reg.predict(df[columns].values)
df['prediction'] = y_pred
df['error'] = df.prediction - df.weight
df['error_pct'] = df.error / df.weight
df['abs_error_pct'] = df.error_pct.abs()

In [None]:
%matplotlib inline
plt.figure(figsize=(15, 10))
plt.grid()
plt.xlabel('K Factor')
plt.ylabel('Error')
plt.scatter(df.loc[~mask, 'weight'], df.loc[~mask, 'error'])


In [None]:
plt.figure(figsize=(10, 10))
plt.plot([0, 5000], [0, 5000], "--", c="r", linewidth=2)
plt.scatter(gt_weight, predictions*reg.coef_ + reg.intercept_,  c=gt_kfactor)
plt.colorbar()
plt.xlabel("Ground truth weight")
plt.ylabel("Predicted weight")
plt.axis("scaled")
plt.show()

In [None]:
analysis_df.sort_values('abs_error_pct', ascending=True)

In [None]:
N = 1000

# perform N-fold cross validation
abs_err_pcts = []
for i in range(N):
    if i % 10 == 0:
        print(i)
    mask = generate_train_mask(df, train_frac=0.615)

    columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(list(range(8)), 2))]
    X_train = df.loc[mask, columns].values
    y_train = df.loc[mask, 'weight'].values
    X_test = df.loc[~mask, columns].values
    y_test = df.loc[~mask, 'weight'].values

    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(df[columns].values)
    df['prediction'] = y_pred
    df['error'] = df.prediction - df.weight
    df['error_pct'] = df.error / df.weight
    df['abs_error_pct'] = df.error_pct.abs()
    
    avg_biomass_err = df.loc[~mask, 'prediction'].mean() - df.loc[~mask, 'weight'].mean()
    abs_err_pct = abs(avg_biomass_err) / df.loc[~mask, 'weight'].mean()
    abs_err_pcts.append(abs_err_pct)
    
    




In [None]:
data_sorted = sorted(list(abs_err_pcts))
p = 1.0 * np.arange(len(data_sorted)) / (len(data_sorted) - 1)
fig = plt.figure(figsize=(30, 7))
ax1 = fig.add_subplot(121)
ax1.plot(p, data_sorted)
ax1.set_xlabel('p')
ax1.set_ylabel('OOS error percentage')
plt.axvline(x=0.95, linestyle='--', color='red', label='p = 0.95')
plt.title('CDF of OOS errors (sample size = 250)')
plt.legend()
plt.grid()



In [None]:
%matplotlib inline
plt.figure(figsize=(20, 10))
plt.scatter(df[~mask].weight, df[~mask].prediction)
plt.xlabel('Ground Truth Weight')
plt.ylabel('Prediction')
plt.plot(range(5000), range(5000))


<h1> Conduct Error Analysis </h1>

<h2> Plot 3D coordinates of keypoints </h2>

In [None]:
%matplotlib notebook

def plot_3D_keypoints(norm_wkps, norm_closest_wkps):
    body_parts = norm_wkps.keys()
    xs = [norm_wkps[bp][0] for bp in body_parts]
    ys = [norm_wkps[bp][1] for bp in body_parts]
    zs = [norm_wkps[bp][2] for bp in body_parts]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_xlim3d(0, max(xs))
    ax.set_ylim3d(-0.3, 0.3)
    ax.set_zlim3d(-0.3, 0.3)

    
    ax.scatter(xs, ys, zs, color='blue')

    for i, txt in enumerate(body_parts):
        ax.text(xs[i], ys[i], zs[i], txt, size=5, color='blue')
        
    xs = [norm_closest_wkps[bp][0] for bp in body_parts]
    ys = [norm_closest_wkps[bp][1] for bp in body_parts]
    zs = [norm_closest_wkps[bp][2] for bp in body_parts]
    ax.scatter(xs, ys, zs, color='red')
    
    for i, txt in enumerate(body_parts):
        ax.text(xs[i], ys[i], zs[i], txt, size=5, color='red')
        
#     plt.axis('scaled')
        
    

def plot_fish_id(gtsf_fish_identifier):
    row = analysis_df[analysis_df.gtsf_fish_identifier == gtsf_fish_identifier].iloc[0]
    
    # get GTSF normalized world keypoint coordinates
    wkps = json.loads(row.wkps)
    body_parts = wkps.keys()
    wkps = {bp: np.array(wkps[bp]) for bp in body_parts}
    norm_wkps = normalize_world_keypoints(wkps)
    
    # get closest Blender model normalized world keypoint coordinates
    closest_wkps = json.loads(row.closest_wkps)
    closest_wkps = {bp: np.array(closest_wkps[bp]) for bp in body_parts}
    norm_closest_wkps = normalize_world_keypoints(closest_wkps)
    
    plot_3D_keypoints(norm_wkps, norm_closest_wkps)

In [None]:
%matplotlib notebook
plot_fish_id(190325010034)