# GTSF phase: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

In [None]:
keypoints_order = {"TAIL_NOTCH":0,
                "ADIPOSE_FIN":1,
                "UPPER_LIP":2,
                "ANAL_FIN":3,
                "PELVIC_FIN":4,
                "EYE":5,
                "PECTORAL_FIN":6,
                "DORSAL_FIN":7}
kps = ["TAIL_NOTCH",
                "ADIPOSE_FIN",
                "UPPER_LIP",
                "ANAL_FIN",
                "PELVIC_FIN",
                "EYE",
                "PECTORAL_FIN",
                "DORSAL_FIN"]
rkps = {str(v): k for (k,v) in keypoints_order.items()}

In [None]:
%load_ext autoreload
%autoreload 2

### Look at the volumes created with blender

Load blender data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

In [None]:
with open("/root/thomas/blender/volumes_all.json", "r") as f:
    data = json.load(f)

Some plot

In [None]:
# plt.scatter(np.array(data["dimensions"])[:, 1], data["volume"])
# plt.ylabel("Volume (cm^3)")
# plt.xlabel("Length (mm)")
# plt.show()

In [None]:
plt.hist(data["volume"])
plt.title("Blender volume histogram")
plt.show()

Calculate pairwise distances from blender data

In [None]:
mapping = data["mapping"]
reverse_mapping = data["reverse_mapping"]
reverse_mapping = rkps

In [None]:
number_of_parts = max(list(mapping.values()))+1

In [None]:
dataset = {"volume":[]}
dataset_np = []
kfactors = []
for (coord, vol) in zip(data["coordinates"], data["volume"]):
    row = []
    for k in range(number_of_parts):
        v = coord[reverse_mapping[str(k)]]
        for k0 in range(k+1, number_of_parts):
            v0 = coord[reverse_mapping[str(k0)]]
            dist = np.sqrt((v[2]-v0[2])**2 + (v[1]-v0[1])**2)
            cname = "{}-{}".format(k, k0)
            row.append(dist)
            if cname not in dataset:
                dataset[cname] = []
            dataset[cname].append(dist)
    dataset_np.append(row)
    dataset["volume"].append(vol)

Create panda dataframe

In [None]:
df = pd.DataFrame(data=dataset)
df.head()

In [None]:
plt.plot(df["2-3"], "o")

## Look at the gtsf data

Loading the gtsf data points and creating the pairwise distances

In [None]:
import json
import os

In [None]:
jsonfiles = ['/root/thomas/biomass_kp_predictions_val.json']

In [None]:
annotations = []
for jsonpath in jsonfiles:
    with open(jsonpath, "r") as f:
        jfile = json.load(f)
        annotations += jfile
print("Number of annotations: {}".format(len(annotations)))

Add the local path for ease and rename the body parts

In [None]:
for ann in annotations:
    local_path = os.path.join("/root/data/gtsf_phase_I/", 
                  "/".join(ann["Labeled Data"].split("/")[7:]))
    ann["local_path"] = local_path
    if not os.path.isfile(local_path):
        print(local_path)  
        print("missing image!!")
    for body_part in ann["Label"].keys():
        new_body_part = "_".join(body_part.replace(":", "").split()).upper()
        ann["Label"][new_body_part] = ann["Label"].pop(body_part)

In [None]:
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import Table, select, func, and_, insert, delete, update, or_

from tqdm import tqdm

In [None]:
sql_credentials = json.load(open("/root/thomas/sql_research_credentials.json"))

sql_engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], sql_credentials["password"],
                                         sql_credentials["host"], sql_credentials["port"],
                                         sql_credentials["database"]))

metadata = MetaData()
gtsf = Table('gtsf_data_collections', metadata, autoload=True, autoload_with=sql_engine)

Get all the timestamps

In [None]:
timestamps = []
for ann in annotations:
    timestamp = ann["local_path"].split("/")[-3]
    ann["timestamp"] = timestamp
    timestamps.append(ann["timestamp"])

Query

In [None]:
query = select([gtsf.c.ground_truth_metadata,
                gtsf.c.gtsf_fish_identifier]).select_from(gtsf).where(gtsf.c.gtsf_fish_identifier.in_(timestamps))
connection = sql_engine.connect()
q = connection.execute(query)
results = [(eval(r[0]), r[1]) for r in q]

In [None]:
for ann in annotations:
    for r in results:
        if r[1] == ann["timestamp"]:
            ann["weight"] = r[0]["data"]["weight"]
            ann["breath"] = r[0]["data"]["breath"]
            ann["length"] = r[0]["data"]["length"]
            ann["width"] = r[0]["data"]["width"]
            ann["kfactor"] = 10**5*ann["weight"] / ann["length"]**3
            ann["species"] = r[0]["data"].get("species", "salmon")
            break

In [None]:
kfactor = np.array([ann["kfactor"] for ann in annotations])
plt.hist(kfactor)
plt.title("K factor distribution of GTSF data")
plt.xlabel("K factor")
plt.show()

### 2D to 3D 

Move from 2d pixel coordinates to 3d world coordinates. First, need to create pairs

Creating pairs below

In [None]:
from aquabyte.biomass import BiomassAnnotation

In [None]:
bio = BiomassAnnotation(annotations, kps)

Some plotting

In [None]:
pair = bio.plot_pair()

Match the keypoints and create world coordinates

In [None]:
from aquabyte.optics import depth_from_disp, convert_to_world_point

In [None]:
params = {'FOCAL_LENGTH': 0.0084366,
          'BASELINE': 0.128096,
          'PIXEL_SIZE_M': 3.45 * 1e-6,
          'FOCAL_LENGTH_PIXEL': 0.0084366 / (3.45 * 1e-6),
          'IMAGE_SENSOR_WIDTH': 0.01412,
          'IMAGE_SENSOR_HEIGHT': 0.01035,
          'PIXEL_COUNT_WIDTH': 4096,
          'PIXEL_COUNT_HEIGHT': 3000
         }

In [None]:
jitter = {"jitter": False, "delta": 50}

In [None]:
world = {}
for ts in bio.full_pairs:
#     left_keypoints = bio.load_keypoints(ts, 'left', jitter)
#     right_keypoints = bio.load_keypoints(ts, 'right', jitter)
    left_keypoints = np.array(bio.pairs[ts]['left']['predictions'])
    right_keypoints = np.array(bio.pairs[ts]['right']['predictions'])
    
    # calculate disparities
#     disparities = left_keypoints[:, 0] - right_keypoints[:, 0]
    disparities = left_keypoints[:, 0] - right_keypoints[:, 0]
#     print(disparities)
#     print(pdisparities)
#     print(disparities - pdisparities)
#     print('#'*50)
    # compute world key point
    world_keypoints = {}
    for (i, d) in enumerate(disparities):
        depth = depth_from_disp(d, params)
        world_coord = convert_to_world_point(left_keypoints[i, 0], left_keypoints[i, 1], depth, params)
        world_keypoints[kps[i]] = world_coord
    world[ts] = world_keypoints

Plot world coordinates

In [None]:
plt.scatter(left_keypoints[:, 0], left_keypoints[:, 1])
for i in range(number_of_parts):
    plt.text(left_keypoints[i, 0], left_keypoints[i, 1], kps[i])
plt.axis("scaled")
plt.show()

In [None]:
for ts in world.keys():
    plt.figure(figsize=(15, 10))
    for (k, v) in world[ts].items():
        plt.scatter(v[0], v[2])
        plt.text(v[0]+0.003, v[2]+0.003, k)
        plt.axis("scaled")
    plt.show()

### Forecasting

First, let's calculate the pairwise distances for the gtsf data. Second let's find the closest Blender model

In [None]:
predictions = []
predictions_average = []
ground_truth = []

for ts in world:
    # load keypoints
    world_keypoints = world[ts]
    # calculate distances
    measurements= []
    for k in range(number_of_parts):
        v = world_keypoints[rkps[str(k)]]
        for k0 in range(k+1, number_of_parts):
            v0 = world_keypoints[rkps[str(k0)]]
            dist = np.linalg.norm(v - v0)*1000 
            measurements.append(dist)
    print(measurements)
    # find closest blender volume
    # calculate l1 distance
    diff = np.nanmean(np.abs(np.array(df)[:, :-1] - measurements), axis=1)
    closest = np.argsort(diff)
    idx = 10
    closest5 = np.array(df)[closest[:idx], -1]
    print("closest volumes", closest5)
    print("standard dev:", np.std(closest5))
    print("estimated length", measurements[13])
    closest_length = np.array(list(df["2-3"].iloc()[closest[:idx]]))
    kfactor = 10**5*closest5 / closest_length**3
    print("closest length", closest_length)
    print("closest kfactor", kfactor)
    print("closest height", list(df["4-6"].iloc()[closest[:idx]]))
    print("#"*50)
    pred_volume = np.array(df)[closest[0], -1]
    predictions.append(pred_volume)
    predictions_average.append(np.mean(closest5))
    
    # ground truth
    ground_truth_weight = [ann["weight"] for ann in annotations if ann["timestamp"] == ts][0]
    ground_truth_kfactor = [ann["kfactor"] for ann in annotations if ann["timestamp"] == ts][0]
    ground_truth.append([ground_truth_weight, ground_truth_kfactor])


In [None]:
predictions = np.array(predictions)
predictions_average = np.array(predictions_average)
ground_truth = np.array(ground_truth)
gt_weight = ground_truth[:, 0]
gt_kfactor = ground_truth[:, 1]

In [None]:
plt.hist(gt_weight)

In [None]:
plt.hist(predictions)

**OLD CODE**

Quick OLS. 

$\hat{\beta} = (X^{T}X)^{-1}X^{T}Y$

(just for Alok)

In [None]:
# ground_truth = ground_truth[:, np.newaxis]
# ground_truth.shape
# A = np.linalg.inv(np.matmul(ground_truth.transpose(), ground_truth))
# B = np.matmul(ground_truth.transpose(), predictions)
# coeff = 1 / (A*B)
# print("Reg coeff: {}".format(coeff))
# plt.figure(figsize=(10, 10))
# plt.plot([0, 5000], [0, 5000], "--", c="r", linewidth=2)
# plt.scatter(ground_truth, predictions*coeff)
# #plt.scatter(ground_truth, predictions)
# plt.xlabel("Ground truth weight")
# plt.ylabel("Predicted weight")
# plt.axis("scaled")
# plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
predictions = predictions[:, np.newaxis]
reg = LinearRegression().fit(predictions, gt_weight)
print(reg.coef_, reg.intercept_)
print("R2 : {}".format(reg.score(predictions, gt_weight)))
predictions = np.squeeze(predictions)

In [None]:
plt.scatter(gt_weight, predictions)
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
plt.plot([0, 5000], [0, 5000], "--", c="r", linewidth=2)
plt.scatter(gt_weight, predictions*reg.coef_ + reg.intercept_, c=gt_kfactor)
#plt.scatter(ground_truth, predictions)
plt.xlabel("Ground truth weight")
plt.ylabel("Predicted weight")
plt.colorbar()
plt.clim([0.8, 1.6])
plt.axis("scaled")
plt.show()

**Linear reg New code**

In [None]:
from sklearn.linear_model import LinearRegression
from aquabyte.biomass import BiomassAccuracy

In [None]:
description = 'Biomass prediction from keypoint predictions. Total population {} pairs, split {}'.format(len(bio.full_pairs), 0.3)

In [None]:
bioacc = BiomassAccuracy(ground_truth, predictions, description, split_size=0.3)

In [None]:
bioacc.plot_kf()

In [None]:
errors = bioacc.calculate_errors()

In [None]:
bioacc.plot_with_density(errors["error"])

In [None]:
bioacc.plot_with_density(errors["relative_error"])

**Cross validation**

In [None]:
bioacc.plot_errors()

In [None]:
# bioacc.plot_sample_curve()