# GTSF phase: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

In [None]:
%load_ext autoreload
%autoreload 2

### Look at the volumes created with blender

Load blender data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

In [None]:
with open("/root/thomas/blender/volumes_all.json", "r") as f:
    data = json.load(f)

Some plot

In [None]:
# plt.scatter(np.array(data["dimensions"])[:, 1], data["volume"])
# plt.ylabel("Volume (cm^3)")
# plt.xlabel("Length (mm)")
# plt.show()

In [None]:
plt.hist(data["volume"])
plt.title("Blender volume histogram")
plt.show()

Calculate pairwise distances from blender data

In [None]:
mapping = data["mapping"]
reverse_mapping = data["reverse_mapping"]

In [None]:
number_of_parts = max(list(mapping.values()))+1

In [None]:
dataset = {"volume":[]}
dataset_np = []
kfactors = []
for (coord, vol) in zip(data["coordinates"], data["volume"]):
    row = []
    for k in range(number_of_parts):
        v = coord[reverse_mapping[str(k)]]
        for k0 in range(k+1, number_of_parts):
            v0 = coord[reverse_mapping[str(k0)]]
            dist = np.sqrt((v[2]-v0[2])**2 + (v[1]-v0[1])**2)
            cname = "{}-{}".format(k, k0)
            row.append(dist)
            if cname not in dataset:
                dataset[cname] = []
            dataset[cname].append(dist)
    dataset_np.append(row)
    dataset["volume"].append(vol)

Create panda dataframe

In [None]:
df = pd.DataFrame(data=dataset)
df.head()

In [None]:
plt.plot(df["2-3"], "o")

## Look at the gtsf data

Loading the gtsf data points and creating the pairwise distances

In [None]:
import json
import os

In [None]:
jsonfiles = ['/root/data/rds/formatted.json']

In [None]:
annotations = []
for jsonpath in jsonfiles:
    with open(jsonpath, "r") as f:
        jfile = json.load(f)
        annotations += jfile
print("Number of annotations: {}".format(len(annotations)))

In [None]:
annotations[0]

In [None]:
# create pairs per timestamp
pairs = {}
new_annotations = []
for ann in annotations:
    if ann['pen_id'] != 4:
        continue
    if ann['site_id'] != 23:
        continue
    if ann["species"] != "salmon":
        continue
    if ann["kfactor"] < 0.3:
        continue
    timestamp = ann["timestamp"]
    side = os.path.basename(ann["local_path"]).split("_")[0]
    ann["side"] = side
    if timestamp not in pairs:
        pairs[timestamp] = {}
    pairs[timestamp][side] = ann
    new_annotations.append(ann)
full_pairs = [k for (k, v)in pairs.items() if "left" in v and "right" in v]
print("Number of full pairs: {}".format(len(full_pairs)))

### 2D to 3D 

Move from 2d pixel coordinates to 3d world coordinates. First, need to create pairs

Creating pairs below

In [None]:
from aquabyte.biomass import BiomassAnnotation

In [None]:
bio = BiomassAnnotation(new_annotations, mapping)

Some plotting

In [None]:
# pair = bio.plot_pair()

Match the keypoints and create world coordinates

In [None]:
from aquabyte.optics import depth_from_disp, convert_to_world_point

In [None]:
# params = {'FOCAL_LENGTH': 0.0084366,
#           'BASELINE': 0.128096,
#           'PIXEL_SIZE_M': 3.45 * 1e-6,
#           'FOCAL_LENGTH_PIXEL': 0.0084366 / (3.45 * 1e-6),
#           'IMAGE_SENSOR_WIDTH': 0.01412,
#           'IMAGE_SENSOR_HEIGHT': 0.01035,
#           'PIXEL_COUNT_WIDTH': 4096,
#           'PIXEL_COUNT_HEIGHT': 3000
#          }
params = {'BASELINE' : 0.10019751688037272,
'FOCAL_LENGTH' : 0.013658357173918818,
'FOCAL_LENGTH_PIXEL' :  3958.944108382266,
'IMAGE_SENSOR_HEIGHT' : 0.01035,
'IMAGE_SENSOR_WIDTH' : 0.01412,
'PIXEL_COUNT_HEIGHT' : 3000,
'PIXEL_COUNT_WIDTH' : 4096}

In [None]:
jitter = {"jitter": False, "delta": 50}

In [None]:
world = {}
for ts in bio.full_pairs:
    left_keypoints = bio.load_keypoints(ts, 'left', jitter)
    right_keypoints = bio.load_keypoints(ts, 'right', jitter)
    
    # calculate disparities
    disparities = left_keypoints[:, 0] - right_keypoints[:, 0]
    # print(disparities)
    # compute world key point
    world_keypoints = {}
    for (i, d) in enumerate(disparities):
        depth = depth_from_disp(d, params)
        world_coord = convert_to_world_point(left_keypoints[i, 0], left_keypoints[i, 1], depth, params)
        world_keypoints[list(mapping.keys())[i]] = world_coord
    world[ts] = world_keypoints

Plot world coordinates

In [None]:
# plt.scatter(left_keypoints[:, 0], left_keypoints[:, 1])
# for i in range(number_of_parts):
#     plt.text(left_keypoints[i, 0], left_keypoints[i, 1], list(mapping.keys())[i])
# plt.show()

In [None]:
# plt.figure(figsize=(15, 10))
# for (k, v) in world['190226010005'].items():
#     plt.scatter(v[0], v[2])
#     plt.text(v[0]+0.003, v[2]+0.003, k)
#     plt.axis("scaled")
# plt.show()

### Forecasting

First, let's calculate the pairwise distances for the gtsf data. Second let's find the closest Blender model

In [None]:
predictions = []
predictions_average = []
ground_truth = []
ids = []
for ts in world:
    # load keypoints
    world_keypoints = world[ts]
    # calculate distances
    measurements= []
    for k in range(number_of_parts):
        v = world_keypoints[reverse_mapping[str(k)]]
        for k0 in range(k+1, number_of_parts):
            v0 = world_keypoints[reverse_mapping[str(k0)]]
            dist = np.linalg.norm(v - v0)*1000 
            measurements.append(dist)
    
    # find closest blender volume
    # calculate l1 distance
    diff = np.nanmean(np.abs(np.array(df)[:, :-1] - measurements), axis=1)
    closest = np.argsort(diff)
    idx = 10
    closest5 = np.array(df)[closest[:idx], -1]
#     print("closest volumes", closest5)
#     print("standard dev:", np.std(closest5))
#     print("estimated length", measurements[13])
    closest_length = np.array(list(df["2-3"].iloc()[closest[:idx]]))
    kfactor = 10**5*closest5 / closest_length**3
#     print("closest length", closest_length)
#     print("closest kfactor", kfactor)
#     print("closest height", list(df["4-6"].iloc()[closest[:idx]]))
#     print("#"*50)
    pred_volume = np.array(df)[closest[0], -1]
    predictions.append(pred_volume)
    predictions_average.append(np.mean(closest5))
    ids.append(bio.pairs[ts]['left']['keypoint_annotation_id'])
    # ground truth
#     ground_truth_weight = [ann["weight"] for ann in annotations if ann["timestamp"] == ts][0]
#     ground_truth_kfactor = [ann["kfactor"] for ann in annotations if ann["timestamp"] == ts][0]
#     ground_truth.append([ground_truth_weight, ground_truth_kfactor])


In [None]:
predictions = np.array(predictions)
predictions_average = np.array(predictions_average)
ground_truth = np.array(ground_truth)
# gt_weight = ground_truth[:, 0]
# gt_kfactor = ground_truth[:, 1]

In [None]:
slope = data['coeff'][0]
intercept = data['coeff'][1]

In [None]:
# predictions = predictions*slope + intercept

In [None]:
plt.hist(predictions)

In [None]:
dates = {}
i = 0
for ts in full_pairs:
    date = pairs[ts]['left']['local_path'].split('date=')[1].split('/')[0]
    if date not in dates:
        dates[date] = []
    dates[date].append(predictions[i])
    i += 1

In [None]:
days = sorted(dates.keys())
# daily_mean = [(np.median(dates[d]) + np.mean(dates[d])) /2.0 for d in days]
daily_mean = [np.mean(dates[d]) for d in days]
daily_count = [len(dates[d]) for d in days]

plt.plot(days, daily_count)
plt.title('Daily count')
plt.show()


plt.plot(days, daily_mean)
plt.title('Daily average biomass (g)')
plt.show()

In [None]:
data['coeff']

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
Y = np.array(daily_mean)
X = np.arange(1, len(daily_mean)+1)[:, np.newaxis]
logY = np.log(Y)
logX = np.log(X)

In [None]:
reg = LinearRegression().fit(X, logY)

In [None]:
plt.plot(X, logY)
plt.plot(X, X*reg.coef_ + reg.intercept_)
plt.show()

In [None]:
intercept = np.exp(reg.intercept_)
coef = reg.coef_

In [None]:
preds = intercept*np.exp(coef*X)
plt.plot(X, Y, 'b')
plt.plot(X, intercept*np.exp(coef*X), 'r')
plt.legend(['data', 'fit'])
plt.title('{} exp^{}t'.format(intercept, coef[0]))
plt.show()

In [None]:
errors = preds.squeeze() - Y
relative_error = (preds.squeeze() - Y) / Y * 100

In [None]:
values = np.arange(0, 101, 5)
percentiles = np.percentile(np.abs(relative_error), values)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(percentiles, values)
plt.yticks(np.arange(0,101,5))
plt.ylabel("Percentage")
plt.xlabel("Absolute relative error (%)")
plt.grid()
plt.show()

# REPOPULATE TABLE

In [None]:
import json

sql_credentials = json.load(open('/root/thomas/sqlcredentials.json'))

In [None]:
sql_credentials['user'] = 'thomas_the_fixer'
sql_credentials['password'] = 'thomas2019'

In [None]:
sql_credentials

In [None]:
from sqlalchemy import create_engine, MetaData, Table
from tqdm import tqdm

In [None]:
sql_engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], sql_credentials["password"],
                                         sql_credentials["host"], sql_credentials["port"],
                                         sql_credentials["database"]))
metadata = MetaData()
biomass_table = Table('biomass_computations', metadata, autoload=True, autoload_with=sql_engine)


In [None]:
ids[0]

In [None]:
predictions[0]

In [None]:
for i in tqdm(range(len(ids))):
    query = biomass_table.update().where(biomass_table.c.keypoint_annotation_id == ids[i]).values(estimated_biomass_g = predictions[i])

    connection = sql_engine.connect()
    ex = None
    try:
        ex = connection.execute(query)
    except exc.IntegrityError as e:
        print("ERROR: failed query, {}", query)
        print(e)

**OLD CODE**

Quick OLS. 

$\hat{\beta} = (X^{T}X)^{-1}X^{T}Y$

(just for Alok)

In [None]:
# ground_truth = ground_truth[:, np.newaxis]
# ground_truth.shape
# A = np.linalg.inv(np.matmul(ground_truth.transpose(), ground_truth))
# B = np.matmul(ground_truth.transpose(), predictions)
# coeff = 1 / (A*B)
# print("Reg coeff: {}".format(coeff))
# plt.figure(figsize=(10, 10))
# plt.plot([0, 5000], [0, 5000], "--", c="r", linewidth=2)
# plt.scatter(ground_truth, predictions*coeff)
# #plt.scatter(ground_truth, predictions)
# plt.xlabel("Ground truth weight")
# plt.ylabel("Predicted weight")
# plt.axis("scaled")
# plt.show()

**Linear reg New code**

In [None]:
from sklearn.linear_model import LinearRegression
from aquabyte.biomass import BiomassAccuracy

In [None]:
bioacc = BiomassAccuracy(ground_truth, predictions, 'test', split_size=0.3)

In [None]:
bioacc.reg.coef_

In [None]:
bioacc.reg.intercept_

In [None]:
bioacc.plot_kf()

In [None]:
errors = bioacc.calculate_errors()

In [None]:
bioacc.plot_with_density(errors["error"])

In [None]:
bioacc.plot_with_density(errors["relative_error"])

**Cross validation**

In [None]:
bioacc.plot_errors()

In [None]:
bioacc.plot_sample_curve()