# GTSF phase I: biomass prediction

In this notebook, we are forecasting the weights by finding the closest blender model

### Look at the volumes created with blender

Load blender data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import cv2

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from scipy.stats import norm
import tqdm
import pickle
from itertools import combinations
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.optics import euclidean_distance

from PIL import Image, ImageDraw
from multiprocessing import Pool, Manager
import copy
import uuid
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)





<h1> Get world keypoint coordinates from GTSF data </h1>

In [None]:
research_sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
research_rds_access_utils = RDSAccessUtils(research_sql_credentials)
sql_engine = research_rds_access_utils.sql_engine
Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs

<h1> Create training dataset </h1>

In [None]:
session.rollback()
sfps_all = session.query(StereoFramePair).all()
df = pd.DataFrame()

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
    'UPPER_PRECAUDAL_PIT', 
    'LOWER_PRECAUDAL_PIT',
    'HYPURAL_PLATE'
])

session.rollback()
for idx, row in enumerate(sfps_all):
    if idx % 10 == 0:
        print(idx)
        
    # get fish_id and ground truth metadata
    if row.gtsf_fish_identifier == '190321010002':
        continue
    ground_truth_metadata = json.loads(row.ground_truth_metadata)
    if ground_truth_metadata['data'].get('species') != 'salmon':
        continue
    
    left_keypoints = json.loads(row.left_image_keypoint_coordinates)
    right_keypoints = json.loads(row.right_image_keypoint_coordinates)
    wkps = json.loads(row.world_keypoint_coordinates)

    df_row = {'0': idx}
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(wkps[body_parts[i]], wkps[body_parts[j]])
            df_row['{0}-{1}'.format(i, j)] = d
    
    weight, length, kfactor = None, None, None
    if 'data' in ground_truth_metadata.keys():
        keys = ground_truth_metadata['data'].keys()
        if 'weight' in keys or 'weightKgs' in keys:
            weightKey = 'weight' if 'weight' in keys else 'weightKgs'
            lengthKey = 'length' if 'length' in keys else 'lengthMms'
            weight = ground_truth_metadata['data'][weightKey]
            length = ground_truth_metadata['data'][lengthKey]
            kfactor = (weight / length**3) * 1e5
    if not weight:
        print('No weight recorded for GTSF fish identifier: {}'.format(row.gtsf_fish_identifier))
        continue
        
        
    # calculate curvature
    wkp = {bp: [wkps[bp][2], wkps[bp][1], wkps[bp][0]] for bp in body_parts}
    fv1 = np.array(wkp['UPPER_LIP']) - np.array(wkp['DORSAL_FIN'])
    fv2 = np.array(wkp['UPPER_LIP']) - np.array(wkp['PELVIC_FIN'])
    n1 = np.cross(fv1, fv2)
    
    bv1 = np.array(wkp['PELVIC_FIN']) -  np.array(wkp['TAIL_NOTCH'])
    bv2 = np.array(wkp['DORSAL_FIN']) -  np.array(wkp['TAIL_NOTCH'])
    n2 = np.cross(bv1, bv2)
    curvature_theta = (180 / np.pi) * np.arccos(np.dot(n1, n2) / (np.linalg.norm(n1) * np.linalg.norm(n2)))
    
    df_row['weight'] = weight
    df_row['length'] = length
    df_row['kfactor'] = kfactor
    df_row['date'] = row.date
    df_row['project_name'] = row.annotations_project_name
    df_row['left_keypoints'] = json.loads(row.left_image_keypoint_coordinates)
    df_row['right_keypoints'] = json.loads(row.right_image_keypoint_coordinates)
    df_row['world_keypoints'] = wkps
    df_row['gtsf_fish_identifier'] = row.gtsf_fish_identifier
    df_row['epoch'] = row.epoch
    df_row['stereo_frame_pair_id'] = row.id
    df_row['curvature_theta'] = curvature_theta
        
    df = df.append(df_row, ignore_index=True)
            
    



<h1> Apply filters </h1>

In [None]:
df_cache = df.copy()
# df = df.dropna()

In [None]:
df_cache.to_hdf('/root/data/df_cache.h5', 'key')

In [None]:
# read from cached location to avoid having to regenerate data

df = pd.read_hdf('/root/data/df_cache.h5')

In [None]:
df = df_cache.copy()

In [None]:
df = df[(df.stereo_frame_pair_id != 6137)] 

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    


<h1> Labelbox + new calibration + 11 keypoint 

In [None]:
# define all features

body_parts_subset = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
#     'UPPER_PRECAUDAL_PIT',
#     'LOWER_PRECAUDAL_PIT',
#     'HYPURAL_PLATE'
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        df[interaction_column] = df[col1] * df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            df[interaction_column] = df[col1] * df[col2] * df[col3]
            interaction_columns_cubic.append(interaction_column)
            

In [None]:
np.random.seed(0)

mask = generate_train_mask(df, train_frac=0.8)
mask = mask & (~df.gtsf_fish_identifier.str.contains('190620'))
columns = pairwise_distance_columns + interaction_columns_quadratic #+ interaction_columns_cubic

X_train = df.loc[mask, columns].values
print(X_train.sum())
y_train = df.loc[mask, 'weight'].values
X_test = df.loc[~mask, columns].values
y_test = df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
idx = np.where(explained_variance_ratio > 0.999999)[0][0]
print(idx)

pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)
print(reg.score(X_test_transformed, y_test))

y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
df['prediction'] = y_pred
df['error'] = df.prediction - df.weight
df['error_pct'] = df.error / df.weight
df['abs_error_pct'] = df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts   
}



In [None]:
amg = AccuracyMetricsGenerator(mask.values, df.prediction.values, df.weight.values)

In [None]:
amg.plot_predictions_vs_ground_truth()

In [None]:
amg.display_train_test_accuracy_metrics()

In [None]:
# define all features

body_parts_subset = sorted([
    'HYPURAL_PLATE',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        df[interaction_column] = df[col1] * df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            df[interaction_column] = df[col1] * df[col2] * df[col3]
            interaction_columns_cubic.append(interaction_column)
            
cs2 = pairwise_distance_columns

In [None]:
np.random.seed(0)

mask = generate_train_mask(df, train_frac=0.8)
mask = mask & (~df.gtsf_fish_identifier.str.contains('190620'))
columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

X_train = df.loc[mask, columns].values
print(X_train.sum())
y_train = df.loc[mask, 'weight'].values
X_test = df.loc[~mask, columns].values
y_test = df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
idx = np.where(explained_variance_ratio > 0.999999)[0][0]
print(idx)

pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)
print(reg.score(X_test_transformed, y_test))

y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
df['prediction'] = y_pred
df['error'] = df.prediction - df.weight
df['error_pct'] = df.error / df.weight
df['abs_error_pct'] = df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts   
}



In [None]:
amg2 = AccuracyMetricsGenerator(mask.values, df.prediction.values, df.weight.values)
amg2.plot_predictions_vs_ground_truth()

In [None]:
amg2.display_train_test_accuracy_metrics()

In [None]:
df[(df.prediction < 6000) & (df.weight > 8000)].iloc[0].epoch

In [None]:
# read from cached location to avoid having to regenerate data

df2 = pd.read_hdf('/root/data/df_cache.h5')

In [None]:
df2.groupby('date').agg(len)['gtsf_fish_identifier']

In [None]:
set(df2.epoch).difference(set(df.epoch))

In [None]:
df.groupby('date').agg(len)['gtsf_fish_identifier']

<h1> Ensure apples-to-apples comparison </h1>

In [None]:
projects = [
    'Underwater Live GTSF - Axiom Calibration Full',
    'Underwater Live GTSF - Axiom Calibration - Filtered - Team 1',
    'Underwater Live GTSF - Axiom Calibration - Filtered - Team 2'
]

common_epochs = None

for project_name in projects:
    project_mask = df.project_name == project_name
    epochs = set(df[project_mask].epoch.unique())
    if not common_epochs:
        common_epochs = epochs
    else:
        common_epochs = common_epochs.intersection(epochs)
    
common_epochs = sorted(list(common_epochs))
tdf = df[df.epoch.isin(common_epochs)].copy(deep=True)

In [None]:
N = 100
biomass_error_pcts = []
for i in range(N):
    print(i)
    mask = generate_train_mask(df, train_frac=0.8)
    mask = mask & (~df.gtsf_fish_identifier.str.contains('190620'))
    columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

    X_train = df.loc[mask, columns].values
    y_train = df.loc[mask, 'weight'].values
    X_test = df.loc[~mask, columns].values
    y_test = df.loc[~mask, 'weight'].values

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_normalized = scaler.transform(X_train)

    pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
    pca.fit(X_train_normalized)
    explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
    idx = np.where(explained_variance_ratio > 0.999999)[0][0]

    pca = PCA(n_components=idx+1)
    pca.fit(X_train_normalized)
    X_train_transformed = pca.transform(X_train_normalized)
    X_test_normalized = scaler.transform(X_test)
    X_test_transformed = pca.transform(X_test_normalized)

    reg = LinearRegression().fit(X_train_transformed, y_train)

    y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
    df['prediction'] = y_pred
    df['error'] = df.prediction - df.weight
    df['error_pct'] = df.error / df.weight
    df['abs_error_pct'] = df.error_pct.abs()

    model = {
        'mean': scaler.mean_,
        'std': scaler.scale_,
        'PCA_components': pca.components_,
        'reg_coef': reg.coef_,
        'reg_intercept': reg.intercept_,
        'body_parts': body_parts   
    }
    
    amg = AccuracyMetricsGenerator(mask.values, df.prediction.values, df.weight.values)
    accuracy_metrics = amg.generate_train_test_accuracy_metrics()
    biomass_error_pct = accuracy_metrics['test']['biomass_error_pct']
    biomass_error_pcts.append(biomass_error_pct)
    



In [None]:
data_sorted = sorted(list([abs(x) for x in biomass_error_pcts]))
p = 1.0 * np.arange(len(data_sorted)) / (len(data_sorted) - 1)
fig = plt.figure(figsize=(30, 7))
ax1 = fig.add_subplot(121)
ax1.plot(p, data_sorted)
ax1.set_xlabel('p')
ax1.set_ylabel('OOS error percentage')
plt.axvline(x=0.95, linestyle='--', color='red', label='p = 0.95')
plt.title('CDF of OOS errors (sample size = 250)')
plt.legend()
plt.grid()



<h1> Overall prediction distribution comparison between Team 1 and Team 2 </h1>

In [None]:
projects = [
    'Underwater Live GTSF - Axiom Calibration Full',
    'Underwater Live GTSF - Axiom Calibration - Filtered - Team 1',
    'Underwater Live GTSF - Axiom Calibration - Filtered - Team 2'
]
for project_name in projects:
    project_mask = tdf.project_name == project_name
    average_error = (tdf[project_mask].prediction.mean() - 1500.0) / 1500.0
    print('Average prediction error: {}%'.format(round(average_error * 100.0, 2)))
    plt.figure(figsize=(15, 10))
    plt.title('Predicted biomass histogram for project: {}'.format(project_name))
    plt.hist(tdf[project_mask].prediction, bins=20)
    plt.axvline(1500, color='red', label='Ground Truth Weight')
    plt.xlabel('Biomass prediction for single live fish (grams)')
    plt.legend()
    plt.grid()
    plt.show()


<h1> Per-stereo-frame level prediction comparison between Team 1 and Team 2 </h1>

In [None]:
fish_analysis_df = pd.DataFrame()
for epoch in common_epochs:
    epoch_mask = tdf.epoch == epoch
    team_1_prediction = tdf[(tdf.project_name == projects[0]) & epoch_mask].prediction.iloc[0]
    team_2_prediction = tdf[(tdf.project_name == projects[1]) & epoch_mask].prediction.iloc[0]
    weight = tdf[(tdf.project_name == projects[0]) & epoch_mask].weight.iloc[0]
    row = {}
    row['epoch'] = epoch
    row['team_1_prediction'] = team_1_prediction
    row['team_2_prediction'] = team_2_prediction
    row['weight'] = weight
    
    fish_analysis_df = fish_analysis_df.append(row, ignore_index=True)

    
    

In [None]:
differences = fish_analysis_df.team_1_prediction - fish_analysis_df.team_2_prediction
difference_mean = differences.mean()
difference_std = differences.std()
print('Mean difference: {} grams'.format(round(differences.mean(), 2)))
print('Standard deviation of difference: {} grams'.format(round(differences.std(), 2)))

plt.figure(figsize=(20, 10))
plt.title('Distribution of per-fish prediction differences between Team 1 and Team 2')
plt.hist(differences, bins=10)
plt.xlabel('Prediction difference (grams)')
plt.grid()
plt.show()

<h1> Per-point level comparison between Team 1 and Team 2 </h1>

In [None]:
keypoint_analysis_df = pd.DataFrame()
for epoch in common_epochs:
    
    epoch_mask = tdf.epoch == epoch
    team_1_left_keypoints = tdf[epoch_mask & (tdf.project_name == projects[0])].left_keypoints.iloc[0]
    team_2_left_keypoints = tdf[epoch_mask & (tdf.project_name == projects[1])].left_keypoints.iloc[0]
    team_1_right_keypoints = tdf[epoch_mask & (tdf.project_name == projects[0])].right_keypoints.iloc[0]
    team_2_right_keypoints = tdf[epoch_mask & (tdf.project_name == projects[1])].right_keypoints.iloc[0]
    
    for body_part, team_1_left_keypoint in team_1_left_keypoints.items():
        team_2_left_keypoint = team_2_left_keypoints[body_part]
        x_diff = team_1_left_keypoint[0] - team_2_left_keypoint[0]
        y_diff = team_1_left_keypoint[1] - team_2_left_keypoint[1]
        row = {}
        row['epoch'] = epoch
        row['body_part'] = body_part
        row['side'] = 'left'
        row['x_diff'] = x_diff
        row['y_diff'] = y_diff
        
        keypoint_analysis_df = keypoint_analysis_df.append(row, ignore_index=True)
        
    for body_part, team_1_right_keypoint in team_1_right_keypoints.items():
        team_2_right_keypoint = team_2_right_keypoints[body_part]
        x_diff = team_1_right_keypoint[0] - team_2_right_keypoint[0]
        y_diff = team_1_right_keypoint[1] - team_2_right_keypoint[1]
        row = {}
        row['epoch'] = epoch
        row['body_part'] = body_part
        row['side'] = 'right'
        row['x_diff'] = x_diff
        row['y_diff'] = y_diff
        
        keypoint_analysis_df = keypoint_analysis_df.append(row, ignore_index=True)
        
    
    

In [None]:
plt.figure(figsize=(15, 10))
plt.title('Predicted biomass histogram for underwater live fish')
plt.hist(df[df.gtsf_fish_identifier == '190620-4e4e0640-d4eb-405d-8fcf-57fda11d7660'].prediction, bins=20)
plt.axvline(1500, color='red', label='Ground Truth Weight')
plt.xlabel('Biomass prediction for single live fish (grams)')
plt.legend()
plt.grid()

In [None]:
df[df.gtsf_fish_identifier == '190620-4e4e0640-d4eb-405d-8fcf-57fda11d7660'].prediction.mean()

<h1> Investigate Individual Cases </h1>

In [None]:
data_access_utils = DataAccessUtils('/root/data/')

In [None]:
def visualize_stereo_frame_pair(stereo_frame_pair_id):
    sfp = session.query(StereoFramePair).filter(StereoFramePair.id == stereo_frame_pair_id).all()[0]
    left_image_s3_key = sfp.left_image_s3_key
    right_image_s3_key = sfp.right_image_s3_key
    image_s3_bucket = sfp.image_s3_bucket
    left_image_keypoint_coordinates = json.loads(sfp.left_image_keypoint_coordinates)
    right_image_keypoint_coordinates = json.loads(sfp.right_image_keypoint_coordinates)
    
    
    left_image_f = data_access_utils.download_from_s3(image_s3_bucket, left_image_s3_key)
    right_image_f = data_access_utils.download_from_s3(image_s3_bucket, right_image_s3_key)
    left_image = plt.imread(left_image_f)
    right_image = plt.imread(right_image_f)
    
    fig, axes = plt.subplots(1, 2, figsize=(30, 20))
    axes[0].imshow(left_image)
    for bp, coords in left_image_keypoint_coordinates.items():
        axes[0].scatter(coords[0], coords[1], s=2, label=bp, color='red')
    
    axes[1].imshow(right_image)
    for bp, coords in right_image_keypoint_coordinates.items():
        axes[1].scatter(coords[0], coords[1], s=2, label=bp, color='red')
    
    plt.show()
    
    


In [None]:
features = [
  'gtsf_fish_identifier', 
  'epoch', 
  'prediction', 
  'stereo_frame_pair_id', 
  'abs_error_pct',
  'error_pct',
  '6-7',
  'curvature_theta'
]
tdf = df.ix[df.weight == 1500, features].sort_values('error_pct')

for idx, row in tdf.iterrows():
    stereo_frame_pair_id = row.stereo_frame_pair_id
    prediction = row.prediction
    error_pct = row.error_pct
    print('Prediction: {0}, Error Percentage: {1}'.format(prediction, error_pct))
    visualize_stereo_frame_pair(stereo_frame_pair_id)


In [None]:
visualize_stereo_frame_pair(3805)

In [None]:
df.ix[df.weight == 1500, 
      [
          'gtsf_fish_identifier', 
          'epoch', 
          'prediction', 
          'stereo_frame_pair_id', 
          'error_pct',
          'abs_error_pct', 
          '6-7',
          'curvature_theta'
      ]
     ].sort_values('error_pct')



In [None]:
sfp = session.query(StereoFramePair).filter(StereoFramePair.id == 3756).all()[0]

In [None]:
sfp.left_image_keypoint_coordinates

In [None]:
sfp.right_image_keypoint_coordinates

In [None]:
tdf = df.ix[df.weight == 1500, 
      [
          'gtsf_fish_identifier', 
          'epoch', 
          'prediction', 
          'stereo_frame_pair_id', 
          'abs_error_pct', 
          '6-7',
          'curvature_theta'
      ]
     ].copy(deep=True)

In [None]:
(tdf.prediction.mean() - 1500)/1500.