In [1]:
%load_ext autoreload
%autoreload 2

import random
from research.utils.data_access_utils import S3AccessUtils
from research.weight_estimation.gtsf_data.gtsf_dataset import GTSFDataset
from research.weight_estimation.gtsf_data.gtsf_augmentation import GTSFAugmentation
from research.weight_estimation.keypoint_utils.body_parts import BodyParts

import os
import math
import numpy as np
from keras.layers import Input, Dense, Flatten
from keras.models import Model
from keras.optimizers import RMSprop
from keras.models import load_model
import keras

from matplotlib import pyplot as plt

Using TensorFlow backend.


<h1> Load GTSF Data </h1>

In [2]:
akpd_scorer_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/keypoint-detection-scorer/akpd_scorer_model_TF.h5'
gtsf_dataset = GTSFDataset('2019-03-01', '2020-02-10', akpd_scorer_url)
df = gtsf_dataset.get_prepared_dataset()



Raw dataframe loaded!
Dataset preparation beginning...
3D spatial information added!
Adding AKPD scores...
Converting world keypoints to matrix form...


<h1> Perform Augmentation </h1>

In [3]:
df = df[(df.captured_at < '2019-09-20') & (df.median_depth < 1.0) & (df.akpd_score > 0.5)]
gtsf_augmentation = GTSFAugmentation(df)
y_bounds, max_jitter_std, trials = (0.5, 3.0), 10, 10
augmented_df = gtsf_augmentation.generate_augmented_dataset(y_bounds, max_jitter_std, trials, random_seed=0)
print(augmented_df.shape)

here
Percentage completed: 0.0
Percentage completed: 0.68
Percentage completed: 1.35
Percentage completed: 2.03
Percentage completed: 2.7
Percentage completed: 3.38
Percentage completed: 4.06
Percentage completed: 4.73
Percentage completed: 5.41
Percentage completed: 6.08
Percentage completed: 6.76
Percentage completed: 7.44
Percentage completed: 8.11
Percentage completed: 8.79
Percentage completed: 9.46
Percentage completed: 10.14
Percentage completed: 10.82
Percentage completed: 11.49
Percentage completed: 12.17
Percentage completed: 12.84
Percentage completed: 13.52
Percentage completed: 14.2
Percentage completed: 14.87
Percentage completed: 15.55
Percentage completed: 16.22
Percentage completed: 16.9
Percentage completed: 17.58
Percentage completed: 18.25
Percentage completed: 18.93
Percentage completed: 19.61
Percentage completed: 20.28
Percentage completed: 20.96
Percentage completed: 21.63
Percentage completed: 22.31
Percentage completed: 22.99
Percentage completed: 23.66
Percen

In [None]:
df['annotation'] = df.keypoints

In [None]:
df.to_csv('/root/data/alok/biomass_estimation/playground/20200520_gtsf_dataset.csv')

In [None]:
df.left_image_url.iloc[0]

<h1> Create Train / Test Split </h1>

In [None]:
def generate_stabilized_input(augmented_df, mask=None):
    
    if mask is not None:
        X = augmented_df[mask].wkps.values
        y = 1e-4 * augmented_df[mask].weight.values
    else:
        X = augmented_df.wkps.values
        y = 1e-4 * augmented_df.weight.values
    X = np.concatenate(X).reshape(X.shape[0], 8, 3)
    
    X_new = np.zeros(X.shape)
    X_new[:, :, 0] = 0.5 * X[:, :, 0] / X[:, :, 1]
    X_new[:, :, 1] = 0.5 * X[:, :, 2] / X[:, :, 1]
    X_new[:, :, 2] = 0.05 / X[:, :, 1]
    X_new = X_new.reshape(-1, 24)
    return X_new, y

In [None]:
# select train / test sets such that there are no overlapping fish IDs

fish_ids = augmented_df.fish_id.unique()
train_pct, val_pct, test_pct = 0.8, 0.1, 0.1
train_cnt, val_cnt, test_cnt = np.random.multinomial(len(fish_ids), [train_pct, val_pct, test_pct])
assignments = np.array([0] * train_cnt + [1] * val_cnt + [2] * test_cnt)
np.random.shuffle(assignments)
train_fish_ids = fish_ids[np.where(assignments == 0)]
val_fish_ids = fish_ids[np.where(assignments == 1)]
test_fish_ids = fish_ids[np.where(assignments == 2)]

train_mask = augmented_df.fish_id.isin(train_fish_ids)
val_mask = augmented_df.fish_id.isin(val_fish_ids)
test_mask = augmented_df.fish_id.isin(test_fish_ids)

X_train, y_train = generate_stabilized_input(augmented_df, train_mask)
X_val, y_val = generate_stabilized_input(augmented_df, val_mask)
X_test, y_test = generate_stabilized_input(augmented_df, test_mask)

<h1> Train Neural Network in Keras </h1>

In [None]:
inputs = Input(shape=(24,))

# a layer instance is callable on a tensor, and returns a tensor
# x = Dense(64, activation='relu')(inputs)
# x = Dense(128, activation='relu')(inputs)
x = Dense(256, activation='relu')(inputs)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
pred = Dense(1)(x)
model = Model(input=inputs, output=pred)


In [None]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
                                           min_delta=0,
                                           patience=10,
                                           verbose=0, 
                                           mode='auto')]

optimizer = keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer,
              loss='mean_squared_error',
              metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=callbacks, batch_size=64, epochs=500)

In [None]:
def generate_error_breakdown(df, vals, field):
    for idx in range(len(vals) - 1):
        mask = (df[field] > vals[idx]) & (df[field] < vals[idx + 1])
        error_pct = (df[mask].y_pred.mean() - df[mask].weight.mean()) / (df[mask].weight.mean())
        abs_error_pct = np.mean(np.abs((df[mask].y_pred - df[mask].weight) / df[mask].weight))
        print('Errors for {} in range {} <-> {}: {}, {}'.format(
            field,
            round(vals[idx], 2), 
            round(vals[idx + 1], 2),
            round(100*error_pct, 2),
            round(100*abs_error_pct, 2)
        ))



<h1> Get error breakdown by depth on current augmented dataset </h1>

In [None]:
X, y = generate_stabilized_input(augmented_df)
augmented_df['y_pred'] = 1e4 * model.predict(X).squeeze().astype(float)
generate_error_breakdown(augmented_df, np.arange(0, 3.1, 0.1), 'mean_y')

In [None]:
np.mean(np.abs((augmented_df.y_pred - augmented_df.weight) / augmented_df.weight))

In [None]:
augmented_df.shape

In [None]:
y_bounds, jitter, trials = (0.5, 3.0), 20, 5
augmented_df_2 = gtsf_augmentation.generate_augmented_dataset(y_bounds, jitter, trials, random_seed=0)
X_oos, y_oos = generate_stabilized_input(augmented_df_2)
augmented_df_2['y_pred'] = 1e4 * model.predict(X_oos).squeeze().astype(float)
generate_error_breakdown(augmented_df_2, np.arange(0, 2.3, 0.1), 'mean_y')

In [None]:
errs = ((augmented_df.y_pred - augmented_df.weight) / augmented_df.weight)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(errs.values, bins=100)
plt.show()

In [None]:
np.mean(np.abs(errs))

In [None]:
f = '/root/data/alok/biomass_estimation/playground/model_keras_reduced_jitter.h5'
model.save(f)

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
s3_access_utils.s3_client.upload_file(f, 'aquabyte-models', 'playground/20200520_model_keras_reduced_jitter.h5')

<h1> Apply old model </h1>

In [None]:
df = df[(df.captured_at < '2019-09-20') & (df.median_depth < 1.0) & (df.akpd_score > 0.5)]
gtsf_augmentation = GTSFAugmentation(df)
y_bounds, jitter, trials = (0.7, 1.0), 0, 1
augmented_df = gtsf_augmentation.generate_augmented_dataset(y_bounds, jitter, trials, random_seed=0)
print(augmented_df.shape)

In [None]:
from research.weight_estimation.old.weight_estimator_old import NormalizedStabilityTransform, Network
from research.weight_estimation.old.data_loader import KeypointsDataset, NormalizeCentered2D, ToTensor, BODY_PARTS
from research.weight_estimation.keypoint_utils.optics import pixel2world
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


In [None]:
normalize_centered_2D_transform = NormalizeCentered2D()
normalized_stability_transform = NormalizedStabilityTransform()
to_tensor_transform = ToTensor()

s3_access_utils = S3AccessUtils('/root/data')
model_url = 'https://aquabyte-models.s3-us-west-1.amazonaws.com/biomass/trained_models/2019-11-08T00-13-09/nn_epoch_798.pb'
model_f, _, _ = s3_access_utils.download_from_url(model_url)
network = torch.load(model_f)


weight_predictions = []
count = 0
for idx, row in augmented_df.iterrows():
    if count % 1000 == 0:
        print(count)
    count += 1
    
    input_sample = {
        'keypoints': row.ann,
        'cm': row.cm,
        'stereo_pair_id': 0,
        'single_point_inference': True
    }
    nomralized_centered_2D_kps = \
        normalize_centered_2D_transform.__call__(input_sample)
    
    normalized_stability_kps = normalized_stability_transform.__call__(nomralized_centered_2D_kps)
    tensorized_kps = to_tensor_transform.__call__(normalized_stability_kps)
    weight_prediction = network(tensorized_kps['kp_input']).item() * 1e4
    weight_predictions.append(weight_prediction)
    



In [None]:
augmented_df['old_y_pred'] = weight_predictions

In [None]:
np.mean(np.abs((augmented_df.old_y_pred - augmented_df.weight) / augmented_df.weight))

In [None]:
generate_error_breakdown(augmented_df, np.arange(0, 1.0, 0.05), 'mean_y')

In [None]:
X = augmented_df.original_wkps.iloc[2]
plt.figure(figsize=(20, 10))
plt.scatter(X[:, 0], X[:, 2])
plt.grid()
plt.show()

In [None]:
idx = 3
X_o = augmented_df.original_wkps.iloc[idx]
X = augmented_df.centered_wkps.iloc[idx]

plt.figure(figsize=(20, 10))
plt.scatter(X_o[:, 0], X_o[:, 2], color='blue')
plt.scatter(X[:, 0], X[:, 2], color='red')
plt.grid()
plt.show()

In [None]:
plt.scatter(X[:, 0], X[:, 2])