In [None]:
import json, os
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from wpca import WPCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world, convert_to_world_point
from aquabyte.visualize import Visualizer, _normalize_world_keypoints
import random
from scipy.stats import norm
from copy import copy


import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

pd.set_option('display.max_rows', 500)

<h1> Extract base data from database </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))
query = """
    select * from research.fish_metadata a left join keypoint_annotations b
    on a.left_url = b.left_image_url 
    where b.keypoints is not null and b.is_qa = false;
"""
df = rds_access_utils.extract_from_database(query)

<h1> Append world kepyoints to the data </h1>
<h3> Ideally, this data should already live directly in the database </h3>

In [None]:
def get_world_keypoints(row):
    if 'leftCrop' in row.keypoints and 'rightCrop' in row.keypoints:
        return pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata)
    else:
        return None
    
df['world_keypoints'] = df.apply(
    lambda x: get_world_keypoints(x), axis=1
)

df = df[~df.world_keypoints.isnull()]

<h1> Add weight prediction to data </h1>

In [None]:
model = pickle.load(open('/root/alok/repos/cv_algorithms/biomass-production/src/model.pkl', 'rb'))

# helper function from cv_algorithms
def coord2biomass(world_keypoints, model):
    """from coordinates to biomass"""

    mean = model['mean']
    std= model['std']
    PCA_components = model['PCA_components']
    reg_coef = model['reg_coef']
    reg_intercept = model['reg_intercept']
    body_parts = model['body_parts']
    # calculate pairwise distances for production coord
    # based on the exact ordering reflected in the body_parts
    # variable above

    pairwise_distances = []
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            dist = euclidean_distance(world_keypoints[body_parts[i]], world_keypoints[body_parts[j]])
            pairwise_distances.append(dist)

    interaction_values_quadratic = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            dist1 = pairwise_distances[i]
            dist2 = pairwise_distances[j]
            interaction_values_quadratic.append(dist1 * dist2)

    interaction_values_cubic = []
    for i in range(len(pairwise_distances)):
        for j in range(i, len(pairwise_distances)):
            for k in range(j, len(pairwise_distances)):
                dist1 = pairwise_distances[i]
                dist2 = pairwise_distances[j]
                dist3 = pairwise_distances[k]
                interaction_values_cubic.append(dist1 * dist2 * dist3)


    X = np.array(pairwise_distances + interaction_values_quadratic + interaction_values_cubic)

    X_normalized = (X - model['mean']) / model['std']
    X_transformed = np.dot(X_normalized, model['PCA_components'].T)
    prediction = np.dot(X_transformed, reg_coef) + reg_intercept
    return prediction



<h1> Create "good" class of data </h1>

In [None]:
train_pct = 0.5
fish_ids = list(df.fish_id.unique())
random.shuffle(fish_ids)
N = len(fish_ids)
train_fish_ids = fish_ids[:int(train_pct * N)]
test_fish_ids = fish_ids[int(train_pct * N):]

In [None]:
body_parts = sorted(list(df.world_keypoints.iloc[0].keys()))
def generate_X_y(df):
    X, y = [], []
    for idx, row in df.iterrows():
        world_keypoints = row.world_keypoints
        if world_keypoints:
            norm_wkps = _normalize_world_keypoints(world_keypoints)
            keypoints_list = []
            for bp in body_parts:
                keypoints_list.append(norm_wkps[bp])
            if np.isnan(np.array(keypoints_list)).sum() == 0:    
                X.append(keypoints_list)
                y.append(1)

    X, y = np.array(X), np.array(y)
    return X, y
    

# create X, y set corresponding to just "good" labels for both the train and test set
train_mask = df.fish_id.isin(train_fish_ids)
X_train_good, y_train_good = generate_X_y(df[train_mask])
X_test_good, y_test_good = generate_X_y(df[~train_mask])



<h1> Create "bad" class of data </h1>

In [None]:
def generate_bad_X_y(df):
    np.random.seed(0)
    X, y = [], []
    pct_error_threshold_gt = 0.5
    pct_error_threshold_original = 0.2
    row_count = 0
    for idx, row in df.iterrows():

        gt_weight = row.weight
        # random keypoint jitters
        keypoints = row.keypoints
        original_weight = coord2biomass(row.world_keypoints, model)
        for jitter in [10, 20, 50, 100]:
            for num_bad_locations in [1, 2, 3]:
                jittered_keypoints = {'leftCrop': [], 'rightCrop': []}
                jittered_locations = list(np.random.choice(body_parts, num_bad_locations))
                for key in ['leftCrop', 'rightCrop']:
                    for item in keypoints[key]:
                        jittered_item = copy(item)
                        if jittered_item['keypointType'] in jittered_locations:
                            j = np.random.normal(0, jitter)
                            jittered_item['xFrame'] += j
                        jittered_keypoints[key].append(jittered_item)

                jittered_world_keypoints = pixel2world(jittered_keypoints['leftCrop'],
                                                       jittered_keypoints['rightCrop'],
                                                       row.camera_metadata)
                estimated_weight = coord2biomass(jittered_world_keypoints, model)
                pct_error_original = (original_weight - gt_weight)/gt_weight
                pct_error_estimated = (estimated_weight - gt_weight)/gt_weight

                # update X and y
                if (abs(pct_error_estimated) > 0.4) or (abs(pct_error_estimated) - abs(pct_error_original) > 0.2):
                    norm_wkps = _normalize_world_keypoints(jittered_world_keypoints)
                    keypoints_list = []
                    for bp in body_parts:
                        keypoints_list.append(norm_wkps[bp])
                    if np.isnan(np.array(keypoints_list)).sum() == 0:
                        X.append(keypoints_list)
                        y.append(0)

        if row_count % 100 == 0:
            print(row_count)
        row_count += 1
    
    X, y = np.array(X), np.array(y)
    return X, y










In [None]:
X_train_bad, y_train_bad = generate_bad_X_y(df[train_mask])

In [None]:
X_test_bad, y_test_bad = generate_bad_X_y(df[~train_mask])

<h1> Creat PyTorch Dataloader from balanced training set </h1>

In [None]:
class KeypointsDataset(Dataset):
    """Keypoints dataset."""

    def __init__(self, X, labels, transform=None):
        self.X = X
        self.labels = labels

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.labels[idx]

        return torch.from_numpy(x).float(), torch.from_numpy(np.array([y])).float()


In [None]:
np.random.seed(0)
train_size_good = X_train_good.shape[0]
random_choices = np.random.choice(list(range(X_train_bad.shape[0])), train_size_good)
X_train = np.vstack([X_train_good, X_train_bad[random_choices]])
y_train = np.hstack([y_train_good, y_train_bad[random_choices]])

In [None]:
dataset = KeypointsDataset(X_train, y_train)

In [None]:
np.random.seed(0)
test_size_good = X_test_good.shape[0]
random_choices = np.random.choice(list(range(X_test_bad.shape[0])), test_size_good)
X_test = np.vstack([X_test_good, X_test_bad[random_choices]])
y_test = np.hstack([y_test_good, y_test_bad[random_choices]])

In [None]:
val_dataset = KeypointsDataset(X_test, y_test)
val_dataloader = DataLoader(val_dataset, batch_size=25, shuffle=True, num_workers=20)

In [None]:
# TODO: Define your network architecture here
import torch
from torch import nn

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(33, 16)
        self.fc2 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x
        



In [None]:
dataset = KeypointsDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=25, shuffle=True, num_workers=20)


In [None]:
network = Network()
epochs = 500
optimizer = torch.optim.SGD(network.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()

for epoch in range(epochs):
    running_loss = 0.0
    for i, data_batch in enumerate(dataloader):
        optimizer.zero_grad()
        X_batch, y_batch = data_batch
        y_pred = network(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        
    # compute validation loss
    else:
        ## TODO: Implement the validation pass and print out the validation accuracy
        with torch.no_grad():
            all_equals = []
            for i, data_batch in enumerate(val_dataloader):
                X_batch, y_batch = data_batch
                y_pred = network(X_batch)
                equals = y_pred.round() == y_batch.view(y_pred.shape)
                all_equals.extend(list(equals))
        accuracy = np.mean(np.array(all_equals))
    
    loss_for_epoch = running_loss / len(dataloader)
    print('Loss for epoch {}: {}'.format(epoch, loss_for_epoch))
    print('Validation accuracy for epoch {}: {}'.format(epoch, accuracy))





In [None]:
torch.save(network.state_dict(), '/root/data/alok/biomass_estimation/playground/filter_nn_model.pth')
