In [None]:
import json, os
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from wpca import WPCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world, convert_to_world_point
from aquabyte.visualize import Visualizer, _normalize_world_keypoints
import random
from scipy.stats import norm
from copy import copy


import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

pd.set_option('display.max_rows', 500)

<h1> Extract base data from database </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))
query = """
    select * from research.fish_metadata a left join keypoint_annotations b
    on a.left_url = b.left_image_url 
    where b.keypoints is not null and b.is_qa = false;
"""
df = rds_access_utils.extract_from_database(query)

<h1> Append world kepyoints to the data </h1>
<h3> Ideally, this data should already live directly in the database </h3>

In [None]:
def get_world_keypoints(row):
    if 'leftCrop' in row.keypoints and 'rightCrop' in row.keypoints:
        return pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata)
    else:
        return None
    
df['world_keypoints'] = df.apply(
    lambda x: get_world_keypoints(x), axis=1
)

df = df[~df.world_keypoints.isnull()]

<h1> Create base dataset for PyTorch DataLoader </h1>

In [None]:
train_pct = 0.5
fish_ids = list(df.fish_id.unique())
random.shuffle(fish_ids)
N = len(fish_ids)
train_fish_ids = fish_ids[:int(train_pct * N)]
test_fish_ids = fish_ids[int(train_pct * N):]

In [None]:
body_parts = sorted(list(df.world_keypoints.iloc[0].keys()))
def generate_X_y(df):
    X, y = [], []
    for idx, row in df.iterrows():
        world_keypoints = row.world_keypoints
        if world_keypoints:
            norm_wkps = _normalize_world_keypoints(world_keypoints)
            keypoints_list = []
            for bp in body_parts:
                if bp == 'HYPURAL_PLATE':
                    continue
                keypoints_list.append(norm_wkps[bp])
            if np.isnan(np.array(keypoints_list)).sum() == 0:    
                X.append(keypoints_list)
                y.append(row.weight)

    X, y = np.array(X), np.array(y)
    return X, y
    

# create X, y set corresponding to just "good" labels for both the train and test set
train_mask = df.fish_id.isin(train_fish_ids)
X_train, y_train = generate_X_y(df[train_mask])
X_test, y_test = generate_X_y(df[~train_mask])

# normalize X and labels

u_X = X_train.mean(axis=0)
sigma_X = X_train.std(axis=0)
X_train = (X_train - u_X) / sigma_X
X_test = (X_test - u_X) / sigma_X

u_y = y_train.mean()
sigma_y = y_train.std()
y_train = (y_train - u_y)/sigma_y
y_test = (y_test - u_y)/sigma_y


In [None]:
torch.backends.cudnn.enabled

In [None]:
torch.cuda.is_available()

<h1> Creat PyTorch Dataloader from balanced training set </h1>

In [None]:
class KeypointsDataset(Dataset):
    """Keypoints dataset."""

    def __init__(self, X, labels, transform=None):
        self.X = X
        self.labels = labels

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.labels[idx]

        return torch.from_numpy(x).float(), torch.from_numpy(np.array([y])).float()


In [None]:
dataset = KeypointsDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=5, shuffle=True, num_workers=1)
# val_dataset = KeypointsDataset(X_test[:5], y_test[:5])
# val_dataloader = DataLoader(val_dataset, batch_size=5, shuffle=True, num_workers=20)

In [None]:
# TODO: Define your network architecture here
import torch
from torch import nn

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(30, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.output(x)
        return x
        



In [None]:
network = Network()
epochs = 500
optimizer = torch.optim.Adam(network.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

for epoch in range(epochs):
    running_loss = 0.0
    for i, data_batch in enumerate(dataloader):
        optimizer.zero_grad()
        X_batch, y_batch = data_batch
        y_pred = network(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    # compute validation loss
#     else:
#         ## TODO: Implement the validation pass and print out the validation accuracy
#         with torch.no_grad():
#             all_equals = []
#             for i, data_batch in enumerate(val_dataloader):
#                 X_batch, y_batch = data_batch
#                 y_pred = network(X_batch)
#                 equals = y_pred.round() == y_batch.view(y_pred.shape)
#                 all_equals.extend(list(equals))
#         accuracy = np.mean(np.array(all_equals))
    
    loss_for_epoch = running_loss / len(dataloader)
    print('Loss for epoch {}: {}'.format(epoch, loss_for_epoch))
#     print('Validation accuracy for epoch {}: {}'.format(epoch, accuracy))





In [None]:
predictions = (network(torch.from_numpy(X_test).float()).detach().numpy() * sigma_y) + u_y
gt = (y_test * sigma_y) + u_y
plt.figure(figsize=(20, 10))
plt.scatter(gt, predictions)
plt.xlim([0, 10000])
plt.ylim([0, 10000])
plt.grid()
plt.show()

In [None]:
predictions = (network(torch.from_numpy(X_train).float()).detach().numpy() * sigma_y) + u_y
gt = (y_train * sigma_y) + u_y
plt.figure(figsize=(20, 10))
plt.scatter(gt, predictions)
plt.xlim([0, 10000])
plt.ylim([0, 10000])
plt.grid()
plt.show()

In [None]:
from tensorflow.python.client import device_lib
def get_available_gpus():
   local_device_protos = device_lib.list_local_devices()
   print(local_device_protos)
   return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

In [None]:
torch.cuda.is_available()