In [None]:
%load_ext autoreload
%autoreload 2

import datetime as dt
import json

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from scipy.stats import norm
import tqdm
import pickle
from itertools import combinations
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.optics import euclidean_distance

from PIL import Image, ImageDraw
from multiprocessing import Pool, Manager
import copy
import uuid
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)





<h1> Instantiate data extraction tools </h1>

In [None]:

# prod research SQL credentaials
prod_research_sql_credentials = json.load(open(os.environ["PROD_RESEARCH_SQL_CREDENTIALS"]))
rds_access_utils = RDSAccessUtils(prod_research_sql_credentials)


sql_query = '''
select * from keypoint_annotations
where pen_id = 48
and keypoints is not NULL
and is_qa=false;
'''

original_df = rds_access_utils.extract_from_database(sql_query)


In [None]:
research_sql_credentials = json.load(open(os.environ["SQL_CREDENTIALS"]))
research_rds_access_utils = RDSAccessUtils(research_sql_credentials)
sql_engine = research_rds_access_utils.sql_engine
Session = sessionmaker(bind=sql_engine)
session = Session()

Base = automap_base()
Base.prepare(sql_engine, reflect=True)
Enclosure = Base.classes.enclosures
Calibration = Base.classes.calibrations
GtsfDataCollection = Base.classes.gtsf_data_collections
StereoFramePair = Base.classes.stereo_frame_pairs

In [None]:
# AWS credentials
aws_credentials = json.load(open(os.environ["AWS_CREDENTIALS"]))
s3_client = boto3.client('s3', aws_access_key_id=aws_credentials["aws_access_key_id"],
                         aws_secret_access_key=aws_credentials["aws_secret_access_key"],
                         region_name="eu-west-1")

s3_access_utils = S3AccessUtils('/root/data')

<h1> Helper functions </h1>

In [None]:
import numpy as np
import json
import datetime
from sqlalchemy import create_engine, MetaData, Table, exc, exists, select, literal
import pickle

def euclidean_distance(p1, p2):
    return ((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2 + (p1[2] - p2[2])**2)**0.5


def convert_to_world_point(x, y, d, parameters):
    """ from pixel coordinates to world coordinates """
    # get relevant parameters
    pixel_count_height = parameters["pixelCountHeight"]
    pixel_count_width = parameters["pixelCountWidth"]
    sensor_width = parameters["imageSensorWidth"]
    sensor_height = parameters["imageSensorHeight"]
    focal_length = parameters["focalLength"]

    image_center_x = pixel_count_height / 2.0
    image_center_y = pixel_count_width / 2.0
    px_x = x - image_center_x
    px_z = image_center_y - y

    sensor_x = px_x * (sensor_height / pixel_count_height)
    sensor_z = px_z * (sensor_width / pixel_count_width)

    # now move to world coordinates
    world_y = d
    world_x = (world_y * sensor_x) / focal_length
    world_z = (world_y * sensor_z) / focal_length
    return np.array([world_x, world_y, world_z])


def depth_from_disp(disp, parameters):
    """ calculate the depth of the point based on the disparity value """
    focal_length_pixel = parameters["focalLengthPixel"]

    baseline = parameters["baseline"]
    depth = focal_length_pixel * baseline / np.array(disp)
    return depth


def pixel2world(left_crop, right_crop, parameters):
    """2D pixel coordinates to 3D world coordinates"""

    # first create a dic with crop keypoints
    image_coordinates = {"leftCrop": {},
                         "rightCrop": {}}
    for keypoint in left_crop:
        name = keypoint["keypointType"]
        image_coordinates["leftCrop"][name] = [keypoint["xFrame"], keypoint["yFrame"]]
    for keypoint in right_crop:
        name = keypoint["keypointType"]
        image_coordinates["rightCrop"][name] = [keypoint["xFrame"], keypoint["yFrame"]]

    # then loop through the right crop keypoints and calculate the world coordinates
    world_coordinates = {}
    for keypoint in left_crop:
        name = keypoint["keypointType"]
        disparity = image_coordinates["leftCrop"][name][0] - image_coordinates["rightCrop"][name][0]
        depth = depth_from_disp(disparity, parameters)
        world_point = convert_to_world_point(image_coordinates["leftCrop"][name][1],
                                             image_coordinates["leftCrop"][name][0],
                                             depth,
                                             parameters)
        world_coordinates[name] = world_point
    return world_coordinates

In [None]:
df = pd.DataFrame()
capture_f = '/root/data/temp/capture.json'



body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
    'UPPER_PRECAUDAL_PIT', 
    'LOWER_PRECAUDAL_PIT',
    'HYPURAL_PLATE'
])

k = 0

session.rollback()
for idx, row in original_df.iterrows():
    if k % 10 == 0:
        print(k)
    k += 1
    
    # get gtsf_fish_identifier, ground truth metadata, and weight for this row
    left_crop_url = row.left_image_url
    right_crop_url = row.right_image_url

    left_crop_key = left_crop_url.replace('https://s3-eu-west-1.amazonaws.com/aquabyte-crops/', '')
    right_crop_key = right_crop_url.replace('https://s3-eu-west-1.amazonaws.com/aquabyte-crops/', '')
    crop_key_dir = os.path.dirname(left_crop_key)
    capture_key = os.path.join(crop_key_dir, 'capture.json')
    left_image_key = os.path.join(crop_key_dir, 'left_frame.jpg')
    right_image_key = os.path.join(crop_key_dir, 'right_frame.jpg')
    image_bucket = 'aquabyte-frames-resized-inbound'
    s3_access_utils.download_from_s3(image_bucket, capture_key, capture_f)
    capture_info = json.load(open(capture_f))
    
    gtsf_fish_identifier = capture_info['gtsf_fish_identifier']
    gtsf_data_collection = session.query(GtsfDataCollection).filter(GtsfDataCollection.gtsf_fish_identifier == gtsf_fish_identifier).all()[0]
    ground_truth_metadata = json.loads(gtsf_data_collection.ground_truth_metadata)
    weight, length, kfactor = None, None, None
    if 'data' in ground_truth_metadata.keys():
        keys = ground_truth_metadata['data'].keys()
        if 'weight' in keys or 'weightKgs' in keys:
            weightKey = 'weight' if 'weight' in keys else 'weightKgs'
            lengthKey = 'length' if 'length' in keys else 'lengthMms'
            weight = ground_truth_metadata['data'][weightKey]
            length = ground_truth_metadata['data'][lengthKey]
            kfactor = (weight / length**3) * 1e5
    if not weight:
        print('No weight recorded for GTSF fish identifier: {}'.format(gtsf_fish_identifier))
        continue
        
        
    # get left, right, and world keypoints
    keypoints = row.keypoints
    if 'leftCrop' not in keypoints or 'rightCrop' not in keypoints:
        continue
        
    keypoint_world_coordinates = pixel2world(keypoints['leftCrop'], keypoints['rightCrop'], row.camera_metadata)
    
    # write row to dataframe
    df_row = {}
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(keypoint_world_coordinates[body_parts[i]], 
                                   keypoint_world_coordinates[body_parts[j]])
            df_row['{0}-{1}'.format(i, j)] = d
    
    df_row['weight'] = weight
    df_row['length'] = length
    df_row['kfactor'] = kfactor
    df_row['captured_at'] = row.captured_at
    df_row['gtsf_fish_identifier'] = gtsf_fish_identifier
    df_row['keypoints']= row.keypoints
    df_row['keypoint_world_coordinates'] = keypoint_world_coordinates
    df_row['left_image_key'] = left_image_key
    df_row['right_image_key'] = right_image_key
    df_row['left_crop_key'] = left_crop_key
    df_row['right_crop_key'] = right_crop_key
    df_row['image_bucket'] = 'aquabyte-frames-resized-inbound'
    df_row['crop_bucket'] = 'aquabyte-crops'
    df = df.append(df_row, ignore_index=True)
    

In [None]:
df[df.gtsf_fish_identifier == '190301010003']['weight'].iloc[0]

In [None]:
df.captured_at

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    


In [None]:
# define all features

body_parts_subset = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        df[interaction_column] = df[col1] * df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            df[interaction_column] = df[col1] * df[col2] * df[col3]
            interaction_columns_cubic.append(interaction_column)
            


In [None]:
np.random.seed(0)

mask = generate_train_mask(df, train_frac=1.0)
mask = mask & (df.epoch.isin(features_df.epoch))
columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

X_train = df.loc[mask, columns].values
print(X_train.sum())
y_train = df.loc[mask, 'weight'].values
X_test = df.loc[~mask, columns].values
y_test = df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
idx = np.where(explained_variance_ratio > 0.999999)[0][0]
# idx = 4
print(idx)

pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)
print(reg.score(X_test_transformed, y_test))

y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
df['prediction'] = y_pred
df['error'] = df.prediction - df.weight
df['error_pct'] = df.error / df.weight
df['abs_error_pct'] = df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts_subset   
}



<h1> Save model to disk </h1>

In [None]:
pickle.dump(model, open('/root/data/models/biomass/20190722_bati_post_axiom_calibration.pkl', 'wb'))

In [None]:
amg = AccuracyMetricsGenerator(mask.values, df.prediction.values, df.weight.values)

In [None]:
amg.plot_predictions_vs_ground_truth()

In [None]:
amg.display_train_test_accuracy_metrics()

<h1> Cross validation study </h1>

In [None]:
N = 100
biomass_error_pcts = []
for i in range(N):
    print(i)
    mask = generate_train_mask(df, train_frac=0.8)
    mask = mask & (df.epoch.isin(features_df.epoch))
    columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

    X_train = df.loc[mask, columns].values
    y_train = df.loc[mask, 'weight'].values
    X_test = df.loc[~mask, columns].values
    y_test = df.loc[~mask, 'weight'].values

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_normalized = scaler.transform(X_train)

    pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
    pca.fit(X_train_normalized)
    explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
    idx = np.where(explained_variance_ratio > 0.999999)[0][0]

    pca = PCA(n_components=idx+1)
    pca.fit(X_train_normalized)
    X_train_transformed = pca.transform(X_train_normalized)
    X_test_normalized = scaler.transform(X_test)
    X_test_transformed = pca.transform(X_test_normalized)

    reg = LinearRegression().fit(X_train_transformed, y_train)

    y_pred = reg.predict(pca.transform(scaler.transform(df[columns].values)))
    df['prediction'] = y_pred
    df['error'] = df.prediction - df.weight
    df['error_pct'] = df.error / df.weight
    df['abs_error_pct'] = df.error_pct.abs()

    model = {
        'mean': scaler.mean_,
        'std': scaler.scale_,
        'PCA_components': pca.components_,
        'reg_coef': reg.coef_,
        'reg_intercept': reg.intercept_,
        'body_parts': body_parts   
    }
    
    amg = AccuracyMetricsGenerator(mask.values, df.prediction.values, df.weight.values)
    accuracy_metrics = amg.generate_train_test_accuracy_metrics()
    biomass_error_pct = accuracy_metrics['test']['biomass_error_pct']
    biomass_error_pcts.append(biomass_error_pct)
    



In [None]:
data_sorted = sorted(list([abs(x) for x in biomass_error_pcts]))
p = 1.0 * np.arange(len(data_sorted)) / (len(data_sorted) - 1)
fig = plt.figure(figsize=(30, 7))
ax1 = fig.add_subplot(121)
ax1.plot(p, data_sorted)
ax1.set_xlabel('p')
ax1.set_ylabel('OOS error percentage')
plt.axvline(x=0.95, linestyle='--', color='red', label='p = 0.95')
plt.title('CDF of OOS errors (sample size = 250)')
plt.legend()
plt.grid()



In [None]:
X = df.ix[(df.epoch.isin(features_df.epoch)), columns + ['weight']].values

In [None]:
Y = features_df.ix[(features_df.epoch.isin(df.epoch)) & (features_df.pen_id == 48), columns + ['weight']].values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import cv2

import glob
import os
import boto3
import tempfile
from sqlalchemy import create_engine, MetaData, Table, select, and_, func
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.automap import automap_base
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from scipy.stats import norm
import tqdm
import pickle
from itertools import combinations
from aquabyte.optics import euclidean_distance

from PIL import Image, ImageDraw
from multiprocessing import Pool, Manager
import copy
import uuid
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)





In [None]:
plt.hist(df[(df.gtsf_fish_identifier == '190607010041_bolaks-mjanes') & (df.prediction < 7000)].prediction)
plt.axvline(5571, color='red')
plt.show()

In [None]:
((df[df.gtsf_fish_identifier == '190607010041_bolaks-mjanes'].prediction.mean()) - 5571)/5571

In [None]:
(df.prediction - df.weight).std() / df.weight.mean()

<h1> Get all epochs for this dataset </h1>

In [None]:
epochs = []
for idx, row in original_df.iterrows():
    captured_at = str(row.captured_at)
    unix_epoch = dt.datetime.utcfromtimestamp(0)
    try:
        pattern = '%Y-%m-%d %H:%M:%S.%f+00:00'
        timestamp = dt.datetime.strptime(captured_at, pattern)
    except ValueError as e:
        pattern = '%Y-%m-%d %H:%M:%S+00:00'
        timestamp = dt.datetime.strptime(captured_at, pattern)
        
    epoch = int((timestamp - unix_epoch).total_seconds() * 1000.0)
    epochs.append(epoch)

In [None]:
features_df = pd.read_hdf('/root/data/temp/features_df.h5', 'table')

In [None]:
df.captured_at

In [None]:
df['nanoepoch'] = df.captured_at.values.tolist()

In [None]:
features_df['nanoepoch'] = features_df.captured_at.values.tolist()

In [None]:
features_df.captured_at.values.tolist()

In [None]:
df[(df.epoch.isin(features_df.epoch)) & (df.epoch.isin(original_df[original_df.is_qa==False].epoch))].shape

In [None]:
features_df[features_df.pen_id==48].shape

In [None]:
m = (df.epoch.isin(features_df.epoch))
for epoch in df[m].epoch.unique():
    if df[df.epoch==epoch].shape[0] > 1:
        continue
    
    a = df.ix[df.epoch==epoch, 'weight'].values.sum()
    b = features_df.ix[features_df.epoch==epoch, 'weight'].values.sum()
    print(a - b)

In [None]:
df[df.epoch.isin(features_df.epoch)].shape