In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from wpca import WPCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer, _normalize_world_keypoints
import random
import pickle
from scipy.stats import norm

import matplotlib.cm as cm
pd.set_option('display.max_rows', 500)

<h1> Extract base data from database </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS'])))
query = """
    select * from research.fish_metadata a left join keypoint_annotations b
    on a.left_url = b.left_image_url 
    where b.keypoints is not null and b.is_qa = false;
"""
df = rds_access_utils.extract_from_database(query)

<h1> Append world kepyoints to the data </h1>
<h3> Ideally, this data should already live directly in the database </h3>

In [None]:
def get_world_keypoints(row):
    if 'leftCrop' in row.keypoints and 'rightCrop' in row.keypoints:
        return pixel2world(row.keypoints['leftCrop'], row.keypoints['rightCrop'], row.camera_metadata)
    else:
        return None
    
df['world_keypoints'] = df.apply(
    lambda x: get_world_keypoints(x), axis=1
)

<h1> Get the features dataframe from the base data with all pairwise distances </h1>

In [None]:
features_data = defaultdict(list)

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
    'UPPER_PRECAUDAL_PIT', 
    'LOWER_PRECAUDAL_PIT',
    'HYPURAL_PLATE'
])

body_parts_subset = sorted([
    'HYPURAL_PLATE',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])


for idx, row in df.iterrows():
    world_keypoints = row.world_keypoints
    if world_keypoints:
        for i in range(len(body_parts)-1):
            for j in range(i+1, len(body_parts)):
                if body_parts[i] in body_parts_subset and body_parts[j] in body_parts_subset:
                    d = euclidean_distance(world_keypoints[body_parts[i]], 
                                           world_keypoints[body_parts[j]])
                    features_data['{}-{}'.format(i, j)].append(d)

            
        features_data['world_keypoints'].append(row.world_keypoints)
        features_data['weight'].append(row.weight)
        features_data['captured_at'].append(row.captured_at)
        features_data['gtsf_fish_identifier'].append(row.fish_id)
        features_data['pen_id'].append(row.pen_id)
        features_data['keypoint_annotation_id'].append(row.id)
        features_data['kf'].append(1e5 * row.weight / row['data']['lengthMms']**3) 
        features_data['length'].append(row['data']['lengthMms'] * 1e-3)
        features_data['width'].append(row['data']['widthMms'] * 1e-3 if 'widthMms' in row['data'] else None)
#         features_data['breadth'].append(row['data']['breadthMms'] * 1e-3 if 'breadthMms' in row['data'] else None)

features_df = pd.DataFrame(features_data)

# get rid of bad keypoint annotation ids

blacklisted_keypoint_annotation_ids = [
    606484, 
    635806, 
    637801, 
    508773, 
    640493, 
    639409, 
    648536, 
    507003,
    706002,
    507000,
    709298,
    714073,
    719239
]

# blacklist_mask = features_df['8-9'] > 1.0
blacklist_mask = features_df['2-6'] > 1.0
for kp_id in blacklisted_keypoint_annotation_ids:
    if blacklist_mask is None:
        blacklist_mask = features_df.keypoint_annotation_id == kp_id
    else:
        blacklist_mask = blacklist_mask | (features_df.keypoint_annotation_id == kp_id)
features_df = features_df[~blacklist_mask]



In [None]:
# define all features


body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        features_df[interaction_column] = features_df[col1] * features_df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            features_df[interaction_column] = features_df[col1] * features_df[col2] * features_df[col3]
            interaction_columns_cubic.append(interaction_column)
            

<h1> Weight each datapoint based on the number of stereo images captured for that fish </h1>

<h1> Compute best fit plane </h1>

In [None]:
i = 0
features_df['rms_error_m'] = np.nan

rms_error_ms, max_error_ms, coeffs = [], [], []
horizontal_angles, vertical_angles = [], []
for idx, row in features_df.iterrows():
    try:
        
        # fit plane based on well-behaved points
        X, y = [], []
        for body_part in ['UPPER_LIP', 'HYPURAL_PLATE', 'ADIPOSE_FIN', 'ANAL_FIN']:
            wkp = row.world_keypoints[body_part]
            X.append([
                wkp[0],
                wkp[2]
            ])
            y.append(wkp[1])

        X, y = np.array(X), np.array(y)
        reg = LinearRegression().fit(X, y)
        coeffs.append(reg.coef_)
        vertical_angles.append(np.arctan(reg.coef_[0]) * 180.0 / np.pi)
        horizontal_angles.append(np.arctan(reg.coef_[1]) * 180.0 / np.pi)
        
        # test plane
        X, y = [], []
        for body_part, wkp in row.world_keypoints.items():
            X.append([
                wkp[0],
                wkp[2]
            ])
            y.append(wkp[1])
        X, y = np.array(X), np.array(y)
        rms_error_m = np.linalg.norm(reg.predict(X) - y) / y.shape[0]
        rms_error_ms.append(rms_error_m)
        
        max_error_m = abs(reg.predict(X) - y).max()
        max_error_ms.append(max_error_m)
    except Exception as e:
        print(e)
        rms_error_ms.append(None)

features_df['rms_error_m'] = rms_error_ms
features_df['max_error_m'] = max_error_ms
features_df['horizontal_angle'] = horizontal_angles
features_df['vertical_angle'] = vertical_angles


In [None]:
%matplotlib inline
plt.figure(figsize=(20, 10))
plt.hist(features_df[features_df.rms_error_m < 0.1].horizontal_angle)
plt.grid()
plt.show()

<h1> Model Training </h1>

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    

def generate_oos_score(features_df, mask, train_size, num_eigenvectors):
    np.random.seed(0)
    columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

    X_train = features_df.loc[mask, columns].values
    y_train = features_df.loc[mask, 'weight'].values
    w_train = features_df.loc[mask, 'w'].values
    X_test = features_df.loc[~mask, columns].values
    y_test = features_df.loc[~mask, 'weight'].values

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_normalized = scaler.transform(X_train)

    pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
    pca.fit(X_train_normalized)
    explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
    idx = num_eigenvectors

    pca = PCA(n_components=idx+1)
    pca.fit(X_train_normalized)
    X_train_transformed = pca.transform(X_train_normalized)
    X_test_normalized = scaler.transform(X_test)
    X_test_transformed = pca.transform(X_test_normalized)

    reg = LinearRegression().fit(X_train_transformed, y_train, sample_weight=w_train)
    score = reg.score(X_test_transformed, y_test)

    y_pred = reg.predict(pca.transform(scaler.transform(features_df[columns].values)))
    features_df['prediction'] = y_pred
    features_df['error'] = features_df.prediction - features_df.weight
    features_df['error_pct'] = features_df.error / features_df.weight
    features_df['abs_error_pct'] = features_df.error_pct.abs()

    model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts_subset   
    }
    

    return mask, model, score



<h1> Plot one instance of train / test where the training set consists of 2000 fish </h1>

In [None]:
np.isnan(fea.values)

In [None]:
# num eigenvectors = 20

np.random.seed(0)
random.seed(0)
tdf = features_df.copy(deep=True)

weights = []
i = 0
for idx, row in tdf.iterrows():
    if i % 100 == 0:
        print(i)
    i += 1
    count = tdf[tdf.gtsf_fish_identifier == row.gtsf_fish_identifier].shape[0]
    if count > 1:
        weights.append(1.0 / count ** 0.5)
    else:
        weights.append(1)
        
tdf['w'] = weights

gtsf_fish_identifiers = list(tdf.gtsf_fish_identifier.unique())
train_size = int(0.8 * len(gtsf_fish_identifiers))
fish_ids = random.sample(gtsf_fish_identifiers, train_size)
mask = tdf.gtsf_fish_identifier.isin(fish_ids)
mask, model, score = generate_oos_score(tdf, mask, 2000, 20)



In [None]:
amg = AccuracyMetricsGenerator()
amg.set_data(mask, tdf.prediction.values, tdf.weight.values, w=tdf.w.values)
amg.plot_predictions_vs_ground_truth(impose_bounds=True)

In [None]:
amg.display_train_test_accuracy_metrics()

In [None]:
amg = AccuracyMetricsGenerator()
m = tdf.max_error_m < 0.04
# amg.set_data(mask, tdf.prediction.values, tdf.weight.values, w=tdf.w.values)
amg.set_data(m & mask, tdf.prediction.values, tdf.weight.values, test_mask=~m & mask, w=tdf.w.values)
amg.plot_predictions_vs_ground_truth(impose_bounds=True)

In [None]:
amg.display_train_test_accuracy_metrics()

In [None]:
# num eigenvectors = 20
np.random.seed(0)
random.seed(0)
tdf = features_df[(features_df.horizontal_angle.abs() < 20) & (features_df.vertical_angle.abs() < 20) & (features_df.rms_error_m < 0.1)]

weights = []
i = 0
for idx, row in tdf.iterrows():
    if i % 100 == 0:
        print(i)
    i += 1
    count = tdf[tdf.gtsf_fish_identifier == row.gtsf_fish_identifier].shape[0]
    if count > 1:
        weights.append(1.0 / count ** 0.5)
#         weights.append(1.0 / count)
    else:
        weights.append(1)
        
tdf['w'] = weights


gtsf_fish_identifiers = list(tdf.gtsf_fish_identifier.unique())
train_size = int(0.8 * len(gtsf_fish_identifiers))
fish_ids = random.sample(gtsf_fish_identifiers, train_size)
mask = tdf.gtsf_fish_identifier.isin(fish_ids)
mask, model, score = generate_oos_score(tdf, mask, 2000, 50)



In [None]:
amg = AccuracyMetricsGenerator()
amg.set_data(mask, tdf.prediction.values, tdf.weight.values, w=tdf.w.values)
amg.plot_predictions_vs_ground_truth(impose_bounds=True)

In [None]:
amg.display_train_test_accuracy_metrics()

In [None]:
amg = AccuracyMetricsGenerator()
amg.set_data(mask, tdf.prediction.values, tdf.weight.values, w=tdf.w.values)
amg.plot_predictions_vs_ground_truth(impose_bounds=True)

In [None]:
amg.display_train_test_accuracy_metrics()

In [None]:
# pickle.dump(model, open('/root/data/alok/biomass_estimation/playground/model_lateral_only.pkl', 'wb'))

In [None]:
(np.average(tdf[mask].prediction.values, weights=tdf[mask].w.values) - np.average(tdf[mask].weight.values, weights=tdf[mask].w.values)) / np.average(tdf[mask].weight.values, weights=tdf[mask].w.values)



In [None]:
error = tdf.prediction.values - tdf.weight.values
ground_truth = tdf.weight.values
w = tdf.w.values
np.sqrt(np.average((error / ground_truth)**2, weights=w) - np.average(error / ground_truth, weights=w)**2)

In [None]:
df[df.id == 713939].left_image_url.iloc[0]

In [None]:
df[df.id == 714830].iloc[0].left_image_url

In [None]:
cols = ['keypoint_annotation_id', 'gtsf_fish_identifier', 'rms_error_m', 'weight', 'width']
features_df.ix[features_df.rms_error_m > 0.02, cols]

<h1> Visualize Individual Cases </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
credentials = json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS']))
rds_access_utils = RDSAccessUtils(credentials)
v = Visualizer(s3_access_utils, rds_access_utils)


In [None]:
%matplotlib inline
for idx, row in features_df.sample(100).sort_values('rms_error_m', ascending=False).iterrows():
    v.load_data(row.keypoint_annotation_id)
    v.display_crops()

In [None]:
%matplotlib notebook
v.display_3d_keypoints()

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(features_df.rms_error_m, bins=100)
plt.grid()
plt.show()

In [None]:
cm = plt.cm.get_cmap('seismic')
fig, ax = plt.subplots(figsize=(20, 10))
mask = features_df.rms_error_m < 0.5
sc = ax.scatter(features_df[mask].sort_values('rms_error_m').weight.values, 
            features_df[mask].sort_values('rms_error_m').prediction.values,
            c=features_df[mask].sort_values('rms_error_m').rms_error_m.values,
            cmap=cm)
plt.colorbar(sc)
plt.plot([0, 10000], [0, 10000])
# plt.xlim([0, 10000])
# plt.ylim([0, 10000])
plt.grid()
plt.show()

In [None]:
features_df[mask].sort_values('error_pct', ascending=False).keypoint_annotation_id

In [None]:
def centroid_depth(wkps):
    if wkps:
        mean = np.mean(np.array([wkp[1] for wkp in wkps.values()]))
        if mean == np.inf:
            return None
        return mean
    return None

features_df['centroid_depth'] = features_df.world_keypoints.apply(lambda x: centroid_depth(x))

In [None]:
%matplotlib inline
plt.figure(figsize=(20, 10))
plt.hist(features_df[(features_df.centroid_depth > 0) & (features_df.centroid_depth < 2.0)].centroid_depth, bins=100)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(features_df.centroid_depth, features_df.rms_error_m)
plt.grid()
plt.show()

In [None]:
features_df.rms_error_m.mean()

In [None]:
df.left_image_url.iloc[0]