In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer
from scipy.stats import norm

pd.set_option('display.max_rows', 500)

In [None]:
df_path = '/root/data/temp/results_04ee5b3adda1c0e5f96b08c6dff049831dda828b_model_more_mixed_air_gtsf.h5'
df = pd.read_hdf(df_path, key='table')

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(df.weight, df.estimated_biomass_g)
plt.xlim([0, 10000])
plt.ylim([0, 10000])
plt.plot([0, 10000], [0, 10000])
plt.grid()
plt.show()

In [None]:
features_data = defaultdict(list)

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
    'UPPER_PRECAUDAL_PIT', 
    'LOWER_PRECAUDAL_PIT',
    'HYPURAL_PLATE'
])

for idx, row in df.iterrows():
    world_keypoints = row.world_keypoints
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(world_keypoints[body_parts[i]], 
                                   world_keypoints[body_parts[j]])
            features_data['{0}-{1}'.format(i, j)].append(d)

    features_data['weight'].append(row.weight)
    features_data['captured_at'].append(row.captured_at)
    features_data['gtsf_fish_identifier'].append(row.fish_id)
    features_data['pen_id'].append(row.pen_id)
    features_data['keypoint_annotation_id'].append(row.id)
    features_data['kf'].append(1e5 * row.weight / row['data']['lengthMms']**3) 
    features_data['length'].append(row['data']['lengthMms'] * 1e-3)
    features_data['breadth'].append(row['data']['breadthMms'] * 1e-3 if 'breadhMms' in row['data'] else None)
    
    

In [None]:
features_df = pd.DataFrame(features_data)


In [None]:
blacklisted_keypoint_annotation_ids = [606484, 635806, 637801, 508773]
blacklist_mask = None
for kp_id in blacklisted_keypoint_annotation_ids:
    if blacklist_mask is None:
        blacklist_mask = features_df.keypoint_annotation_id == kp_id
    else:
        blacklist_mask = blacklist_mask | (features_df.keypoint_annotation_id == kp_id)
features_df = features_df[~blacklist_mask]

In [None]:
features_df = features_df[(~features_df.captured_at.astype(str).str.contains('2019-05-13'))]

In [None]:
# define all features

body_parts_subset = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        features_df[interaction_column] = features_df[col1] * features_df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            features_df[interaction_column] = features_df[col1] * features_df[col2] * features_df[col3]
            interaction_columns_cubic.append(interaction_column)
            


In [None]:
m = features_df[features_df.pen_id == 48]
features_df['8-9'] - features_df.length

In [None]:
features_df.loc[2984].keypoint_annotation_id

In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    


In [None]:
np.random.seed(0)

mask = generate_train_mask(features_df, train_frac=0.8)
# mask = mask & (features_df.pen_id != 48)
columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

X_train = features_df.loc[mask, columns].values
y_train = features_df.loc[mask, 'weight'].values
X_test = features_df.loc[~mask, columns].values
y_test = features_df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
# scaler.fit(features_df.loc[mask & (features_df.pen_id==48), columns].values)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
# pca = WPCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)#, weights=np.vstack([w_train] * X_train_normalized.shape[1]).T)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
idx = np.where(explained_variance_ratio > 0.999995)[0][0]
print(idx)

pca = PCA(n_components=idx+1)
# pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)#, sample_weight=w_train)
print(reg.score(X_test_transformed, y_test))

y_pred = reg.predict(pca.transform(scaler.transform(features_df[columns].values)))
features_df['prediction'] = y_pred
features_df['error'] = features_df.prediction - features_df.weight
features_df['error_pct'] = features_df.error / features_df.weight
features_df['abs_error_pct'] = features_df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts_subset   
}



In [None]:
amg = AccuracyMetricsGenerator(mask, features_df.prediction.values, features_df.weight.values)

In [None]:
amg.plot_predictions_vs_ground_truth(impose_bounds=True)

In [None]:
amg.display_train_test_accuracy_metrics()

In [None]:
weights = []
for idx, row in features_df.iterrows():
    count = features_df[(features_df.weight >= row.weight-250) & (features_df.weight <= row.weight+250)].shape[0]
    if count > 1:
        weights.append(1.0 / count)
    else:
        weights.append(1)

In [None]:
features_df['w'] = weights

In [None]:
plt.figure(figsize=(20, 10))
m = features_df.kf > 1000
plt.scatter(features_df[m].kf, features_df[m].error_pct)
plt.ylim([-1.0, 1.0])
plt.grid()
plt.show()

In [None]:
kf_list = list(np.arange(0.75, 2.0, 0.05))
mean_error_pcts = []
for i in range(len(kf_list)-1):
    m = (features_df.kf > kf_list[i]) & (features_df.kf < kf_list[i+1])
    mean_error_pcts.append(features_df[m].error_pct.median())

In [None]:
plt.scatter(kf_list[:-1], mean_error_pcts)
plt.ylim([-.10, .10])
plt.grid()
plt.show()

In [None]:
features_df.ix[(features_df.weight > 4000) & (features_df.weight < 4500),['kf', 'weight', 'error_pct', 'keypoint_annotation_id']].sort_values('kf', ascending=False)

<h1> Visualize Bad Cases </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
credentials = json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS']))
rds_access_utils = RDSAccessUtils(credentials)
visualizer = Visualizer(s3_access_utils, rds_access_utils)

In [None]:
visualizer.load_data(507035)
visualizer.display_crops()

In [None]:
Image.open(visualizer.left_image_f).resize((512, 512))

In [None]:
from PIL import Image

In [None]:
Image.open(visualizer.left_image_f).resize((512, 512))

In [None]:
visualizer.load_data(518656)
visualizer.display_crops()

In [None]:
visualizer.load_data(637950)
visualizer.display_crops()

In [None]:
visualizer.load_data(508373)
visualizer.display_crops()

In [None]:
visualizer.load_data(506844)

In [None]:
visualizer.display_crops()

In [None]:
visualizer.load_data(507546)

In [None]:
visualizer.display_crops()

In [None]:
visualizer.load_data(506781)


In [None]:
visualizer.display_crops()

In [None]:
visualizer.load_data(507768)

In [None]:
visualizer.display_crops()