In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from wpca import WPCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer
from scipy.stats import norm

pd.set_option('display.max_rows', 500)

In [None]:
df_path = '/root/data/temp/results_c8cc936a26a7645895842f467fe9d88764cf5579_gtsf_mixed_air_v1.h5'
df = pd.read_hdf(df_path, key='table')

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(df.weight, df.estimated_biomass_g)
plt.xlim([0, 10000])
plt.ylim([0, 10000])
plt.plot([0, 10000], [0, 10000])
plt.grid()
plt.show()

In [None]:
features_data = defaultdict(list)

body_parts = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE',
    'UPPER_PRECAUDAL_PIT', 
    'LOWER_PRECAUDAL_PIT',
    'HYPURAL_PLATE'
])

for idx, row in df.iterrows():
    world_keypoints = row.world_keypoints
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            d = euclidean_distance(world_keypoints[body_parts[i]], 
                                   world_keypoints[body_parts[j]])
            features_data['{0}-{1}'.format(i, j)].append(d)

    features_data['weight'].append(row.weight)
    features_data['captured_at'].append(row.captured_at)
    features_data['gtsf_fish_identifier'].append(row.fish_id)
    features_data['pen_id'].append(row.pen_id)
    features_data['keypoint_annotation_id'].append(row.id)
    
    

In [None]:
features_df = pd.DataFrame(features_data)


In [None]:
features_df = features_df[features_df.keypoint_annotation_id != 606484]

In [None]:
# define all features

body_parts_subset = sorted([
    'TAIL_NOTCH',
    'ADIPOSE_FIN',
    'ANAL_FIN',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'DORSAL_FIN',
    'UPPER_LIP',
    'EYE'
])

body_part_indices = [body_parts.index(bp) for bp in body_parts_subset]

pairwise_distance_columns = ['{0}-{1}'.format(x, y) for x, y in list(combinations(body_part_indices, 2))]
interaction_columns_quadratic = []
interaction_columns_cubic = []
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        col1 = pairwise_distance_columns[i]
        col2 = pairwise_distance_columns[j]
        interaction_column = '{},{}'.format(col1, col2)
        features_df[interaction_column] = features_df[col1] * features_df[col2]
        interaction_columns_quadratic.append(interaction_column)
        
for i in range(len(pairwise_distance_columns)):
    for j in range(i, len(pairwise_distance_columns)):
        for k in range(j, len(pairwise_distance_columns)):
            col1 = pairwise_distance_columns[i]
            col2 = pairwise_distance_columns[j]
            col3 = pairwise_distance_columns[k]
            interaction_column = '{},{},{}'.format(col1, col2, col3)
            features_df[interaction_column] = features_df[col1] * features_df[col2] * features_df[col3]
            interaction_columns_cubic.append(interaction_column)
            


In [None]:
def generate_train_mask(df, train_frac, randomize=True):
    x = np.zeros((df.shape[0]), dtype=bool)
    x[:int(train_frac * df.shape[0])] = True
    np.random.shuffle(x)
    mask = pd.Series(x)
    return x
    


In [None]:
np.random.seed(0)

mask = generate_train_mask(features_df, train_frac=0.9)
mask = mask & (features_df.gtsf_fish_identifier != '190620-4e4e0640-d4eb-405d-8fcf-57fda11d7660')
columns = pairwise_distance_columns + interaction_columns_quadratic + interaction_columns_cubic

X_train = features_df.loc[mask, columns].values
y_train = features_df.loc[mask, 'weight'].values
# w_train = features_df.loc[mask, 'w'].values
X_test = features_df.loc[~mask, columns].values
y_test = features_df.loc[~mask, 'weight'].values

scaler = StandardScaler()
scaler.fit(X_train)
# scaler.fit(features_df.loc[mask & (features_df.pen_id==48), columns].values)
X_train_normalized = scaler.transform(X_train)

pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
# pca = WPCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)#, weights=np.vstack([w_train] * X_train_normalized.shape[1]).T)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
# idx = np.where(explained_variance_ratio > 0.999999)[0][0]
idx = 
print(idx)

pca = PCA(n_components=idx+1)
# pca = PCA(n_components=idx+1)
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)
X_test_normalized = scaler.transform(X_test)
X_test_transformed = pca.transform(X_test_normalized)

reg = LinearRegression().fit(X_train_transformed, y_train)#, sample_weight=w_train)
print(reg.score(X_test_transformed, y_test))

y_pred = reg.predict(pca.transform(scaler.transform(features_df[columns].values)))
features_df['prediction'] = y_pred
features_df['error'] = features_df.prediction - features_df.weight
features_df['error_pct'] = features_df.error / features_df.weight
features_df['abs_error_pct'] = features_df.error_pct.abs()

model = {
    'mean': scaler.mean_,
    'std': scaler.scale_,
    'PCA_components': pca.components_,
    'reg_coef': reg.coef_,
    'reg_intercept': reg.intercept_,
    'body_parts': body_parts_subset   
}



In [None]:
pca = PCA(n_components=min(X_train_normalized.shape[0], X_train_normalized.shape[1]))
pca.fit(X_train_normalized)
X_train_transformed = pca.transform(X_train_normalized)

<h1> Principal Component Regression </h1>

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
kf_10 = KFold(n_splits=10, shuffle=True, random_state=1)

In [None]:
mse = []
regr = LinearRegression()
# Calculate MSE using CV for the 19 principle components, adding one component at the time.
for i in np.arange(1, 200):
    score = -1*cross_val_score(regr, X_train_transformed[:,:i], y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
    mse.append(score)

In [None]:
plt.figure(figsize=(20, 10))
plt.ylim([0, 100000])
plt.plot(mse)
plt.show()

In [None]:
np.argmin(np.array(mse))

In [None]:
def convert(x):
    if all([k in x for k in ['leftCrop', 'rightCrop', 'cameraParameters']]):
        return pixel2world(x['leftCrop'], x['rightCrop'], x['cameraParameters'])
    else:
        return None

In [None]:
df.keypoints.apply(lambda x: convert(x))

In [None]:
amg = AccuracyMetricsGenerator(mask, features_df.prediction.values, features_df.weight.values)

In [None]:
amg.plot_predictions_vs_ground_truth(impose_bounds=True)

In [None]:
m = ~mask & (features_df.pen_id != 100.0) & (features_df.abs_error_pct < 2.0) & (features_df.weight > 2000)
(features_df[m].prediction.mean() - features_df[m].weight.mean()) / features_df[m].weight.mean()

In [None]:
(features_df[m].abs_error_pct).mean()

In [None]:
(features_df[m].error_pct).mean()

In [None]:
weights = []
for idx, row in features_df.iterrows():
    count = features_df[(features_df.weight >= row.weight-250) & (features_df.weight <= row.weight+250)].shape[0]
    if count > 1:
        weights.append(1.0 / count)
    else:
        weights.append(1)

In [None]:
features_df['w'] = weights

In [None]:
features_df[features_df.captured_at > '2019-06-22']

In [None]:
plt.figure(figsize=(20, 10))
x = features_df[(features_df.captured_at >= '2019-05-03') & (features_df.captured_at < '2019-05-04')].error_pct
plt.hist(x[x.abs() < 100.0])
plt.show()

In [None]:
features_df[['keypoint_annotation_id', 'abs_error_pct', 'captured_at']].sort_values('abs_error_pct', ascending=False)

In [None]:
features_df.index = pd.to_datetime(features_df.captured_at)

In [None]:
features_df.abs_error_pct.resample('D', how=lambda x: x.mean())

In [None]:
features_df[['abs_error_pct', 'keypoint_annotation_id', 'captured_at', 'weight']].sort_values('abs_error_pct', ascending=False)

<h1> Visualize Bad Cases </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data')
credentials = json.load(open(os.environ['PROD_RESEARCH_SQL_CREDENTIALS']))
rds_access_utils = RDSAccessUtils(credentials)
visualizer = Visualizer(s3_access_utils, rds_access_utils)

In [None]:
visualizer.load_data(507806)

In [None]:
visualizer.display_crops()

In [None]:
kp = df[df.id == 508195].keypoints.iloc[0]
wkps = pixel2world(kp['leftCrop'], kp['rightCrop'], kp['cameraParameters'])
body_parts = sorted(list(wkps.keys()))
for i in range(len(body_parts)-1):
    for j in range(i+1, len(body_parts)):
        bp1, bp2 = body_parts[i], body_parts[j]
        dist = euclidean_distance(wkps[bp1], wkps[bp2])
        print('{0}<->{1}: {2}'.format(bp1, bp2, dist))
        