In [None]:
import json
from collections import defaultdict
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from aquabyte.data_access_utils import S3AccessUtils
from sklearn.metrics import precision_recall_curve
from inspect import signature

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


<h1> Load Data </h1>

In [None]:
s3_access_utils = S3AccessUtils('/root/data')

In [None]:
akpd_data_f = s3_access_utils.download_from_s3('aquabyte-annotations', 'akpd/type=data/date=2019-06-01/group=VERIFIED/group=VERIFIED.parquet')

In [None]:
data = pq.ParquetDataset(akpd_data_f).read()
df = data.to_pandas()


<h1> Construct Features </h1>

In [None]:
def euclidean_distance(p1, p2):
    return np.linalg.norm(p1-p2)

def get_left_right_keypoints(keypoints):
    left_keypoints, right_keypoints = {}, {}
    for item in keypoints['leftCrop']:
        left_keypoints[item['keypointType']] = (item['xFrame'], item['yFrame'])
        
    for item in keypoints['rightCrop']:
        right_keypoints[item['keypointType']] = (item['xFrame'], item['yFrame'])
        
    return left_keypoints, right_keypoints

body_parts = sorted([
    'UPPER_LIP',
    'TAIL_NOTCH',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'ADIPOSE_FIN',
    'EYE',
    'DORSAL_FIN',
    'ANAL_FIN'
])

features_df = pd.DataFrame()

sides, manhattan_errors = [], []
pairwise_distance_values = defaultdict(list)
counter = 0
for idx, row in df.iterrows():
    if counter % 1000 == 0:
        print(counter)
    counter += 1
    
    gt_keypoints = json.loads(row.keypoints.decode('utf8').replace("'", '"'))
    gt_left_keypoints, gt_right_keypoints = get_left_right_keypoints(gt_keypoints)
    
    pred_keypoints = json.loads(row.akpd.decode('utf8').replace("'", '"'))
    pred_left_keypoints, pred_right_keypoints = {}, {}
    pred_left_keypoints, pred_right_keypoints = get_left_right_keypoints(pred_keypoints)
    
    manhattan_error = row.leftError
    sides.append('left')
    manhattan_errors.append(manhattan_error)
    
    manhattan_error = row.rightError
    sides.append('right')
    manhattan_errors.append(manhattan_error)
    
    # get predicted pairwise distances
    for i in range(len(body_parts)-1):
        for j in range(i+1, len(body_parts)):
            bp1, bp2 = body_parts[i], body_parts[j]
            dist_left = euclidean_distance(np.array(pred_left_keypoints[bp1]), 
                                           np.array(pred_left_keypoints[bp2]))
            dist_left /= euclidean_distance(np.array(pred_left_keypoints['UPPER_LIP']), 
                                            np.array(pred_left_keypoints['TAIL_NOTCH']))
            
            
            pairwise_distance_values['{}-{}'.format(bp1, bp2)].append(dist_left)
            
            dist_right = euclidean_distance(np.array(pred_right_keypoints[bp1]), np.array(pred_right_keypoints[bp2]))
            dist_right /= euclidean_distance(np.array(pred_right_keypoints['UPPER_LIP']),
                                             np.array(pred_right_keypoints['TAIL_NOTCH']))
            
            pairwise_distance_values['{}-{}'.format(bp1, bp2)].append(dist_right)
            
            
features_df = pd.DataFrame({
    'side': sides,
    'manhattan_errors': manhattan_errors,
})

for k, v in dict(pairwise_distance_values).items():

    features_df[k] = v
            
        

# json.loads(df.keypoints.iloc[0].decode('utf8').replace("'", '"'))['leftCrop']

In [None]:
features_df = features_df.dropna()

In [None]:
features = []
for i in range(len(body_parts)-1):
    for j in range(i+1, len(body_parts)):
        features.append('{}-{}'.format(body_parts[i], body_parts[j]))

In [None]:
X, y = features_df[features].values, (features_df.manhattan_errors < 20).values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = SVC(probability=True)
clf.fit(X_train, y_train)




In [None]:
9

In [None]:
clf.score(X_test, y_test)

In [None]:
p_test = clf.predict_proba(X_test)

In [None]:
precision, recall, _ = precision_recall_curve(y_test, clf.decision_function(X_test))
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve')

In [None]:
y_test.sum() / y_test.shape[0]

In [None]:
t = 0.4
y_pred = (p_test[:, 1] > t).astype(int)
print(y_pred[y_test == 1].sum()/y_test.sum())
print(y_pred[y_test == 1].sum()/y_pred.sum())



<h1> Two Layer Neural Network </h1>

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import math
from keras.layers import Input, Dense, Flatten
from keras.models import Model
from keras.optimizers import RMSprop
from keras.models import load_model


In [None]:
def get_left_right_keypoints(keypoints):
    left_keypoints, right_keypoints = {}, {}
    for item in keypoints['leftCrop']:
        left_keypoints[item['keypointType']] = np.array([item['xFrame'], item['yFrame']])
        
    for item in keypoints['rightCrop']:
        right_keypoints[item['keypointType']] = np.array([item['xFrame'], item['yFrame']])
        
    return left_keypoints, right_keypoints

In [None]:
def rotate(point, angle, origin=(0, 0)):
    """
    Rotate a point counterclockwise by a given angle around a given origin.

    The angle should be given in radians.
    """
    ox, oy = origin
    px, py = point

    qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
    qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
    return qx, qy


def normalize_keypoints(keypoints, origin_bp = 'TAIL_NOTCH'):
    # translation
    for bp in body_parts:
        keypoints[bp] = keypoints[bp] - keypoints[origin_bp]
        keypoints[bp][1] = -keypoints[bp][1]
    
    # rotation & compression
    angle = np.arctan(keypoints['UPPER_LIP'][1] / keypoints['UPPER_LIP'][0])
    for bp in body_parts:
        keypoints[bp] = rotate(keypoints[bp], -angle)
        keypoints[bp] = keypoints[bp] / np.linalg.norm(keypoints['UPPER_LIP'])
        
    return keypoints
    

<h1> Construct dataset to feed to network </h1>

In [None]:
body_parts = sorted([
    'UPPER_LIP',
    'TAIL_NOTCH',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'ADIPOSE_FIN',
    'EYE',
    'DORSAL_FIN',
    'ANAL_FIN'
])

features_df = pd.DataFrame()

X_values, manhattan_errors = [], []
counter = 0
for idx, row in df.iterrows():
    if counter % 1000 == 0:
        print(counter)
    counter += 1
    
    pred_keypoints = json.loads(row.akpd.decode('utf8').replace("'", '"'))
    pred_left_keypoints, pred_right_keypoints = get_left_right_keypoints(pred_keypoints)
    pred_norm_left_keypoints = normalize_keypoints(pred_left_keypoints)
    pred_norm_right_keypoints = normalize_keypoints(pred_right_keypoints)
    
    coords = []
    for bp in body_parts:
        coords.append(pred_norm_left_keypoints[bp])
    
    X_values.append(coords)
    manhattan_errors.append(row.leftError)
    
    coords = []
    for bp in body_parts:
        coords.append(pred_norm_right_keypoints[bp])
    
    X_values.append(coords)
    manhattan_errors.append(row.rightError)
        
    



In [None]:
X = np.swapaxes(np.array(X_values), 1, 2)
# y = (np.array(manhattan_errors) < 10).astype(int)
y = (np.array(manhattan_errors) < 20).astype(int)

<h1> Train network </h1>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
inputs = Input(shape=(2, 8))

# a layer instance is callable on a tensor, and returns a tensor
x = Dense(8, activation='relu')(inputs)
x = Dense(24, activation='relu')(x)
x = Dense(24, activation='relu')(x)
x = Flatten()(x)
predictions = Dense(1, activation='sigmoid')(x)

In [None]:
model = Model(inputs=inputs, outputs=predictions)
optimizer = RMSprop(lr=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100)

In [None]:
model.save('/root/data/temp/akpd_scorer_model.h5')

In [None]:
X_train[0]

In [None]:
m.predict(np.array([X_train[0],]))

In [None]:
import numpy as np
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import math
from keras.layers import Input, Dense, Flatten
from keras.models import Model
from keras.optimizers import RMSprop
from keras.models import load_model


class AKPDPredictionScorer(object):
    
    def __init__(self, model_f, body_parts):
        self.model = load_model(model_f)
        self.body_parts = sorted(body_parts)

    def _get_left_right_keypoints(self, keypoints):
        left_keypoints, right_keypoints = {}, {}
        for item in keypoints['leftCrop']:
            left_keypoints[item['keypointType']] = np.array([item['xFrame'], item['yFrame']])

        for item in keypoints['rightCrop']:
            right_keypoints[item['keypointType']] = np.array([item['xFrame'], item['yFrame']])

        return left_keypoints, right_keypoints

    
    def _rotate(self, point, angle, origin=(0, 0)):
        """
        Rotate a point counterclockwise by a given angle around a given origin.

        The angle should be given in radians.
        """
        ox, oy = origin
        px, py = point

        qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
        qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
        return qx, qy


    def _normalize_keypoints(self, keypoints, origin_bp='TAIL_NOTCH'):
        # translation
        for bp in body_parts:
            keypoints[bp] = keypoints[bp] - keypoints[origin_bp]
            keypoints[bp][1] = -keypoints[bp][1]

        # rotation & compression
        angle = np.arctan(keypoints['UPPER_LIP'][1] / keypoints['UPPER_LIP'][0])
        for bp in body_parts:
            keypoints[bp] = self._rotate(keypoints[bp], -angle)
            keypoints[bp] = keypoints[bp] / np.linalg.norm(keypoints['UPPER_LIP'])

        return keypoints
    
    def _generate_one_side_score(self, coords):
        X = np.array([coords, ])
        X = np.swapaxes(X, 1, 2)
        return self.model.predict(X)
        


    def get_confidence_score(self, pred_keypoints):

        pred_left_keypoints, pred_right_keypoints = self._get_left_right_keypoints(pred_keypoints)
        pred_norm_left_keypoints = self._normalize_keypoints(pred_left_keypoints)
        pred_norm_right_keypoints = self._normalize_keypoints(pred_right_keypoints)

        coords_left, coords_right = [], []
        for bp in self.body_parts:
            coords_left.append(pred_norm_left_keypoints[bp])
            coords_right.append(pred_norm_right_keypoints[bp])
            
        left_score = self._generate_one_side_score(coords_left)[0][0]
        right_score = self._generate_one_side_score(coords_right)[0][0]
        return min(left_score, right_score)

    

pred_keypoints = {"version": 2, "leftCrop": [{"xCrop": 58, "yCrop": 367, "xFrame": 382, "yFrame": 959, "keypointType": "UPPER_LIP"}, {"xCrop": 232, "yCrop": 345, "xFrame": 556, "yFrame": 937, "keypointType": "EYE"}, {"xCrop": 724, "yCrop": 70, "xFrame": 1048, "yFrame": 662, "keypointType": "DORSAL_FIN"}, {"xCrop": 1255, "yCrop": 150, "xFrame": 1579, "yFrame": 742, "keypointType": "ADIPOSE_FIN"}, {"xCrop": 1426, "yCrop": 209, "xFrame": 1750, "yFrame": 801, "keypointType": "UPPER_PRECAUDAL_PIT"}, {"xCrop": 1525, "yCrop": 275, "xFrame": 1849, "yFrame": 867, "keypointType": "HYPURAL_PLATE"}, {"xCrop": 1623, "yCrop": 283, "xFrame": 1947, "yFrame": 875, "keypointType": "TAIL_NOTCH"}, {"xCrop": 1430, "yCrop": 328, "xFrame": 1754, "yFrame": 920, "keypointType": "LOWER_PRECAUDAL_PIT"}, {"xCrop": 1187, "yCrop": 423, "xFrame": 1511, "yFrame": 1015, "keypointType": "ANAL_FIN"}, {"xCrop": 900, "yCrop": 484, "xFrame": 1224, "yFrame": 1076, "keypointType": "PELVIC_FIN"}, {"xCrop": 466, "yCrop": 462, "xFrame": 790, "yFrame": 1054, "keypointType": "PECTORAL_FIN"}], "rightCrop": [{"xCrop": 21, "yCrop": 392, "xFrame": 83, "yFrame": 961, "keypointType": "UPPER_LIP"}, {"xCrop": 185, "yCrop": 363, "xFrame": 247, "yFrame": 932, "keypointType": "EYE"}, {"xCrop": 708, "yCrop": 78, "xFrame": 770, "yFrame": 647, "keypointType": "DORSAL_FIN"}, {"xCrop": 1261, "yCrop": 171, "xFrame": 1323, "yFrame": 740, "keypointType": "ADIPOSE_FIN"}, {"xCrop": 1462, "yCrop": 228, "xFrame": 1524, "yFrame": 797, "keypointType": "UPPER_PRECAUDAL_PIT"}, {"xCrop": 1538, "yCrop": 294, "xFrame": 1600, "yFrame": 863, "keypointType": "HYPURAL_PLATE"}, {"xCrop": 1645, "yCrop": 302, "xFrame": 1707, "yFrame": 871, "keypointType": "TAIL_NOTCH"}, {"xCrop": 1445, "yCrop": 345, "xFrame": 1507, "yFrame": 914, "keypointType": "LOWER_PRECAUDAL_PIT"}, {"xCrop": 1198, "yCrop": 443, "xFrame": 1260, "yFrame": 1012, "keypointType": "ANAL_FIN"}, {"xCrop": 901, "yCrop": 523, "xFrame": 963, "yFrame": 1092, "keypointType": "PELVIC_FIN"}, {"xCrop": 414, "yCrop": 481, "xFrame": 476, "yFrame": 1050, "keypointType": "PECTORAL_FIN"}]}
body_parts = sorted([
    'UPPER_LIP',
    'TAIL_NOTCH',
    'PECTORAL_FIN',
    'PELVIC_FIN',
    'ADIPOSE_FIN',
    'EYE',
    'DORSAL_FIN',
    'ANAL_FIN'
])

f = '/root/data/temp/akpd_scorer_model.h5'
aps = AKPDPredictionScorer(f, body_parts)
aps.get_confidence_score(pred_keypoints)



<h1> Plot Precision / Recall </h1>

In [None]:
p_test = model.predict(X_test).flatten()
thresholds = np.arange(0.0, 0.86, 0.01)
precisions, recalls = [], []
for t in thresholds:
    y_pred = (p_test >= t).astype(int)
    recall = y_pred[y_test == 1].sum()/y_test.sum()
    precision = y_pred[y_test == 1].sum()/y_pred.sum()
    precisions.append(precision)
    recalls.append(recall)

    
plt.figure(figsize=(20, 10))
plt.scatter(precisions, recalls)
plt.grid()
plt.show()



<h1> Plot Prioritizer Performance </h1>

<h2> Plot actual prioritizier performance </h2>

In [None]:
y_test_cache = y_test.copy()

In [None]:
idx = train_test_split(X, np.array(list(range(X.shape[0]))), test_size=0.33, random_state=0)[3]


In [None]:
y_test = (np.array(manhattan_errors)[idx] < 10)

In [None]:
y_test = y_test_cache.copy()

In [None]:
cutoff = 1500


scores = list(zip(p_test, y_test))
sorted_by_score = list(reversed(sorted(scores, key=lambda x: x[0])))
perfectly_sorted = list(reversed(sorted(scores, key=lambda x: x[1])))
randomly_shuffled = shuffle(scores)

cum_randomly_shuffled = np.cumsum(np.array([x[1] for x in randomly_shuffled]))
cum_sorted_by_score = np.cumsum(np.array([x[1] for x in sorted_by_score]))
cum_perfectly_sorted = np.cumsum(np.array([x[1] for x in perfectly_sorted]))
                                

plt.figure(figsize=(20, 10))
plt.plot(cum_randomly_shuffled, color='r', label='AKPD Prioritizer Inactive')
plt.plot(cum_sorted_by_score, color='b', label='AKPD Prioritizer Active')
plt.plot(cum_perfectly_sorted, color='g', label='Perfect Theoretical Performance')

plt.hlines(cutoff, 0, 16000, linestyles='dashed', label='Required daily good image count: 1500')

plt.title('AKPD Prioritizer Performance')
plt.xlabel('Number of images analyzed')
plt.ylabel('Number of good AKPD predictions')
plt.legend()
plt.grid()
plt.show()
    

In [None]:
np.where(cum_sorted_by_score == 1500)[0]