In [None]:
import glob
import json
import numpy as np
import os

import boto3
from matplotlib import pyplot as plt
import pandas as pd
from PIL import Image
from pycocotools.coco import COCO
from skimage.io import imread

%matplotlib inline

# set pandas options
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)


<h1> Load crop dataset </h1>

In [None]:
analysis_df_path, key = '/root/data/alok/crop_data/data_dumps/analysis_df.h5', 'df'
analysis_df = pd.read_hdf(analysis_df_path, key)

# compute crop size and aspect ratio 
analysis_df['crop_size'] = analysis_df.image_width_px * analysis_df.image_height_px
analysis_df['aspect_ratio'] = analysis_df.image_width_px / analysis_df.image_height_px

<h3> Visualize crop size for crops that were accepted in QA vs. rejected by Cogito </h3>

In [None]:
# crop size histogram for crops that are accepted in QA

accepted_mask = (analysis_df.created_by == 'gunnar@aquabyte.ai') & (analysis_df.adult_female_count_adjusted >= 0)
plt.hist(analysis_df[accepted_mask].crop_size)
plt.show()


In [None]:
# crop size histogram for crops that are rejected by Cogito

# rejected_mask = (analysis_df.is_skipped == True)
rejected_mask = analysis_df.adult_female_count_adjusted.isnull()
plt.hist(analysis_df[rejected_mask].crop_size)
plt.show()

<h3> Visualize aspect ratio for crops that were accepted in QA versus rejected by Cogito </h3>

In [None]:
# aspect ratio histogram for crops that are accepted in QA

accepted_mask = (analysis_df.created_by == 'gunnar@aquabyte.ai') & (analysis_df.adult_female_count_adjusted >= 0)
plt.hist(analysis_df[accepted_mask].aspect_ratio)
plt.show()


In [None]:
# aspect ratio histogram for crops that are rejected by Cogito

# rejected_mask = (analysis_df.is_skipped == True)
rejected_mask = analysis_df.adult_female_count_adjusted.isnull()
plt.hist(analysis_df[rejected_mask].aspect_ratio)
plt.show()

<h1> Create precision / recall curve for training data </h1>

<h3> Define positive outcome as a crop being rejected due to size threshold, and negative outcome as crop being accepted </h3>

In [None]:
true_negative_mask = (analysis_df.created_by == 'gunnar@aquabyte.ai') & (analysis_df.adult_female_count_adjusted >= 0)
true_positive_mask = analysis_df.adult_female_count_adjusted.isnull()

metric = 'crop_size'
thresholds = np.percentile(analysis_df[metric], list(range(100)))
precisions, recalls = [], []
for t in thresholds:
    positive_predictions_mask = analysis_df[metric] < t
    negative_predictions_mask = analysis_df[metric] > t
    false_positive_cnt = analysis_df[positive_predictions_mask & true_negative_mask].shape[0]
    false_negative_cnt = analysis_df[negative_predictions_mask & true_positive_mask].shape[0]
    if analysis_df[positive_predictions_mask].shape[0] > 0:
        precision = analysis_df[positive_predictions_mask & true_positive_mask].shape[0] / \
                    analysis_df[positive_predictions_mask].shape[0]
        recall = analysis_df[positive_predictions_mask & true_positive_mask].shape[0] / \
                    analysis_df[true_positive_mask].shape[0]
        precisions.append(precision)
        recalls.append(recall)
    
    

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(recalls, precisions)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision vs. Recall for {} based classifier'.format(metric))
plt.show()

In [None]:
for t, p, r in list(zip(thresholds, precisions, recalls)):
    print(t, p, r)

In [None]:
analysis_df[true_positive_mask].shape[0] / analysis_df.shape[0]

<h1> Investigate bad cases </h1>

In [None]:
threshold = np.percentile(analysis_df.crop_size, list(range(100)))[5]
false_positive_mask = positive_predictions_mask & ~true_positive_mask
tdf = analysis_df[false_positive_mask].sort_values('crop_size', ascending=True).head(10)
tdf


In [None]:
coco = COCO()
coco.imgs = [
    {
        'height': 3000,
        'width': 4096
    }
]

In [None]:
i = 1
image_f = tdf.image_path.iloc[i]
im = Image.open(image_f)
ann = {
    'image_id': 0,
    'segmentation': json.loads(tdf.segmentation.iloc[i])['segmentation']
}
m = coco.annToMask(ann)



In [None]:
im

In [None]:
Image.fromarray(np.array(im.convert('L')) * m)