In [None]:
import json, os
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from aquabyte.accuracy_metrics import AccuracyMetricsGenerator
from aquabyte.data_access_utils import S3AccessUtils, RDSAccessUtils
from aquabyte.optics import euclidean_distance, pixel2world
from aquabyte.visualize import Visualizer
import random
from scipy.stats import norm
from PIL import Image, ImageDraw
from urllib.parse import urlparse
import seaborn as sns
from IPython.display import Image

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)

In [None]:
s3_access_utils = S3AccessUtils('/root/data')

rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))
query = """
    select * from lati_fish_detections_lice_annotations where pen_id = 56 and captured_at between '2019-12-01' and '2020-01-12';
"""
df = rds_access_utils.extract_from_database(query)



In [None]:
is_submitted = df.is_skipped == False
reasonable_duration_mask = df.work_duration_ms < 600*1e3

print(df[is_submitted].work_duration_ms.median() * 1e-3)
print(df[~is_submitted].work_duration_ms.median() * 1e-3)

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(df[~is_submitted & reasonable_duration_mask].work_duration_ms, bins=20, color='blue')
plt.hist(df[is_submitted & reasonable_duration_mask].work_duration_ms, bins=20, color='red')
plt.grid()
plt.show()


In [None]:
df[~is_submitted & reasonable_duration_mask].work_duration_ms.mean()

In [None]:
def generate_center_coordinate(metadata, x_direction=True):
    if x_direction:
        x = metadata['x_coord'] + 0.5 * metadata['width']
        return x
    y = metadata['y_coord'] + 0.5 * metadata['height']
    return y

def retrieve_depth(metadata):
    if 'depth_m_weekly_linear_model' in metadata.keys():
        return metadata['depth_m_weekly_linear_model']['value']
    return None


df['centroid_x'] = df.metadata.apply(lambda x: generate_center_coordinate(x, x_direction=True))
df['centroid_y'] = df.metadata.apply(lambda x: generate_center_coordinate(x, x_direction=False))
df['depth'] = df.metadata.apply(lambda x: retrieve_depth(x))

In [None]:
MAX_WIDTH, MAX_HEIGHT = 4096, 3000+1
SQUARE_SIZE = 500
x_values = list(np.arange(0, MAX_WIDTH, SQUARE_SIZE))
y_values = list(np.arange(0, MAX_HEIGHT, SQUARE_SIZE))
results = np.zeros([len(x_values)-1, len(y_values)-1])
counts = np.zeros([len(x_values)-1, len(y_values)-1])
good_crop_mask = (df.is_bad_crop != True)# | (df.is_bad_crop != False)
accept_mask = (df.is_skipped == False)
for x_idx in range(len(x_values)-1):
    for y_idx in range(len(y_values)-1):
        x_low, x_high = x_values[x_idx], x_values[x_idx+1]
        y_low, y_high = y_values[y_idx], y_values[y_idx+1]
        mask_x = (df.centroid_x > x_low) & (df.centroid_x < x_high)
        mask_y = (df.centroid_y > y_low) & (df.centroid_y < y_high)
        tile_mask = mask_x & mask_y
        if df[good_crop_mask & tile_mask].shape[0] > 0:
            accept_rate = df[good_crop_mask & tile_mask & accept_mask].shape[0] / df[good_crop_mask & tile_mask].shape[0]
        else:
            accept_rate = 0
        if accept_rate > 0.49:
            accept_rate = 0
        results[x_idx, y_idx] = accept_rate
        counts[x_idx, y_idx] = df[good_crop_mask & tile_mask].shape[0]


In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(results.T, annot=True)
plt.title('Accept rate by Field Position, Pen ID = 65, October 2019')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(counts.T, annot=True)
plt.title('Crop Count by Field Position, Pen ID = 65, October 2019')
plt.show()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 10))
sns.heatmap(results.T, annot=True, ax=axes[0])
sns.heatmap(counts.T, annot=True, ax=axes[1])

plt.show()

<h1> Generate large dataset </h1>

In [None]:
pen_id = 37
date_ranges = [
    ('2019-10-01', '2019-11-01'),
    ('2019-11-01', '2019-12-01'),
    ('2019-12-01', '2020-01-01')
]

figs, axes = plt.subplots(2, len(date_ranges), figsize=(30, 10))

count = 0
for date_idx, date_range in enumerate(date_ranges):
    start_date, end_date = date_range
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    MAX_WIDTH, MAX_HEIGHT = 4096, 3000+1
    SQUARE_SIZE = 500
    x_values = list(np.arange(0, MAX_WIDTH, SQUARE_SIZE))
    y_values = list(np.arange(0, MAX_HEIGHT, SQUARE_SIZE))
    results = np.zeros([len(x_values)-1, len(y_values)-1])
    counts = np.zeros([len(x_values)-1, len(y_values)-1])
    accept_mask = (df.is_skipped == False)
    for x_idx in range(len(x_values)-1):
        for y_idx in range(len(y_values)-1):
            x_low, x_high = x_values[x_idx], x_values[x_idx+1]
            y_low, y_high = y_values[y_idx], y_values[y_idx+1]
            mask_x = (df.centroid_x > x_low) & (df.centroid_x < x_high)
            mask_y = (df.centroid_y > y_low) & (df.centroid_y < y_high)
            tile_mask = mask_x & mask_y
            if df[date_mask & tile_mask].shape[0] > 0:
                accept_rate = df[date_mask & tile_mask & accept_mask].shape[0] / df[date_mask & tile_mask].shape[0]
            else:
                accept_rate = 0
            if accept_rate > 0.49:
                accept_rate = 0
            results[x_idx, y_idx] = accept_rate
            counts[x_idx, y_idx] = df[date_mask & tile_mask].shape[0]

    sns.heatmap(results.T, annot=True, ax=axes[0, date_idx])
    sns.heatmap(counts.T, annot=True, ax=axes[1, date_idx])
    axes[0, date_idx].set_title('Accept rate, Pen ID = {}, {} - {}'.format(pen_id, start_date, end_date))
    axes[1, date_idx].set_title('Counts, Pen ID = {}, {} - {}'.format(pen_id, start_date, end_date))

plt.show()



In [None]:
plt.plot(list(range(tdf.shape[0])), np.cumsum(tdf.is_submitted.values))

<h1> Generate out of sample results </h1>

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))
query = """
    select * from lati_fish_detections_lice_annotations where pen_id = 57 and captured_at between '2020-01-10' and '2020-01-11';
"""

df = rds_access_utils.extract_from_database(query)
df = df[~df.image_url.str.contains('research')]


In [None]:
# baseline simulation

df['score'] = df.metadata.apply(lambda row: row['quality_score'])
df['simulated_completed_at'] = df.completed_at + dt.timedelta(hours=2)
df['is_submitted'] = df.is_skipped == False
queue, submits = [], []
last_ts = None
i = 0
start_date, end_date = '2020-01-10', '2020-01-11'
date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
tdf = df[date_mask].copy(deep=True)
print(tdf.captured_at.iloc[-1])
for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
    if not last_ts: 
        additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
        
    else:
        additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)
    
    last_ts = row.simulated_completed_at
    additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                    tdf[additional_captures_mask].is_submitted.tolist()))

    queue.extend(additional_scores_and_submits)
    queue.sort(key=lambda x: x[0], reverse=True)
    _, submit = queue.pop(0)
    submits.append(submit)
#     if i % 100 == 0:
#         print(i)
#     i += 1
    
fig, ax1 = plt.subplots(figsize=(20, 10))
# ax1.plot(range(tdf.shape[0]), np.cumsum((tdf.is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
ax1.plot(list(range(tdf.shape[0])), np.cumsum(tdf.is_submitted.values), color='blue', label='Depth-Based Prioritization')
ax1.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='2D Field Position-Based Prioritization')
ax1.set_xlabel('Num. images analyzed by Cogito')
ax1.set_ylabel('Num. images submitted to QA', color='blue')
ax1.axhline(50, linestyle='dashed', label='KPI', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.legend()

plt.title('Pen ID 65 (Hisdalen), Date={}'.format(start_date))
ax1.grid()
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(list(range(df.shape[0])), np.cumsum(df.sort_values('score', ascending=False).is_submitted))
plt.show()

In [None]:
df.sort_values('score', ascending=False).iloc[:5].captured_at

In [None]:
df.sort_values('score', ascending=False).iloc[:5].score

In [None]:
df.sort_values('completed_at').iloc[:5].captured_at

In [None]:
# research simulation

rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))
query = """
    select * from lati_fish_detections_lice_annotations where pen_id = 65 and captured_at between '2020-01-10' and '2020-01-11';
"""
df = rds_access_utils.extract_from_database(query)
df = df[~df.image_url.str.contains('research')]


In [None]:
df['score'] = df.metadata.apply(lambda row: row['quality_score'])
df['simulated_completed_at'] = df.completed_at# + dt.timedelta(hours=0)
df['is_submitted'] = df.is_skipped == False
queue, submits = [], []
last_ts = None
i = 0
start_date, end_date = '2020-01-10', '2020-01-11'
date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
tdf = df[date_mask].copy(deep=True)
for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
    if not last_ts:
        additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
    else:
        additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)
    
        

    last_ts = row.simulated_completed_at
    additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                    tdf[additional_captures_mask].is_submitted.tolist()))

    queue.extend(additional_scores_and_submits)
    queue.sort(key=lambda x: x[0], reverse=True)
    _, submit = queue.pop(0)
    submits.append(submit)
    if i % 100 == 0:
        print(i)
    i += 1
    
fig, ax1 = plt.subplots(figsize=(20, 10))
ax1.plot(range(tdf.shape[0]), np.cumsum((tdf.sort_values('simulated_completed_at').is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
ax1.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='2D Field Position-Based Prioritization')
ax1.set_xlabel('Num. images analyzed by Cogito')
ax1.set_ylabel('Num. images submitted to QA', color='blue')
ax1.axhline(50, linestyle='dashed', label='KPI', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.legend()

plt.title('Pen ID 65 (Hisdalen), Date={}'.format(start_date))
ax1.grid()
plt.show()

In [None]:
plt.plot(list(range(df.shape[0])), df.sort_values('score', ascending=False).is_submitted.cumsum())

In [None]:
url = 'https://aquabyte-crops.s3.eu-west-1.amazonaws.com/environment=production/site-id=43/pen-id=65/date=2020-01-10/hour=00/at=2020-01-10T00:15:49.348188000Z/left_frame_crop_362_914_4096_2255.jpg'
df[df.image_url == url]
   
   

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))
query = """
    select d.group_id, d.url_key, ROW_NUMBER() OVER (PARTITION BY d.group_id ORDER BY d.group_id, d.captured_at asc) as r, SUM(QA) OVER (PARTITION BY d.group_id ORDER BY d.group_id, d.captured_at asc) as c from (select a.url_key, a.captured_at, a.group_id, CASE when b.url_key is null THEN 0 ELSE 1 end as QA from prod.crop_annotation a left join prod.crop_annotation b on a.pen_id=b.pen_id and a.service_id=b.service_id and a.url_key=b.url_key and a.captured_at=b.captured_at and b.annotation_state_id=3 where a.annotation_state_id=2 and a.service_id=1 and a.pen_id=73 and a.captured_at between '2020-01-10' and '2020-01-11') d;
"""
sql_df = rds_access_utils.extract_from_database(query)



In [None]:
plt.figure(figsize=(20, 10))
plt.plot(list(range(sql_df.shape[0])), sql_df.c)
plt.plot(list(range(tdf.shape[0])), np.cumsum(tdf.is_submitted.values), color='blue', label='Depth-Based Prioritization')
plt.grid()
plt.show()

In [None]:
rds_access_utils = RDSAccessUtils(json.load(open(os.environ['PROD_SQL_CREDENTIALS'])))
query = """
    select * from lati_fish_detections_lice_annotations where pen_id = 56 and captured_at between '2019-12-18' and '2019-12-31';
"""
df = rds_access_utils.extract_from_database(query)



In [None]:
df['date'] = df.captured_at.astype(str).apply(lambda x: x[:10])
df['score'] = df.metadata.apply(lambda x: x['quality_score'])
df['simulated_completed_at'] = df.completed_at + dt.timedelta(hours=24)
df['is_submitted'] = df.is_skipped == False


queue, submits = [], []
intermediate_backlog_count = 0
intermediate_backlog_counts = []
last_ts = None
i = 0
start_date, end_date = '2019-11-20', '2019-11-21'
date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
tdf = df[date_mask].copy(deep=True)
for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
    if not last_ts:
        additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
    else:
        additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)
    
        

    last_ts = row.simulated_completed_at
    additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                    tdf[additional_captures_mask].is_submitted.tolist()))
    intermediate_backlog_count += len(additional_scores_and_submits) - 1
    intermediate_backlog_counts.append(intermediate_backlog_count)

    queue.extend(additional_scores_and_submits)
    queue.sort(key=lambda x: x[0], reverse=True)
    _, submit = queue.pop(0)
    submits.append(submit)
    if i % 100 == 0:
        print(i)
    i += 1
    
fig, ax1 = plt.subplots(figsize=(20, 10))
ax1.plot(range(tdf.shape[0]), np.cumsum((tdf.is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
ax1.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='1h delay Depth-Based Prioritization')
ax1.set_xlabel('Num. images analyzed by Cogito')
ax1.set_ylabel('Num. images submitted to QA')
ax1.axhline(50, linestyle='dashed', label='KPI', color='green')
ax1.legend()

plt.title('Pen ID 65 (Hisdalen), Date={}'.format(start_date))
ax1.grid()
plt.show()

In [None]:
df['date'] = df.captured_at.astype(str).apply(lambda x: x[:10])
df['score'] = df.metadata.apply(lambda x: x['crop_area'])
df['simulated_completed_at'] = df.completed_at + dt.timedelta(hours=24)
df['is_submitted'] = df.is_skipped == False


queue, submits = [], []
intermediate_backlog_count = 0
intermediate_backlog_counts = []
last_ts = None
i = 0
start_date, end_date = '2019-12-20', '2019-12-21'
date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
tdf = df[date_mask].copy(deep=True)
for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
    if not last_ts:
        additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
    else:
        additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)
    
        

    last_ts = row.simulated_completed_at
    additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                    tdf[additional_captures_mask].is_submitted.tolist()))
    intermediate_backlog_count += len(additional_scores_and_submits) - 1
    intermediate_backlog_counts.append(intermediate_backlog_count)

    queue.extend(additional_scores_and_submits)
    queue.sort(key=lambda x: x[0], reverse=True)
    _, submit = queue.pop(0)
    submits.append(submit)
    if i % 100 == 0:
        print(i)
    i += 1
    
fig, ax1 = plt.subplots(figsize=(20, 10))
ax1.plot(range(tdf.shape[0]), np.cumsum((tdf.is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
ax1.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='1h delay Depth-Based Prioritization')
ax1.set_xlabel('Num. images analyzed by Cogito')
ax1.set_ylabel('Num. images submitted to QA')
ax1.axhline(50, linestyle='dashed', label='KPI', color='green')
ax1.legend()

plt.title('Pen ID 65 (Hisdalen), Date={}'.format(start_date))
ax1.grid()
plt.show()

In [None]:
df['date'] = df.captured_at.astype(str).apply(lambda x: x[:10])

In [None]:
column_count = 5
dates = list(df.date.unique())
figs, axes = plt.subplots((len(dates) // column_count)+1, column_count, figsize=(30, 20))

df['score'] = df.metadata.apply(lambda x: x['quality_score'])
df['simulated_completed_at'] = df.completed_at + dt.timedelta(hours=2)
df['is_submitted'] = df.is_skipped == False


count = 0
for date_idx in range(len(dates) - 1):
    start_date, end_date = dates[date_idx], dates[date_idx+1]
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    tdf = df[date_mask].copy(deep=True)
    queue, submits = [], []
    intermediate_backlog_count = 0
    intermediate_backlog_counts = []
    last_ts = None

    for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)



        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                        tdf[additional_captures_mask].is_submitted.tolist()))
        intermediate_backlog_count += len(additional_scores_and_submits) - 1
        intermediate_backlog_counts.append(intermediate_backlog_count)

        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[0], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
    
    row, col = date_idx // column_count, date_idx % column_count
    ax = axes[row, col]
    ax.plot(range(tdf.shape[0]), np.cumsum((tdf.is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
    ax.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='1h delay Depth-Based Prioritization')
    ax.set_xlabel('Num. images analyzed by Cogito')
    ax.set_ylabel('Num. images submitted to QA')
    ax.axhline(50, linestyle='dashed', label='KPI', color='green')
    ax.set_title(start_date)
    ax.legend()
    ax.grid()
    print('Completed date: {}'.format(start_date))

plt.show()



In [None]:
column_count = 5
dates = list(df.date.unique())
figs, axes = plt.subplots((len(dates) // column_count)+1, column_count, figsize=(30, 20))

df['score'] = df.metadata.apply(lambda x: x['crop_area'])
df['simulated_completed_at'] = df.completed_at + dt.timedelta(hours=2)
df['is_submitted'] = df.is_skipped == False


count = 0
for date_idx in range(len(dates) - 1):
    start_date, end_date = dates[date_idx], dates[date_idx+1]
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    tdf = df[date_mask].copy(deep=True)
    queue, submits = [], []
    intermediate_backlog_count = 0
    intermediate_backlog_counts = []
    last_ts = None

    for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)



        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                        tdf[additional_captures_mask].is_submitted.tolist()))
        intermediate_backlog_count += len(additional_scores_and_submits) - 1
        intermediate_backlog_counts.append(intermediate_backlog_count)

        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[0], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
    
    row, col = date_idx // column_count, date_idx % column_count
    ax = axes[row, col]
    ax.plot(range(tdf.shape[0]), np.cumsum((tdf.is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
    ax.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='1h delay Depth-Based Prioritization')
    ax.set_xlabel('Num. images analyzed by Cogito')
    ax.set_ylabel('Num. images submitted to QA')
    ax.axhline(50, linestyle='dashed', label='KPI', color='green')
    ax.set_title(start_date)
    ax.legend()
    ax.grid()
    print('Completed date: {}'.format(start_date))

plt.show()



In [None]:
df = df[~df.image_url.str.contains('research')]

In [None]:
column_count = 5
dates = list(df.date.unique())
# figs, axes = plt.subplots((len(dates) // column_count)+1, column_count, figsize=(30, 40))
figs, axes = plt.subplots((len(dates) // column_count)+1, column_count, figsize=(30, 20))

df['score'] = df.metadata.apply(lambda x: x['quality_score'])
df['simulated_completed_at'] = df.completed_at + dt.timedelta(hours=4)
df['is_submitted'] = df.is_skipped == False


count = 0
for date_idx in range(len(dates) - 1):
    start_date, end_date = dates[date_idx], dates[date_idx+1]
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    date_mask = (df.captured_at > start_date) & (df.captured_at < end_date)
    tdf = df[date_mask].copy(deep=True)
    queue, submits = [], []
    intermediate_backlog_count = 0
    intermediate_backlog_counts = []
    last_ts = None

    for idx, row in tdf.sort_values('simulated_completed_at').iterrows():
        if not last_ts:
            additional_captures_mask = (tdf.captured_at <= row.simulated_completed_at)
        else:
            additional_captures_mask = (tdf.captured_at > last_ts) & (tdf.captured_at <= row.simulated_completed_at)



        last_ts = row.simulated_completed_at
        additional_scores_and_submits = list(zip(tdf[additional_captures_mask].score.tolist(), 
                                        tdf[additional_captures_mask].is_submitted.tolist()))
        intermediate_backlog_count += len(additional_scores_and_submits) - 1
        intermediate_backlog_counts.append(intermediate_backlog_count)

        queue.extend(additional_scores_and_submits)
        queue.sort(key=lambda x: x[0], reverse=True)
        _, submit = queue.pop(0)
        submits.append(submit)
    
    row, col = date_idx // column_count, date_idx % column_count
    ax = axes[row, col]
    ax.plot(range(tdf.shape[0]), np.cumsum((tdf.is_skipped == False).astype(int)), color='blue', label='Depth-Based Prioritization')
    ax.plot(range(len(submits)), np.cumsum(np.array(submits)), color='red', label='1h delay Depth-Based Prioritization')
    ax.set_xlabel('Num. images analyzed by Cogito')
    ax.set_ylabel('Num. images submitted to QA')
    ax.axhline(50, linestyle='dashed', label='KPI', color='green')
    ax.set_title(start_date)
    ax.legend()
    ax.grid()
    print('Completed date: {}'.format(start_date))

plt.show()



In [None]:
np.corrcoef(df.apply(lambda row: get_score(row.centroid_x, row.centroid_y), axis=1), 
            df.metadata.apply(lambda x: x['quality_score']))

In [None]:
plt.figure(figsize=(20, 10))
plt.hist(intermediate_backlog_counts, bins=20)
plt.show()

In [None]:
date_mask = (df.captured_at > '2019-12-20') & (df.captured_at < '2019-12-21')


In [None]:
x_idx, y_idx = 3, 1
mask_x = (df.centroid_x > x_values[x_idx]) & (df.centroid_x < x_values[x_idx+1])
mask_y = (df.centroid_y > y_values[y_idx]) & (df.centroid_y < y_values[y_idx+1])
tile_mask = mask_x & mask_y


In [None]:
tdf = df[tile_mask & (~accept_mask) & (df.is_bad_crop != True)].sample(10)

In [None]:
tdf

In [None]:
tdf[[c for c in tdf.columns if 'is_' in c]]