### Download the data

In [None]:
!ls

In [None]:
!aws s3 cp s3://aquabyte-models/skip-classifier/model.pt current_production_model.pt

In [None]:
from sqlalchemy import create_engine
import pandas as pd


class RDSAccessUtils(object):

    def __init__(self, sql_credentials):
        self.sql_engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(sql_credentials["user"], 
         sql_credentials["password"],
         sql_credentials["host"], sql_credentials["port"],
         sql_credentials["database"]))
        self.db_connection = self.sql_engine.connect()

    def extract_from_database(self, sql_query):
        results = self.db_connection.execute(sql_query)
        df = pd.DataFrame(results.fetchall())
        df.columns = results.keys()
        return df
         


In [None]:
import json

client = RDSAccessUtils(json.load(open('/root/sid/credentials/data_warehouse_sql_credentials.json')))

In [None]:
query = """SELECT * FROM prod.crop_annotation WHERE (service_id=1) 
 AND (annotation_state_id IN (3, 4, 6, 7)) AND captured_at>'2020-01-16'"""

production_data = client.extract_from_database(query)

In [None]:
production_data

In [None]:
production_data['site_id'] = production_data['base_key'].str.split('/').apply(lambda ps: ps[1])

In [None]:
# import os

# production_eval_img_dir = '/root/data/sid/needed_data/skip_classifier_datasets/production_evaluation/images/'
# already_downloaded = set(os.listdir(production_eval_img_dir))
# len(already_downloaded)

In [None]:
# list(already_downloaded)[0]

### Filter the metadata to only include analyzed images. 

In [None]:
id2state = {
    3:'QA',
    4:'SKIPPED_ANN',
    6:'SKIPPED_QA',
    7:'VERIFIED'
}

production_data['state'] = production_data['annotation_state_id'].apply(lambda id: id2state[id] if id in id2state else None)
production_data = production_data[production_data['state'].notnull()]
production_data['state'].value_counts()

In [None]:
qa_accepts = production_data[production_data['state'] == 'VERIFIED']

In [None]:
pen_counts = qa_accepts.site_id.value_counts()
pen_counts

In [None]:
import pandas as pd

all_pens = list(production_data.site_id.unique())
naccepts_per_pen = 200
sampled_accepts = pd.DataFrame([], columns=qa_accepts.columns)

for s in all_pens:
    this_pen_accepts = qa_accepts[qa_accepts['site_id'] == s]
    this_pen_count = 0 if p not in pen_counts else pen_counts[p]
    this_pen_sample = this_pen_accepts.sample(min(naccepts_per_pen, len(this_pen_accepts)))
    sampled_accepts = pd.concat([sampled_accepts, this_pen_sample])
sampled_accepts.site_id.value_counts()

In [None]:
sampled_accepts.site_id.value_counts().sum()

In [None]:
pen_counts = sampled_accepts['site_id'].value_counts()
pen_counts

In [None]:
pen_counts[137]

In [None]:
cogito_skips = production_data[production_data['state'] == 'SKIPPED_ANN']

In [None]:
import pandas as pd

all_pens = list(production_data.site_id.unique())
nskips_per_pen = int(round((len(qa_accepts)*2)/len(all_pens), 0))
sampled_skips = pd.DataFrame([], columns=cogito_skips.columns)

for p in all_pens:
    this_pen_skips = cogito_skips[cogito_skips['site_id'] == p]
    this_pen_count = 0 if p not in pen_counts else pen_counts[p]
    this_pen_sample = this_pen_skips.sample(min(this_pen_count, len(this_pen_skips)))
    sampled_skips = pd.concat([sampled_skips, this_pen_sample])
sampled_skips.pen_id.value_counts()

In [None]:
import pandas as pd

#eval_data = pd.concat([sampled_accepts, sampled_skips])
eval_data = pd.concat([sampled_accepts, sampled_skips])
eval_data

### Download the images

In [None]:
def get_url(row):
    if isinstance(row['left_crop_url'], str):
        return row['left_crop_url']
    elif isinstance(row['right_crop_url'], str):
        return row['right_crop_url']
    else:
        assert False
    

eval_data['url'] = eval_data.apply(get_url, axis=1)
eval_data['url']

In [None]:
!mkdir /root/data/sid/needed_data/skip_classifier_datasets/production_evaluation/may15-may20_images

In [None]:
import os

production_eval_img_dir = '/root/data/sid/needed_data/skip_classifier_datasets/production_evaluation/images/'

def get_local_path(url):
    name = '_PATHSEP_'.join(url.split('/')[3:])
    return os.path.join(production_eval_img_dir, name)
eval_data['local_path'] = eval_data.url.apply(get_local_path)
eval_data.local_path.iloc[:10].tolist()

In [None]:
len(already_downloaded)

In [None]:
import os

already_downloaded = os.listdir('/root/data/sid/needed_data/skip_classifier_datasets/production_evaluation/images/')

In [None]:
already_downloaded = ['/root/data/sid/needed_data/skip_classifier_datasets/production_evaluation/images/' + url
                      for url in already_downloaded]

In [None]:
len(already_downloaded)

In [None]:
import requests
import os
import shutil

def download_image(row, exclude_images=[]):
    url, local_path = row['url'], row['local_path']
    if local_path not in exclude_images:
        response = requests.get(url, stream=True)
        with open(local_path, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
    return local_path 

download_image(eval_data.iloc[1])

In [None]:
production_data_images = eval_data[~eval_data.url.duplicated()]
production_data_images.info()

In [None]:
already_downloaded = []

In [None]:
need_to_download = production_data_images[~production_data_images['local_path'].isin(already_downloaded)]
len(need_to_download)

In [None]:
from tqdm.notebook import tqdm
from functools import partial

tqdm.pandas()

need_to_download.progress_apply(partial(download_image), axis=1)

In [None]:
production_data_images

In [None]:
downloaded_production_data = production_data_images#[production_data_images['local_path'].isin(already_downloaded)]
downloaded_production_data.info()

### Run classifier on these images

In [None]:
!ls /root/data/sid/needed_datasets/skip_classifier_checkpoints/

In [None]:
!ls /root/data/sid/needed_datasets/skip_classifier_checkpoints/

In [None]:
!ls /root/data/sid/needed_datasets/skip_classifier_checkpoints/testing123__2021-01-19__08-57-40/

In [None]:
NEW_MODEL_NAME = 'testing123__2021-01-19__08-57-40'
NEW_MODEL_PATH = os.path.join('/root/data/sid/needed_datasets/skip_classifier_checkpoints/', NEW_MODEL_NAME)
#SPLITS_NAME = '07-14-2020_stratify_hour_partialfish.json'
#SPLIT_PATH = os.path.join('/root/data/sid/needed_data/skip_classifier_datasets/splits', SPLITS_NAME)
BEST_EPOCH = 'epoch_1'
metric_path = os.path.join(NEW_MODEL_PATH, BEST_EPOCH, 'train', 'metrics.json')

In [None]:
NEW_MODEL_PATH

In [None]:
json.load(open(metric_path))

In [None]:
!ls /root/data/sid/needed_datasets/skip_classifier_checkpoints/

In [None]:
import sys
sys.path.append('/root/sid/repos/cv_research/sid/lice_counting/skip_classifier')

In [None]:
!ls /root/data/sid/needed_datasets/skip_classifier_checkpoints/qa_accept_cogito_skips_05-15-2020_recentsample_stratified__2020-05-17__13-15-57/epoch_14/val/model.pt

In [None]:
#!pip install 'torch==1.0'

In [None]:
# !pip install 'torch==1.3.1'
# !pip install 'torchvision==0.4.2'
# !pip install 'albumentations==0.4.5'
# !pip install 'opencv-python==4.2.0.32'
# !pip install --upgrade 'numpy==1.15.0'

In [None]:
#!pip install --upgrade scikit-image

In [None]:
!ls /root/data/sid/needed_datasets/skip_classifier_checkpoints/
#  /root/data/sid/needed_datasets/skip_classifier_checkpoints/testing123__2020-07-15__08-51-12/

In [None]:
import sys
sys.path.append('/root/sid/repos/cv_research/sid/lice_counting/skip_classifier/')

from model import MultilabelClassifier
help(MultilabelClassifier)

In [None]:
from train import ACCEPT_LABEL, SKIP_LABEL
import torch

device = 0
metric_path = os.path.join(NEW_MODEL_PATH, BEST_EPOCH, 'train', 'metrics.json')
print(json.load(open(metric_path)))
path = os.path.join(NEW_MODEL_PATH, BEST_EPOCH, 'train', 'model.pt')
new_model = MultilabelClassifier(savename=None, num_labels=5)
new_model.load_state_dict(torch.load(path))
new_model.to(device)
new_model.cuda()
new_model.eval()

In [None]:
# classes = [ACCEPT_LABEL, SKIP_LABEL]
# eval_set['paths'] = production_data['local_path']
# eval_set['labels'] = production_data['skip_reasons'].notnull().apply(int)
# samples = [(path, label) for path, label in zip(
#             eval_set['paths'], eval_set['labels'])]
# len(samples)

In [None]:
downloaded_production_data.iloc[:5]

In [None]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from loader import TRANSFORMS
import cv2
from torch.nn.functional import sigmoid

IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')

def image_to_array(file_path):
    # Read an image with OpenCV
    image = cv2.imread(file_path)

    # By default OpenCV uses BGR color space for color images,
    # so we need to convert the image to RGB color space.
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = TRANSFORMS['pad'](image=image)['image']
    return image 

def multilabel_preds(cuda_inputs, model):
    outputs = model(cuda_inputs)
    preds = sigmoid(outputs)
    cpu_outputs = preds.detach().cpu().numpy()
    return cpu_outputs[0]

def regular_preds(cuda_inputs, model):
    preds = model(cuda_inputs)
    cpu_outputs = preds.detach().cpu().numpy()
    return cpu_outputs[0][0]

def get_predictions(image, model, pred_fn):
    cuda_inputs = torch.unsqueeze(image.to(device), dim=0)
    with torch.set_grad_enabled(False):
        return pred_fn(cuda_inputs, model)
    
def path2newmodelpredictions(file_path):
    return get_predictions(image_to_array(file_path), new_model, multilabel_preds)

downloaded_production_data['local_path'].iloc[:5].apply(path2newmodelpredictions)

In [None]:
OLD_MODEL_PATH

In [None]:
OLD_MODEL_NAME = 'bodypart_model_multi__2020-09-28__00-36-53'
OLD_MODEL_PATH = os.path.join('/root/data/sid/needed_datasets/skip_classifier_checkpoints/', OLD_MODEL_NAME)
#SPLITS_NAME = '07-14-2020_stratify_hour_partialfish.json'
#SPLIT_PATH = os.path.join('/root/data/sid/needed_data/skip_classifier_datasets/splits', SPLITS_NAME)
BEST_EPOCH = 'epoch_0'
metric_path = os.path.join(OLD_MODEL_PATH, BEST_EPOCH, 'train', 'metrics.json')
open(metric_path).read()

In [None]:
from train import ACCEPT_LABEL, SKIP_LABEL
import torch
from model import ImageClassifier

device = 0
metric_path = os.path.join(OLD_MODEL_PATH, BEST_EPOCH, 'train', 'metrics.json')
print(json.load(open(metric_path)))
path = os.path.join(OLD_MODEL_PATH, BEST_EPOCH, 'train', 'model.pt')
old_model = MultilabelClassifier(savename=None, num_labels=5)
old_model.load_state_dict(torch.load(path))
old_model.to(device)
old_model.cuda()
old_model.eval()

In [None]:
def path2oldmodelpredictions(file_path):
    return get_predictions(image_to_array(file_path), old_model, multilabel_preds)

downloaded_production_data['local_path'].iloc[:5].apply(path2oldmodelpredictions)

In [None]:
downloaded_production_data = downloaded_production_data.sort_values('captured_at', ascending=True).head(20000)

In [None]:
downloaded_production_data['production_predicted_accept_prob'] = downloaded_production_data.progress_apply(lambda row:
    row['left_crop_metadata']['quality_score'] if row['left_crop_metadata'] 
    else row['right_crop_metadata']['quality_score'], axis=1)

In [None]:
downloaded_production_data['old_model_predicted_accept_prob']

In [None]:
from tqdm.notebook import tqdm

tqdm.pandas()

downloaded_production_data['new_model_predicted_accept_prob'] = downloaded_production_data['local_path'].progress_apply(
    path2newmodelpredictions)


In [None]:
downloaded_production_data['old_model_predicted_accept_prob'] = downloaded_production_data['local_path'].progress_apply(path2oldmodelpredictions)

In [None]:
downloaded_production_data['new_model_cleaned'] = downloaded_production_data['new_model_predicted_accept_prob'].apply(sum)
downloaded_production_data['old_model_cleaned'] = downloaded_production_data['old_model_predicted_accept_prob'].apply(sum)

In [None]:
downloaded_production_data.annotation.apply(type).value_counts()

In [None]:
downloaded_production_data2 = downloaded_production_data[downloaded_production_data['annotation'].apply(type)!=list]


In [None]:
import json
len(downloaded_production_data2)

def get_sections(row):
    ann = row['annotation']

    if row['annotation_state_id'] == 7:
        if ann is None:
            return ['VENTRAL_POSTERIOR',
                     'VENTRAL_ANTERIOR',
                     'DORSAL_POSTERIOR',
                     'DORSAL_ANTERIOR',
                     'HEAD']
        elif ann.get('isPartial', None):
            return ann.get('visibleBodySections', None)
        else:
            return ['VENTRAL_POSTERIOR',
                     'VENTRAL_ANTERIOR',
                     'DORSAL_POSTERIOR',
                     'DORSAL_ANTERIOR',
                     'HEAD']

    else:
        return []
            

downloaded_production_data2['visibleBodySections'] = downloaded_production_data2.apply(get_sections, axis=1)
print(downloaded_production_data2['visibleBodySections'].isnull().sum())
downloaded_production_data2 = downloaded_production_data2[downloaded_production_data2['visibleBodySections'].notnull()]

In [None]:
downloaded_production_data2

for col in BODYPART_COLS:
    downloaded_production_data2[col] = downloaded_production_data2['visibleBodySections'].apply(
        lambda l: col[4:] in l)

In [None]:
new['site_id'].value_counts()

In [None]:
import matplotlib.pyplot as plt

new = downloaded_production_data2.sort_values('new_model_cleaned', ascending=False)
new[BODYPART_COLS] = new[BODYPART_COLS].cumsum()
new['kpi'] = new[BODYPART_COLS].apply(min, axis=1)

old = downloaded_production_data2.sort_values('old_model_cleaned', ascending=False)
old[BODYPART_COLS] = old[BODYPART_COLS].cumsum()
old['kpi'] = old[BODYPART_COLS].apply(min, axis=1)

fig, ax = plt.subplots()
ax.plot(range(len(new)), new['kpi'])
ax.plot(range(len(old)), old['kpi'])

In [None]:
import matplotlib.pyplot as plt

new = downloaded_production_data2.sort_values('new_model_cleaned', ascending=False)
new[BODYPART_COLS] = new[BODYPART_COLS].cumsum()
new['kpi'] = new[BODYPART_COLS].apply(min, axis=1)

old = downloaded_production_data2.sort_values('old_model_cleaned', ascending=False)
old[BODYPART_COLS] = old[BODYPART_COLS].cumsum()
old['kpi'] = old[BODYPART_COLS].apply(min, axis=1)

fig, ax = plt.subplots()
ax.plot(range(len(new)), new['kpi'])
ax.plot(range(len(old)), old['kpi'])
ax.set_ylim((0, 500))
ax.set_xlim((0, 500))

In [None]:
# def path2oldmodelpredictions(file_path):
#     return get_predictions(image_to_array(file_path), old_model)

# downloaded_production_data['old_model_predicted_accept_prob'] = downloaded_production_data['local_path'].progress_apply(path2oldmodelpredictions)

### Get performance metrics

In [None]:
downloaded_production_data['state'].value_counts()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

def plot_roc(labels, outputs, title, pen_id=None, skip_reason=None, ax=None):
    if len(labels.unique()) > 1:
        fpr, tpr, thresholds = roc_curve(labels, outputs)
        auc = roc_auc_score(labels, outputs)
        lw = 2
        if ax is None:
            fig, ax = plt.subplots()
        ax.plot(fpr, tpr, color='darkorange',
                lw=lw, label='ROC curve (area = %0.2f)' % auc)
        ax.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate (skiprate)')
        ax.set_ylabel('Recall (KPI)')
        ax.set_title(title, size=20)
        ax.legend(loc="lower right")
    else:
        assert False

In [None]:
from loader import BODYPART_COLS

BODYPART_COLS

In [None]:
downloaded_production_data.columns

In [None]:
downloaded_production_data['annotation'].apply(type).value_counts()

In [None]:
import json

def get_sections(row):
    ann = row['annotation']

    if row['annotation_state_id'] == 7:
        if ann is None:
            return ['VENTRAL_POSTERIOR',
                     'VENTRAL_ANTERIOR',
                     'DORSAL_POSTERIOR',
                     'DORSAL_ANTERIOR',
                     'HEAD']
        elif isinstance(ann, list):
            print(row)
        elif (not ann['isPartial']):
            return ['VENTRAL_POSTERIOR',
                     'VENTRAL_ANTERIOR',
                     'DORSAL_POSTERIOR',
                     'DORSAL_ANTERIOR',
                     'HEAD']
        else:
            return ann['visibleBodySections']
    else:
        return []
            

downloaded_production_data['visibleBodySections'] = downloaded_production_data.apply(get_sections, axis=1)

In [None]:
for bp_col in BODYPART_COLS:
    downloaded_production_data[bp_col] = downloaded_production_data.visibleBodySections.progress_apply(
        lambda sections: bp_col[4:] in sections)

In [None]:
bodypart_weights = 1/downloaded_production_data[BODYPART_COLS].mean()
bodypart_weights /= bodypart_weights.sum()
bodypart_weights.index = bodypart_weights.index.map(lambda s: s[4:])
normalize_params = {'overall': bodypart_weights.to_dict()}
normalize_params

In [None]:
json.dump(normalize_params, open('oct5_bodypart_normalize_params.json', 'w'))

In [None]:
!pwd

In [None]:
BODYPART_COLS

In [None]:
for idx, bp in enumerate(BODYPART_COLS):
    pred_col = bp.replace('HAS_', 'PRED_')
    downloaded_production_data[pred_col] = downloaded_production_data['new_model_predicted_accept_prob'].apply(
        lambda arr: arr[idx])

### Cogito accepts

In [None]:
def get_label(state):
    if state == 'VERIFIED':
        return 1 
    elif state == 'SKIPPED_ANN':
        return 0
    else:
        return None
    
downloaded_production_data['label'] = downloaded_production_data['state'].apply(get_label)

In [None]:
downloaded_production_data['label'].value_counts()

In [None]:
downloaded_production_data['label'].unique()

In [None]:
import matplotlib.pyplot as plt

cogito_data = downloaded_production_data[downloaded_production_data['cogito_label'].notnull()]

all_pens = ['overall'] + list(cogito_data.pen_id.unique())
fig, axes = plt.subplots(nrows=len(all_pens), ncols=7, figsize=(30, 5*len(all_pens)))
cogito_data = downloaded_production_data[downloaded_production_data['label'].notnull()]

for pen, ax in zip(all_pens, axes):
    if pen != 'overall':
        this_pen = cogito_data[cogito_data['pen_id'] == pen]
    else:
        this_pen = cogito_data
    if pen == 'overall':
        title1 = 'production'
        title2 = 'fullbody'
    ax[0].set_xlabel(f'pen:{pen}')
    plot_roc(this_pen['label'], this_pen['production_predicted_accept_prob'], ax=ax[0], title=title1)
    plot_roc(this_pen['label'], this_pen['old_model_predicted_accept_prob'], ax=ax[1], title=title2)
    for i, col in enumerate(BODYPART_COLS):
        if pen == 'overall':
            title = col
        else:
            title = ''
        try:
            plot_roc(this_pen[col], this_pen[col.replace('HAS_', 'PRED_')], ax=ax[i+2], pen_id=pen, title=title)
        except:
            pass

In [None]:
cogito_data['production_predicted_accept_prob']

In [None]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

res = dict()
pen = 'overall'
for pen in all_pens:
    res[pen] = dict()
    if pen != 'overall':
        this_pen = cogito_data[cogito_data['pen_id'] == pen]
    else:
        this_pen = cogito_data
    pred_cols = ['production_predicted_accept_prob', 'old_model_predicted_accept_prob'] + [col.replace('HAS_', 'PRED_') for col in BODYPART_COLS]
    lab_cols = ['label', 'label'] + BODYPART_COLS
    for lab_col, pred_col in zip(lab_cols, pred_cols):
        if len(this_pen[lab_col].unique()) != 1:
            res[pen][pred_col] = roc_auc_score(this_pen[lab_col], this_pen[pred_col])
pd.DataFrame(res).apply(pd.Series, axis=1).T

In [None]:
def cogito_accept_rate(states):
    if states.isin([3,4]).sum() > 20:
        return (states == 3).sum() / states.isin([3,4]).sum()
    
def cogito_samples(states):
    return states.isin([3, 4]).sum()

def qa_samples(states):
    return states.isin([4, 6, 7]).sum()

def qa_accept_rate(states):
    if states.isin([4,6,7]).sum()> 20:
        return (states == 7).sum() / states.isin([4,6,7]).sum()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

nbins = 20
bins = [0] + [cogito_data['new_model_predicted_accept_prob'].quantile(b/nbins) for b in range(nbins+1)]
cogito_data['score_bucket'] = pd.cut(cogito_data['new_model_predicted_accept_prob'], bins=bins, labels=False)
expected_bins = set(range(nbins+1))

def acceptrate_to_score_hist(data, bins, binmethod='pctile'):
        
    
    scorebin_acceptrates = data.groupby('score_bucket')['annotation_state_id'].aggregate(
        [cogito_accept_rate, qa_accept_rate, cogito_samples, qa_samples])
    missing_bins = set(range(len(bins)-1)) - set(list(scorebin_acceptrates.index))
    #missing_bins = scorebin_acceptrates[scorebin_acceptrates['cogito_accept_rate'].isnull()].index
    for b in missing_bins:
        scorebin_acceptrates.loc[b] = None
        #next_bin = b
        #while (next_bin in missing_bins) and (next_bin <= nbins):
        #    next_bin += 1
        #if next_bin <= nbins:
        #    scorebin_acceptrates.loc[b] = scorebin_acceptrates.loc[next_bin]
        #else:
        #    scorebin_acceptrates.loc[b] = scorebin_acceptrates.loc[max(found_bins)]
    scorebin_acceptrates.sort_index(inplace=True)
    scorebin_acceptrates['cutoff'] = bins[:-1]
    return scorebin_acceptrates
    
fig, ax = plt.subplots()
acceptrate_to_score_hist(cogito_data, bins).plot.bar(y=['cogito_accept_rate'], ax=ax)

In [None]:
import matplotlib.pyplot as plt

pen_scores = dict()
pen_counts = dict()
all_pens = ['overall'] + list(cogito_data['pen_id'].unique())
fig, axes = plt.subplots(nrows=len(all_pens), figsize=(5, len(all_pens)*5))

for pen_id, ax in zip(all_pens, axes):
    if pen_id != 'overall':
        print(pen_id)
        this_pen = cogito_data[cogito_data['pen_id'] == pen_id]
        print(this_pen)
        rate2scores = acceptrate_to_score_hist(this_pen, bins)
        rate2scores = rate2scores[rate2scores['cogito_samples'] > 20] 
        pen_scores[pen_id] = rate2scores
        if len(pen_scores):
            pen_scores[pen_id].plot.bar(y=['cogito_accept_rate'], ax=ax)
        ax.set_xticks([])
        ax.set_title(f'Pen:{pen_id}')
        ax.set_ylim((0, 1.0))
    else:
        pen_scores[pen_id] = acceptrate_to_score_hist(cogito_data, bins)
        pen_scores[pen_id].plot.bar(y=['cogito_accept_rate'], ax=ax)
        ax.set_xticks([])
        ax.set_title(f'Pen:{pen_id}')
        ax.set_ylim((0, 1.0))

In [None]:
pen_counts = cogito_data.groupby(['pen_id', 'score_bucket'])['score_bucket'].aggregate('count')
for pen_id in cogito_data['pen_id'].unique():
    print(pen_id)
    print(pen_counts.loc[(pen_id)])

In [None]:
pen_scores

In [None]:
import numpy as np

transform_data = dict()

for pen_id in pen_scores:
    cutoffs = pen_scores[pen_id]['cutoff']
    rates = pen_scores[pen_id]['cogito_accept_rate']
    bins = list(sorted([int(bin) for bin in cutoffs.keys()]))
    print(rates)
    transform_data[pen_id] = [(cutoffs[b], rates[b]) for b in bins
                              if not np.isnan(rates[b])]

transform_data

In [None]:
def postprocess_normscore(data):
    new_data = dict()
    for pen in data:
        if len(data[pen]) < 5:
            continue
        new_data[str(pen)] = []
        cutoffs = [x[0] for x in data[pen]]
        assert cutoffs == sorted(cutoffs)
        rates = [x[1] for x in data[pen]]
        for idx, (cutoff, rate) in enumerate(zip(cutoffs, rates)):
            if idx != 0:
                these_rates = [x[1] for x in new_data[str(pen)]]
                biggest_rate_sofar = max(these_rates[:idx])
            else:
                biggest_rate_sofar = 0.0
            
            if rate is None:      
                rate = biggest_rate_sofar
            if  rate < biggest_rate_sofar:
                rate = biggest_rate_sofar
            
            new_data[str(pen)].append((cutoff, rate))
            
        if new_data[str(pen)][0][0] != 0.0:
            new_data[str(pen)] = [(0, new_data[str(pen)][0][1])] + new_data[str(pen)]
        if new_data[str(pen)][-1][0] != 1.0:
            new_data[str(pen)] = new_data[str(pen)] + [(1.0, new_data[str(pen)][-1][1])]
        if max([x[1] for x in new_data[str(pen)]]) == 0:
               del new_data[str(pen)]
    return new_data
new_data = postprocess_normscore(transform_data) 

In [None]:
new_data['119']

In [None]:
json.dump(new_data, open('pen_normalization_aug3_model.json', 'w'))

In [None]:
!aws s3 ls s3://aquabyte-research/sid/production_models/skip_classifier/08-03-2020/model.py

In [None]:
!aws s3 cp /root/data/sid/needed_datasets/skip_classifier_checkpoints/08-03-2020_stratify_hour_partialfish_justlice__2020-08-03__01-47-17/epoch_2/val/model.pt s3://aquabyte-research/sid/production_models/skip_classifier/08-03-2020/model.pt

In [None]:
!aws s3 cp pen_normalization_aug3_model.json s3://aquabyte-research/sid/production_models/skip_classifier/08-03-2020/norm_params.json