In [None]:
import os

os.environ.keys()

In [None]:
import pandas as pd

path = '/root/data/sid/skip_classifier_datasets/raw/production_skips_accepts/annotations.csv'
annotations = pd.read_csv(path)
annotations.head()

In [None]:
annotations = annotations[annotations['left_crop_url'].notnull()]

In [None]:
annotations['annotation_state_id'].value_counts()

### There's a couple ways we can slice and dice this skip dataset to improve the quality, which depends on how what we include to be skips, and what we include to be accepts.

* Skips
  1. Use all cogito skips
  2. Use all QA skips
  3. Use confident cogito skips
  4. Use confident QA skips.
  5. Break out skips into different skip reasons.
    
* Accepts
  1. Use all cogito accepts
  2. Use all QA accepts
  3. Use confident cogito skips.
  4. Use confident cogito accepts.
    
##### To start, let's do 2 options:

### all cogito skips and all cogito accepts.

In [None]:
import pandas as pd

SAMPLE_SIZE = 100000
SAMPLE_RATIO = 0.7

cogito_skips = annotations[annotations['annotation_state_id'] == 4]
cogito_skips = cogito_skips[~cogito_skips.left_crop_url.duplicated()]
cogito_skips = cogito_skips.sample(int(SAMPLE_RATIO * SAMPLE_SIZE))
cogito_accepts = annotations[annotations['annotation_state_id'] == 3]
cogito_accepts = cogito_accepts[~cogito_accepts.left_crop_url.duplicated()]
cogito_accepts = cogito_accepts.sample(int((1-SAMPLE_RATIO) * SAMPLE_SIZE))
all_cogito_data = pd.concat([cogito_skips, cogito_accepts])
all_cogito_data.info()

### confident cogito skips and confident QA accepts, just be to sure the labels have clear differences.

In [None]:
cogito_skips = annotations[annotations['annotation_state_id'] == 4]
cogito_skips = cogito_skips[~cogito_skips.left_crop_url.duplicated()]
cogito_skips = cogito_skips.sample(int(SAMPLE_RATIO * SAMPLE_SIZE))

In [None]:
qa_accepts = annotations[annotations['annotation_state_id'] == 7]
qa_accepts = qa_accepts[~qa_accepts.left_crop_url.duplicated()]
qa_accepts = qa_accepts.sample(int((1-SAMPLE_RATIO) * SAMPLE_SIZE))

In [None]:
skip_dataset = pd.concat([cogito_skips, qa_accepts])
skip_dataset.info()

In [None]:
def get_label(state_id):
    if state_id == 4:
        return 'SKIP'
    elif state_id == 7:
        return 'ACCEPT'
    else:
        assert False

skip_dataset['label'] = skip_dataset['annotation_state_id'].apply(get_label)
skip_dataset['label'].value_counts()

In [None]:
skip_dataset.to_csv('/root/data/sid/skip_classifier_datasets/sampled_datasets/qa_accept_cogito_skips_03-04-2020_100k.csv')

### Break down binary datasets by skip reason

In [None]:
import json

annotations['skip_reasons'] = annotations['skip_reasons'].apply(lambda l: l if isinstance(l, float) else json.loads(l))
annotations['skip_reasons']

In [None]:
all_skip_reasons = annotations['skip_reasons'].explode().unique()
all_skip_reasons

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(nrows=len(all_skip_reasons), figsize=(5, len(all_skip_reasons)*5))

skips = annotations[annotations['skip_reasons'].notnull()]

reason_ratios = {col: [] for col in ['label', 'ratio']
for i, label in enumerate(all_skip_reasons):
    skips[f'{label}'] = skips['skip_reasons'].apply(lambda l: (label in l))
    ratio = skips[f'{label}'].value_counts(normalize=True)[True]
    reason_ratios['label'].append(label)
    reason_ratios['ratio'].append(ratio)
pd.Series(reason_ratios, index='label').plot.bar()

In [None]:
useful_labels = [
    'BLURRY',
    'BAD_CROP',
    'BAD_ORIENTATION',
    'OBSTRUCTION',
    'TOO_DARK'
]

SAMPLE_SIZE = 10000
SAMPLE_RATIO = 0.7

for lab in useful_labels:
    label_skips = skips[skips[lab] & (skips['annotation_state_id'] == 4)]
    label_skips = label_skips[~label_skips.left_crop_url.duplicated()]
    label_skips = label_skips.sample(int(SAMPLE_RATIO * SAMPLE_SIZE)) 
    
    qa_accepts = annotations[annotations['annotation_state_id'] == 7]
    qa_accepts = qa_accepts[~qa_accepts.left_crop_url.duplicated()]
    qa_accepts = qa_accepts.sample(int((1-SAMPLE_RATIO) * SAMPLE_SIZE))
    
    skip_dataset = pd.concat([label_skips, qa_accepts])
    print(skip_dataset.skip_reasons.apply(lambda s: (lab in str(s))).value_counts())
    print(skip_dataset['annotation_state_id'].value_counts())
    skip_dataset['label'] = skip_dataset['annotation_state_id'].apply(get_label)
    out_path = f'/root/data/sid/skip_classifier_datasets/sampled_datasets/qa_accept_{lab}_skips_03-04-2020.csv'
    skip_dataset.to_csv(out_path)

In [None]:
useful_labels = [
    'BLURRY',
    'BAD_CROP',
    'BAD_ORIENTATION',
    'OBSTRUCTION',
    'TOO_DARK'
]

SAMPLE_SIZE = 10000
SAMPLE_RATIO = 0.7

for lab in useful_labels:
    label_skips = skips[skips[lab] & (skips['annotation_state_id'] == 4)]
    label_skips = label_skips[~label_skips.left_crop_url.duplicated()]
    label_skips = label_skips.sample(int(SAMPLE_RATIO * SAMPLE_SIZE)) 
    
    qa_accepts = annotations[annotations['annotation_state_id'] == 7]
    qa_accepts = qa_accepts[~qa_accepts.left_crop_url.duplicated()]
    qa_accepts = qa_accepts.sample(int((1-SAMPLE_RATIO) * SAMPLE_SIZE))
    
    skip_dataset = pd.concat([label_skips, qa_accepts])
    print(skip_dataset.skip_reasons.apply(lambda s: (lab in str(s))).value_counts())
    print(skip_dataset['annotation_state_id'].value_counts())
    skip_dataset['label'] = skip_dataset['annotation_state_id'].apply(get_label)
    out_path = f'/root/data/sid/skip_classifier_datasets/sampled_datasets/qa_accept_{lab}_skips_03-04-2020.csv'
    skip_dataset.to_csv(out_path)

In [None]:
for lab in useful_labels:

    out_path = f'/root/data/sid/skip_classifier_datasets/sampled_datasets/qa_accept_{lab}_skips_03-04-2020.csv'
    print(len(pd.read_csv(out_path)))