<h1> Dataset Construction for Current Model Validation </h1>

<h3> In this notebook, we will download a lot of images that have a crop and determine what the false positive rate for filtration is for this dataset. We want this false positive rate to be as close to zero as possible. </h3>

In [None]:
import boto3
import glob
import json
import os
import pandas as pd
from skimage.transform import resize
from skimage.io import imread, imsave
import shutil
import tqdm

from multiprocessing import Pool

<h1> Load database dump into PANDAS </h1>

In [None]:
analysis_df_path = '/root/data/alok/crop_data/data_dumps/historical_fish_detections.csv'
analysis_df = pd.read_csv(analysis_df_path)

<h1> Modify the dataframe as necessary </h1>

In [None]:
full_image_url_prefix = 'https://s3-eu-west-1.amazonaws.com/aquabyte-images-raw'
def full_image_key_from_crop_key(crop_key):
    key_components = crop_key.replace('.jpg', '').split('/')
    full_image_f_name = '_'.join(key_components[-1].split('_')[:-4]) + '.jpg'
    full_image_key = os.path.join('/'.join(key_components[:-1]), full_image_f_name)
    return full_image_key
    

In [None]:
analysis_df['full_image_key'] = analysis_df.image_key.apply(full_image_key_from_crop_key)
mask = (analysis_df.detected_at >= '2018-12-02') & ~(analysis_df.is_bad_crop == True)

<h1> Get unique full image paths that contain fish and download locally from S3 </h1>

In [None]:
aws_credentials = json.load(open(os.environ['AWS_CREDENTIALS']))
s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_credentials['aws_access_key_id'],
    aws_secret_access_key=aws_credentials['aws_secret_access_key']
)


In [None]:
full_images_dir = '/root/data/alok/filtration_classifier_data/fish_present/images'
full_image_bucket = 'aquabyte-images-raw'

def download_locally(key):
    full_image_f_name = os.path.basename(key)
    full_image_f = os.path.join(full_images_dir, full_image_f_name)
    if not os.path.exists(full_image_f):
        print('Writing image to disk: {}'.format(full_image_f))
        s3_client.download_file(full_image_bucket, key, full_image_f)

In [None]:
p = Pool(20)
N = 10
tdf = analysis_df[mask].sample(N)
keys = list(set(tdf.full_image_key.tolist()))
p.map(download_locally, keys)


In [None]:
corrupt_files = []
for f in os.listdir(full_images_dir):
    if not f.endswith('.jpg'):
        corrupt_files.append(f)
        
print('rm -rf {}'.format(' '.join(corrupt_files)))

<h1> Get unique full image paths that contain fish and download locally from S3 </h1>

In [None]:
frames_df_path = '/root/data/alok/filtration_classifier_data/data_dumps/full_frames.csv'
frames_df = pd.read_csv(frames_df_path)
full_image_keys = frames_df.s3_key.tolist()
zero_crop_image_keys = list(set(full_image_keys) - set(analysis_df.full_image_key.tolist()))



In [None]:
full_images_dir = '/root/data/alok/filtration_classifier_data/zero_crops/images'
p = Pool(20)
N = 5000
p.map(download_locally, zero_crop_image_keys[:N])
