In [None]:
import json
import random
import os
import pandas as pd
from research.utils.data_access_utils import RDSAccessUtils

rds = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
def generate_df(pen_id, start_date, end_date):
    query = """
        select cast(captured_at as date) as date, * from prod.crop_annotation
        where pen_id={} and group_id='{}' and captured_at between '{}' and '{}'
        and service_id=1
        and annotation_state_id=1
        and left_crop_url is not null;
    """.format(pen_id, pen_id, start_date, end_date)
    print(query)
    df = rds.extract_from_database(query)
    return df
    

In [None]:
pen_ids = [88]
start_date = '2020-02-25'
end_date = '2020-03-05'
max_images_per_day = 5000

In [None]:
for pen_id in pen_ids:
    df = generate_df(pen_id, start_date, end_date)
    day_dfs = []
    for date in df.date.unique():
        mask = df.date == date
        day_dfs.append(df[mask].sample(min(max_images_per_day, mask.sum())))
    
    
    
    
    

In [None]:
aggregate_df = pd.concat(day_dfs)

In [None]:
json_blobs = []
for idx, row in aggregate_df.iterrows():
    image_url = row.url_key
    json_blob = dict(
        penId=str(row.pen_id),
        siteId='0',
        groupId='zoom-experiment',
        imageScore=0.5,
        capturedAt=str(row.captured_at),
        key=os.path.join(row.base_key, os.path.basename(row.url_key)),
        leftCropUrl=row.left_crop_url,
        rightCropUrl=None,
        leftCropMetadata=row.left_crop_metadata,
        rightCropMetadata=row.right_crop_metadata,
        cameraMetadata=row.camera_metadata
    )
    json_blobs.append(json_blob)

In [None]:
random.shuffle(json_blobs)

In [None]:
len(json_blobs)

In [None]:
out_f = '/root/data/alok/biomass_estimation/playground/wound_ann_test.jsonl'
with open(out_f, 'w') as outfile:
    for entry in json_blobs[:5]:
        json.dump(entry, outfile)
        outfile.write('\n')

In [None]:
for i in range(10):
    start_idx = 5000 * i
    end_idx = 5000 * (i + 1)
    
    out_f = '/root/data/alok/biomass_estimation/playground/maturation_ann_{}.jsonl'.format(i)
    with open(out_f, 'w') as outfile:
        for entry in json_blobs[start_idx:end_idx]:
            json.dump(entry, outfile)
            outfile.write('\n')