In [None]:
import os
import re
import sys
import json

from tqdm import tqdm
from dotenv import load_dotenv

sys.path.append(os.path.join("..", ".."))
from utils.s3_bucket import S3Bucket

In [None]:
load_dotenv(os.path.join("..", "..", "env"))

In [None]:
input_bucket_name = "ava-cv-labels"
input_keys = [
    "basil-leaf-bounding-box/manifests/output/output.manifest",
    "cilantro-leaf-bounding-box/manifests/output/output.manifest",
    "Pl@ntNet/plants/plantnet-strawberry-leaf-bounding-box/manifests/output/output.manifest",
    "Pl@ntNet/plants/plantnet-tomato-leaf-bounding-box/manifests/output/output.manifest",
    "Pl@ntNet/plants/plantnet-red-pepper-leaf-bounding-box/manifests/output/output.manifest",
]
output_bucket_name = "ava-cv-raw-photo-bucket"
output_prefix = "temp/"

In [None]:
input_bucket = S3Bucket(
    bucket_name=input_bucket_name,
    region_name=os.environ["REGION_NAME"]
)

def load_records(bucket, key):
    records = bucket[key]
    records = records.split("\n")
    records = [record for record in records if record]
    return [json.loads(record) for record in records]

records = []
for key in input_keys:
    records.extend(load_records(input_bucket, key))

In [None]:
output_bucket = S3Bucket(
    bucket_name=output_bucket_name,
    region_name=os.environ["REGION_NAME"]
)

In [None]:
new_records = {
    "images": [],
    "annotations": []
}

for idx, record in enumerate(tqdm(records)):
    job_names = [key.replace("-metadata", "") for key in list(record.keys()) if key.endswith("-metadata")]
    if not job_names:
        continue
        
    match = re.search(r"[^/]+/[^/]+\.jpe?g", record["source-ref"])
    assert(match)
        
    file_name = match.group(0)
    for job_name in job_names:
        if "failure-reason" in record[f"{job_name}-metadata"]:
            continue
            
        class_map = record[f"{job_name}-metadata"]["class-map"]
        new_records["images"].append({
            "file_name": file_name,
            "height": record[job_name]["image_size"][0]["height"],
            "width": record[job_name]["image_size"][0]["width"],
            "id": idx
        })

        for annotation in record[job_name]["annotations"]:
            # bbox is [left, top, right, bottom]
            bbox = [
                float(annotation["left"]),
                float(annotation["top"]),
                float(annotation["left"] + annotation["width"]),
                float(annotation["top"] + annotation["height"])
            ]
            class_name = class_map[str(annotation["class_id"])]
            new_records["annotations"].append({
                "image_id": idx,
                "bbox": bbox,
                "category_id": class_name
            })
            
output_key = os.path.join(output_prefix, "annotations.json")
output_bucket[output_key] = json.dumps(new_records, indent=4)