In [None]:
import os
import re
import sys
import json
from dotenv import load_dotenv

sys.path.append(os.path.join("..", ".."))
from utils.s3_bucket import S3Bucket

In [None]:
load_dotenv(os.path.join("..", "..", "env"))

In [None]:
job_name = ""
bucket_name = ""
key = ""
image_path_regex = r"[^/]+/[^/]+\.jpe?g"

In [None]:
bucket = S3Bucket(
    bucket_name=bucket_name,
    region_name=os.environ["REGION_NAME"]
)

records = bucket[key]
records = records.split("\n")
records = [record for record in records if record]
records = [json.loads(record) for record in records]

In [None]:
new_records = {}

for idx, record in enumerate(records):
    if job_name not in record:
        continue
        
    match = re.search(image_path_regex, record["source-ref"])
    assert(match)
        
    file_name = match.group(0)
    class_map = record[f"{job_name}-metadata"]["class-map"]
    for class_name in class_map.values():
        if class_name not in new_records:
            new_records[class_name] = {
                "images": [],
                "annotations": []
            }
        
        new_records[class_name]["images"].append({
            "file_name": file_name,
            "height": record[job_name]["image_size"][0]["height"],
            "width": record[job_name]["image_size"][0]["width"],
            "id": idx
        })
        
    for annotation in record[job_name]["annotations"]:
        # bbox is [left, top, right, bottom]
        bbox = [
            float(annotation["left"]),
            float(annotation["top"]),
            float(annotation["left"] + annotation["width"]),
            float(annotation["top"] + annotation["height"])
        ]
        class_name = class_map[str(annotation["class_id"])]
        new_records[class_name]["annotations"].append({
            "image_id": idx,
            "bbox": bbox,
            "category_id": class_name
        })

In [None]:
for record_type, records in new_records.items():
    new_key = key.replace("output.manifest", f"{record_type}/annotations.json")
    print(os.path.join("s3://", bucket_name, new_key))
    bucket[new_key] = json.dumps(records)