In [1]:
from pathlib import Path
from datetime import datetime
import shutil

In [2]:
image_dir = Path("images")
output_dir = Path("coco")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

In [None]:
xml_files = list(image_dir.glob("**/*.xml"))
for p in xml_files:
    image_path= None
    if p.with_suffix(".jpg").exists():
        image_path = p.with_suffix(".jpg")
    elif p.with_suffix(".png").exists():
        image_path = p.with_suffix(".png")

    #shutil.copy(image_path, output_dir)
    #shutil.copy(p, output_dir)
    pass

In [3]:
coco_dir = output_dir # next working directory

In [4]:
def get_concept_lookup(dir: Path, skipped_folders=[".metadata"]):
    d = {}
    for p in dir.glob("*"):
        if not p.is_dir() or p.name in skipped_folders:
            continue
        print(f"Processing {p}")
        superconcept = p.name.lower()
        for subdir in p.glob("**/"):
            concept = subdir.name
            if concept in d.keys():
                print(f"Conflict: {concept} found in both {d[concept]} and {superconcept}.")
                print(f"=> {d[concept]} overwritten.")
            d[concept] = superconcept
        d[superconcept] = superconcept
    return d

cpt_lkup = get_concept_lookup(image_dir)

Processing images/Sea_Pigs
Processing images/Sea_cucumbers
Processing images/Soft_corals
Processing images/Stony_corals
Processing images/Black_corals
Processing images/Starfish
Processing images/Sea_pens
Processing images/Sea_anemones
Processing images/Demosponges
Processing images/Glass_sponges
Processing images/Sea_urchins


In [6]:
len(cpt_lkup)

446

In [5]:
info = {
    "year": 2022,
    "version": 1,
    "description": "A dataset of images of sessile benthos, derived from Fathomnet",
    "contributor": "Fathomnet, compiled by Linus Leong",
    "url": "",
    "data_created": datetime.now().isoformat()
}

licenses = [dict(id=0, name="FathomNet", url="https://fathomnet.org/")]

In [6]:
categories = []

for index, (cpt, supercpt) in enumerate(cpt_lkup.items()):
    new_cat = dict(
        id=index,
        name=cpt,
        supercategory=supercpt,
        isthing=1,
        color=[0,0,0]
    )
    categories.append(new_cat)

categories.append(dict(
    id=-1,
    name="other",
    supercategory="other",
    isthing=1,
    color=[0,0,0]
))

In [7]:
import matplotlib.pyplot as plt

colors = plt.colormaps.get("tab20").colors
colors = [[int(cc * 255) for cc in c] for c in colors]
supercategories = []

for index, supercpt in enumerate(set(cpt_lkup.values())):
    supercategories.append(
        dict(
            id=index,
            name=supercpt,
            colors=list(colors[index])
        )
    )

In [10]:
supercategories

[{'id': 0, 'name': 'sea_pigs', 'colors': [31, 119, 180]},
 {'id': 1, 'name': 'stony_corals', 'colors': [174, 199, 232]},
 {'id': 2, 'name': 'sea_pens', 'colors': [255, 127, 14]},
 {'id': 3, 'name': 'sea_anemones', 'colors': [255, 187, 120]},
 {'id': 4, 'name': 'starfish', 'colors': [44, 160, 44]},
 {'id': 5, 'name': 'sea_cucumbers', 'colors': [152, 223, 138]},
 {'id': 6, 'name': 'black_corals', 'colors': [214, 39, 40]},
 {'id': 7, 'name': 'sea_urchins', 'colors': [255, 152, 150]},
 {'id': 8, 'name': 'demosponges', 'colors': [148, 103, 189]},
 {'id': 9, 'name': 'glass_sponges', 'colors': [197, 176, 213]},
 {'id': 10, 'name': 'soft_corals', 'colors': [140, 86, 75]}]

In [14]:
from tqdm.autonotebook import tqdm
import xmltodict

xml_files = list(coco_dir.glob("*.xml"))
xml_dicts = []
for x in tqdm(xml_files):
    try:
        xml_dicts.append(xmltodict.parse(x.read_text()))
    except Exception as e:
        print(e)
        print(x)

HBox(children=(IntProgress(value=0, max=25171), HTML(value='')))




In [99]:
unclassed_labels = set()
for d in xml_dicts:
    obj = d['annotation']["object"]
    if not isinstance(obj, list):
        obj = [obj]
    for a in obj:
        if a["name"] not in cpt_lkup.keys():
            unclassed_labels.add(a["name"])

In [114]:
import requests

def read_children(json_data: dict, depth=5, start=1):
    children = []
    layer = start
    while "children" in json_data.keys() \
    and "name" in json_data.keys() \
    and layer < depth:
        if layer >= start:
            children.append(json_data["name"])
        json_data = json_data["children"][0]
        layer += 1
    return children

In [None]:
unclassed_details = {}
for i, other_cpt in enumerate(unclassed_labels):
    print(i)
    res = requests.get(f"https://fathomnet.org/dsg/phylogeny/up/{test_label}")
    if res.status_code != 200:
        print(other_cpt)
        continue
    
    unclassed_details[other_cpt] = read_children(res.json())

In [105]:
foo = [{} for _ in range(4)]
for cpt, fours in unclassed_details.items():
    for a, b in zip(foo, fours):
        if not b in a.keys():
            a[b] = []
        a[b].append(cpt) 

In [8]:
concept_id_lkup = {x['name']: x['id'] for x in categories}
concept_id_lkup

{'Sea_Pigs': 0,
 'Elpidiidae': 1,
 'Scotoplanes': 2,
 'Scotoplanes_globosa': 3,
 'Scotoplanes_clarki': 4,
 'Scotoplanes_sp._A': 5,
 'Peniagone': 6,
 'Peniagone_gracilis': 7,
 'Peniagone_sp._A': 8,
 'Peniagone_papillata': 9,
 'Peniagone_sp._1': 10,
 'Peniagone_vitrea': 11,
 'Peniagone_sp._2': 12,
 'Peniagone_vitrea-_sp._1_complex': 13,
 'Elpidia': 14,
 'Elpidia_sp._A': 15,
 'sea_pigs': 16,
 'Sea_cucumbers': 17,
 'Holothuroidea': 18,
 'Elasipodida': 19,
 'Psychropotidae': 20,
 'Benthodytes': 21,
 'Benthodytes_sp._1': 22,
 'Psychropotes': 23,
 'Psychropotes_sp._1': 24,
 'Psychropotes_depressa': 25,
 'Laetmogonidae': 26,
 'Pannychia': 27,
 'Pannychia_moseleyi': 28,
 'Pannychia_sp._1': 29,
 'Laetmogone': 30,
 'Laetmogone_wyvillethomsoni': 31,
 'Psychronaetes': 32,
 'Pelagothuriidae': 33,
 'Enypniastes': 34,
 'Dendrochirotida': 35,
 'Cucumariidae': 36,
 'Abyssocucumis': 37,
 'Abyssocucumis_abyssorum': 38,
 'Psolidae': 39,
 'Psolus': 40,
 'Psolus_squamatus': 41,
 'Holothuroidea_sp._4': 42,
 '

In [9]:
def parse(d, image_list, annotation_list):
    details = d['annotation']
    #settle image
    fname = details['filename']
    iw = int(details['size']['width'])
    ih = int(details['size']['height'])
    id = len(image_list)
    new_image = dict(
        id=id,
        width=iw,
        height=ih,
        file_name=fname
    )
    image_list.append(new_image)

    # settle annotations
    annotations = details["object"]
    if not isinstance(annotations, list):
        annotations = [annotations]

    for a in annotations:
        cid = concept_id_lkup.get(a["name"]) or -1
        xmin = int(a['bndbox']['xmin'])
        xmax = int(a['bndbox']['xmax'])
        ymin = int(a['bndbox']['ymin'])
        ymax = int(a['bndbox']['ymax'])

        if ymax <= ymin or xmax <= xmin:
            print(f"bad bound box: {fname}")
            continue
        ah = ymax - ymin
        aw = xmax - xmin
        bbox = [xmin, ymin, aw, ah]
        segmentation = [[xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]]
        annotation_list.append(
            dict(
                id=len(annotation_list),
                image_id=id,
                category_id=cid,
                area=ah*aw,
                segmentation=segmentation,
                bbox=bbox,
                iscrowd=0
            )
        )

In [15]:
images = []
annotations = []
for d in xml_dicts:
    parse(d, images, annotations)

bad bound box: 7abba8d5-c7a7-4c62-bdf8-fb89f65ec7cf.png
bad bound box: d48a1532-ee0c-400e-83cf-ca9ad4653797.png
bad bound box: 8b53ad8b-4aa6-4a4e-be55-aa5cb839e9e7.png
bad bound box: 2536ba44-59a0-46e5-85ff-8ff963f1b3ae.png
bad bound box: 7c4ddd91-6c5e-48c7-9465-803c4440f705.png
bad bound box: b5925d24-d898-4305-b4bc-8d191f2350c6.png
bad bound box: 41e20e36-715b-467c-b6b7-e7770fa934be.png
bad bound box: 8fbad6f7-db16-4348-b253-4c78eca4641a.png
bad bound box: 845296fb-514d-4c61-91cf-172b6b511b3e.png
bad bound box: 845296fb-514d-4c61-91cf-172b6b511b3e.png
bad bound box: ac4a1d23-99d8-4d8a-885c-b6b9573ffa84.png
bad bound box: 677dd1d0-192e-43b4-b992-024854928624.png
bad bound box: 3e38e8c0-5f76-4071-8b5d-8edc5754ded3.png
bad bound box: 23b8a279-f53b-409c-bb0c-61ae8146ddc7.png
bad bound box: 23b8a279-f53b-409c-bb0c-61ae8146ddc7.png
bad bound box: 23b8a279-f53b-409c-bb0c-61ae8146ddc7.png
bad bound box: 82893cf6-3360-45b4-bda3-1053cef01db4.png
bad bound box: 82893cf6-3360-45b4-bda3-1053cef01

In [16]:
coco_all = dict(
    info=info,
    licenses=licenses,
    categories=categories,
    supercategories=supercategories,
    images=images,
    annotations=annotations,
)

In [18]:
import json

json.dump(coco_all, open("coco.json", "w"), indent=4)