In [1]:
import os
import json
import glob
from tqdm import tqdm

In [2]:
train_image = "./old_train/img/"
train_ann = "./old_train/ann/"

val_image = "./old_valid/img/"
val_ann = "./old_valid/ann/"

test_image = "./old_test/img/"
test_ann = "./old_test/ann/"

In [3]:
train_image_list = os.listdir(train_image)
train_ann_list = os.listdir(train_ann)

val_image_list = os.listdir(val_image)
val_ann_list = os.listdir(val_ann)

test_image_list = os.listdir(test_image)
test_ann_list = os.listdir(test_ann)

print("TRAIN: ", len(train_image_list), len(train_ann_list))
print("VAL: ", len(val_image_list), len(val_ann_list))
print("TEST: ", len(test_image_list), len(test_ann_list))

TRAIN:  1991 1991
VAL:  254 254
TEST:  250 250


In [4]:
# rename image and ann
# 3449_jpg.rf.f0cbd3e5b418f05bfdab2e4bf556086e.jpg -> 3349.jpg

for i in tqdm(val_image_list):
    os.rename(val_image+i, val_image+i.split("_")[0]+".jpg")
    
# rename ann json
# 1_jpg.rf.a332191a2b0b318c103a508c515931c1.jpg.json -> 1.json
for i in tqdm(val_ann_list):
    os.rename(val_ann+i, val_ann+i.split("_")[0]+".json")

100%|██████████| 254/254 [00:00<00:00, 94925.89it/s]
100%|██████████| 254/254 [00:00<00:00, 138880.62it/s]


In [6]:
# rename image and ann
# 1.jpg -> 10001.jpg, 1.json -> 10001.json
for i in tqdm(train_image_list):
    os.rename(train_image+i, train_image+str(int(i.split("_")[0])+10000)+".jpg")
    
for i in tqdm(train_ann_list):
    os.rename(train_ann+i, train_ann+str(int(i.split("_")[0])+10000)+".json")

100%|██████████| 1991/1991 [00:00<00:00, 92071.22it/s]
100%|██████████| 1991/1991 [00:00<00:00, 99776.09it/s]


In [11]:
for i in tqdm(test_image_list):
    os.rename(test_image+i, test_image+str(int(i.split("_")[0])+20000)+".jpg")

for i in tqdm(test_ann_list):
    os.rename(test_ann+i, test_ann+str(int(i.split("_")[0])+20000)+".json")

100%|██████████| 250/250 [00:00<00:00, 62541.81it/s]


### Check Error

In [12]:
new_train_image_list = os.listdir(train_image)
new_train_ann_list = os.listdir(train_ann)
new_val_image_list = os.listdir(val_image)
new_val_ann_list = os.listdir(val_ann)
new_test_image_list = os.listdir(test_image)
new_test_ann_list = os.listdir(test_ann)

print("TRAIN: ", len(new_train_image_list), len(new_train_ann_list))
print("VAL: ", len(new_val_image_list), len(new_val_ann_list))
print("TEST: ", len(new_test_image_list), len(new_test_ann_list))

TRAIN:  1991 1991
VAL:  254 254
TEST:  250 250


In [13]:
for i in new_train_image_list:
    if i.split(".")[0] + ".json" not in new_train_ann_list:
        print(i)

In [14]:
for i in new_test_image_list:
    if i.split(".")[0] + ".json" not in new_test_ann_list:
        print(i)

In [15]:
for i in new_val_image_list:
    if i.split(".")[0] + ".json" not in new_val_ann_list:
        print(i)

In [16]:
import os
import json
import glob
from tqdm import tqdm

def convert_to_coco(data_dir, output_file):
    images = []
    annotations = []
    categories = [{"id": 0, "name": "tooth", "supercategory": "none"}, {"id":1, "name": "cavity", "supercategory": "none"}]
    
    # category_id = 1  # Since all categories are "Tooth"
    annotation_id = 1
    
    img_dir = os.path.join(data_dir, 'img')
    ann_dir = os.path.join(data_dir, 'ann')
    
    img_files = sorted(glob.glob(os.path.join(img_dir, '*.jpg')))
    
    for img_id, img_file in enumerate(tqdm(img_files), 1):
        img_name = os.path.basename(img_file)
        ann_file = os.path.join(ann_dir, os.path.splitext(img_name)[0] + '.json')

        if not os.path.exists(ann_file):
            continue
        
        with open(ann_file, 'r') as f:
            ann_data = json.load(f)
        
        # Add image information
        images.append({
            "id": img_id,
            "file_name": img_name,
            "height": ann_data['size']['height'],
            "width": ann_data['size']['width']
        })
        
        # Add annotations
        for obj in ann_data['objects']:
            points = obj['points']['exterior']
            # Convert polygon to COCO format (x, y, width, height)
            x_coords = [p[0] for p in points]
            y_coords = [p[1] for p in points]
            min_x, min_y = min(x_coords), min(y_coords)
            max_x, max_y = max(x_coords), max(y_coords)
            width, height = max_x - min_x, max_y - min_y
            classTitle = obj['classTitle']
            
            category_id = 0 if classTitle == 'Tooth' else 1
            
            annotations.append({
                "id": annotation_id,
                "image_id": img_id,
                "category_id": category_id,
                "segmentation": [list(sum(points, []))],  # Flatten the list of points
                "area": width * height,
                "bbox": [min_x, min_y, width, height],
                "iscrowd": 0
            })
            annotation_id += 1
    
    coco_format = {
        "images": images,
        "annotations": annotations,
        "categories": categories
    }
    
    with open(output_file, 'w') as f:
        json.dump(coco_format, f, indent=4)

In [18]:
convert_to_coco('./old_train/', 'train.json')

100%|██████████| 1991/1991 [00:00<00:00, 3424.03it/s]


In [19]:
convert_to_coco("./old_valid/", "val.json")

100%|██████████| 254/254 [00:00<00:00, 4006.73it/s]


In [20]:
convert_to_coco("./old_test/", "test.json")

100%|██████████| 250/250 [00:00<00:00, 4271.34it/s]
