In [2]:
import json
from tqdm import tqdm
import uuid
import os
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
import math
import imgviz
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy
from collections import defaultdict
from get_data_from_XML import XML_preprocessor

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

def bbox_absolute_to_relative(absolute_bbox, image_width_height):
    width, height = image_width_height
    x1 = absolute_bbox[0] / width
    y1 = absolute_bbox[1] / height
    x2 = absolute_bbox[2] / width
    y2 = absolute_bbox[3] / height
    relative_bbox = [x1, y1, x2, y2]
    return relative_bbox

def bbox_relative_to_absolute(relative_bbox, image_width_height):
    width, height = image_width_height
    x1 = relative_bbox[0] * width
    y1 = relative_bbox[1] * height
    x2 = relative_bbox[2] * width
    y2 = relative_bbox[3] * height
    absolute_bbox = [x1, y1, x2, y2]
    return absolute_bbox

def visualize_bbox(image, bbox_list, bbox_name_list,bbox_is_relative=True,with_id=False):
    assert len(bbox_list) == len(bbox_name_list), "bbox_list and bbox_name_list must have the same length"
    if isinstance(image, str):
        image = Image.open(image).convert("RGB")

    if bbox_is_relative:
        # 画像のサイズを取得
        image_width_height = (image.width, image.height)
        # 相対座標を絶対座標に変換
        bbox_list = [bbox_relative_to_absolute(bbox, image_width_height) for bbox in bbox_list]
        
    #bbox_name_listをソート、bbox_listも同じ順番にソート
    # bbox_name_list, bbox_list = zip(*sorted(zip(bbox_name_list, bbox_list), key=lambda x: x[0]))
    # bbox_name_list = list(bbox_name_list)
    # bbox_list = list(bbox_list)
    name_to_label_id_dict = {}
    label_id = 0
    for bbox_name in bbox_name_list:
        if bbox_name not in name_to_label_id_dict:
            name_to_label_id_dict[bbox_name] = label_id
            label_id += 1    
    
    # bbox_listの座標をy1, x1, y2, x2の形式に変換
    bboxes = []
    labels = []
    # label_id = -1
    # old_label = None
    count_object_dict = {}
    id_bbox_name_list = []
    for bbox ,bbox_name in zip(bbox_list, bbox_name_list):
        x1, y1, x2, y2 = bbox
        bboxes.append([y1, x1, y2, x2])
        # if old_label != bbox_name:
        #     label_id += 1
        #     old_label = bbox_name
        label_id = name_to_label_id_dict[bbox_name]
        if bbox_name not in count_object_dict:
            count_object_dict[bbox_name] = 0
        else:
            count_object_dict[bbox_name] += 1
        if with_id:
            bbox_name = f"{bbox_name}_{count_object_dict[bbox_name]}"
            id_bbox_name_list.append(bbox_name)
        labels.append(label_id)
    # bboxes = np.array([bbox[1],bbox[0],bbox[3],bbox[2]]).astype(np.int32).reshape(-1, 4)
    
    base_resolution = 100 * 100
    base_font_size = 3
    image_resolution = image.width * image.height
    font_size = int( base_font_size * (image_resolution / base_resolution) ** 0.5)
    
    if with_id:
        bbox_name_list = id_bbox_name_list
    image = imgviz.instances2rgb(np.array(image), bboxes=bboxes, labels=labels,font_size=font_size,captions=bbox_name_list)

    plt.imshow(image)
    plt.show()

In [None]:
cat_id2name = {
  0: "aeroplane",
  1: "bicycle",
  2: "bird",
  3: "boat",
  4: "bottle",
  5: "bus",
  6: "car",
  7: "cat",
  8: "chair",
  9: "cow",
  10: "diningtable",
  11: "dog",
  12: "horse",
  13: "motorbike",
  14: "person",
  15: "pottedplant",
  16: "sheep",
  17: "sofa",
  18: "train",
  19: "tvmonitor",
}

import glob

split = "train"
year = "2012"
#https://panda-clip.com/data-split-stratified/
#trainは2007のtrainと2012のtrainvalを使用
#valは2007のval
#testは2007のtest
anno_folder = f"/data_ssd/PASCAL-VOC/ultralytics/labels/{split}{year}"
anno_label_path_list = glob.glob(os.path.join(anno_folder, "*.txt"))



In [23]:
print(len(anno_label_path_list))

5717


In [None]:
print(len(anno_folder_label_path_list))

17125
['2007_000027.jpg', '2007_000032.jpg', '2007_000033.jpg', '2007_000039.jpg', '2007_000042.jpg', '2007_000061.jpg', '2007_000063.jpg', '2007_000068.jpg', '2007_000121.jpg', '2007_000123.jpg']
[[0.6563786  0.614      0.69958848 0.652      0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  1.         0.         0.         0.         0.         0.        ]]


In [None]:
import os, json, argparse, xml.etree.ElementTree as ET
from tqdm import tqdm

VOC_CLASSES = [
    "aeroplane","bicycle","bird","boat","bottle",
    "bus","car","cat","chair","cow",
    "diningtable","dog","horse","motorbike","person",
    "pottedplant","sheep","sofa","train","tvmonitor",
]
CAT2ID = {c: i + 1 for i, c in enumerate(VOC_CLASSES)}  # COCO は 1 始まり

def parse_args():
    ap = argparse.ArgumentParser()
    ap.add_argument("--voc_root", required=True,
                    help="e.g. VOCdevkit/VOC2012")
    ap.add_argument("--split", choices=["train", "val"], default="train",
                    help="ImageSets/Main/*.txt を利用")
    ap.add_argument("--out", default=None,
                    help="出力 json。未指定なら instances_<split>2012.json")
    return ap.parse_args()

def load_image_ids(voc_root, split):
    list_file = os.path.join(voc_root, "ImageSets/Main", f"{split}.txt")
    with open(list_file) as f:
        return [line.strip() for line in f if line.strip()]

def build_coco(voc_root, img_ids):
    img_dir = os.path.join(voc_root, "JPEGImages")
    ann_dir = os.path.join(voc_root, "Annotations")

    images, annotations = [], []
    ann_id = 1

    for img_id in tqdm(img_ids, desc="parsing"):
        xml_f = os.path.join(ann_dir, f"{img_id}.xml")
        tree = ET.parse(xml_f)
        root = tree.getroot()

        width  = int(root.findtext("size/width"))
        height = int(root.findtext("size/height"))
        file_name = f"{img_id}.jpg"  # 相対パス推奨（学習時に base_dir を渡す）

        images.append({
            "id": int(img_id),          # VOC のファイル名は数字
            "file_name": file_name,
            "width": width,
            "height": height,
        })

        for obj in root.iter("object"):
            cls_name = obj.findtext("name")
            category_id = CAT2ID[cls_name]

            bnd = obj.find("bndbox")
            xmin = int(float(bnd.findtext("xmin")))
            ymin = int(float(bnd.findtext("ymin")))
            xmax = int(float(bnd.findtext("xmax")))
            ymax = int(float(bnd.findtext("ymax")))

            w = xmax - xmin
            h = ymax - ymin

            annotations.append({
                "id": ann_id,
                "image_id": int(img_id),
                "category_id": category_id,
                "bbox": [xmin, ymin, w, h],
                "area": w * h,
                "iscrowd": 0,
            })
            ann_id += 1

    categories = [
        {"id": i + 1, "name": c, "supercategory": "none"}
        for i, c in enumerate(VOC_CLASSES)
    ]

    return {"images": images, "annotations": annotations, "categories": categories}

def main():
    args = parse_args()
    out_json = args.out or f"instances_{args.split}2012.json"

    img_ids = load_image_ids(args.voc_root, args.split)
    coco_dict = build_coco(args.voc_root, img_ids)

    with open(out_json, "w") as f:
        json.dump(coco_dict, f)
    print(f"Saved {out_json}  "
          f"(images={len(coco_dict['images'])}, "
          f"annotations={len(coco_dict['annotations'])})")

if __name__ == "__main__":
    main()

# 作成するデータセット
* 画像ごとに複数クラス、複数物体
* 物体ごとに分けて検出させる
* \<p\>\<p\>まで入力、後は出力