# データ準備

## ダウンロード

In [None]:
!git clone https://github.com/Shenggan/BCCD_Dataset.git

In [None]:
!mv BCCD_Dataset/BCCD ./
!rm -rf BCCD_Dataset

## データセット作成

In [None]:
import os
import xml.etree.ElementTree as ET

import numpy as np
from datasets import Dataset
from PIL import Image

id2label = {0: "RBC", 1: "WBC", 2: "Platelets"}
label2id = {v: k for k, v in id2label.items()}

In [None]:
xml_paths = [
    f"BCCD/Annotations/{x}" for x in os.listdir("BCCD/Annotations") if ".xml" in x
]

datas = []
object_start_id = 0
for xml_path in xml_paths:
    image_id = int(xml_path.split("_")[-1].split(".")[0])
    image_path = f"BCCD/JPEGImages/BloodImage_{str(image_id).zfill(5)}.jpg"

    tree = ET.parse(xml_path)
    root = tree.getroot()

    image_info = {
        "image_id": image_id,
        "image": Image.open(image_path),
        "width": int(root.find("size").find("width").text),
        "height": int(root.find("size").find("height").text),
    }

    bboxes = [
        [
            float(x.find("xmin").text),
            float(x.find("ymin").text),
            float(x.find("xmax").text) - float(x.find("xmin").text),
            float(x.find("ymax").text) - float(x.find("ymin").text),
        ]
        for x in [x.find("bndbox") for x in root.findall("object")]
    ]
    categories = [label2id[x.find("name").text] for x in root.findall("object")]
    areas = [x[2] * x[3] for x in bboxes]

    bboxes_new = []
    categories_new = []
    areas_new = []
    for bbox, category, area in zip(bboxes, categories, areas):
        if area != 0:
            bboxes_new.append(bbox)
            categories_new.append(category)
            areas_new.append(area)
    objects = {}
    objects["bbox"] = bboxes_new
    objects["category"] = categories_new
    objects["area"] = areas_new
    objects["id"] = list(np.arange(len(objects["area"])) + object_start_id)

    datas.append({**image_info, "objects": objects})
    object_start_id += len(objects["id"])

In [None]:
dataset = Dataset.from_list(datas)
dataset.save_to_disk("BCCD_dataset")