In [3]:
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil
import random

class ImageInteractor:
    def __init__(self, dataset_path: str, target: str, category: str = None):
        self.dataset_path = Path(dataset_path)
        self.target = target
        self.category = category
        self.imgs = self.get_images()

    def get_images(self):
        target_path = self.dataset_path / self.target
        imgs = []
        if not self.category:
            imgs = [image for image in target_path.glob("**/*") if image.suffix in [".jpg"]]
        else:
            imgs = self.get_images_by_category(target_path)
        return sorted(imgs)

    def get_images_by_category(self, target_path):
        imgs = []
        for file in target_path.glob("**/*"):
            if file.suffix in [".xml"]:
                tree = ET.parse(file)
                root = tree.getroot()
                for obj in root.findall("object"):
                    if obj.find('property/category').text == self.category:
                        img = file.with_suffix(".jpg")
                        if img.exists():
                            imgs.append(img)
        return imgs

    def gather_images_by_category(self, destination_folder: str, bottomedge_value: int, min_width: int, min_height: int):
        # Create the destination folder if it doesn't exist
        destination_path = Path(destination_folder)
        destination_path.mkdir(parents=True, exist_ok=True)

        # Randomly select 100 images (if available) from the list of self.imgs
        random_images = random.sample(self.imgs, min(len(self.imgs), 4000))

        for img in random_images:  # Iterate through the randomly selected images
            xml = img.with_suffix(".xml")
            if xml.exists():
                if img.exists():  # Check if the corresponding JPG file exists
                    # Check if the image and XML files already exist in the destination folder
                    new_path = destination_path / img.name
                    new_path1 = destination_path / xml.name

                    if not (new_path.exists() and new_path1.exists()):
                        tree = ET.parse(xml)
                        root = tree.getroot()

                        # Initialize a list to store bounding box coordinates for this image
                        bounding_boxes = []

                        for obj in root.findall("object"):
                            if obj.find('property/category').text == self.category:
                                ymax = int(obj.find('bndbox/ymax').text)
                                xmin = int(obj.find('bndbox/xmin').text)
                                ymin = int(obj.find('bndbox/ymin').text)
                                xmax = int(obj.find('bndbox/xmax').text)

                                # Check if the bounding box overlaps with any previously processed bounding box
                                overlapping = any(
                                    box for box in bounding_boxes
                                    if not (xmax <= box[0] or box[1] <= xmin or ymax <= box[2] or box[3] <= ymin)
                                )

                                if not overlapping:
                                    # If the bounding box does not overlap, add it to the list
                                    bounding_boxes.append((xmin, xmax, ymin, ymax))
                                else:
                                    # If the bounding box overlaps, skip copying this image and move to the next one
                                    print(f"Bounding box overlap found in {img.name}. Skipping copy.")
                                    break
                        else:
                            # If there is no bounding box overlap, copy the image and XML files
                            shutil.copy(str(img), str(new_path))
                            shutil.copy(str(xml), str(new_path1))
                            print(f"Copied {img.name} to {destination_folder}.")
                            print(f"Copied {xml.name} to {destination_folder}.")
                    else:
                        print(f"Files {img.name} and {xml.name} already exist in {destination_folder}. Skipping copy.")
                else:
                    print(f"Corresponding JPG file not found for {xml.name}. Skipping XML file.")
            else:
                print(f"XML file not found for {img.name}. Skipping image.")

        print(f"Images with category '{self.category}' and ymax < {bottomedge_value}, bbox width >= {min_width}, and bbox height >= {min_height} copied to '{destination_folder}'.")


In [5]:
#set dataset_path, target_folder(name), target_category, and destination_folder
dataset_path = "/mnt/disks/data1/aihub/Training/"

target_folder = "서해_군산항_4구역_BOX"

target_category = "선박"

destination_folder = "/mnt/disks/data1/aihub/Training/Modified_Data5"

interactor = ImageInteractor(dataset_path, target_folder, target_category)

# Set minimum size with mid_width&mid_height with bottomedgevalue(this can be modified by situation)
interactor.gather_images_by_category(destination_folder, bottomedge_value=2140, min_width=50, min_height=50)

Bounding box overlap found in 군산항_맑음_20201114_0453_050.jpg. Skipping copy.
Bounding box overlap found in 군산항_맑음_20201114_0338_059.jpg. Skipping copy.
Copied 군산항_맑음_20201124_0305_061.jpg to /mnt/disks/data1/aihub/Training/Modified_Data5.
Copied 군산항_맑음_20201124_0305_061.xml to /mnt/disks/data1/aihub/Training/Modified_Data5.
Copied 군산항_맑음_20201121_0337_014.jpg to /mnt/disks/data1/aihub/Training/Modified_Data5.
Copied 군산항_맑음_20201121_0337_014.xml to /mnt/disks/data1/aihub/Training/Modified_Data5.
Bounding box overlap found in 군산항_맑음_20201113_0035_039.jpg. Skipping copy.
Copied 군산항_맑음_20201114_0220_015.jpg to /mnt/disks/data1/aihub/Training/Modified_Data5.
Copied 군산항_맑음_20201114_0220_015.xml to /mnt/disks/data1/aihub/Training/Modified_Data5.
Bounding box overlap found in 군산항_맑음_20201113_0019_054.jpg. Skipping copy.
Copied 군산항_맑음_20201124_0097_058.jpg to /mnt/disks/data1/aihub/Training/Modified_Data5.
Copied 군산항_맑음_20201124_0097_058.xml to /mnt/disks/data1/aihub/Training/Modified_Data5.
Copi