In [8]:
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil
import random

class ImageInteractor:
    def __init__(self, dataset_path: str, target: str, category: str = None):
        self.dataset_path = Path(dataset_path)
        self.target = target
        self.category = category
        self.imgs = self.get_images()

    def get_images(self):
        target_path = self.dataset_path / self.target
        imgs = []
        if not self.category:
            imgs = [image for image in target_path.glob("**/*") if image.suffix in [".jpg"]]
        else:
            imgs = self.get_images_by_category(target_path)
        return sorted(imgs)

    def get_images_by_category(self, target_path):
        imgs = []
        for file in target_path.glob("**/*"):
            if file.suffix in [".xml"]:
                tree = ET.parse(file)
                root = tree.getroot()
                for obj in root.findall("object"):
                    if obj.find('property/category').text == self.category:
                        img = file.with_suffix(".jpg")
                        if img.exists():
                            imgs.append(img)
        return imgs

    def gather_images_by_category(self, destination_folder: str, bottomedge_value: int, min_width: int, min_height: int):
        # Create the destination folder if it doesn't exist
        destination_path = Path(destination_folder)
        destination_path.mkdir(parents=True, exist_ok=True)

        # Randomly select 100 images (if available) from the list of self.imgs
        random_images = random.sample(self.imgs, min(len(self.imgs), 2000))

        for img in random_images:  # Iterate through the randomly selected images
            xml = img.with_suffix(".xml")
            if xml.exists():
                if img.exists():  # Check if the corresponding JPG file exists
                    # Check if the image and XML files already exist in the destination folder
                    new_path = destination_path / img.name
                    new_path1 = destination_path / xml.name

                    if not (new_path.exists() and new_path1.exists()):
                        tree = ET.parse(xml)
                        root = tree.getroot()

                        # Flag to indicate if any valid bounding box is found
                        valid_bbox_found = False

                        for obj in root.findall("object"):
                            if obj.find('property/category').text == self.category:
                                ymax = int(obj.find('bndbox/ymax').text)
                                xmin = int(obj.find('bndbox/xmin').text)
                                ymin = int(obj.find('bndbox/ymin').text)
                                xmax = int(obj.find('bndbox/xmax').text)

                                bbox_width = xmax - xmin + 1
                                bbox_height = ymax - ymin + 1

                                if ymax > bottomedge_value or bbox_width < min_width or bbox_height < min_height:
                                    valid_bbox_found = False
                                    break
                                else:
                                    valid_bbox_found = True  # Set the flag to True if a valid bounding box is found

                        if valid_bbox_found:
                            shutil.copy(str(img), str(new_path))
                            shutil.copy(str(xml), str(new_path1))
                            print(f"Copied {img.name} to {destination_folder}.")
                            print(f"Copied {xml.name} to {destination_folder}.")
                        else:
                            print(f"No valid bounding box found for {img.name}. Skipping copy.")
                    else:
                        print(f"Files {img.name} and {xml.name} already exist in {destination_folder}. Skipping copy.")
                else:
                    print(f"Corresponding JPG file not found for {xml.name}. Skipping XML file.")
            else:
                print(f"XML file not found for {img.name}. Skipping image.")

        print(f"Images with category '{self.category}' and ymax < {bottomedge_value}, bbox width >= {min_width}, and bbox height >= {min_height} copied to '{destination_folder}'.")

In [11]:
#set dataset_path, target_folder(name), target_category, and destination_folder
dataset_path = "/mnt/disks/data1/aihub/Training/"

target_folder = "서해_대천항_2구역_BOX"

target_category = "등대"

destination_folder = "/mnt/disks/data1/aihub/Training/Modified_Data4"

interactor = ImageInteractor(dataset_path, target_folder, target_category)

# Set minimum size with mid_width&mid_height with bottomedgevalue(this can be modified by situation)
interactor.gather_images_by_category(destination_folder, bottomedge_value=2140, min_width=50, min_height=50)

No valid bounding box found for 대천항_맑음_20201124_5478_0072.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201210_5299_0017.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201121_5404_0630.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201220_5435_0274.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201210_5324_0364.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201210_5171_0157.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201206_5079_0513.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201210_5296_0589.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201211_5336_0341.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201220_5437_0524.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201210_5162_0147.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201210_5196_0607.jpg. Skipping copy.
No valid bounding box found for 대천항_맑음_20201222_5463_0655.jpg. S