## Creating classification dataset from the Yolo Dataset for bacils

In [23]:
import os
import random
import cv2  
from pathlib import Path
import shutil

In [24]:
# Point this to the folder that contains your .txt label files
labels_dir = ""

# Paths (adjust these)
labels_dir = Path("/home/julian/indonezia/dataset/instance-segmentation-dataset/dataset/08_01_dataset_negative_v1/val/labels")  # directory with .txt label files
images_dir = Path("/home/julian/indonezia/dataset/instance-segmentation-dataset/dataset/08_01_dataset_negative_v1/val/images")  # directory with image files
output_dir = Path("/home/julian/indonezia/dataset/classification_dataset_julian/val")

bacil_dir = output_dir / "bacil"
debris_dir = output_dir / "debris"
bacil_dir.mkdir(parents=True, exist_ok=True)
debris_dir.mkdir(parents=True, exist_ok=True)

# Initialize counters
bacil_count = 0
debris_count = 0

polygons_by_class = {
    0: [],  # for bacil
    1: [],  # for debris
}
MAX_IMAGES_PER_CLASS = 4_000
random.seed(42) 


In [7]:
# To verify how many bacil and debris

# for label_file in os.listdir(labels_dir):
#     if not label_file.endswith(".txt"):
#         continue
    
#     file_path = os.path.join(labels_dir, label_file)
    
#     with open(file_path, "r") as f:
#         for line in f:
#             items = line.strip().split()
#             class_id = int(items[0])
            
#             if class_id == 0:
#                 bacil_count += 1
#             elif class_id == 1:
#                 debris_count += 1

# # Print results
# print(f"Total bacil (class 0) instances: {bacil_count}")
# print(f"Total debris (class 1) instances: {debris_count}")

"""
Total bacil (class 0) instances: 20269
Total debris (class 1) instances: 26532
"""

'\nTotal bacil (class 0) instances: 20269\nTotal debris (class 1) instances: 26532\n'

In [16]:
# ## Check Image Instance Count multiclass and multiinstance

# more_than_2_instances_count = 0
# multi_class_images_count = 0

# # We also want to track how many images have more than 1 instance (regardless of classes)
# # This helps with the second question.
# multi_instance_image_count = 0

# # Iterate over all label files
# for label_file in os.listdir(labels_dir):
#     if not label_file.endswith(".txt"):
#         continue
    
#     file_path = os.path.join(labels_dir, label_file)
    
#     with open(file_path, "r") as f:
#         lines = f.read().strip().splitlines()
    
#     # Count number of instances
#     num_instances = len(lines)
    
#     # Check how many images have more than 2 annotations
#     if num_instances > 2:
#         more_than_2_instances_count += 1
    
#     # For multi-class check, parse classes and see if there's more than one unique class
#     if num_instances > 1:
#         multi_instance_image_count += 1
#         class_ids = []
#         for line in lines:
#             items = line.strip().split()
#             class_id = items[0]  # '0' or '1' in string form
#             class_ids.append(class_id)
        
#         # If there's more than one unique class, increment multi_class_images_count
#         if len(set(class_ids)) > 1:
#             multi_class_images_count += 1

# print(f"Number of images with more than 2 instance annotations: {more_than_2_instances_count}")
# print(f"Number of images with more than 1 instance annotation: {multi_instance_image_count}")
# print(f"Among those with more than 1 instance annotation, "
#       f"number of images that have different classes: {multi_class_images_count}")
"""
Number of images with more than 2 instance annotations: 2376
Number of images with more than 1 instance annotation: 4518
Among those with more than 1 instance annotation, number of images that have different classes: 0
"""

'\nNumber of images with more than 2 instance annotations: 2376\nNumber of images with more than 1 instance annotation: 4518\nAmong those with more than 1 instance annotation, number of images that have different classes: 0\n'

## Create a dataset by saving the images based on class 

In [25]:

images_by_class = {
    0: [],  # bacil
    1: [],  # debris
}

for label_file in labels_dir.iterdir():
    if label_file.suffix.lower() != ".txt":
        continue

    # Construct the matching image filename: e.g. "image000.txt" -> "image000.jpg"
    # Adjust if your images use .png or another extension
    image_file = images_dir / (label_file.stem + ".png")
    if not image_file.exists():
        # If the image doesn't exist, skip
        continue

    with open(label_file, "r") as f:
        lines = [ln.strip() for ln in f if ln.strip()]
    
    if not lines:
        # If no lines (no annotations), skip or treat as special case
        continue

    # Check the classes in this image
    classes_in_image = set()
    for line in lines:
        parts = line.split()
        cls_id = int(parts[0])  # 0 or 1
        classes_in_image.add(cls_id)

    # According to your data analysis, classes_in_image should have exactly one element
    if len(classes_in_image) == 1:
        main_class = list(classes_in_image)[0]  # 0 or 1
        # Only track classes 0 or 1 (in case you have other classes or noise)
        if main_class in [0, 1]:
            images_by_class[main_class].append(image_file)


In [26]:

selected_images = {
    0: [],
    1: [],
}

for cls_id in [0, 1]:
    img_list = images_by_class[cls_id]
    random.shuffle(img_list)
    selected_images[cls_id] = img_list[:MAX_IMAGES_PER_CLASS]

print(f"Class 0 (bacil): found {len(images_by_class[0])} total, taking {len(selected_images[0])}")
print(f"Class 1 (debris): found {len(images_by_class[1])} total, taking {len(selected_images[1])}")


Class 0 (bacil): found 1304 total, taking 1304
Class 1 (debris): found 7981 total, taking 4000


In [27]:
for cls_id in [0, 1]:
    if cls_id == 0:
        target_dir = bacil_dir
    else:
        target_dir = debris_dir
    
    for img_path in selected_images[cls_id]:
        # Copy with the original filename
        shutil.copy2(img_path, target_dir / img_path.name)

print("Done! Classification dataset created at:", output_dir)
print("Bacil folder:", len(list(bacil_dir.iterdir())), "images")
print("Debris folder:", len(list(debris_dir.iterdir())), "images")

Done! Classification dataset created at: /home/julian/indonezia/dataset/classification_dataset_julian/val
Bacil folder: 1304 images
Debris folder: 4000 images


## Create a dataset by cropping the objects and saving the cropped objects 

In [8]:
# 1. Gather polygons from labels
for label_file in labels_dir.iterdir():
    if label_file.suffix != ".txt":
        continue
    
    # Derive image filename from label filename
    # e.g., label_file = "image0.txt" -> image_file = "image0.jpg" (or .png)
    # Adjust if your images have different extensions
    image_file = images_dir / (label_file.stem + ".png")
    
    # If the corresponding image does not exist, skip
    if not image_file.exists():
        continue
    
    # Read label lines
    with open(label_file, "r") as f:
        for line in f:
            # Split line into tokens
            items = line.strip().split()
            class_id = int(items[0])
            
            # We only have 2 classes (0 or 1)
            if class_id not in [0, 1]:
                continue
            
            # The rest of items are polygon coordinates: x1, y1, x2, y2, ...
            coords = items[1:]
            
            # Convert them to float
            coords = [float(c) for c in coords]
            
            # Store it as (image_path, class_id, coords)
            polygons_by_class[class_id].append((image_file, coords))


In [9]:
# 2. For each class, we only want 10,000
MAX_PER_CLASS = 10000
selected_polygons = {0: [], 1: []}

for cls_id in [0, 1]:
    # Shuffle the entire list to randomize
    random.shuffle(polygons_by_class[cls_id])
    
    # Take up to 10,000
    selected_polygons[cls_id] = polygons_by_class[cls_id][:MAX_PER_CLASS]


In [11]:
# 3. Function to compute bounding box from polygon coords
def polygon_to_bbox(coords, img_w, img_h):
    """
    coords: [x1, y1, x2, y2, ..., xN, yN] in normalized format (0 to 1).
    Returns (xmin, ymin, xmax, ymax) in absolute pixel coordinates.
    """
    # coords come in pairs
    xs = coords[0::2]  # x1, x2, ...
    ys = coords[1::2]  # y1, y2, ...
    
    # Convert normalized -> absolute pixel coordinates
    xs_abs = [int(x * img_w) for x in xs]
    ys_abs = [int(y * img_h) for y in ys]
    
    xmin, xmax = min(xs_abs), max(xs_abs)
    ymin, ymax = min(ys_abs), max(ys_abs)
    
    return xmin, ymin, xmax, ymax

# 4. Crop and save
def crop_and_save_polygon(image_path, coords, out_dir, index):
    """
    Reads the image, crops the bounding box of the polygon,
    and saves it to out_dir with a filename that includes `index`.
    """
    # Read the image
    img = cv2.imread(str(image_path))
    if img is None:
        return  # skip if image not found or can't be opened
    
    h, w, _ = img.shape
    xmin, ymin, xmax, ymax = polygon_to_bbox(coords, w, h)
    
    # Clip coordinates in case bounding box goes out of image
    xmin = max(0, xmin)
    ymin = max(0, ymin)
    xmax = min(w, xmax)
    ymax = min(h, ymax)
    
    # Crop
    cropped = img[ymin:ymax, xmin:xmax]
    
    # If bounding box is invalid or empty, skip
    if cropped.size == 0:
        return
    
    # Save the cropped patch
    out_path = out_dir / f"{image_path.stem}_{index}.png"
    cv2.imwrite(str(out_path), cropped)

In [12]:
# Process each class
for cls_id, polygons in selected_polygons.items():
    if cls_id == 0:
        class_dir = bacil_dir
    else:
        class_dir = debris_dir
    
    print(f"Processing class {cls_id} -> {class_dir}")
    
    for idx, (img_file, coords) in enumerate(polygons):
        crop_and_save_polygon(img_file, coords, class_dir, idx)

print("Done! Crops saved to:", output_dir)

Processing class 0 -> /home/julian/indonezia/dataset/classification_dataset_julian/train/bacil
Processing class 1 -> /home/julian/indonezia/dataset/classification_dataset_julian/train/debris


KeyboardInterrupt: 