In [1]:
import os
import cv2
import numpy as np
import glob
import shutil

import torch
import torchvision

from groundingdino.util.inference import Model
from segment_anything import sam_model_registry, SamPredictor

import config.config as cfg
import params
import utils

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
# Building GroundingDINO inference model
grounding_dino_model = Model(
    model_config_path=cfg.GROUNDING_DINO_CONFIG_PATH,
    model_checkpoint_path=cfg.GROUNDING_DINO_CHECKPOINT_PATH,
)

# Building SAM Model and SAM Predictor
sam = sam_model_registry[cfg.SAM_ENCODER_VERSION](checkpoint=cfg.SAM_CHECKPOINT_PATH)
sam.to(device=params.DEVICE)
sam_predictor = SamPredictor(sam)



final text_encoder_type: bert-base-uncased


In [None]:
# Create directories for storing segmented lizards, images, and labels
os.makedirs(cfg.SEGMENTED_LIZARDS_PATH, exist_ok=True)
os.makedirs(cfg.SEGMENTED_IMAGES_PATH, exist_ok=True)
os.makedirs(cfg.SEGMENTED_LABELS_PATH, exist_ok=True)

# Iterate over each island directory
for island_path in glob.glob(f"{cfg.RAW_IMAGES_PATH}/*"):
    print(os.path.splitext(os.path.basename(island_path))[0])

    # Randomly select 100 lizards to segment their body parts
    for lizard_path in np.random.choice(glob.glob(f"{island_path}/*"), 100):

        # Process each image of the lizard
        for image_path in glob.glob(f"{lizard_path}/*.jpg"):
            # Clear CUDA cache to free memory
            torch.cuda.empty_cache()

            # Get image name without extension
            image_name = os.path.splitext(os.path.basename(image_path))[0]
            image = cv2.imread(image_path)
            gsam_image_path = f"{cfg.SEGMENTED_LIZARDS_PATH}/gs_{image_name}.jpg"

            image_output_path = f"{cfg.SEGMENTED_IMAGES_PATH}/{image_name}.jpg"
            label_output_path = f"{cfg.SEGMENTED_LABELS_PATH}/{image_name}.txt"

            # Skip processing if the image has already been processed
            if os.path.exists(gsam_image_path):
                print(f"Image {image_name} already processed")
                continue

            print(f"Processing image: {image_name}")

            # Detect objects using GroundingDINO
            detections = grounding_dino_model.predict_with_classes(
                image=image,
                classes=params.CLASSES,
                box_threshold=params.BOX_THRESHOLD,
                text_threshold=params.TEXT_THRESHOLD,
            )

            # Apply Non-Maximum Suppression (NMS) to filter detections
            nms_idx = (
                torchvision.ops.nms(
                    torch.from_numpy(detections.xyxy),
                    torch.from_numpy(detections.confidence),
                    params.NMS_THRESHOLD,
                )
                .numpy()
                .tolist()
            )

            # Update detections after NMS
            detections.xyxy = detections.xyxy[nms_idx]
            detections.confidence = detections.confidence[nms_idx]
            detections.class_id = detections.class_id[nms_idx]

            try:
                # Select the detection with the highest confidence
                max_confidence_idx = np.argmax(detections.confidence)
                detections.xyxy = detections.xyxy[max_confidence_idx].reshape(1, 4)
                detections.confidence = detections.confidence[
                    max_confidence_idx
                ].reshape(1)
                detections.class_id = detections.class_id[max_confidence_idx].reshape(1)

                # Convert detections to masks using SAM predictor
                detections.mask = utils.segment(
                    sam_predictor=sam_predictor,
                    image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
                    xyxy=detections.xyxy,
                )

                # Annotate image with detections
                annotated_image = utils.annotateImageWithDetections(image, detections)

                # Extract the first mask (assuming it's the one we want)
                mask = detections.mask[0]

                # Convert mask to polygon
                polygon = utils.mask_to_polygon(mask)

                # Save the polygon as YOLO txt label and the annotated image
                utils.save_polygon_label_as_yolo_txt(label_output_path, polygon)
                cv2.imwrite(gsam_image_path, annotated_image)
                cv2.imwrite(image_output_path, image)

            except Exception as e:
                print(f"Error processing image: {image_name}, Error: {str(e)}")
                continue

The next step is done by hand. We manually select the images that are perfectly segmented, which will be those that will be used to train the YOLOv8 instace segmentation model.

In [None]:
# Now, move the images and their corresponding labels to the train folder
# if they are in the selected folder

os.makedirs(f"{cfg.IMAGES_PATH}", exist_ok=True)
os.makedirs(f"{cfg.LABELS_PATH}", exist_ok=True)

for image_path in glob.glob(f"{cfg.SELECTED_IMAGES_PATH}/*.jpg"):
    image_name = os.path.splitext(os.path.basename(image_path))[0][3:]
    image_path = f"{cfg.SEGMENTED_IMAGES_PATH}/{image_name}.jpg"
    label_path = f"{cfg.SEGMENTED_LABELS_PATH}/{image_name}.txt"

    shutil.move(image_path, cfg.IMAGES_PATH)
    shutil.move(label_path, cfg.LABELS_PATH)

Once we have the selected images and their correesponding labels, we upload it to ROBOFLOW to be able to apply some preprocessing techniques more easily. In the next notebook, we train the segmentation model.