# DINO Grounding Object Detection for VLSP 2025 Traffic Sign Dataset

This notebook performs object detection on all training images using the DINO Grounding model to detect traffic signs.
The detected objects will be annotated and saved to a new folder for further analysis.

## Import Required Libraries

In [1]:
import os
import json
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import torch
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


## Setup Paths and Configuration

In [2]:
# Define paths
BASE_PATH = Path("/home/mhieu/git/vlsp25")
TRAIN_IMAGES_PATH = BASE_PATH / "dataset/vlsp25/private_test/private_test_images"
OUTPUT_PATH = BASE_PATH / "image" / "private_detected_objects"
ANNOTATED_IMAGES_PATH = OUTPUT_PATH / "annotated_images"
CROPPED_IMAGES_PATH = OUTPUT_PATH / "cropped_objects"
DETECTION_RESULTS_PATH = OUTPUT_PATH / "detection_results"


# Create output directories
os.makedirs(ANNOTATED_IMAGES_PATH, exist_ok=True)
os.makedirs(CROPPED_IMAGES_PATH, exist_ok=True)
os.makedirs(DETECTION_RESULTS_PATH, exist_ok=True)


In [3]:
# # Define paths
# BASE_PATH = Path("/home/mhieu/git/vlsp25")
# TRAIN_IMAGES_PATH = BASE_PATH / "dataset/VLSP 2025 - MLQA-TSR Data Release/train_data/train_images"
# OUTPUT_PATH = BASE_PATH / "detected_objects"
# ANNOTATED_IMAGES_PATH = OUTPUT_PATH / "annotated_images"
# CROPPED_IMAGES_PATH = OUTPUT_PATH / "cropped_objects"
# DETECTION_RESULTS_PATH = OUTPUT_PATH / "detection_results"

# # Create output directories
# os.makedirs(ANNOTATED_IMAGES_PATH, exist_ok=True)
# os.makedirs(CROPPED_IMAGES_PATH, exist_ok=True)
# os.makedirs(DETECTION_RESULTS_PATH, exist_ok=True)

# Model configuration
MODEL_ID = "IDEA-Research/grounding-dino-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TEXT_PROMPT = "traffic sign."
BOX_THRESHOLD = 0.2
TEXT_THRESHOLD = 0.2

print(f"Using device: {DEVICE}")
print(f"Train images path: {TRAIN_IMAGES_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"Annotated images will be saved to: {ANNOTATED_IMAGES_PATH}")
print(f"Cropped objects will be saved to: {CROPPED_IMAGES_PATH}")

Using device: cuda
Train images path: /home/mhieu/git/vlsp25/dataset/vlsp25/private_test/private_test_images
Output path: /home/mhieu/git/vlsp25/image/private_detected_objects
Annotated images will be saved to: /home/mhieu/git/vlsp25/image/private_detected_objects/annotated_images
Cropped objects will be saved to: /home/mhieu/git/vlsp25/image/private_detected_objects/cropped_objects


## Load DINO Grounding Model

In [4]:
print("Loading DINO Grounding model...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_ID).to(DEVICE)
print("Model loaded successfully!")

Loading DINO Grounding model...
Model loaded successfully!
Model loaded successfully!


## Get List of Training Images

In [5]:
# Get all training images
image_files = list(TRAIN_IMAGES_PATH.glob("*.jpg")) + list(TRAIN_IMAGES_PATH.glob("*.png"))
image_files.sort()

print(f"Found {len(image_files)} training images")
print(f"First 5 images: {[img.name for img in image_files[:5]]}")
print(f"Last 5 images: {[img.name for img in image_files[-5:]]}")

Found 104 training images
First 5 images: ['private_test_1_1.jpg', 'private_test_1_2.jpg', 'private_test_1_3.jpg', 'private_test_1_4.jpg', 'private_test_1_5.jpg']
Last 5 images: ['private_test_8_5.jpg', 'private_test_8_6.jpg', 'private_test_8_7.jpg', 'private_test_8_8.jpg', 'private_test_8_9.jpg']


## Define Object Detection Function

In [6]:
def detect_objects_in_image(image_path, text_prompt=TEXT_PROMPT, box_threshold=BOX_THRESHOLD, text_threshold=TEXT_THRESHOLD):
    """
    Detect objects in a single image using DINO Grounding model
    
    Args:
        image_path: Path to the image file
        text_prompt: Text prompt for object detection
        box_threshold: Threshold for bounding box confidence
        text_threshold: Threshold for text matching confidence
    
    Returns:
        dict: Detection results with boxes, scores, labels, and image info
    """
    try:
        # Load and process image
        image = Image.open(image_path).convert("RGB")
        
        # Prepare inputs
        inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(DEVICE)
        
        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Post-process results
        results = processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            target_sizes=[image.size[::-1]]
        )
        
        result = results[0]
        
        # Apply thresholds manually
        if len(result["scores"]) > 0:
            # Filter based on score threshold (box_threshold)
            score_mask = result["scores"] >= box_threshold
            filtered_boxes = result["boxes"][score_mask]
            filtered_scores = result["scores"][score_mask]
            filtered_labels = [label for i, label in enumerate(result["labels"]) if score_mask[i]]
            
            result = {
                "boxes": filtered_boxes,
                "scores": filtered_scores,
                "labels": filtered_labels
            }
        
        return {
            "image_path": str(image_path),
            "image_name": image_path.name,
            "image_size": image.size,
            "num_detections": len(result["boxes"]),
            "boxes": result["boxes"].tolist() if len(result["boxes"]) > 0 else [],
            "scores": result["scores"].tolist() if len(result["scores"]) > 0 else [],
            "labels": result["labels"] if len(result["labels"]) > 0 else [],
            "text_prompt": text_prompt,
            "box_threshold": box_threshold,
            "text_threshold": text_threshold
        }
        
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return {
            "image_path": str(image_path),
            "image_name": image_path.name,
            "error": str(e),
            "num_detections": 0,
            "boxes": [],
            "scores": [],
            "labels": []
        }

## Define Annotation Function

In [7]:
def annotate_and_save_image(image_path, detection_result, output_path):
    """
    Annotate image with detection results and save to output path
    
    Args:
        image_path: Path to original image
        detection_result: Detection results from DINO model
        output_path: Path to save annotated image
    """
    try:
        # Load original image
        image = Image.open(image_path).convert("RGB")
        draw = ImageDraw.Draw(image)
        
        # Try to load a font, fallback to default if not available
        try:
            font = ImageFont.truetype("arial.ttf", 20)
        except:
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20)
            except:
                font = ImageFont.load_default()
        
        # Draw bounding boxes and labels
        for box, score, label in zip(detection_result["boxes"], detection_result["scores"], detection_result["labels"]):
            # Convert box to list if it's not already
            if not isinstance(box, list):
                box = box.tolist()
            
            # Create label text with confidence score
            label_text = f"{label} ({score:.2f})"
            
            # Draw bounding box
            draw.rectangle(box, outline="red", width=3)
            
            # Draw label
            # draw.text((box[0], box[1] - 25), label_text, fill="red", font=font)
        
        # Add summary text
        # summary_text = f"Detections: {detection_result['num_detections']}"
        # draw.text((10, 10), summary_text, fill="blue", font=font)
        
        # Save annotated image
        image.save(output_path)
        
    except Exception as e:
        print(f"Error annotating {image_path}: {str(e)}")

## Define Crop Function

In [8]:
def crop_and_save_detections(image_path, detection_result, output_dir):
    """
    Crop detected objects from the image and save them as individual files
    
    Args:
        image_path: Path to original image
        detection_result: Detection results from DINO model
        output_dir: Directory to save cropped images
    """
    try:
        # Load original image
        image = Image.open(image_path).convert("RGB")
        image_name = Path(image_path).stem
        
        saved_crops = []
        
        # Crop each detected object
        for i, (box, score, label) in enumerate(zip(detection_result["boxes"], detection_result["scores"], detection_result["labels"])):
            # Convert box to list if it's not already
            if not isinstance(box, list):
                box = box.tolist()
            
            # Ensure box coordinates are within image bounds
            x1, y1, x2, y2 = box
            x1 = max(0, int(x1))
            y1 = max(0, int(y1))
            x2 = min(image.width, int(x2))
            y2 = min(image.height, int(y2))
            
            # Skip if box is too small or invalid
            if x2 <= x1 or y2 <= y1 or (x2 - x1) < 10 or (y2 - y1) < 10:
                continue
            
            # Crop the image
            cropped_image = image.crop((x1, y1, x2, y2))
            
            # Create filename for the cropped image
            clean_label = label.replace(" ", "_").replace(".", "")
            crop_filename = f"{image_name}_crop_{i+1}_{clean_label}_{score:.2f}.jpg"
            crop_path = output_dir / crop_filename
            
            # Save the cropped image
            cropped_image.save(crop_path, quality=95)
            
            saved_crops.append({
                "crop_filename": crop_filename,
                "crop_path": str(crop_path),
                "box": box,
                "score": score,
                "label": label,
                "crop_size": cropped_image.size
            })
        
        return saved_crops
        
    except Exception as e:
        print(f"Error cropping objects from {image_path}: {str(e)}")
        return []

## Process All Training Images

In [9]:
# Initialize results storage
all_detection_results = []
all_cropped_info = []
processing_stats = {
    "total_images": len(image_files),
    "processed": 0,
    "errors": 0,
    "total_detections": 0,
    "images_with_detections": 0,
    "total_cropped_objects": 0
}

print(f"Starting object detection on {len(image_files)} images...")
print(f"Results will be saved to: {OUTPUT_PATH}")

# Process images with progress bar
for i, image_path in enumerate(tqdm(image_files, desc="Processing images")):
    # Perform object detection
    detection_result = detect_objects_in_image(image_path)
    all_detection_results.append(detection_result)
    
    # Update statistics
    processing_stats["processed"] += 1
    if "error" in detection_result:
        processing_stats["errors"] += 1
    else:
        processing_stats["total_detections"] += detection_result["num_detections"]
        if detection_result["num_detections"] > 0:
            processing_stats["images_with_detections"] += 1
    
    # Process images with detections
    if detection_result["num_detections"] > 0:
        # Save annotated image
        output_image_path = ANNOTATED_IMAGES_PATH / f"annotated_{image_path.name}"
        annotate_and_save_image(image_path, detection_result, output_image_path)
        
        # Crop and save individual detected objects
        cropped_info = crop_and_save_detections(image_path, detection_result, CROPPED_IMAGES_PATH)
        if cropped_info:
            all_cropped_info.extend(cropped_info)
            processing_stats["total_cropped_objects"] += len(cropped_info)
    
    # Print progress every 50 images
    if (i + 1) % 50 == 0:
        print(f"Processed {i + 1}/{len(image_files)} images. "
              f"Found {processing_stats['total_detections']} detections, "
              f"saved {processing_stats['total_cropped_objects']} cropped objects so far.")

print("\nProcessing completed!")
print(f"Processed: {processing_stats['processed']} images")
print(f"Errors: {processing_stats['errors']} images")
print(f"Total detections: {processing_stats['total_detections']}")
print(f"Images with detections: {processing_stats['images_with_detections']}")
print(f"Total cropped objects saved: {processing_stats['total_cropped_objects']}")

Starting object detection on 104 images...
Results will be saved to: /home/mhieu/git/vlsp25/image/private_detected_objects


Processing images:  48%|████▊     | 50/104 [02:19<02:30,  2.78s/it]

Processed 50/104 images. Found 165 detections, saved 164 cropped objects so far.


Processing images:  96%|█████████▌| 100/104 [04:35<00:11,  2.83s/it]

Processed 100/104 images. Found 362 detections, saved 360 cropped objects so far.


Processing images: 100%|██████████| 104/104 [04:45<00:00,  2.75s/it]


Processing completed!
Processed: 104 images
Errors: 0 images
Total detections: 373
Images with detections: 104
Total cropped objects saved: 371





## Save Detection Results

In [10]:
# Save detailed results as JSON
results_file = DETECTION_RESULTS_PATH / "detection_results.json"
with open(results_file, 'w') as f:
    json.dump({
        "metadata": {
            "timestamp": datetime.now().isoformat(),
            "model_id": MODEL_ID,
            "text_prompt": TEXT_PROMPT,
            "box_threshold": BOX_THRESHOLD,
            "text_threshold": TEXT_THRESHOLD,
            "device": DEVICE,
            "statistics": processing_stats
        },
        "results": all_detection_results,
        "cropped_objects": all_cropped_info
    }, f, indent=2)

# Save cropped objects summary as CSV
if all_cropped_info:
    cropped_df = pd.DataFrame(all_cropped_info)
    cropped_summary_file = DETECTION_RESULTS_PATH / "cropped_objects_summary.csv"
    cropped_df.to_csv(cropped_summary_file, index=False)
    print(f"Cropped objects summary saved to: {cropped_summary_file}")

print(f"Detailed results saved to: {results_file}")

Cropped objects summary saved to: /home/mhieu/git/vlsp25/image/private_detected_objects/detection_results/cropped_objects_summary.csv
Detailed results saved to: /home/mhieu/git/vlsp25/image/private_detected_objects/detection_results/detection_results.json
