In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from models import multiscale_model as multiscale
from models import model
from models import track_model
from src.utils import visualize_detection_video, visualize_tracking_video, visualize_detection_image

In [2]:
# model_path = "checkpoints/deyolo-2.pt"
# model_path = "checkpoints/IR/yolov8n/best.pt"
model_path = "checkpoints/RGB/yolov8n2/best.pt"
video_path = """C:\\Users\\phamd\\Downloads\\vipcup-20250711T154912Z-1-001\\test_img_video\\RGB\\V_BIRD_00564.mp4"""
video_name = os.path.basename(video_path).split('.')[0]
output_dir = f"annotations/{video_name}"


In [3]:
def save_tracking_annotations_yolo(
    video_path: str,
    tracking_frames: list,
    output_dir: str,
    class_mapping: dict = None
):
    """
    Save tracking annotations in YOLO txt format
    
    Args:
        video_path: Path to the video file
        tracking_frames: List of tracking results from video_track()
        output_dir: Directory to save the annotation files
        class_mapping: Optional dict to map class_id to class names
    """
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get video info
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    
    # Group tracking results by frame
    frame_annotations = {}
    for track in tracking_frames:
        frame_idx = track['frame_idx']
        if frame_idx not in frame_annotations:
            frame_annotations[frame_idx] = []
        frame_annotations[frame_idx].append(track)
    
    # Generate YOLO format annotations for each frame
    for frame_idx in range(total_frames):
        # Create filename for this frame
        frame_filename = f"frame_{frame_idx:06d}.txt"
        frame_path = os.path.join(output_dir, frame_filename)
        
        # Write annotations for this frame
        with open(frame_path, 'w') as f:
            if frame_idx in frame_annotations:
                for track in frame_annotations[frame_idx]:
                    # Extract bbox coordinates
                    x1, y1, x2, y2 = track['bbox']
                    
                    # Convert to YOLO format (normalized center_x, center_y, width, height)
                    center_x = (x1 + x2) / 2 / frame_width
                    center_y = (y1 + y2) / 2 / frame_height
                    width = (x2 - x1) / frame_width
                    height = (y2 - y1) / frame_height
                    
                    # Get class ID
                    class_id = int(track['label'])
                    
                    # Write YOLO format line: class_id center_x center_y width height
                    f.write(f"{class_id} {center_x:.6f} {center_y:.6f} {width:.6f} {height:.6f}\n")
    
    print(f"Saved {len(frame_annotations)} frames of annotations to {output_dir}")
    
    # Save class mapping if provided
    if class_mapping:
        classes_path = os.path.join(output_dir, "classes.txt")
        with open(classes_path, 'w') as f:
            for class_id, class_name in sorted(class_mapping.items()):
                f.write(f"{class_name}\n")
        print(f"Saved class mapping to {classes_path}")

def save_tracking_annotations_yolo_with_tracking_id(
    video_path: str,
    tracking_frames: list,
    output_dir: str,
    class_mapping: dict = None
):
    """
    Save tracking annotations in YOLO txt format with tracking IDs
    Format: class_id center_x center_y width height track_id
    """
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get video info
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    
    # Group tracking results by frame
    frame_annotations = {}
    for track in tracking_frames:
        frame_idx = track['frame_idx']
        if frame_idx not in frame_annotations:
            frame_annotations[frame_idx] = []
        frame_annotations[frame_idx].append(track)
    
    # Generate YOLO format annotations for each frame
    for frame_idx in range(total_frames):
        # Create filename for this frame
        frame_filename = f"frame_{frame_idx:06d}.txt"
        frame_path = os.path.join(output_dir, frame_filename)
        
        # Write annotations for this frame
        with open(frame_path, 'w') as f:
            if frame_idx in frame_annotations:
                for track in frame_annotations[frame_idx]:
                    # Extract bbox coordinates
                    x1, y1, x2, y2 = track['bbox']
                    
                    # Convert to YOLO format (normalized center_x, center_y, width, height)
                    center_x = (x1 + x2) / 2 / frame_width
                    center_y = (y1 + y2) / 2 / frame_height
                    width = (x2 - x1) / frame_width
                    height = (y2 - y1) / frame_height
                    
                    # Get class ID and track ID
                    class_id = int(track['label'])
                    track_id = track['track_id']
                    
                    # Write YOLO format line with track ID: class_id center_x center_y width height track_id
                    f.write(f"{class_id} {center_x:.6f} {center_y:.6f} {width:.6f} {height:.6f} {track_id}\n")
    
    print(f"Saved {len(frame_annotations)} frames of annotations with tracking IDs to {output_dir}")
    
    # Save class mapping if provided
    if class_mapping:
        classes_path = os.path.join(output_dir, "classes.txt")
        with open(classes_path, 'w') as f:
            for class_id, class_name in sorted(class_mapping.items()):
                f.write(f"{class_name}\n")
        print(f"Saved class mapping to {classes_path}")

In [4]:

class_mapping = {
    0: "BIRD",  # or whatever your class names are
    1: "DRONE",
}

# Create tracking model and get results
multiscale_model = multiscale.DetectionModel(
    model_path,
    conf_threshold=0.2,
    iou_threshold=0.1,
    device="cuda",
)

multiscale_track_model = track_model.TrackingModel(multiscale_model)
multiscale_track_outputs = multiscale_track_model.video_track(video_path)

# Save annotations in YOLO format


# Save standard YOLO format (without tracking IDs)
save_tracking_annotations_yolo(
    video_path=video_path,
    tracking_frames=multiscale_track_outputs,
    output_dir=output_dir,
    class_mapping=class_mapping
)

# Or save with tracking IDs
# save_tracking_annotations_yolo_with_tracking_id(
#     video_path=video_path,
#     tracking_frames=multiscale_track_outputs,
#     output_dir=f"{output_dir}_with_ids",
#     class_mapping=class_mapping
# )

Frame 0: Detected 4 objects
Frame 1: Detected 4 objects
Frame 1: Active: 3, Lost: 0, Recovered: 0, Valid: 0
Frame 2: Detected 5 objects
Frame 2: Active: 3, Lost: 0, Recovered: 0, Valid: 0
Frame 3: Detected 4 objects
Frame 3: Active: 3, Lost: 0, Recovered: 0, Valid: 0
Frame 4: Detected 4 objects
Frame 4: Active: 3, Lost: 0, Recovered: 0, Valid: 0
Frame 5: Detected 5 objects
Frame 5: Active: 3, Lost: 0, Recovered: 0, Valid: 0
Frame 6: Detected 4 objects
Frame 6: Active: 3, Lost: 0, Recovered: 0, Valid: 0
Frame 7: Detected 3 objects
Frame 7: Active: 2, Lost: 1, Recovered: 1, Valid: 1
Frame 8: Detected 4 objects
Frame 8: Active: 2, Lost: 1, Recovered: 1, Valid: 1
Frame 9: Detected 4 objects
Frame 9: Active: 1, Lost: 2, Recovered: 1, Valid: 1
Frame 10: Detected 4 objects
Frame 10: Active: 2, Lost: 0, Recovered: 1, Valid: 0
Frame 11: Detected 4 objects
Frame 11: Active: 2, Lost: 0, Recovered: 0, Valid: 0
Frame 12: Detected 4 objects
Frame 12: Active: 2, Lost: 0, Recovered: 0, Valid: 0
Frame 