In [1]:
import cv2
import os
import supervision as sv

from ultralytics import YOLO
from pathlib import Path

# Configuration
CONFIG = {
    "VIDEO_PATH": "../data/videos/input/badminton_test.mp4",  # Path to the input video
    "MODEL_PATH": "../weights/badminton_best.pt",  # Path to the YOLO model weights,
    "JSON_OUTPUT_PATH": "../data/detections", # Path to the output json 
    "CONFIDENCE_THRESHOLD": 0.5,  # Confidence threshold for detections
    "DEVICE": "cuda:0",  # Device to run the model on (e.g., 'cuda:0' for GPU or 'cpu' for CPU)
    "DISPLAY_RESOLUTION": (1280, 720),  # Resolution for displaying the annotated video
    "FPS_MONITOR_ENABLED": True,  # Enable or disable FPS monitoring
}

# Load YOLO model
model = YOLO(CONFIG["MODEL_PATH"], task='detect')

# Initialize FPS monitor
fps_monitor = sv.FPSMonitor()

# Initialize JSON sink with the new file path
video_file_name = Path(CONFIG["VIDEO_PATH"]).stem
json_output_file = os.path.join(CONFIG["JSON_OUTPUT_PATH"], f"{video_file_name}.json")
json_sink = sv.JSONSink(json_output_file)

# Initialize tracker and annotators
tracker = sv.ByteTrack()
box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

# Get video frame generator
frame_generator = sv.get_video_frames_generator(source_path=CONFIG["VIDEO_PATH"])

# Process and display each frame of the video
with json_sink as sink:
    for frame in frame_generator:
        # Perform prediction with YOLO model
        results = model.predict(frame, conf=CONFIG["CONFIDENCE_THRESHOLD"], device=CONFIG["DEVICE"])[0]
        detections = sv.Detections.from_ultralytics(results)

        # Update detections with tracker
        detections = tracker.update_with_detections(detections)

        # Save information of model in json 
        sink.append(detections, custom_data={"<YOUR_LABEL>":"<YOUR_DATA>"})

        # Annotate the frame with bounding boxes and labels
        annotated_image = box_annotator.annotate(scene=frame.copy(), detections=detections)

        annotated_image = label_annotator.annotate(
            scene=annotated_image,
            detections=detections
        )
        # Generate FPS and annotate the frame
        if CONFIG["FPS_MONITOR_ENABLED"]:
            fps_monitor.tick()
            fps = fps_monitor.fps
            cv2.putText(annotated_image, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # Resize the annotated image for display
        annotated_image = cv2.resize(annotated_image, CONFIG["DISPLAY_RESOLUTION"])

        # Display the annotated frame in a window
        cv2.imshow("Annotated Video", annotated_image)

        # Exit the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

# Release resources and close windows
cv2.destroyAllWindows()


0: 384x640 2 persons, 66.6ms
Speed: 3.8ms preprocess, 66.6ms inference, 76.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 15.9ms
Speed: 4.4ms preprocess, 15.9ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 11.2ms
Speed: 2.2ms preprocess, 11.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 20.9ms
Speed: 2.5ms preprocess, 20.9ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 11.9ms
Speed: 2.0ms preprocess, 11.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 20.5ms
Speed: 2.8ms preprocess, 20.5ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 11.7ms
Speed: 2.0ms preprocess, 11.7ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 17.8ms
Speed: 2.4ms preprocess, 17.8ms inference, 2.1ms postprocess per image at shape 