## YOLO Object Detection on MP4 Videos

This python code will do object detection on mp4 videos using the YOLO object detection vision model. The model will try to predict and put bounding boxes on each frame of the video. The output is a new mp4 video with bouding boxes embedded in the video.

This implimentation will take the input video resolution and 'slice' it into smaller image squares (e.g., 640x640 pixels) to do the predictions. 

User inputs include the video path, the output path, the model to use, the classes to detect, and the confidence threshold.

The model used is the YOLOv8 model that has been fine tuned on the [WALDO dataset](https://huggingface.co/StephanST/WALDO30). The dataset itself is not public, but the weights of this fine tuned model are available on Hugging Face. WALDO has been trained to identify 12 different objects. 0 = light vehicle; 1 = person; 2 = building; 3 = Utility pole; 4 = boat; 5 = bike; 6 = container; 7 = truck; 8 = gastank; 10 = digger (construction equipment); 11 = solar panels; 12 = bus. 

The WALDO fine tuned model is available on Hugging Face [here](https://huggingface.co/StephanST/WALDO30/resolve/main/WALDO30_yolov8m_640x640.pt?download=true).



In [None]:
import cv2
import sys
from sahi.auto_model import AutoDetectionModel
from sahi.predict import get_sliced_prediction
import supervision as sv
import numpy as np

In [None]:
## User defined parameters
input_video_path = '/home/jgillan/Documents/yolo_drone/2_italians.mp4'
output_video_path = '/home/jgillan/Documents/yolo_drone/2_italians_predict4.mp4'
model_path = '/home/jgillan/Documents/yolo_drone/WALDO30_yolov8m_640x640.pt'
TARGET_CLASSES = [0, 1] #eg, for vehicle & person
confidence_threshold = 0.5

slice_height = int(640)
slice_width = int(640)
overlap_height_ratio = float(0.1)
overlap_width_ratio = float(0.1)

In [None]:
###Runs the prediction and outputs a new mp4 video 

# Initialize the YOLOv8 model
detection_model = AutoDetectionModel.from_pretrained(
    model_type='yolov8',
    model_path=model_path,
    confidence_threshold=confidence_threshold,
    device='cuda'  # or 'cpu'
)


# Open input video
cap = cv2.VideoCapture(input_video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")


# Set up output video writer
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))


# Create bounding box and label annotators
#box_annotator = sv.BoundingBoxAnnotator(thickness=1)
box_annotator = sv.BoxCornerAnnotator(thickness=2)
label_annotator = sv.LabelAnnotator(text_scale=0.5, text_thickness=2)



# Process each frame
frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Perform sliced inference on the current frame using SAHI
    
    result = get_sliced_prediction(
        image=frame,
        detection_model=detection_model,
        slice_height=slice_height,
        slice_width=slice_width,
        overlap_height_ratio=overlap_height_ratio,
        overlap_width_ratio=overlap_width_ratio
    )

    # Extract data from SAHI result
    object_predictions = [
        pred for pred in result.object_prediction_list if pred.category.id in TARGET_CLASSES
    ]    

    # Initialize lists to hold the data
    xyxy = []
    confidences = []
    class_ids = []
    class_names = []

    # Loop over the object predictions and extract data
    for pred in object_predictions:
        bbox = pred.bbox.to_xyxy()  # Convert bbox to [x1, y1, x2, y2]
        xyxy.append(bbox)
        confidences.append(pred.score.value)
        class_ids.append(pred.category.id)
        class_names.append(pred.category.name)

    # Check if there are any detections
    if xyxy:
        # Convert lists to numpy arrays
        xyxy = np.array(xyxy, dtype=np.float32)
        confidences = np.array(confidences, dtype=np.float32)
        class_ids = np.array(class_ids, dtype=int)

        # Create sv.Detections object
        detections = sv.Detections(
            xyxy=xyxy,
            confidence=confidences,
            class_id=class_ids
        )

        # Prepare labels for label annotator
        labels = [
            f"{class_name} {confidence:.2f}"
            for class_name, confidence in zip(class_names, confidences)
        ]

        # Annotate frame with detection results
        annotated_frame = frame.copy()
        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    else:
        # If no detections, use the original frame
        annotated_frame = frame.copy()

    # Write the annotated frame to the output video
    out.write(annotated_frame)

    frame_count += 1
    print(f"Processed frame {frame_count}", end='\r')

# Release resources
cap.release()
out.release()
print("\nInference complete. Video saved at", output_video_path)