In [None]:
from ultralytics import YOLO
import torch
import cv2
import cvzone
import math
import numpy as np

model = YOLO("yolo-weights/yolov8n.pt")
# device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# model.to(device)

# cap =  cv2.VideoCapture(0) # For Webcam
cap =  cv2.VideoCapture("resources/yolo/bikes.mp4") # For Video
cap.set(3, 640)
cap.set(4, 480)

# Resize the image to ensure dimensions are divisible by 32
def resize_image(img, new_size):
    return cv2.resize(img, new_size)

# Convert the image to a tensor and normalize
def preprocess_image(img):
    # Resize image
    img_resized = resize_image(img, (640, 640))

    # Convert to tensor and normalize
    img_tensor = torch.from_numpy(img_resized).permute(2, 0, 1).float() / 255.0

    # Add batch dimension
    img_tensor = img_tensor.unsqueeze(0)
    
    return img_tensor


classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

while True:
    success, img = cap.read()
    results = model(img,stream=True)
    for r in results:
        boxes = r.boxes
        for box in boxes:
            # Bounding Box
            # OpenCV cv2
            # x1,y1,x2,y2 = box.xyxy[0]
            # x1,y1,x2,y2 = int(x1),int(y1),int(x2),int(y2)
            # cv2.rectangle(img,(x1,y1),(x2,y2),(255,0,255),3)
            
            # cvzone
            x1,y1,x2,y2 = box.xyxy[0]
            x1,y1,x2,y2 = int(x1),int(y1),int(x2),int(y2)
            w,h = x2-x1,y2-y1
            cvzone.cornerRect(img,(x1,y1,w,h))
            #Confidence
            conf = math.ceil(box.conf[0]*100)/100

            #Class Name
            cls = int(box.cls[0])

            cvzone.putTextRect(img,f'{classNames[cls]} {conf}',(max(0,x1),max(35,y1)),scale=1,thickness=1)
    
    cv2.imshow("Image",img)
    cv2.waitKey(1)






0: 640x640 13 persons, 4 bicycles, 158.6ms
Speed: 0.0ms preprocess, 158.6ms inference, 5.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 persons, 5 bicycles, 1 handbag, 180.5ms
Speed: 0.0ms preprocess, 180.5ms inference, 6.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 11 persons, 5 bicycles, 1 handbag, 163.3ms
Speed: 0.0ms preprocess, 163.3ms inference, 5.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 11 persons, 3 bicycles, 1 handbag, 165.0ms
Speed: 0.0ms preprocess, 165.0ms inference, 5.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 10 persons, 3 bicycles, 163.7ms
Speed: 0.0ms preprocess, 163.7ms inference, 5.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 persons, 4 bicycles, 1 handbag, 165.3ms
Speed: 0.0ms preprocess, 165.3ms inference, 5.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 13 persons, 5 bicycles, 156.5ms
Speed: 0.0ms preprocess, 156.5ms inference, 5.4ms postproces