In [None]:
# Importing CNN model
try:
    from ultralytics import YOLO
except:
    %pip install -q ultralytics

## 0. Imports and agnostic code

In [None]:
# import torch
import cv2 as cv
import numpy as np
from pathlib import Path
import time
# device = "cpu"
# device

'cpu'

## 1. Object detection using YOLO11

In [None]:
model = YOLO("yolo11n.pt")

# Train the model using the 'coco8.yaml' dataset for 3 epochs
# results = model.train(data="coco.yaml", epochs=3)

# Evaluate the model's performance on the validation set
# results = model.val()

# Export the model to ONNX format
# success = model.export(format="onnx")

img_path = Path("pics/bus.jpg")
results = model(img_path)
results[0].show()


image 1/1 c:\Users\jayat\projects\object identification with cnn\pics\bus.jpg: 640x480 4 persons, 1 bus, 563.3ms
Speed: 18.4ms preprocess, 563.3ms inference, 53.4ms postprocess per image at shape (1, 3, 640, 480)


## 2. Function to check stable objects

In [4]:
# Global trackers
seen_objects = {}      # {label: first_seen_time}
stable_objects = set() # objects confirmed for 30s

def update_object_tracking(detected_labels, seen_objects, stable_objects, threshold=30):
    """
    Update seen_objects and stable_objects based on detected_labels.
    
    Args:
        detected_labels (list): list of class labels detected in current frame
        seen_objects (dict): {label: first_seen_time}
        stable_objects (set): set of labels confirmed after threshold seconds
        threshold (int): time in seconds before confirming object as stable
    
    Returns:
        tuple: (seen_objects, stable_objects)
    """
    current_time = time.time()

    # Track newly seen or persistent objects
    for label in detected_labels:
        if label not in seen_objects:
            seen_objects[label] = current_time
        else:
            if (current_time - seen_objects[label] > threshold) and (label not in stable_objects):
                stable_objects.add(label)

    # Reset objects that disappeared
    for label in list(seen_objects.keys()):
        if label not in detected_labels:
            seen_objects.pop(label, None)
            stable_objects.discard(label)

    return seen_objects, stable_objects


## 3. TTS function

In [5]:
import pyttsx3

def speak_objects(objects):
    """
    Convert a set of object labels into speech.
    Args:
        objects (set): set of labels (strings)
    """
    if not objects:
        return
    engine = pyttsx3.init()
    for obj in objects:
        engine.say(f"I see a {obj}")
    engine.runAndWait()


## 4. Final loop

In [6]:
cap = cv.VideoCapture(0)
if not cap.isOpened():
    print("Cannot open camera")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

    # Run YOLO detection
    results = model(frame)
    boxes = results[0].boxes
    detected_labels = []

    if boxes is not None:
        for c in boxes.cls:
            label = model.names[int(c)]
            detected_labels.append(label)

    # Update trackers
    seen_objects, stable_objects = update_object_tracking(
        detected_labels, seen_objects, stable_objects, threshold=30
    )

    # Show annotated frame
    annotated_frame = results[0].plot()
    cv.imshow("YOLO Live Feed", annotated_frame)

    if cv.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv.destroyAllWindows()

# Print confirmed objects at the end
print("Stable objects (seen for 30s):", stable_objects)


0: 480x640 (no detections), 231.9ms
Speed: 7.2ms preprocess, 231.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 246.7ms
Speed: 9.3ms preprocess, 246.7ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 286.9ms
Speed: 3.7ms preprocess, 286.9ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 329.7ms
Speed: 2.8ms preprocess, 329.7ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 320.9ms
Speed: 3.6ms preprocess, 320.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 291.9ms
Speed: 3.3ms preprocess, 291.9ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 212.5ms
Speed: 4.0ms preprocess, 212.5ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 215.2ms
Speed: 2.7ms prepr

In [13]:
results

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: None
 names: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted p

In [7]:
results[0].boxes

ultralytics.engine.results.Boxes object with attributes:

cls: tensor([0., 0.])
conf: tensor([0.9008, 0.7908])
data: tensor([[ 55.0413,  74.9824, 575.7079, 480.0000,   0.9008,   0.0000],
        [548.1929, 245.7629, 639.4830, 479.2880,   0.7908,   0.0000]])
id: None
is_track: False
orig_shape: (480, 640)
shape: torch.Size([2, 6])
xywh: tensor([[315.3746, 277.4912, 520.6667, 405.0176],
        [593.8380, 362.5255,  91.2902, 233.5250]])
xywhn: tensor([[0.4928, 0.5781, 0.8135, 0.8438],
        [0.9279, 0.7553, 0.1426, 0.4865]])
xyxy: tensor([[ 55.0413,  74.9824, 575.7079, 480.0000],
        [548.1929, 245.7629, 639.4830, 479.2880]])
xyxyn: tensor([[0.0860, 0.1562, 0.8995, 1.0000],
        [0.8566, 0.5120, 0.9992, 0.9985]])

In [8]:
stable_objects, seen_objects

(set(), {'person': 1760593812.7940629})