### Import Libraries and Load Models

In [17]:
try:
    from ultralytics import YOLO
except ImportError:
    ! pip install ultralytics
    from ultralytics import YOLO

In [18]:
import cv2
import torch
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from ultralytics import YOLO
import torch.nn.functional as F
import matplotlib.pyplot as plt
import os
import pyttsx3
import queue
import threading
import speech_recognition as sr
from queue import Queue
import speech_recognition as sr
from pytesseract import image_to_string
from pytesseract import pytesseract



In [19]:
yolo_model = YOLO('yolov5su.pt')

### Load MiDaS Model for Depth Estimation

In [20]:
# Calibration factor (experimentally determined)
CALIBRATION_FACTOR = 0.03 # Adjust as needed
OFFSET = 0.02           


In [22]:
! pip install timm


Collecting timm
  Downloading timm-1.0.15-py3-none-any.whl (2.4 MB)
     ---------------------------------------- 2.4/2.4 MB 78.1 kB/s eta 0:00:00
Collecting safetensors
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
     ------------------------------------ 308.9/308.9 kB 147.0 kB/s eta 0:00:00
Collecting huggingface_hub
  Downloading huggingface_hub-0.29.2-py3-none-any.whl (468 kB)
     ------------------------------------ 468.1/468.1 kB 234.4 kB/s eta 0:00:00
Installing collected packages: safetensors, huggingface_hub, timm
Successfully installed huggingface_hub-0.29.2 safetensors-0.5.3 timm-1.0.15


In [23]:
def load_midas_model():
    model_type = "DPT_Large"
    # Define the path to the cached model file
    model_cache_dir = os.path.expanduser("~/.cache/torch/hub/checkpoints")
    model_filename = "dpt_large_384.pt"
    model_filepath = os.path.join(model_cache_dir, model_filename)

    # Check if the model file exists in the cache
    if not os.path.exists(model_filepath):
        print("Downloading model...")
        midas = torch.hub.load("intel-isl/MiDaS", model_type)
    else:
        print("Loading model from cache...")
        midas = torch.hub.load("intel-isl/MiDaS", model_type, force_reload=False)

    midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
    transform = midas_transforms.default_transform if model_type in ["DPT_Large", "DPT_Hybrid"] else midas_transforms.small_transform

    return midas, transform

midas_model, midas_transform = load_midas_model()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas_model.to(device).eval()

Loading model from cache...


Using cache found in C:\Users\AYUSHI/.cache\torch\hub\intel-isl_MiDaS_master
  from .autonotebook import tqdm as notebook_tqdm
Using cache found in C:\Users\AYUSHI/.cache\torch\hub\intel-isl_MiDaS_master


DPTDepthModel(
  (pretrained): Module(
    (model): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_featur

In [24]:
# Set up OCR (Tesseract configuration)
pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

## Helper Functions for Object Detection and Depth Estimation


In [26]:
def detect_and_estimate_depth(frame):
    # Perform object detection using YOLO model
    results = yolo_model(frame)

    # Depth Estimation (using MiDaS)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    input_batch = midas_transform(frame_rgb).to(device)
    with torch.no_grad():
        prediction = midas_model(input_batch)
        prediction = F.interpolate(
            prediction.unsqueeze(1),
            size=frame_rgb.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze()
    depth_map = prediction.cpu().numpy()

    # Convert depth map to estimated distance
    estimated_distance_map = (depth_map * CALIBRATION_FACTOR) + OFFSET
    return results, estimated_distance_map

 ### Define Direction and Display Helper Functions

In [27]:
# Simple function to estimate the real distance based on known width and detected width in pixels
def calibrate_depth(known_distance, known_width, depth_map, x1, y1, x2, y2):
    # Calculate the width of the detected object in pixels
    object_width_in_pixels = x2 - x1
    # Assuming depth_map[y1:y2, x1:x2] represents the detected object area
    depth_at_object = np.mean(depth_map[y1:y2, x1:x2])
    
    # Calculate the scale (based on the known object size)
    scale = known_width / object_width_in_pixels
    # Use scale to adjust the distance (depth) estimation
    estimated_distance = depth_at_object * scale

    # Use the known distance as a baseline and apply the scaling factor
    # Calibrate depth estimation based on known distance
    return estimated_distance

In [28]:
engine = pyttsx3.init()
engine.setProperty('rate', 150)  # You can adjust the speaking speed
engine.setProperty('volume', 1)  

In [30]:

engine_lock = threading.Lock()  
speech_queue = queue.Queue()  
speech_thread = None
stop_flag = threading.Event()
speaking = threading.Event()  
live_detection_active = threading.Event()
voice_queue = queue.Queue()  

In [31]:
def speech_worker():
    while True:
        speech_text = speech_queue.get()
        if speech_text is None:
            break  # Exit the thread if None is received
        try:
            with engine_lock:  # Ensure thread safety for the speech engine
                engine.say(speech_text)
                engine.runAndWait()  # This will block until the speech is finished
        except RuntimeError as e:
            print(f"Speech engine error: {e}")
        speech_queue.task_done()

def speak(text):
    """Function to queue text-to-speech requests."""
    speech_queue.put(text)  # Queue the text for speech

def start_speech_thread():
    """Start the speech thread to handle TTS requests."""
    global speech_thread
    if speech_thread is None or not speech_thread.is_alive():
        speech_thread = threading.Thread(target=speech_worker, daemon=True)
        speech_thread.start()

In [32]:
def get_direction(x_center, frame_width):
    """Determine object direction based on x-axis position from human perspective."""
    # Flip the logic for the human's perspective
    if x_center < frame_width / 3:
        return "right"  # Camera's left is the person's right
    elif x_center > 2 * frame_width / 3:
        return "left"  # Camera's right is the person's left
    else:
        return "center"

In [33]:
# Function to extract text using OCR within a specific region (bounding box)
def extract_text_from_region(frame, x1, y1, x2, y2):
    # Crop the region from the frame based on the bounding box
    cropped_region = frame[y1:y2, x1:x2]
    
    # Convert the cropped region to grayscale for better OCR accuracy
    gray_region = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2GRAY)
    
    # Use Tesseract to extract text from the region
    ocr_text = pytesseract.image_to_string(gray_region)
    return ocr_text.strip()

### Live Detection with Stop Key Functionality

In [34]:
def display_results(frame, results, depth_map):
    frame_height, frame_width = frame.shape[:2]
    output_lines = []  # Store detailed output for console/logging
    announcements = []  # Store announcements for audio output

    for result in results:
        boxes = result.boxes.xyxy
        confs = result.boxes.conf
        classes = result.boxes.cls

        for i in range(len(boxes)):
            x1, y1, x2, y2 = map(int, boxes[i])  # Bounding box coordinates
            conf = confs[i].item()  # Confidence score
            cls = int(classes[i])  # Class index
            label = yolo_model.names[cls]  # Object name

            if conf < 0.5:  # Skip low-confidence detections
                continue

            # Calculate depth and direction
            distance = depth_map[y1:y2, x1:x2].mean()  # Average depth in the region
            x_center = (x1 + x2) / 2  # Center of the bounding box
            direction = get_direction(x_center, frame_width)

            # Format output line for detected object
            output_line = (
                f"Object: {label} | Confidence: {conf:.2f} | "
                f"Distance: {distance:.2f} meters | Direction: {direction}"
            )
            output_lines.append(output_line)  # Collect for console output

            # Format audio announcement for detected object
            announcements.append(
                f"{label} detected at {distance:.2f} meters to the {direction} with a confidence score of {conf:.2f}"
            )

            # Annotate the frame with detection info
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)  # Draw bounding box
            cv2.putText(
                frame, f"{label} {conf:.2f}", (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2
            )
            cv2.putText(
                frame, f"{distance:.2f} m, {direction}", (x1, y2 + 20),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2
            )

            # Use OCR to detect any text inside the bounding box region
            ocr_text = extract_text_from_region(frame, x1, y1, x2, y2)
            if ocr_text:  # If OCR detects any text
                ocr_sentence = f"OCR detected in {label}: '{ocr_text}'"
                output_lines.append(ocr_sentence)
                announcements.append(f"OCR detected the following text in {label}: {ocr_text}")

    # Print all collected detections for the current frame
    if output_lines:
        print("\n--- Current Frame Detections ---")
        for line in output_lines:
            print(line)
        print("--- End of Detections ---\n")

    # Speak all collected announcements
    for announcement in announcements:
        speak(announcement)

    return frame


In [35]:
def live_detection():
    cap = cv2.VideoCapture(0)  # Use webcam for live detection
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    print("Starting live detection. Say 'stop the live detection' to quit.")

    while not stop_flag.is_set():  # Check stop_flag
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break

        # Detect objects and estimate depth dynamically
        results, depth_map = detect_and_estimate_depth(frame)

        # Annotate and display detected objects with their information
        annotated_frame = display_results(frame, results, depth_map)

        # Show the frame with annotations
        cv2.imshow("Live Object Detection", annotated_frame)

        # Press 'q' to break the loop and stop
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()
    print("Live detection stopped.")


### Execution Prompt

In [36]:
def listen_for_commands():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    while True:
        try:
            with mic as source:
                print("Listening for commands...")
                audio = recognizer.listen(source)
                command = recognizer.recognize_google(audio).lower().strip()

                if "go live" in command:
                    stop_flag.clear()
                    print("Starting live detection...")
                    threading.Thread(target=live_detection, daemon=True).start()

                elif "stop the live detection" in command:
                    stop_flag.set()
                    print("Stopping live detection...")
                    break

        except sr.UnknownValueError:
            print("Could not understand the command.")
        except Exception as e:
            print(f"Error: {e}")

In [37]:
# Check if the speech_thread is not None and is alive before calling join()
if speech_thread and speech_thread.is_alive():
    speech_thread.join()  # Wait for the thread to finish
else:
    print("Speech thread is not running.")

Speech thread is not running.


In [39]:
! pip install pyaudio


Collecting pyaudio
  Downloading PyAudio-0.2.14-cp310-cp310-win_amd64.whl (164 kB)
     ------------------------------------- 164.1/164.1 kB 33.6 kB/s eta 0:00:00
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.14


In [None]:
if __name__ == "__main__":
    start_speech_thread()  # Start the speech thread before anything else
    command_thread = threading.Thread(target=listen_for_commands, daemon=True)
    command_thread.start()

    # Wait for the command thread to finish
    command_thread.join()

    # Add the following line to wait for the speech_thread to finish
    if speech_thread and speech_thread.is_alive():
        speech_queue.put(None)  # Signal the speech worker to exit
        speech_thread.join()

    print("Program terminated.")

Listening for commands...
Listening for commands...
Listening for commands...
Starting live detection...
Listening for commands...
Starting live detection. Say 'stop the live detection' to quit.

0: 480x640 (no detections), 284.6ms
Speed: 56.0ms preprocess, 284.6ms inference, 11.4ms postprocess per image at shape (1, 3, 480, 640)
Could not understand the command.
Listening for commands...

0: 480x640 1 person, 1 cat, 203.9ms
Speed: 7.6ms preprocess, 203.9ms inference, 36.1ms postprocess per image at shape (1, 3, 480, 640)
Could not understand the command.
Listening for commands...

--- Current Frame Detections ---
Object: cat | Confidence: 0.65 | Distance: 0.57 meters | Direction: center
--- End of Detections ---


0: 480x640 1 cat, 175.5ms
Speed: 2.1ms preprocess, 175.5ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)
Could not understand the command.
Listening for commands...

--- Current Frame Detections ---
Object: cat | Confidence: 0.69 | Distance: 0.62 meters | 