In [1]:
import os
import threading
import asyncio
from playsound import playsound
import cv2
from dotenv import load_dotenv, find_dotenv
import supervision as sv
import numpy as np
import openai
import time
from ultralytics import YOLO
from text_to_speech import text_to_speech

In [2]:
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

In [3]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
class_names = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird",
"cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", 
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
"sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
"ball", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife",
"spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
"hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed",
"dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
"cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "watch", "game controller", "Sword", "wand", "speaker"]
# buffer for summary data
scene_buffer = []
buffer_lock  = threading.Lock()
last_send    = time.time()
SEND_INTERVAL = 7  # seconds

In [5]:
def send_to_gpt():
    global last_send
    while True:
        now = time.time()
        if now - last_send >= SEND_INTERVAL:
            # 1) grab & clear the buffer in one quick lock
            with buffer_lock:
                if scene_buffer:
                    batch = scene_buffer.copy()
                    scene_buffer.clear()
                else:
                    batch = None
                last_send = now

            # 2) only if we actually had data, call OpenAI **outside** the lock
            if batch:
                prompt = "You are an assistant for a visually impaired user. "\
                         "Here is what was just seen:\n"
                for evt in batch:
                    prompt += (
                        f"- {evt['class']} (ID {evt['track_id']}) at "
                        f"x={evt['cx']:.0f}, y={evt['cy']:.0f} moving "
                        f"vx={evt['vx']:.1f}, vy={evt['vy']:.1f}\n"
                    )
                prompt += "\nPlease describe this briefly in natural spoken language."

                try:
                    resp = openai.chat.completions.create(
                        model="gpt-4",
                        messages=[
                            {"role": "system",
                             "content": "You describe surroundings for a blind user."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.7,
                        max_tokens=150
                    )
                    text = resp.choices[0].message.content.strip()
                    print(f"\n[Descriptive update]\n{text}\n")
                    loop  = asyncio.get_event_loop_policy().get_event_loop()
                    try:
                        output_file = "output_audio.mp3"
                        
                        # Run the text_to_speech function
                        loop.run_until_complete(text_to_speech(text, output_file))
                    except Exception as e:
                        print(f"An error occurred: {e}")
                    finally:
                        loop.close()
                        # Play the audio file after conversion
                        playsound(output_file)
                        os.remove(output_file) if os.path.exists(output_file) else None
                        print("Temporary audio file removed.")
                except Exception as e:
                    # don’t let the thread die—just log the error
                    print(f"[GPT Error] {e}")

        time.sleep(0.5)


In [6]:
def create_kalman():
    kf = cv2.KalmanFilter(4, 2)               # 4 state vars (x, y, vx, vy), 2 measurements (x, y)
    # State transition: x' = x + vx, y' = y + vy
    kf.transitionMatrix = np.array([
        [1, 0, 1, 0],
        [0, 1, 0, 1],
        [0, 0, 1, 0],
        [0, 0, 0, 1]
    ], np.float32)
    # We directly observe x and y
    kf.measurementMatrix = np.array([
        [1, 0, 0, 0],
        [0, 1, 0, 0]
    ], np.float32)
    # Tune noise covariances as needed
    kf.processNoiseCov = np.eye(4, dtype=np.float32) * 1e-2
    kf.measurementNoiseCov = np.eye(2, dtype=np.float32) * 1e-1
    kf.errorCovPost = np.eye(4, dtype=np.float32)
    return kf

In [7]:
model = YOLO("yolov8m-world.pt")

In [8]:
model.set_classes(class_names)

In [9]:
bounding_box_annotator = sv.BoxAnnotator(
    thickness=2,
)
label_annotator = sv.LabelAnnotator(
    text_thickness=2,
    text_scale=1)

In [10]:

kalman_filters = {} 

In [None]:
# start the background sender
threading.Thread(target=send_to_gpt, daemon=True).start()


# ─── MAIN TRACK & KALMAN LOOP ───────────────────────────────────────────────────

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # 1) Run Ultralytics tracking on THIS single frame
    results = model.track(
        frame, 
        tracker="custom_bytetrack.yaml",
        conf=0.25,
        iou=0.5,
        persist=True,
        verbose=False
    )
    result = results[0]  # model.track(frame) returns a list of 1 Result

    annotated = frame.copy()
    detections = sv.Detections.from_ultralytics(result)

    # 2) Loop through tracked boxes + IDs + Kalman as before
    for box, cls, tid in zip(result.boxes.xyxy,
                              result.boxes.cls,
                              result.boxes.id):
        x1, y1, x2, y2 = map(int, box)
        cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

        # Kalman init/predict/correct
        if tid not in kalman_filters:
            kf = create_kalman()
            kf.statePost = np.array([[cx],[cy],[0],[0]], np.float32)
            kalman_filters[tid] = kf
        else:
            kf = kalman_filters[tid]

        pred = kf.predict().flatten()
        kf.correct(np.array([[cx],[cy]], np.float32))
        px, py, pvx, pvy = pred

        # draw detection & ID
        label = f"{result.names[int(cls)]} ID:{int(tid)}"
        cv2.rectangle(annotated, (x1, y1), (x2, y2), (0,255,0), 2)
        cv2.putText(annotated, label, (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

        # draw Kalman prediction
        cv2.circle(annotated, (int(px), int(py)), 4, (0,0,255), -1)
        cv2.arrowedLine(annotated,
                        (int(cx), int(cy)),
                        (int(px), int(py)),
                        (0,0,255), 2, tipLength=0.3)

        # buffer for GPT…
        with buffer_lock:
            scene_buffer.append({
                "track_id": tid,
                "class":    result.names[int(cls)],
                "cx":       cx,
                "cy":       cy,
                "vx":       pvx,
                "vy":       pvy
            })

    # 3) Display
    cv2.imshow("YOLOv8 + ByteTrack + Kalman", annotated)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


[Descriptive update]
You are in a room with a person and a bed. The person is moving slightly within the same area, while the bed remains stationary. The person seems to be pacing around the same spot and the bed is situated a short distance away from them. Overall, the environment is fairly stable with minimal activity.

[GPT Error] There is no current event loop in thread 'Thread-3 (send_to_gpt)'.

[Descriptive update]
In the room, there's a person who seems to be moving around slightly but generally staying in the same area. The bed also appears to be stationary in the room. A cell phone is also present and it is moving gradually around the room.

[GPT Error] There is no current event loop in thread 'Thread-3 (send_to_gpt)'.
[GPT Error] Error code: 429 - {'error': {'message': 'Request too large for gpt-4 in organization org-BbAGzyIh3vSGGQfnsIr8cLe4 on tokens per min (TPM): Limit 10000, Requested 18417. The input or output tokens must be reduced in order to run successfully. Visit h

[GPT Error] Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 12405 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

[Descriptive update]
A person is seen moving slowly around a room with two beds. The person's movement is primarily along the x-axis with minor ups and downs along the y-axis. The first bed is almost stationary and positioned further along the y-axis, while the second bed is closer to the person's initial position. The person is moving around a lot, potentially navigating around the room or performing various tasks. The beds do not seem to change their positions significantly, indicating that they are likely large, heavy, and non-mobile.

[GPT Error] There is no current event loop in thread 'Thread-3 (send_to_gpt)'.
