## üëÅÔ∏è Real-Time Object Detection with YOLOv8

YOLO (You Only Look Once) changed the computer vision landscape by treating detection as a **single regression problem** rather than a classification task performed on thousands of window crops. We are using **YOLOv8**, the most refined version of the architecture, which utilizes a "Darknet" backbone and a C2f (Cross Stage Partial Bottleneck with two convolutions) module for superior feature fusion.

### üìì Python Implementation (Jupyter Optimized)

In [None]:
!pip install ultralytics

In [None]:
import cv2
import numpy as np
import time
from ultralytics import YOLO
from google.colab.output import eval_js
from base64 import b64decode
from IPython.display import display, Javascript, clear_output
from google.colab.patches import cv2_imshow

# 1. Initialize YOLOv8
model = YOLO('yolov8n.pt')

def setup_js_webcam():
    js = Javascript('''
    // We define the function globally immediately
    window.takePhoto = async function() {
      const div = document.querySelector('#webcam-div') || document.createElement('div');
      div.id = 'webcam-div';

      const video = document.querySelector('#webcam-video') || document.createElement('video');
      video.id = 'webcam-video';

      if (!video.srcObject) {
        const stream = await navigator.mediaDevices.getUserMedia({video: true});
        video.srcObject = stream;
        video.style.display = 'block';
        video.width = 640;
        video.height = 480;
        document.body.appendChild(div);
        div.appendChild(video);
        await video.play();
      }

      const canvas = document.createElement('canvas');
      canvas.width = 640;
      canvas.height = 480;
      canvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
      return canvas.toDataURL('image/jpeg', 0.8);
    };

    // Signal that the script has loaded
    window.webcamReady = true;
    ''')
    display(js)

def js_to_image(js_reply):
    image_bytes = b64decode(js_reply.split(',')[1])
    jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
    return cv2.imdecode(jpg_as_np, flags=1)

# --- EXECUTION ---
setup_js_webcam()

print("Waiting for browser to initialize webcam...")
# 2. THE HANDSHAKE: Wait until JS confirms it's ready
for i in range(10):
    try:
        if eval_js('window.webcamReady'):
            print("Webcam is ready!")
            break
    except:
        time.sleep(1)
        if i == 9:
            print("Error: Handshake failed. Please check browser permissions.")

try:
    while True:
        js_reply = eval_js('window.takePhoto()')
        if not js_reply: break

        frame = js_to_image(js_reply)
        results = model.predict(frame, conf=0.5, verbose=False)
        annotated_frame = results[0].plot()

        clear_output(wait=True)
        cv2_imshow(annotated_frame)

except Exception as e:
    print(f"Status: {e}")

In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
from google.colab.output import eval_js
from base64 import b64decode, b64encode
from IPython.display import display, Javascript, HTML
import PIL.Image
import io

# 1. Initialize YOLOv8
model = YOLO('yolov8n.pt')

def start_live_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;

    async function createDom() {
        if (div !== null) return;

        div = document.createElement('div');
        div.style.position = 'relative';

        video = document.createElement('video');
        video.style.display = 'block';
        video.width = 640;
        video.height = 480;
        video.setAttribute('playsinline', '');
        video.id = 'video-feed';

        // This canvas will sit on top of the video to show detection boxes
        const canvas = document.createElement('canvas');
        canvas.style.position = 'absolute';
        canvas.style.left = '0';
        canvas.style.top = '0';
        canvas.width = 640;
        canvas.height = 480;
        canvas.id = 'overlay-canvas';

        div.appendChild(video);
        div.appendChild(canvas);
        document.body.appendChild(div);

        captureCanvas = document.createElement('canvas');
        captureCanvas.width = 640;
        captureCanvas.height = 480;
    }

    async function streamVideo() {
        await createDom();
        stream = await navigator.mediaDevices.getUserMedia({video: true});
        video.srcObject = stream;
        await video.play();

        window.videoStream = stream;
    }

    // Function to capture a frame and send to Python
    window.getFrame = async function() {
        const context = captureCanvas.getContext('2d');
        context.drawImage(video, 0, 0, 640, 480);
        return captureCanvas.toDataURL('image/jpeg', 0.6);
    }

    // Function to draw boxes back onto the live video
    window.drawDetections = function(imgData) {
        const canvas = document.getElementById('overlay-canvas');
        const ctx = canvas.getContext('2d');
        const img = new Image();
        img.onload = function() {
            ctx.clearRect(0, 0, 640, 480);
            ctx.drawImage(img, 0, 0);
        };
        img.src = imgData;
    }

    streamVideo();
    ''')
  display(js)

def bytes_to_js_image(annotated_frame):
    """Converts the OpenCV frame with boxes into a format JS can draw."""
    # We make the background transparent so only the boxes show up on top of the webcam
    _, buffer = cv2.imencode('.png', annotated_frame)
    encoded_string = b64encode(buffer).decode('utf-8')
    return f'data:image/png;base64,{encoded_string}'

def js_to_image(js_reply):
    image_bytes = b64decode(js_reply.split(',')[1])
    jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
    return cv2.imdecode(jpg_as_np, flags=1)

# --- EXECUTION ---
start_live_stream()
print("Starting live real-time detection...")

try:
    while True:
        # 1. Grab frame from JS
        js_reply = eval_js('window.getFrame()')
        if not js_reply: break

        # 2. Process with YOLO
        frame = js_to_image(js_reply)
        results = model.predict(frame, conf=0.4, verbose=False)

        # 3. Create a blank transparent image for the boxes
        # This makes it look like the boxes are floating on your webcam
        ann_frame = np.zeros((480, 640, 4), dtype=np.uint8)

        # Draw results onto the transparent frame
        # We manually extract boxes to keep the background clear
        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cls = int(box.cls[0])
                conf = float(box.conf[0])
                label = f'{model.names[cls]} {conf:.2f}'

                # Draw the rectangle and label on the transparent image
                cv2.rectangle(ann_frame, (x1, y1), (x2, y2), (0, 255, 0, 255), 2)
                cv2.putText(ann_frame, label, (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0, 255), 2)

        # 4. Send the boxes back to the JS overlay
        js_overlay = bytes_to_js_image(ann_frame)
        eval_js(f'window.drawDetections("{js_overlay}")')

except Exception as e:
    print(f"Stopped: {e}")