In [1]:
# ── Standard library ────────────────────────────────────────────────────────────
from collections import deque
from typing import List, Tuple
import os
import time
import tempfile

# ── Third-party ────────────────────────────────────────────────────────────────
import cv2
import numpy as np

import torch
import torch.nn as nn
import torchvision.transforms as T
import torchvision.models as models

from ultralytics import YOLO

# ── Runtime tweaks (optional but handy) ────────────────────────────────────────
cv2.setNumThreads(0)                 # reduce OpenCV/NumPy thread contention
torch.backends.cudnn.benchmark = True  # speedup when input sizes are static
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# ---------- YOLO detector wrapper ----------
class YoloFaceDetector:
    def __init__(self, weights="yolov12n-face.pt", imgsz=640, conf=0.35, iou=0.5):
        self.model = YOLO(weights)
        self.imgsz = imgsz
        self.conf = conf
        self.iou = iou

    def __call__(self, frame_bgr):
        results = self.model.predict(
            source=frame_bgr, imgsz=self.imgsz,
            conf=self.conf, iou=self.iou, verbose=False, device="cpu"
        )[0]
        dets = []
        if results.boxes is not None and len(results.boxes) > 0:
            xyxy = results.boxes.xyxy.cpu().numpy()
            conf = results.boxes.conf.cpu().numpy()
            for k in range(len(xyxy)):
                x1, y1, x2, y2 = xyxy[k].tolist()
                dets.append({"bbox": [x1, y1, x2, y2], "conf": float(conf[k])})
        return dets



In [3]:
source = "data/dangoon.mp4"

In [6]:
def main():
    source = 0  # webcam
    model = YOLO("models/yolov12n-face.pt")
    win = "YOLO + BoT-SORT (Ultralytics, CPU)"
    cv2.namedWindow(win, cv2.WINDOW_NORMAL)

    gen = model.track(
        source=source,
        tracker='botsort_custom.yaml',
        stream=True,
        imgsz=640,
        conf=0.35,
        iou=0.5,
        device='cpu',
        verbose=False,
        persist=True
    )

    try:
        for res in gen:
            frame = res.orig_img
            boxes = res.boxes
            if boxes is not None and len(boxes) > 0:
                xyxy = boxes.xyxy.cpu().numpy()
                confs = boxes.conf.cpu().numpy()
                ids = boxes.id.cpu().numpy() if boxes.id is not None else np.full((len(confs),), -1)

                for k, (x1,y1,x2,y2) in enumerate(xyxy.astype(int)):
                    tid  = int(ids[k]) if ids[k] != -1 else k
                    cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
                    y_text = max(0, y1 - 6)
                    cv2.putText(frame, f"ID {tid}", (x1, y_text),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

            cv2.imshow(win, frame)
            k = cv2.waitKey(1) & 0xFF
            if k in (27, ord('q')) or cv2.getWindowProperty(win, cv2.WND_PROP_VISIBLE) < 1:
                break
    finally:
        # Try to close Ultralytics internal loaders (releases the camera)
        try:
            predictor = getattr(model, "predictor", None)
            if predictor is not None:
                ds = getattr(predictor, "dataset", None)
                if ds is not None and hasattr(ds, "close"):
                    ds.close()
        except Exception:
            pass

        cv2.destroyAllWindows()
        # A few ticks help on some OSes (esp. macOS) to actually close the window
        for _ in range(5):
            cv2.waitKey(1)
        time.sleep(0.05)

In [7]:
if __name__ == "__main__":
    main()



1/1: 0... Success ✅ (inf frames of shape 1920x1080 at 30.00 FPS)

