In [1]:
import sys, site
print("Python exe:", sys.executable)
print("Version:", sys.version)
print("Site-packages:", site.getsitepackages() if hasattr(site,"getsitepackages") else site.getusersitepackages())

Python exe: /opt/miniconda3/envs/face-detection/bin/python
Version: 3.10.18 (main, Jun  5 2025, 08:37:47) [Clang 14.0.6 ]
Site-packages: ['/opt/miniconda3/envs/face-detection/lib/python3.10/site-packages']


In [4]:
import cv2, time, numpy as np, torch
from ultralytics import YOLO
import mediapipe as mp
from deep_sort_realtime.deepsort_tracker import DeepSort

In [5]:
# ---------- Simple IoU-based tracker ----------
class Track:
    def __init__(self, tid, bbox, conf):
        self.id = tid
        self.bbox = np.array(bbox, dtype=float)  # [x1,y1,x2,y2]
        self.conf = float(conf)
        self.age = 1
        self.missed = 0

class IoUTracker:
    def __init__(self, iou_thresh=0.3, max_missed=3):
        self.iou_thresh = iou_thresh
        self.max_missed = max_missed
        self.next_id = 1
        self.tracks = []

    @staticmethod
    def iou(a, b):
        # a,b: [x1,y1,x2,y2]
        inter_x1 = max(a[0], b[0]); inter_y1 = max(a[1], b[1])
        inter_x2 = min(a[2], b[2]); inter_y2 = min(a[3], b[3])
        iw = max(0.0, inter_x2 - inter_x1); ih = max(0.0, inter_y2 - inter_y1)
        inter = iw * ih
        area_a = (a[2]-a[0])*(a[3]-a[1]); area_b = (b[2]-b[0])*(b[3]-b[1])
        union = area_a + area_b - inter + 1e-6
        return inter / union

    def update(self, dets):
        """
        dets: list of dicts -> {"bbox":[x1,y1,x2,y2], "conf":float}
        returns: list of Track
        """
        # 1) Build IoU matrix (tracks x detections)
        T, D = len(self.tracks), len(dets)
        iou_mat = np.zeros((T, D), dtype=float)
        for i, tr in enumerate(self.tracks):
            for j, d in enumerate(dets):
                iou_mat[i, j] = self.iou(tr.bbox, d["bbox"])

        # 2) Greedy assignment by IoU
        assigned_tr = set(); assigned_det = set()
        pairs = []
        while True:
            if iou_mat.size == 0: break
            i, j = divmod(iou_mat.argmax(), iou_mat.shape[1])
            if iou_mat[i, j] < self.iou_thresh: break
            pairs.append((i, j))
            assigned_tr.add(i); assigned_det.add(j)
            iou_mat[i, :] = -1; iou_mat[:, j] = -1

        # 3) Update matched tracks
        for i, j in pairs:
            tr = self.tracks[i]
            tr.bbox = np.array(dets[j]["bbox"], dtype=float)
            tr.conf = float(dets[j]["conf"])
            tr.age += 1
            tr.missed = 0

        # 4) Create new tracks for unmatched detections
        for j, d in enumerate(dets):
            if j in assigned_det: continue
            self.tracks.append(Track(self.next_id, d["bbox"], d["conf"]))
            self.next_id += 1

        # 5) Age / remove unmatched tracks
        alive = []
        for idx, tr in enumerate(self.tracks):
            if idx in assigned_tr:
                alive.append(tr)
            else:
                tr.missed += 1
                if tr.missed <= self.max_missed:
                    alive.append(tr)
        self.tracks = alive
        return self.tracks



In [6]:
# ---------- YOLO detector wrapper ----------
class YoloFaceDetector:
    def __init__(self, weights="yolov12n-face.pt", imgsz=640, conf=0.35, iou=0.5):
        self.model = YOLO(weights)
        self.imgsz = imgsz
        self.conf = conf
        self.iou = iou

    def __call__(self, frame_bgr):
        results = self.model.predict(
            source=frame_bgr, imgsz=self.imgsz,
            conf=self.conf, iou=self.iou, verbose=False, device="cpu"
        )[0]
        dets = []
        if results.boxes is not None and len(results.boxes) > 0:
            xyxy = results.boxes.xyxy.cpu().numpy()
            conf = results.boxes.conf.cpu().numpy()
            for k in range(len(xyxy)):
                x1, y1, x2, y2 = xyxy[k].tolist()
                dets.append({"bbox": [x1, y1, x2, y2], "conf": float(conf[k])})
        return dets



In [7]:
# ------------------------ Utils ------------------------
def clamp_box(x1, y1, x2, y2, w, h):
    return [max(0, x1), max(0, y1), min(w-1, x2), min(h-1, y2)]

def pad_and_square(b, pad, w, h):
    x1,y1,x2,y2 = b
    cx = (x1+x2)/2; cy = (y1+y2)/2; s = max(x2-x1, y2-y1) * (1+pad*2)
    nx1 = cx - s/2; ny1 = cy - s/2; nx2 = cx + s/2; ny2 = cy + s/2
    return clamp_box(nx1, ny1, nx2, ny2, w, h)

def preprocess_face(frame_bgr, box, size=160):
    x1,y1,x2,y2 = map(int, box); crop = frame_bgr[y1:y2, x1:x2]
    crop = cv2.resize(crop, (size, size), interpolation=cv2.INTER_LINEAR)
    rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    # ImageNet-ish normalization (adapt to your training)
    mean = np.array([0.485, 0.456, 0.406]); std = np.array([0.229, 0.224, 0.225])
    rgb = (rgb - mean) / std
    chw = np.transpose(rgb, (2,0,1))
    return chw

class EMA:
    def __init__(self, alpha=0.6, init=None): self.a=alpha; self.v=init
    def __call__(self, x):
        self.v = x if self.v is None else self.a*x + (1-self.a)*self.v
        return self.v

# MediaPipe face mesh
mp_face_mesh = mp.solutions.face_mesh
mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=8,
                             refine_landmarks=True, min_detection_confidence=0.5,
                             min_tracking_confidence=0.5)

def mouth_aspect_ratio(landmarks):
    """
    Use MediaPipe FaceMesh indices:
    mouth corners ~ 78 (left), 308 (right)
    upper/lower inner lip center ~ 13 (upper), 14 (lower)
    """
    p = landmarks
    A = np.linalg.norm(p[13] - p[14])       # vertical
    B = np.linalg.norm(p[78] - p[308])      # horizontal
    return float(A/(B+1e-6))



I0000 00:00:1760424589.712677 354016555 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M3 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1760424589.716616 354019277 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760424589.731808 354019276 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [28]:
# ---------- Main real-time loop ----------
def main():
    cap = cv2.VideoCapture(0)  # webcam
    cap.set(cv2.CAP_PROP_FRAME_WIDTH,  640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    win = "YOLO + IOU+nonReID"
    cv2.namedWindow(win, cv2.WINDOW_NORMAL)
    
    # det = YoloFaceDetector(weights="face_yolo12_best.pt", imgsz=640, conf=0.4)
    # det = YoloFaceDetector(weights="best.pt", imgsz=640, conf=0.4)
    det = YoloFaceDetector(weights="yolov12n-face.pt", imgsz=640, conf=0.4)
    
    # tracker = IoUTracker(iou_thresh=0.35, max_missed=3)
    # ds_tracker = DeepSort(max_age=80, n_init=1, max_iou_distance=0.7, nn_budget=100)
    ds_tracker = DeepSort(
    max_age=80,
    n_init=1,
    max_iou_distance=0.7,
    nn_budget=100,
    embedder=None   # 👈 disables CNN embeddings (IOU-only tracking)
)


    K = 3         # detect every K frames
    fidx = 0
    last_dets = []
    try:
        while True:
            ok, frame = cap.read()
            if not ok: break
            fidx += 1
    
            # 1) DETECT every K-th frame; else reuse last detections (or keep empty and rely on tracker state)
            if fidx % K == 1:
                dets = det(frame)
                last_dets = dets
            else:
                dets = last_dets  # optional: skip to save CPU
    
            # 2) TRACK with current detections
            detections_xywh = []
            for d in dets:
                x1,y1,x2,y2 = d['bbox']
                w, h = x2 - x1, y2 - y1
                detections_xywh.append([[float(x1), float(y1), float(w), float(h)],
                            float(d['conf']),
                            "face"])  # label optional
            # tracks = tracker.update(dets)
    
            # 3) Draw
            tracks = ds_tracker.update_tracks(detections_xywh, frame=frame)  # pass frame to enable appearance
            
            for t in tracks:
                if not t.is_confirmed():
                    continue
                x1,y1,x2,y2 = map(int, t.to_tlbr())          # left, top, right, bottom
                tid = t.track_id
                conf = getattr(t, "det_conf", 1.0)            # deep-sort-realtime exposes last det conf
                cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
                y_text = max(0, y1-6)
                cv2.putText(frame, f"ID {tid}", (x1, y_text),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
                txt = f"Conf {conf:.2f}"
                (tw, _), _ = cv2.getTextSize(txt, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
                cv2.putText(frame, txt, (x2 - tw, y_text),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)
    
            cv2.imshow(win, frame)

            k = cv2.waitKey(1) & 0xFF
            if k in (27, ord('q')): # ESC or q
                print(dets)
                break

            # also exit if user clicks the window's X button
            if cv2.getWindowProperty(win, cv2.WND_PROP_VISIBLE) < 1:
                break
    
        # cap.release(); cv2.destroyAllWindows()
    finally:
        cap.release()
        cv2.destroyWindow(win)                # close just this window
        # pump the event queue a few times so the OS actually closes it
        for _ in range(3):
            cv2.waitKey(1)
        # tiny sleep can help on some macOS builds
        time.sleep(0.05)




In [29]:
if __name__ == "__main__":
    main()

Exception: Embedder not created during init so embeddings must be given now!