# Face Recognition - Jesmine Tey Khai Jing

Buffalo-m: https://github.com/deepinsight/insightface/tree/master/python-package#model-zoo

In [5]:
import cv2 as cv
import pickle
import time
import numpy as np
from insightface.app import FaceAnalysis
from huggingface_hub import hf_hub_download

file_path = hf_hub_download(
    repo_id="jesmine0820/assignment_face_recognition",   
    filename="face_embeddings.pkl",  
    repo_type="dataset"
)
with open(file_path, "rb") as f:
    embeddings_data = pickle.load(f)

detector = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider'])
detector.prepare(ctx_id=0, det_size=(640, 640), det_thresh=0.5)

class RecognitionSmoother:
    def __init__(self, window_size=5):
        self.window_size = window_size
        self.history = []
    
    def add_recognition(self, person_id, score):
        self.history.append((person_id, score))
        if len(self.history) > self.window_size:
            self.history.pop(0)
    
    def get_smoothed_result(self):
        if not self.history:
            return None, 0

        weights = np.linspace(0.5, 1.5, len(self.history))
        scores = {}
        
        for (pid, score), weight in zip(self.history, weights):
            if pid not in scores:
                scores[pid] = []
            scores[pid].append(score * weight)
        
        avg_scores = {pid: np.mean(vals) for pid, vals in scores.items()}
        best_pid = max(avg_scores.items(), key=lambda x: x[1])[0]
        best_score = avg_scores[best_pid]
        
        return best_pid, best_score

smoother = RecognitionSmoother(window_size=5)

def enhance_contrast(img):
    lab = cv.cvtColor(img, cv.COLOR_RGB2LAB)
    l, a, b = cv.split(lab)
    clahe = cv.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    merged = cv.merge((cl, a, b))
    return cv.cvtColor(merged, cv.COLOR_LAB2RGB)

def align_face(image, face_obj, target_size=(112, 112)):
    src = np.array([
        [38.2946, 51.6963],
        [73.5318, 51.5014],
        [56.0252, 71.7366],
        [41.5493, 92.3655],
        [70.7299, 92.2041]], dtype=np.float32)

    dst = face_obj.kps.astype(np.float32)
    M = cv.estimateAffinePartial2D(dst, src, method=cv.LMEDS)[0]
    aligned = cv.warpAffine(image, M, target_size, borderValue=0.0)
    return aligned

def get_face_embedding_from_obj(face_obj):
    emb = face_obj.embedding
    if emb is None:
        return None
    return emb / np.linalg.norm(emb)

def recognize_face(embedding, dataset, threshold=0.5):
    if embedding is None:
        return None, None, -1

    best_score = -1
    best_id = None
    best_name = None

    for entry in dataset:
        db_embedding = entry["embedding"]
        db_embedding = db_embedding / np.linalg.norm(db_embedding)

        cos_sim = np.dot(embedding, db_embedding)
        if cos_sim > best_score:
            best_score = cos_sim
            best_id = entry["id"]
            best_name = entry["image_name"]

    if best_score < threshold:
        return None, None, best_score

    return best_id, best_name, best_score

def draw_result(image, name, score):
    faces = detector.get(image)
    if not faces:
        return image

    h, w, _ = image.shape
    img_center = np.array([w // 2, h // 2])
    closest_face, min_dist = None, float("inf")

    for face in faces:
        bbox = face.bbox.astype(int)
        face_center = np.array([(bbox[0] + bbox[2]) // 2, (bbox[1] + bbox[3]) // 2])
        dist = np.linalg.norm(face_center - img_center)
        if dist < min_dist:
            min_dist = dist
            closest_face = face

    if closest_face is None:
        return image

    bbox = closest_face.bbox.astype(int)
    cv.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)

    label = f"{name} ({score:.2f})" if name else "Unknown"
    cv.putText(image, label, (bbox[0], bbox[1] - 10),
               cv.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
    return image

video = cv.VideoCapture(0)

def real_time_pipeline():
    current_person = None
    start_time = None

    while True:
        ret, frame = video.read()
        if not ret:
            break

        frame = cv.flip(frame, 1)
        rgb_frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)

        faces = detector.get(rgb_frame)

        if faces:
            # pick best face (highest det_score)
            faces.sort(key=lambda f: f.det_score, reverse=True)
            best_face = faces[0]

            # face quality filtering
            if best_face.det_score < 0.6:
                continue
            if (best_face.bbox[2] - best_face.bbox[0]) < 80:
                continue

            # align + preprocess
            aligned_face = align_face(rgb_frame, best_face)
            enhanced_face = enhance_contrast(aligned_face)

            # get embedding
            embedding = get_face_embedding_from_obj(best_face)

            # recognize
            person_id, name, score = recognize_face(embedding, embeddings_data)

            # smooth results
            smoother.add_recognition(person_id, score)
            smoothed_id, smoothed_score = smoother.get_smoothed_result()

            # draw
            frame = draw_result(frame, name, smoothed_score)

            # stable detection for 5s
            if smoothed_id == current_person:
                if start_time and (time.time() - start_time >= 5):
                    print(f"Detected id: {smoothed_id}, Score: {smoothed_score}")
                    start_time = None
            else:
                current_person = smoothed_id
                start_time = time.time()

        # draw middle guide box
        h, w, _ = frame.shape
        rect_w, rect_h = 200, 200
        center_x, center_y = w // 2, h // 2
        top_left = (center_x - rect_w // 2, center_y - rect_h // 2)
        bottom_right = (center_x + rect_w // 2, center_y + rect_h // 2)
        cv.rectangle(frame, top_left, bottom_right, (255, 0, 0), 2)

        cv.imshow("Face Recognition", frame)
        if cv.waitKey(1) & 0xFF == ord('q'):
            break

    video.release()
    cv.destroyAllWindows()

real_time_pipeline()




Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\User/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\User/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\User/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\User/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\User/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127

In [7]:
import os

def check_accuracy():
    for folder in os.listdir("test_photos"):
        for filename in os.listdir(folder):
            if filename.lower().endswith(('.jpg')):
                img_path = os.path.join("test_photos",folder, filename)
                img = cv.imread(img_path)
                cv.imshow("image", img)
                cv.waitKey(0)

# Face Recognition - Ethel Ng Yi Yan

In [None]:
from mtcnn import MTCNN
from keras_facenet import FaceNet
import cv2
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize detector and embedder
detector = MTCNN()
embedder = FaceNet()

def l2_normalize(x):
    return x / np.linalg.norm(x)

In [None]:
# Folder containing known faces
photo_dir = r"C:\Users\User\Image Processing Assignment\Photo"

# Get embedding from image path
def get_face_embedding(img_path):
    img = cv2.imread(img_path)
    if img is None:
        print(f"[WARNING] Failed to load image: {img_path}")
        return None
    
    results = detector.detect_faces(img)
    if len(results) == 0:
        print(f"[INFO] No face detected in: {img_path}")
        return None
    
    face = results[0]
    x, y, w, h = face['box']
    x, y = max(0, x), max(0, y)
    
    face_img = img[y:y+h, x:x+w]
    face_img = cv2.resize(face_img, (160, 160))
    
    embedding = embedder.embeddings([face_img])[0]
    embedding = l2_normalize(embedding)
    return embedding

# Build the face database from photo directory
def build_face_database(folder):
    database = {}
    for file in os.listdir(folder):
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            path = os.path.join(folder, file)
            name = os.path.splitext(file)[0]
            print(f"Processing: {file}")
            embedding = get_face_embedding(path)
            if embedding is not None:
                database[name] = embedding
            else:
                print(f"[SKIPPED] {file}")
    return database

face_database = build_face_database(photo_dir)
print(f"✅ Loaded {len(face_database)} valid faces from Photo folder.")


In [None]:
# Get top N matches with cosine similarity
def get_top_matches(face_img, database, top_n=3):
    face_img = cv2.resize(face_img, (160, 160))
    embedding = embedder.embeddings([face_img])[0]
    embedding = l2_normalize(embedding)

    similarities = []
    for name, db_emb in database.items():
        sim_score = cosine_similarity([embedding], [db_emb])[0][0]  # Higher is better
        similarities.append((name, sim_score))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Start webcam recognition
video = cv2.VideoCapture(0)
print("📷 Press 'q' to quit...")

threshold = 0.7  # Raised threshold

while True:
    ret, frame = video.read()
    if not ret:
        break

    results = detector.detect_faces(frame)
    
    for face in results:
        x, y, w, h = face['box']
        x, y = max(0, x), max(0, y)

        # Add margin to bounding box
        margin = 10
        x1 = max(0, x - margin)
        y1 = max(0, y - margin)
        x2 = min(frame.shape[1], x + w + margin)
        y2 = min(frame.shape[0], y + h + margin)

        face_img = frame[y1:y2, x1:x2]

        top_matches = get_top_matches(face_img, face_database)

        if top_matches and top_matches[0][1] > threshold:
            name = top_matches[0][0]
            similarity = top_matches[0][1]
            cv2.putText(frame, f"{name} ({similarity*100:.1f}%)", (x, y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

            print(f"\nTop 3 matches for face at ({x}, {y}):")
            for match_name, sim in top_matches:
                print(f"  {match_name}: {sim * 100:.2f}% similarity")
        else:
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
            print(f"\n❌ No good match found for face at ({x}, {y}).")

    cv2.imshow('Face Recognition with MTCNN + FaceNet', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

video.release()
cv2.destroyAllWindows()


In [4]:
from mtcnn import MTCNN
from keras_facenet import FaceNet
import cv2
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

# Initialize detector and embedder
detector = MTCNN()
embedder = FaceNet()

def l2_normalize(x):
    return x / np.linalg.norm(x)

# Folder containing known faces
photo_dir ="photos"

# Get embedding from image path
def get_face_embedding(img_path):
    img = cv2.imread(img_path)
    if img is None:
        print(f"[WARNING] Failed to load image: {img_path}")
        return None
    
    results = detector.detect_faces(img)
    if len(results) == 0:
        print(f"[INFO] No face detected in: {img_path}")
        return None
    
    face = results[0]
    x, y, w, h = face['box']
    x, y = max(0, x), max(0, y)
    
    face_img = img[y:y+h, x:x+w]
    face_img = cv2.resize(face_img, (160, 160))
    
    embedding = embedder.embeddings([face_img])[0]
    embedding = l2_normalize(embedding)
    return embedding

# Build the face database from photo directory
def build_face_database(folder):
    database = {}
    for file in os.listdir(folder):
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            path = os.path.join(folder, file)
            name = os.path.splitext(file)[0]
            print(f"Processing: {file}")
            embedding = get_face_embedding(path)
            if embedding is not None:
                database[name] = embedding
            else:
                print(f"[SKIPPED] {file}")
    return database

face_database = build_face_database(photo_dir)
print(f"✅ Loaded {len(face_database)} valid faces from Photo folder.")

# Get top N matches with cosine similarity
def get_top_matches(face_img, database, top_n=3):
    face_img = cv2.resize(face_img, (160, 160))
    embedding = embedder.embeddings([face_img])[0]
    embedding = l2_normalize(embedding)

    similarities = []
    for name, db_emb in database.items():
        sim_score = cosine_similarity([embedding], [db_emb])[0][0]  # Higher is better
        similarities.append((name, sim_score))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Start webcam recognition with stillness detection
video = cv2.VideoCapture(0)
print("📷 Press 'q' to quit...")

# Parameters for stillness detection
still_threshold = 10  # max movement in pixels to consider still
still_duration = 2    # seconds to hold still before capture

last_face_pos = None
still_start_time = None
captured = False
top_matches = []

while True:
    ret, frame = video.read()
    if not ret:
        break

    results = detector.detect_faces(frame)

    if len(results) == 0:
        # No face detected, reset everything
        last_face_pos = None
        still_start_time = None
        captured = False
        top_matches = []
    else:
        # Only process the largest face (closest)
        largest_face = max(results, key=lambda f: f['box'][2] * f['box'][3])
        x, y, w, h = largest_face['box']
        x, y = max(0, x), max(0, y)

        # Calculate movement from last frame
        if last_face_pos is not None:
            lx, ly, lw, lh = last_face_pos
            movement = abs(x - lx) + abs(y - ly) + abs(w - lw) + abs(h - lh)
        else:
            movement = None

        if movement is not None and movement < still_threshold:
            if still_start_time is None:
                still_start_time = time.time()
            else:
                elapsed = time.time() - still_start_time
                remaining = int(still_duration - elapsed) + 1

                # Draw countdown timer label
                countdown_label = f"Hold still... {remaining}s"
                (label_width, label_height), baseline = cv2.getTextSize(countdown_label, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
                cv2.rectangle(frame, (x, y - label_height - baseline - 10), (x + label_width, y), (0, 255, 255), cv2.FILLED)
                cv2.putText(frame, countdown_label, (x, y - 5),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)

                if elapsed >= still_duration and not captured:
                    # Capture face image with margin
                    margin = 10
                    x1 = max(0, x - margin)
                    y1 = max(0, y - margin)
                    x2 = min(frame.shape[1], x + w + margin)
                    y2 = min(frame.shape[0], y + h + margin)
                    face_img = frame[y1:y2, x1:x2]

                    # Compare with database and get top matches
                    top_matches = get_top_matches(face_img, face_database)

                    print(f"\nCaptured face after being still for {still_duration} seconds:")
                    for match_name, sim in top_matches:
                        print(f"  {match_name}: {sim * 100:.2f}% similarity")

                    captured = True
        else:
            # Movement too big or first detection: reset timer and capture flag
            still_start_time = None
            captured = False
            top_matches = []

        last_face_pos = (x, y, w, h)

        # Draw bounding box and label
        if captured and top_matches:
            # Green box with best match label
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            best_name, best_sim = top_matches[0]
            label = f"{best_name} ({best_sim*100:.1f}%)"
            (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
            cv2.rectangle(frame, (x, y - label_height - baseline - 5), (x + label_width, y), (0, 255, 0), cv2.FILLED)
            cv2.putText(frame, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2)
        else:
            # Yellow box while waiting
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 255), 2)

    cv2.imshow('Face Recognition with MTCNN + FaceNet', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

video.release()
cv2.destroyAllWindows()



Processing: 24WMA08802.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Processing: 24WMA08803.jpg
[INFO] No face detected in: photos\24WMA08803.jpg
[SKIPPED] 24WMA08803.jpg
Processing: 24WMH08807.jpg
[INFO] No face detected in: photos\24WMH08807.jpg
[SKIPPED] 24WMH08807.jpg
Processing: 24WMR08820.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Processing: 24WMR08821.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Processing: 24WMR08822.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Processing: 24WMR08824.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Processing: 24WMR08826.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Processing: 24WMR08827.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Processing: 24WMR08828.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Proce

# Barcode Detection - Gan Khai Li

# Barcode Detection - Kit Chin Jie Ying

# Real Time Human Tracking

In [None]:
# !pip install ultralytics, deep-sort-realtime

In [6]:
import time
import cv2 as cv
import torch
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort

# Configuration
model_path = "yolov8n.pt"
threshold = 0.4
max_frames = 300

# Load YOLO model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = YOLO(model_path)
model.to(device)

# Deep SORT tracker
tracker = DeepSort(
    max_age=30,
    n_init=3,
    max_iou_distance=0.7,
    nms_max_overlap=1.0,
    max_cosine_distance=0.2,
    nn_budget=None,
    embedder="mobilenet",
    half=torch.cuda.is_available(),
    bgr=True,
)

def draw_box_with_label(img, tlbr, label, color=(0, 255, 0)):
    x1, y1, x2, y2 = map(int, tlbr)
    cv.rectangle(img, (x1, y1), (x2, y2), color, 2)
    (tw, th), _ = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.6, 2)
    cv.rectangle(img, (x1, y1 - th - 6), (x1 + tw + 6, y1), color, -1)
    cv.putText(img, label, (x1 + 3, y1 - 6),
                cv.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)

# Initialize variable
video = cv.VideoCapture(0)
fps_smooth = None
person_class_id = 0

while True:
    ret, frame = video.read()

    if not ret: 
        break

    frame = cv.flip(frame, 1)
    t0 = time.time()

    # Detect person
    results = model.predict(
        frame,
        conf=threshold,
        iou=0.45,
        verbose=False,
        classes=[person_class_id],
        device=device
    )

    # Convert the YOLO detections
    detections = []
    if len(results):
        r = results[0]
        if r.boxes is not None and len(r.boxes) > 0:
            for box in r.boxes:
                xyxy = box.xyxy[0].cpu().numpy()
                conf = float(box.conf[0].cpu().numpy())
                detections.append([xyxy.tolist(), conf, "person"])

    # Update tracker
    tracks = tracker.update_tracks(detections, frame=frame)

    for trk in tracks:
        if not trk.is_confirmed() or trk.time_since_update > 0:
            continue
        track_id = trk.track_id
        tlbr = trk.to_tlbr()
        label = f"ID {track_id}"
        draw_box_with_label(frame, tlbr, label, color=(0, 255, 0))

    # FPS
    dt = time.time() - t0
    fps = 1.0 / dt if dt > 0 else 0.0
    fps_smooth = fps if fps_smooth is None else fps_smooth * 0.9 + fps * 0.1
    cv.putText(frame, f"FPS: {fps_smooth:.1f}", (10, 30),
                cv.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
    
    cv.imshow("Real Time Human Tracking", frame)
    if cv.waitKey(1) & 0xFF == ord('q'):
        break

video.release()
cv.destroyAllWindows()