# Simple Face Recognition using Euclidean Distance

This notebook implements a basic face recognition system using raw pixel values and Euclidean distance for the LFW (Labeled Faces in the Wild) dataset.

## Approach

1. **Feature Extraction**: Raw pixel values from grayscale images (250x250) normalized to [0,1]
2. **Distance Metric**: Euclidean distance between flattened feature vectors
3. **Decision Rule**: Classify as same person if distance < threshold, different person otherwise
4. **Threshold Optimization**: Find optimal threshold using development data

## Key Components

- `EuclideanFaceRecognition`: Main class handling image loading, feature extraction, and verification
- `find_optimal_threshold()`: Finds best threshold using validation data
- `evaluate_face_recognition()`: Computes accuracy, precision, recall, and F1-score

## Dataset Structure

- Images organized in folders by person name
- Pairs files specify which image pairs to compare
- Format: person1 img1 person2 img2 (different persons) or person img1 img2 (same person)

In [44]:
import numpy as np
import cv2
import os
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import pandas as pd

In [None]:
class EuclideanFaceRecognition:
    def __init__(self, dataset_path="FaceRecognitionDset/lfw_funneled"):
        self.dataset_path = dataset_path
        self.face_features = {}
        
    def load_image(self, image_path):
        """Load and preprocess an image"""
        try:
            img = cv2.imread(image_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (100, 100))  # Smaller size to reduce noise
            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)  # Convert to grayscale
            
            # Apply Gaussian blur to reduce noise
            img = cv2.GaussianBlur(img, (3, 3), 0)
            
            return img.flatten().astype(np.float32)  # Flatten to 1D array
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            return None
    
    def extract_features(self, person_name, image_num):
        """Extract features (raw pixel values) from a specific image"""
        image_path = os.path.join(self.dataset_path, person_name, 
                                 f"{person_name}_{image_num:04d}.jpg")
        if os.path.exists(image_path):
            features = self.load_image(image_path)
            if features is not None:
                # Normalize features to [0, 1]
                features = features / 255.0
                # z-score norm
                # Additional normalization: subtract mean and divide by std
                features = (features - features.mean()) / (features.std() + 1e-8)
                return features
        return None
    
    def euclidean_distance(self, features1, features2):
        """Calculate Euclidean distance between two feature vectors"""
        if features1 is None or features2 is None:
            return float('inf')
        return np.linalg.norm(features1 - features2)
    
    def verify_pair(self, person1, img1, person2, img2, threshold=0.6):
        """
        Verify if two images belong to the same person
        Returns True if distance < threshold (same person)
        """
        features1 = self.extract_features(person1, img1)
        features2 = self.extract_features(person2, img2)
        
        if features1 is None or features2 is None:
            return False
        
        distance = self.euclidean_distance(features1, features2)
        return distance < threshold
    
    def find_optimal_threshold(self, pairs_file, max_pairs=100):
        """Find optimal threshold using validation data"""
        with open(pairs_file, 'r') as f:
            lines = f.readlines()
        
        # Parse first line - could be just total pairs or "folds pairs_per_fold"
        first_line = lines[0].strip().split()
        if len(first_line) == 1:
            # Format: total_pairs
            total_pairs = int(first_line[0])
        else:
            # Format: num_folds pairs_per_fold
            num_folds = int(first_line[0])
            pairs_per_fold = int(first_line[1])
            total_pairs = num_folds * pairs_per_fold
        
        distances_same = []
        distances_diff = []
        
        line_idx = 1
        same_count = 0
        diff_count = 0
        
        # Process pairs - same person pairs have 3 elements, different person pairs have 4
        while line_idx < len(lines) and (same_count < max_pairs // 2 or diff_count < max_pairs // 2):
            parts = lines[line_idx].strip().split()
            
            if len(parts) == 3 and same_count < max_pairs // 2:
                # Same person pair
                person, img1, img2 = parts
                features1 = self.extract_features(person, int(img1))
                features2 = self.extract_features(person, int(img2))
                if features1 is not None and features2 is not None:
                    distance = self.euclidean_distance(features1, features2)
                    distances_same.append(distance)
                    same_count += 1
                    
            elif len(parts) == 4 and diff_count < max_pairs // 2:
                # Different person pair
                person1, img1, person2, img2 = parts
                features1 = self.extract_features(person1, int(img1))
                features2 = self.extract_features(person2, int(img2))
                if features1 is not None and features2 is not None:
                    distance = self.euclidean_distance(features1, features2)
                    distances_diff.append(distance)
                    diff_count += 1
            
            line_idx += 1
        
        print(f"Processed {same_count} same-person pairs, {diff_count} different-person pairs")
        
        if distances_same and distances_diff:
            print(f"Same person distances: mean={np.mean(distances_same):.3f}, std={np.std(distances_same):.3f}")
            print(f"Different person distances: mean={np.mean(distances_diff):.3f}, std={np.std(distances_diff):.3f}")
        
        # Find optimal threshold with wider range
        min_dist = min(min(distances_same, default=10), min(distances_diff, default=10))
        max_dist = max(max(distances_same, default=50), max(distances_diff, default=50))
        thresholds = np.linspace(min_dist * 0.8, max_dist * 1.2, 100)
        
        best_accuracy = 0
        best_threshold = np.mean([np.mean(distances_same), np.mean(distances_diff)])
        
        for threshold in thresholds:
            correct = 0
            total = 0
            
            # Same person pairs should have distance < threshold
            for dist in distances_same:
                if dist < threshold:
                    correct += 1
                total += 1
            
            # Different person pairs should have distance >= threshold
            for dist in distances_diff:
                if dist >= threshold:
                    correct += 1
                total += 1
            
            accuracy = correct / total if total > 0 else 0
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_threshold = threshold
        
        print(f"Optimal threshold: {best_threshold:.3f} (accuracy: {best_accuracy:.3f})")
        return best_threshold

In [46]:
def evaluate_face_recognition(recognizer, pairs_file, threshold, max_pairs=200):
    """Evaluate face recognition performance on test pairs"""
    with open(pairs_file, 'r') as f:
        lines = f.readlines()
    
    # Parse first line - could be just total pairs or "folds pairs_per_fold"
    first_line = lines[0].strip().split()
    if len(first_line) == 1:
        # Format: total_pairs
        total_pairs = int(first_line[0])
        pairs_per_fold = total_pairs
    else:
        # Format: num_folds pairs_per_fold
        num_folds = int(first_line[0])
        pairs_per_fold = int(first_line[1])
        total_pairs = num_folds * pairs_per_fold
    
    predictions = []
    ground_truth = []
    
    line_idx = 1
    same_count = 0
    diff_count = 0
    
    # Process pairs - same person pairs have 3 elements, different person pairs have 4
    while line_idx < len(lines) and (same_count < max_pairs // 2 or diff_count < max_pairs // 2):
        if line_idx >= len(lines):
            break
            
        parts = lines[line_idx].strip().split()
        
        if len(parts) == 3 and same_count < max_pairs // 2:
            # Same person pair
            person, img1, img2 = parts
            prediction = recognizer.verify_pair(person, int(img1), person, int(img2), threshold)
            predictions.append(prediction)
            ground_truth.append(True)  # Same person
            same_count += 1
            
        elif len(parts) == 4 and diff_count < max_pairs // 2:
            # Different person pair
            person1, img1, person2, img2 = parts
            prediction = recognizer.verify_pair(person1, int(img1), person2, int(img2), threshold)
            predictions.append(prediction)
            ground_truth.append(False)  # Different persons
            diff_count += 1
        
        line_idx += 1
    
    # Calculate metrics
    accuracy = accuracy_score(ground_truth, predictions)
    
    # Calculate precision, recall, F1
    tp = sum(p and gt for p, gt in zip(predictions, ground_truth))
    fp = sum(p and not gt for p, gt in zip(predictions, ground_truth))
    fn = sum(not p and gt for p, gt in zip(predictions, ground_truth))
    tn = sum(not p and not gt for p, gt in zip(predictions, ground_truth))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"Evaluation Results:")
    print(f"Tested {same_count} same-person pairs, {diff_count} different-person pairs")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1:.3f}")
    print(f"True Positives: {tp}, False Positives: {fp}")
    print(f"True Negatives: {tn}, False Negatives: {fn}")
    
    return accuracy, precision, recall, f1

In [47]:
# Initialize the face recognition system
recognizer = EuclideanFaceRecognition()

# Find optimal threshold using development data
optimal_threshold = recognizer.find_optimal_threshold("FaceRecognitionDset/pairsDevTrain.txt", max_pairs=100)

Processed 50 same-person pairs, 50 different-person pairs
Same person distances: mean=120.793, std=27.523
Different person distances: mean=128.519, std=19.252
Optimal threshold: 111.657 (accuracy: 0.570)


In [48]:
# Test on development test set
print("Testing on development test set:")
dev_accuracy, dev_precision, dev_recall, dev_f1 = evaluate_face_recognition(
    recognizer, "FaceRecognitionDset/pairsDevTest.txt", optimal_threshold, max_pairs=200
)

Testing on development test set:
Evaluation Results:
Tested 100 same-person pairs, 100 different-person pairs
Accuracy: 0.595
Precision: 0.686
Recall: 0.350
F1-Score: 0.464
True Positives: 35, False Positives: 16
True Negatives: 84, False Negatives: 65


In [49]:
# Test on full evaluation pairs
print("\nTesting on full evaluation set:")
full_accuracy, full_precision, full_recall, full_f1 = evaluate_face_recognition(
    recognizer, "FaceRecognitionDset/pairs.txt", optimal_threshold, max_pairs=400
)


Testing on full evaluation set:
Evaluation Results:
Tested 200 same-person pairs, 200 different-person pairs
Accuracy: 0.610
Precision: 0.729
Recall: 0.350
F1-Score: 0.473
True Positives: 70, False Positives: 26
True Negatives: 174, False Negatives: 130


In [50]:
# Example: Manual verification of specific pairs
print("\nManual verification examples:")

# Same person pair
person = "George_W_Bush"
img1, img2 = 1, 2
is_same = recognizer.verify_pair(person, img1, person, img2, optimal_threshold)
features1 = recognizer.extract_features(person, img1)
features2 = recognizer.extract_features(person, img2)
if features1 is not None and features2 is not None:
    distance = recognizer.euclidean_distance(features1, features2)
    print(f"Same person ({person}): Distance = {distance:.3f}, Verified = {is_same}")

# Different person pair
person1, person2 = "George_W_Bush", "Tony_Blair"
img1, img2 = 1, 1
is_same = recognizer.verify_pair(person1, img1, person2, img2, optimal_threshold)
features1 = recognizer.extract_features(person1, img1)
features2 = recognizer.extract_features(person2, img2)
if features1 is not None and features2 is not None:
    distance = recognizer.euclidean_distance(features1, features2)
    print(f"Different persons ({person1} vs {person2}): Distance = {distance:.3f}, Verified = {is_same}")


Manual verification examples:
Same person (George_W_Bush): Distance = 122.061, Verified = False
Different persons (George_W_Bush vs Tony_Blair): Distance = 110.591, Verified = True
