In [13]:
%pip install facenet-pytorch pandas tqdm scikit-learn opencv-python opencv-contrib-python

Collecting opencv-contrib-python
  Obtaining dependency information for opencv-contrib-python from https://files.pythonhosted.org/packages/f3/78/b504ca8f7a312918d184e0b8093c62bc9a110d8154f658b591ef5c020d65/opencv_contrib_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata
  Downloading opencv_contrib_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Downloading opencv_contrib_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl (46.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: opencv-contrib-python
Successfully installed opencv-contrib-python-4.11.0.86

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to rest

In [6]:
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from torch.utils.data import DataLoader
from torchvision import datasets

import numpy as np
from tqdm import tqdm
import random
from torchvision import transforms

RNG_SEED = 42

random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)
np.random.seed(RNG_SEED)


In [2]:
# Check M1 support
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print('Running on device: {}'.format(device))

Running on device: mps


# Define MTCNN baseline
We use the default params for now

In [7]:
mtcnn = MTCNN(
    image_size=160, margin=0, min_face_size=20,
    thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=True,
    keep_all=True, device=device
)

In [8]:
# Need to be changed
DATA_DIR = "./data"
TRAIN_TEST_SPLIT = 0.8

In [8]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
])

tensor_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])


def collate_fn(batch):
    images, labels = zip(*batch)
    return list(images), list(labels)

# Dataset for training
dataset = datasets.ImageFolder(f"{DATA_DIR}", transform=transform)
dataset_train, dataset_test = torch.utils.data.random_split(dataset, [TRAIN_TEST_SPLIT, 1 - TRAIN_TEST_SPLIT])
loader_train = DataLoader(dataset_train, collate_fn=collate_fn, batch_size=8, shuffle=True)

# Dataset for testing
loader_test = DataLoader(dataset_test, collate_fn=collate_fn, batch_size=8, shuffle=True,)
tensor_loader = DataLoader(dataset_test, collate_fn=collate_fn, batch_size=8, shuffle=True)


## Testing MTCNN accuracy with manually labelled data

In [12]:
count = 0
test_count = 0
false_positives = 0
false_negatives = 0

# mtcnn = MTCNN(factor=0.95)

for X, Y in tqdm(loader_test):
    for i in range(0, len(X)):
        x = X[i]
        y = Y[i]

        x_aligned, probs = mtcnn.detect(x)

        if x_aligned is not None:
            if y == 0:  
                count += 1 
        else:
            if y == 1:  
                count += 1 
                

print(f"{(count / 10000):8f}")

100%|██████████| 10278/10278 [22:16<00:00,  7.69it/s]

6.894200





## Training SVM Model


In [4]:
import os
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score


In [36]:
def preprocess_data(loader):
    X = []
    y = []
    for images, labels in tqdm(loader, desc="Flattening data"):
        # Convert images to numpy arrays and flatten
        images_flat = [np.array(img).flatten() for img in images]
        X.extend(images_flat)
        y.extend(labels)
    return np.array(X), np.array(y)

In [37]:
X_train, y_train = preprocess_data(loader_train)
X_test, y_test = preprocess_data(loader_test)

Flattening data: 100%|██████████| 8223/8223 [06:20<00:00, 21.60it/s]
Flattening data: 100%|██████████| 2056/2056 [01:37<00:00, 21.16it/s]


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train SVM Model 
svm_model = svm.SVC(kernel="linear")
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


## Training Logistic Regression

### Preprocess data

In [63]:
BATCH_SIZE = 8
IMG_SIZE = 256
NUM_CLASSES = 2
BOVW_CLUSTERS = 500

# We'll do basic transforms: resize + tensor + normalization.
img_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

# Get datasets & loaders
dataset = datasets.ImageFolder(f"{DATA_DIR}", transform=img_transforms)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [TRAIN_TEST_SPLIT, 1 - TRAIN_TEST_SPLIT])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

### Get SIFT features

In [65]:
import cv2

# Create SIFT extractor
sift = cv2.SIFT_create()

In [66]:
def tensor_to_opencv_img(tensor_img):
    """
    Convert a single image from a PyTorch tensor (C,H,W) to a NumPy array (H,W) or (H,W,3).
    We'll convert to grayscale for SIFT.
    """
    # tensor_img shape: (3, H, W) if color
    # Move to CPU, convert to numpy
    img_np = tensor_img.cpu().numpy()

    # img_np shape is (3, H, W). We can convert to (H, W, 3) by transposing
    img_np = np.transpose(img_np, (1, 2, 0))  # (H, W, 3)

    # Convert to uint8 [0..255] if necessary
    img_np = (img_np * 255.0).clip(0, 255).astype(np.uint8)

    # Convert to grayscale
    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
    return gray

def extract_descriptors_from_dataloader(dataloader):
    """
    Loop through an entire DataLoader, extract SIFT descriptors for each image.
    """
    descriptors_per_image = []
    labels_list = []

    # Assume we already have train_loader that yields (images, labels)
    for images, labels in tqdm(dataloader):
        # images shape: (batch_size, 3, H, W)
        # labels shape: (batch_size,)
        batch_size = images.size(0)
        for i in range(batch_size):
            # Convert one image to grayscale OpenCV format
            gray_img = tensor_to_opencv_img(images[i])
            # Extract SIFT descriptors
            kp, desc = sift.detectAndCompute(gray_img, None)
            if desc is not None:
                descriptors_per_image.append(desc)
            else:
                # Some images might have no descriptors
                descriptors_per_image.append(np.zeros((0,128), dtype=np.float32))

            # We also keep the label so we can match it up later
            labels_list.append(labels[i].item())

    return descriptors_per_image, labels_list

def build_bovw_histogram(descriptors, kmeans_model):
    """
    Given SIFT descriptors (num_keypoints,128) for ONE image,
    assign each descriptor to the nearest cluster and build a histogram of size BOVW_CLUSTERS.
    """
    hist = np.zeros((BOVW_CLUSTERS,), dtype=np.float32)
    if descriptors is None or len(descriptors) == 0:
        return hist  # no keypoints => zero histogram

    words = kmeans_model.predict(descriptors)
    for w in words:
        hist[w] += 1

    return hist

In [67]:
print("Collecting SIFT descriptors from train_loader...")
all_descriptors, all_labels = extract_descriptors_from_dataloader(train_loader)
print(f"Collected descriptors from {len(all_descriptors)} training images.")

# Stack all descriptors into one large array for K-Means (excluding empty ones)
desc_nonempty = [d for d in all_descriptors if d.shape[0] > 0]
if len(desc_nonempty) > 0:
    all_train_desc = np.vstack(desc_nonempty)
else:
    all_train_desc = np.zeros((0, 128), dtype=np.float32)

Collecting SIFT descriptors from train_loader...


  0%|          | 0/8223 [00:00<?, ?it/s]python(33412) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(33416) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
100%|██████████| 8223/8223 [05:32<00:00, 24.76it/s]


Collected descriptors from 65777 training images.


In [68]:
from sklearn.cluster import KMeans

if all_train_desc.shape[0] == 0:
    print("No descriptors found in training set! Can't build K-Means.")
    exit()

print(f"Running K-Means on {all_train_desc.shape[0]} descriptors with {BOVW_CLUSTERS} clusters...")
kmeans = KMeans(n_clusters=BOVW_CLUSTERS, random_state=RNG_SEED, verbose=1)
kmeans.fit(all_train_desc)
print("K-Means done.")

Running K-Means on 18050453 descriptors with 500 clusters...
Initialization complete
Iteration 0, inertia 1849718472704.0.
Iteration 1, inertia 1344751009792.0.
Iteration 2, inertia 1320826961920.0.
Iteration 3, inertia 1310717640704.0.
Iteration 4, inertia 1304969609216.0.
Iteration 5, inertia 1301103771648.0.
Iteration 6, inertia 1298242732032.0.
Iteration 7, inertia 1295986327552.0.
Iteration 8, inertia 1294159314944.0.
Iteration 9, inertia 1292659982336.0.
Iteration 10, inertia 1291431444480.0.
Iteration 11, inertia 1290394271744.0.
Iteration 12, inertia 1289526968320.0.
Iteration 13, inertia 1288796504064.0.
Iteration 14, inertia 1288169717760.0.
Iteration 15, inertia 1287635599360.0.
Iteration 16, inertia 1287166885888.0.
Iteration 17, inertia 1286761611264.0.
Iteration 18, inertia 1286393823232.0.
Iteration 19, inertia 1286061162496.0.
Iteration 20, inertia 1285761269760.0.
Iteration 21, inertia 1285486018560.0.
Iteration 22, inertia 1285236981760.0.
Iteration 23, inertia 128500

In [69]:
from sklearn.preprocessing import normalize

train_histograms = []
train_labels = []

idx = 0
print("Building BoVW histograms for training set...")
for desc in tqdm(all_descriptors):
    hist = build_bovw_histogram(desc, kmeans)
    train_histograms.append(hist)
    train_labels.append(all_labels[idx])
    idx += 1

train_histograms = np.array(train_histograms, dtype=np.float32)
train_labels = np.array(train_labels, dtype=np.int64)

# (Optional) Normalize histograms
train_histograms = normalize(train_histograms, norm='l2', axis=1)

print("Train BoVW shape:", train_histograms.shape)  # (num_train_images, NUM_CLUSTERS)


Building BoVW histograms for training set...


100%|██████████| 65777/65777 [00:54<00:00, 1212.75it/s]


Train BoVW shape: (65777, 500)


### Define the Logistic Regression Model

In [70]:
from sklearn.linear_model import LogisticRegression

print("Training Logistic Regression on BoVW histograms...")
clf = LogisticRegression(max_iter=1000)
clf.fit(train_histograms, train_labels)
print("Logistic Regression training complete.")


Training Logistic Regression on BoVW histograms...
Logistic Regression training complete.


In [71]:
print("Extracting SIFT descriptors from test_loader...")
test_descriptors_list, test_labels_list = extract_descriptors_from_dataloader(test_loader)

print("Building BoVW histograms for the test set...")
test_histograms = []
for desc in tqdm(test_descriptors_list):
    hist = build_bovw_histogram(desc, kmeans)
    test_histograms.append(hist)

test_histograms = np.array(test_histograms, dtype=np.float32)
test_histograms = normalize(test_histograms, norm='l2', axis=1)
test_labels = np.array(test_labels_list, dtype=np.int64)

print("Predicting on test histograms...")
test_preds = clf.predict(test_histograms)

Extracting SIFT descriptors from test_loader...


  0%|          | 0/2056 [00:00<?, ?it/s]python(33991) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(33995) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
100%|██████████| 2056/2056 [01:33<00:00, 22.06it/s]


Building BoVW histograms for the test set...


100%|██████████| 16443/16443 [00:07<00:00, 2184.94it/s]

Predicting on test histograms...





In [72]:
accuracy = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 70.43%


## Training Gaussian Mixture model


In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler

In [None]:
# digits = load_digits()
X, y = [], []
for images, labels in tqdm(tensor_loader):
    # Flatten images to shape
    images_flat = [img.numpy().transpose(1, 2, 0).flatten() for img in images]
    X.extend(images_flat)
    y.extend(labels)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = np.asarray(y, dtype=int)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

n_classes = len(np.unique(y_train))
gmm_models = []

In [None]:
for label in tqdm(range(n_classes)):
    X_class = X_train[y_train == label]
    
    gmm = GaussianMixture(n_components=n_classes, covariance_type='full', random_state=42)
    gmm.fit(X_class)
    gmm_models.append(gmm)

# gmm = GaussianMixture(n_components=len(idx_to_class), random_state=42)
# gmm.fit(X_train)

# # Step 5: Predict Labels
# y_pred = gmm.predict(X_test)

In [None]:
y_pred = []

for sample in tqdm(X_test):
    likelihoods = gmm.score_samples(sample.reshape(1, -1))
    
    y_pred.append(np.argmax(likelihoods))
    
    
accuracy = accuracy_score(y_test, y_pred)

accuracy

print(accuracy)

# Labeling all images using MTCNN



In [None]:
from PIL import Image

result = []

def chunks(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]
        
data_dir = "data/"
batch_size = 10
all_files = [
    os.path.join(root, f)
    for root, _, files in os.walk(data_dir)
    for f in files if f.endswith("webp")
]
batches = list(chunks(all_files, batch_size))

for batch in tqdm(batches):
    for i in range(len(batch)):
        if batch[i].endswith("webp"):
            image_path = os.path.join(data_dir, batch[i])
            try:
                img = Image.open(image_path).convert('RGB')
                boxes, probs = mtcnn.detect(img)
                
                label = "face" if boxes is not None else "no face"
                
                # Append result
                result.append({
                    "filename": batch[i],
                    "label": label
                })

            except Exception as e:
                print("Error processing {batch[i]}: {e}")

  0%|          | 21/7219 [00:28<2:43:06,  1.36s/it]


KeyboardInterrupt: 

In [None]:
import csv 

output_csv = 'mtcnn_labels.csv'

with open(output_csv, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["filename", "label"])
    writer.writeheader()
    for result in result:
        writer.writerow(result)

In [None]:

# Fit the scaler on smaller chunks of the training data
batch_size = 1000  # Adjust based on your available memory
for i in tqdm(range(0, len(X_train), batch_size)):
    batch = X_train[i:i + batch_size]
    scaler.partial_fit(batch)  # Fit incrementally using partial batches

# Transform training and test data in chunks
def transform_in_batches(data, batch_size, scaler):
    transformed_data = []
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i:i + batch_size]
        transformed_batch = scaler.transform(batch)
        transformed_data.append(transformed_batch)
    return np.vstack(transformed_data)

X_train_scaled = transform_in_batches(X_train, batch_size, scaler)
X_test_scaled = transform_in_batches(X_test, batch_size, scaler)