# Face and Gestures Analysis: Face Identification Challenge

Students:
* Manuel Parma (255570)
* Àlex Montoya (242873)
* Marina Riba (229240)

## Set up and imports

In [None]:
import os
import numpy as np
from imageio.v2 import imread
from scipy.io import loadmat
import itertools
import cv2 as cv
import torch
import torch.nn as nn
import torchvision.transforms as tf
import matplotlib.pyplot as plt
from PIL import Image

## Face detection algorithm from Lab 1

We are using the Viola Jones algorithm we fine-tuned during Lab 1 in order to crop input images to only the faces. 

In [None]:
# face recognition code
classifier_file = os.path.dirname(cv.__file__) + "/data/haarcascade_frontalface_alt.xml"
face_cascade = cv.CascadeClassifier(classifier_file)

def face_detection(img, scaleFactor=1.1, minNeighbors=6, minSize=[100, 100]):
    """
    Method for detecting faces in an image using the Viola-Jones algorithm.
    :param img: Image data to detect the faces on.
    :return: Cropped image on face, None if no face is detected
    """
    if len(img.shape) == 3:
      # convert to grey image
      gray_image = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    else:
      gray_image = img

    # we run multiple tests for the three parameters and got the best performance
    # (approx. 85% accuracy) with these values.
    faces = face_cascade.detectMultiScale(gray_image,
                                          scaleFactor=scaleFactor,
                                          minNeighbors=minNeighbors,
                                          minSize=minSize)

    if len(faces) == 0:
      return None

    # we keep the biggest bounding box
    if len(faces) > 1:
        faces = sorted(faces, key=lambda rect: rect[2] * rect[3], reverse=True)
        faces = faces[:1]

    # convert to coordinates
    x, y, w, h = faces[0]
    return img[y:y+h, x:x+w]

## Data Augmentation and Cleaning

In [None]:
AGC_Challenge3_TRAINING = loadmat("AGC_Challenge3_Training.mat")
AGC_Challenge3_TRAINING = np.squeeze(AGC_Challenge3_TRAINING['AGC_Challenge3_TRAINING'])

imageName = AGC_Challenge3_TRAINING['imageName']
imageName = list(itertools.chain.from_iterable(imageName))

ids = list(AGC_Challenge3_TRAINING['id'])
ids = np.concatenate(ids).ravel().tolist()

In [None]:
def group_by_labels(images, labels):
    # we will group by labels
    output = dict()
    
    for imagefile, label in zip(images, labels):
      if label not in output.keys():
        output[label] = list()
      output[label].append(imagefile)
        
    return output

label_to_images = group_by_labels(imageName, ids)

This next code divided the TRAINING dataset in folders according to their labels.

In [None]:
# First we will group files by class
PERFORM_CLASSIFICATION = False

root_dir = "./TRAINING/"
output_dir = "./TRAINING_CLASS/"

if PERFORM_CLASSIFICATION:
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)
      
  for label, images in label_to_images.items():
    class_dir = os.path.join(output_dir, str(label))

    if not os.path.exists(class_dir):
      os.makedirs(class_dir)

    if int(label) == -1:
        continue

    print(f"Moving class {label}")

    for image_file in images:
      # Open the image and copy it
      image_path = os.path.join(root_dir, image_file)
      
      if int(label) == -1:
         # keep only faces from this class
         image = imread(image_path)
         if face_detection(image) is None:
            continue
      
      image = Image.open(image_path)
      image.save(f"{class_dir}/{image_file}")


Looking at the training data (and after some failed first attempts to build a model), we realized that the dataset is heavily unbalanced with "unknown" or "unidentified" images (those with label -1).

*   Identified faces: 480
*   Unknown (label -1): 720

We realized that the data was insufficient for most of the identities (most had 4 images only). We decided to increase the dataset downloading images from Google Images for each of the classes. Now, as the dataset has increased in size, we run the face detection algorithm to discard those where a face was not detected.

Additionally, we added fake faces images (from "thispersondoesnotexist.com") as the "-1"  label, and some generic images, and run it through Viola-Jones again. This was in order to make the CNN predict unknown identities and the case of no faces on the image.

We also removed the images from the TRAINING directory provided by the teachers, so this data can be used for unseen testing with the challenge script.

In [None]:
PERFORM_TRANSFORMATION = False

root_dir = "./TRAINING_CLASS/"
output_dir = "./TRAINING_AUGMENTED/"

if PERFORM_TRANSFORMATION:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get the classes of the dataset
    classes = os.listdir(root_dir)
    classes.sort()
    
    for class_label in classes:
        root_class_dir = os.path.join(root_dir, str(class_label))
        output_class_dir = os.path.join(output_dir, str(class_label))
        
        if not os.path.exists(output_class_dir):
            os.makedirs(output_class_dir)
        
        image_files = os.listdir(root_class_dir)
        
        faces_count = 0
        idx = 0
        
        for idx, imagefile in enumerate(image_files):
            image_path = os.path.join(root_class_dir, imagefile)
            _, image_extension = os.path.splitext(image_path)

            image = imread(image_path)
            
            face_detected = face_detection(image)
            
            if face_detected is None:
                continue
                
            out_path = os.path.join(output_class_dir, f"{idx}{image_extension}")
            image = Image.fromarray(face_detected).convert('RGB')
            faces_count += 1
            
            image.save(out_path)
                        
        print(f"Transformed class {class_label}: {faces_count}/{idx} faces")
            
print("FINISHED!")

## Data Loader

In [None]:
from sklearn.model_selection import train_test_split

training_dir = "./TRAINING_AUGMENTED/"

# "translations" of labels to trainable numbers
labels_to_number = {}
number_to_labels = {}

# Get the classes of the dataset
classes = os.listdir(training_dir)
classes.sort()

imagefiles = []
labels = []

for idx, class_name in enumerate(classes):
  # Get the image paths of the current class
  class_dir = os.path.join(training_dir, class_name)
  class_files = os.listdir(class_dir)

  # Create the dictionaries
  labels_to_number[class_name] = idx
  number_to_labels[idx] = class_name

  for image_file in class_files:
      image_path = os.path.join(class_dir, image_file)

      if not os.path.isdir(image_path):
          imagefiles.append(image_path)
          labels.append(idx)

# split into training and validation
data_train, data_val, labels_train, labels_val = train_test_split(imagefiles, labels, test_size=0.2, random_state=42)
print("training size:", len(data_train))
print("validation size:", len(data_val))

For each input image, we apply the different augmentations we specified below separately. That is, each image will appear 6 times in the training data loader: its original, and 5 different transformations. This helped us to increment the training size very easily.

In [None]:
# transformations for normalizing input
tr = tf.Compose([
    tf.RandomHorizontalFlip(),
    tf.ToTensor(),
    tf.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

resize = tf.Resize((224, 224))

augmentations = [
    tf.RandomRotation(degrees=(-30, 30)), 
    tf.GaussianBlur(kernel_size=(5, 9)), 
    tf.Compose([tf.Resize((50, 50)), tf.Resize((224, 224)),]),
    tf.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5),
    tf.Compose([tf.RandomCrop(size=(170, 170)), tf.Resize((224, 224)),]),
]

# Data loader for images and labels/identities
class FacesDataset(torch.utils.data.Dataset):

    def __init__(self, images_names, images_labels, augment_data=False):
        self.images_names = images_names
        self.labels = images_labels
        self.augment_data = augment_data

        # list of (image path, label, transforms to apply)
        self.__data = []

        for idx in range(len(images_names)):
          self.__load_image(idx)

    def __load_image(self, index):
        image_path = self.images_names[index]
        label = self.labels[index]

        # keep original image
        self.__data.append((
            image_path, label, None
        ))
        
        if self.augment_data:
            for aug in augmentations:
                self.__data.append((
                    image_path, label, aug
                ))

    def __getitem__(self, index):
        image_path, label, aug = self.__data[index]
        
        image = Image.open(image_path).convert('RGB')
        
        # Apply the preprocessing
        image = resize(image)
                
        if aug is not None:
            image = aug(image)
        
        image = tr(image)
            
        return image, label

    def __len__(self):
        return len(self.__data)

We applied the transformations to the training dataset, but not to the validation dataset (to make sure the model is learning the correct features).

In [None]:
batch_size = 50

# Training Dataset
print("Loading training dataset...")
train_dataset = FacesDataset(data_train, labels_train, augment_data=True)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print(f"Training size: {len(train_dataset)}")

# Validation Dataset
print("Loading validation dataset...")
val_dataset = FacesDataset(data_val, labels_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
print(f"Training size: {len(val_dataset)}")


# CNN

Below are some of the models we used, but as these were modified after each training sessions, not all of them are included. VGGSimple5 (which came after 5 iterations of VGGSimple) was the model that best performed for us.

In [ ]:
# Simple CNN similar to VGG (Source: Deep Learning course 2023 at UPF)
class VGGSimple(nn.Module):
    def __init__(self, num_classes=10):

        super(VGGSimple, self).__init__()

        self.conv11 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv12 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)

        self.conv21 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv22 = nn.Conv2d(128,128, kernel_size=3, stride=1, padding=1)

        self.maxpool= nn.MaxPool2d(kernel_size=5, stride=5)

        self.fc = nn.Linear(8*8*128, num_classes)

        self.relu = nn.ReLU()

    def forward(self, x):

        out = self.relu(self.conv11(x))
        out = self.relu(self.conv12(out))
        out = self.maxpool(out)

        out = self.relu(self.conv21(out))
        out = self.relu(self.conv22(out))
        out = self.maxpool(out)

        out = out.view(out.size(0), -1)
        out = self.fc(out)

        return out

In [None]:
class VGGSimple5(nn.Module):
    def __init__(self, num_classes=10):

        super(VGGSimple5, self).__init__()

        self.conv11 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv12 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.conv13 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)

        self.conv20 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.conv21 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv22 = nn.Conv2d(128,128, kernel_size=3, stride=1, padding=1)

        self.maxpool = nn.MaxPool2d(kernel_size=5, stride=5)

        self.fc1 = nn.Linear(8*8*128, 80)
        self.fc2 = nn.Linear(80, num_classes)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.15)

    def forward(self, x):

        out = self.relu(self.conv11(x))
        out = self.relu(self.conv12(out))
        out = self.relu(self.conv13(out))
        out = self.maxpool(out)
        out = self.dropout(out)

        out = self.relu(self.conv20(out))
        out = self.relu(self.conv21(out))
        out = self.relu(self.conv22(out))
        out = self.maxpool(out)
        out = self.dropout(out)

        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.fc2(out)

        return out

In [None]:
class SeparableConv2d(nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
        super(SeparableConv2d, self).__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, 
                                   groups=in_channels, bias=bias, padding=1)
        self.pointwise = nn.Conv2d(in_channels, out_channels, 
                                   kernel_size=1, bias=bias)
    
    def forward(self, x):
        out = self.depthwise(x)
        out = self.pointwise(out)
        return out

In [None]:
class VGGSeparableNew(nn.Module):
    def __init__(self, num_classes=10):

        super(VGGSeparableNew, self).__init__()

        self.conv11 = SeparableConv2d(3, 64, kernel_size=3)
        self.conv12 = SeparableConv2d(64, 64, kernel_size=3)
        self.conv13 = SeparableConv2d(64, 64, kernel_size=3)

        self.conv21 = SeparableConv2d(64, 128, kernel_size=3)
        self.conv22 = SeparableConv2d(128, 128, kernel_size=3)
        self.conv23 = SeparableConv2d(128, 128, kernel_size=3)

        self.maxpool = nn.MaxPool2d(kernel_size=5, stride=5)

        self.fc1 = nn.Linear(8*8*128, 110)
        self.fc2 = nn.Linear(110, num_classes)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.15)

    def forward(self, x):

        out = self.relu(self.conv11(x))
        out = self.relu(self.conv12(out))
        out = self.relu(self.conv13(out))
        out = self.maxpool(out)
        out = self.dropout(out)

        out = self.relu(self.conv21(out))
        out = self.relu(self.conv22(out))
        out = self.relu(self.conv23(out))
        out = self.maxpool(out)
        out = self.dropout(out)

        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.fc2(out)

        return out

# Training

In [None]:
# Functions from Deep Learning course 2023 at UPF

# Train function
def train(CNN, train_loader, val_loader, optimizer, num_epochs=5, model_name='model.ckpt', device='cpu', results_path = './results/'):
    CNN.train()  # Set the model in train mode
    total_step = len(train_loader)
    losses_list = []
    val_losses_list = []
    val_acc_list = []
    criterion = nn.CrossEntropyLoss()
    

    # Iterate over epochs
    for epoch in range(num_epochs):
        # Training
        loss_avg = 0
        nBatches = 0

        for i, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.type(torch.LongTensor).to(device)

            # Forward pass
            outputs = CNN(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_avg += loss.cpu().item()
            nBatches += 1

            if (i + 1) % 10 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                      .format(epoch + 1, num_epochs, i + 1, total_step, loss_avg / nBatches))

        print('Epoch [{}/{}], Training Loss: {:.4f}'
              .format(epoch + 1, num_epochs, loss_avg / nBatches))
        losses_list.append(loss_avg / nBatches)

        # Validation
        CNN.eval()  # Set the model in evaluation mode
        val_loss_avg = 0
        val_nBatches = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.type(torch.LongTensor).to(device)

                outputs = CNN(images)
                val_loss = criterion(outputs, labels)
                val_loss_avg += val_loss.cpu().item()
                val_nBatches += 1

        val_loss_avg /= val_nBatches
        print('Epoch [{}/{}], Validation Loss: {:.4f}'.format(epoch + 1, num_epochs, val_loss_avg))
        val_losses_list.append(val_loss_avg)
        
        val_acc = test(CNN, val_loader)
        val_acc_list.append(val_acc)
        print(f'Validation Accuracy: {val_acc}')

        CNN.train()  # Set the model back to train mode

        # Check if the results directory exists, or create it
        if not os.path.exists(results_path):
          os.makedirs(results_path)

        torch.save(CNN.state_dict(), results_path + f"epoch_{epoch}_" + model_name)

    return losses_list, val_losses_list, val_acc_list

To calculate the validation accuracy, we used the F1-Score method provided by the course for the challenge.

In [None]:
def CHALL_AGC_ComputeRecognScores(auto_ids, true_ids):
    #   Compute face recognition score
    #
    #   INPUTS
    #     - AutomSTR: The results of the automatic face
    #     recognition algorithm, stored as an integer
    #
    #     - AGC_Challenge_STR: The ground truth ids
    #
    #   OUTPUT
    #     - FR_score:     The final recognition score
    #
    #   --------------------------------------------------------------------
    #   AGC Challenge
    #   Universitat Pompeu Fabra
    #

    if len(auto_ids) != len(true_ids):
        assert ('Inputs must be of the same len');

    # convert to teacher's values
    auto_ids = [int(number_to_labels[value]) for value in auto_ids]
    true_ids = [int(number_to_labels[value]) for value in true_ids]
    # convert 0 class to -1 (they represent the same)
    auto_ids = [value if value != 0 else -1 for value in auto_ids]
    true_ids = [value if value != 0 else -1 for value in true_ids]

    f_beta = 1
    res_list = list(filter(lambda x: true_ids[x] != -1, range(len(true_ids))))

    nTP = len([i for i in res_list if auto_ids[i] == true_ids[i]])

    res_list = list(filter(lambda x: auto_ids[x] != -1, range(len(auto_ids))))

    nFP = len([i for i in res_list if auto_ids[i] != true_ids[i]])

    res_list_auto_ids = list(filter(lambda x: auto_ids[x] == -1, range(len(auto_ids))))
    res_list_true_ids = list(filter(lambda x: true_ids[x] != -1, range(len(true_ids))))

    nFN = len(set(res_list_auto_ids).intersection(res_list_true_ids))

    FR_score = (1 + f_beta ** 2) * nTP / ((1 + f_beta ** 2) * nTP + f_beta ** 2 * nFN + nFP)

    return FR_score


# Test function
def test(CNN, test_loader):
  CNN.eval()
  with torch.no_grad():
        real_ids = []
        predicted_ids = []
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            # get network predictions
            outputs = CNN(images)

            # get predicted class
            _, predicted = torch.max(outputs.data, 1)

            real_ids.extend(labels.cpu().tolist())
            predicted_ids.extend(predicted.cpu().tolist())

  # return accuracy
  return CHALL_AGC_ComputeRecognScores(predicted_ids, real_ids) * 100

In [None]:
def plot_loss(loss_hist, val_loss_hist, model_name, results_path):
    # visualize the results
    plt.plot(loss_hist, '-.r', linewidth=1.0, label='train_loss')
    plt.plot(val_loss_hist,'-b', linewidth=1.0, label='val_loss')
    plt.xlabel('train step', fontsize=14)
    plt.ylabel('loss', fontsize=14)
    plt.title(model_name)
    plt.legend()
    plt.savefig(results_path + f"Loss_{model_name}.png")
    # plt.show()
    plt.clf()
  
def plot_acc(acc_hist, model_name, results_path):
    plt.plot(acc_hist, '-b', linewidth=1.0, label='validation')
    plt.xlabel('train step', fontsize=14)
    plt.ylabel('accuracy', fontsize=14)
    plt.title(model_name)
    plt.legend()
    plt.savefig(results_path + f"Accuracy_{model_name}.png")
    # plt.show()
    plt.clf()
    

In [None]:
def count_parameters(model):
    # count the parameters of the model
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
import random
import gc
import pickle

# here we can specify which models and learning rates to train
training_combinations = [(VGGSimple5, 0.01)]

num_epochs=25

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

num_classes = len(classes)
print(f"Num of classes: {num_classes}")

for model_class, lr in training_combinations:
    # free cuda memory
    gc.collect()
    torch.cuda.empty_cache()
    
    random.seed(42)
    torch.manual_seed(42)
    np.random.seed(42)
    
    my_model = model_class(num_classes)
    model_name = model_class.__name__ + "_SGD"
    results_path = f'./results/{model_name}_{lr}/'

    print("Model:", model_name)
    print(f"Model parameters: {count_parameters(my_model)}")
    print("Learning rate:", lr)
    print("Results folder:", results_path)
    
    # optimizer = torch.optim.Adam(my_model.parameters(), lr = lr, weight_decay=lr/10)
    optimizer = torch.optim.SGD(my_model.parameters(), lr = lr, momentum=0.9, weight_decay=lr/10)

    model = my_model.to(device)
        
    if not os.path.exists(results_path):
        os.makedirs(results_path)
    
    losses_list, val_losses_list, val_acc_list = train(model, train_loader, val_loader, optimizer,
            num_epochs=num_epochs, model_name=f'{model_name}.ckpt', device=device,
            results_path=results_path)
    
    plot_loss(losses_list, val_losses_list, model_name, results_path=results_path)
    plot_acc(val_acc_list, model_name, results_path=results_path)
    
    output = dict()

    output["labels_to_number"] = labels_to_number
    output["number_to_labels"] = number_to_labels
    
    with open(results_path + "label_dicts.pk", "wb") as f:
        pickle.dump(output, f)
        
    print("-" * 50)