# Pytorch face detection for ML

*Lionnus Kesting (ETHZ)*

Import packages

In [None]:
# Start with the basics
import numpy as np
from pandas import DataFrame
import os
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Library for plotting
import matplotlib.pyplot as plt

# pytorch
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import torch.nn.functional as F
import torchvision.transforms.functional as TF
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.io import read_image

# Import ai8x specifics
#import ai8x

In [None]:
#@title Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: %s' % device)

## Define dataset and dataloader classes

In [None]:
"""
Custom image dataset class
"""
class WIDERFacesDataset(Dataset):
    def __init__(self, data_path, transform=None):
        self.data_path = data_path
        self.annotation_file = os.path.join(self.data_path, "wider_face_bbx_gt.txt")
        self.transform = transform
        
        # Load data and annotations
        self.data, self.annotations = self.load_data()
        
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx):
        image_path = os.path.join(self.data_path, self.annotations[idx]['image'])
        image = self.load_image(image_path)
        
        bboxes = self.annotations[idx]['bboxes']

        # Resize image to 128 x 128 and update boundary boxes accordingly
        bboxes_resized = self.resize_bbox(bboxes, image.shape[1], image.shape[0], 64,64)
        image = cv2.resize(image, (64,64))

        #labels = self.annotations[idx]['labels']
        
        if self.transform:
            image = self.transform(image)
        
            
        return image, torch.tensor([float(i) for i in bboxes_resized[0]])
    
    def load_data(self):
        annotations = []
        data = []
        
        image_files = os.listdir(self.data_path) # list of all the objects in directory, also includes the folder->check for extension to get images

        with open(self.annotation_file, 'r') as f:
            lines = f.read().splitlines()

        i = 0
        while(i<len(lines)):
                image_name = lines[i]
                image_file_path = os.path.join(self.data_path, image_name)
                i += 1
                num_bboxes = int(lines[i])

                bboxes = []
                labels = []

                # Iterate over the lines containing the boundary box coordinates
                for j in range(num_bboxes):
                    i += 1
                    bbox_data = lines[i].split(' ')
                    bbox = [
                        int(bbox_data[0]),
                        int(bbox_data[1]),
                        int(bbox_data[2]),
                        int(bbox_data[3])
                    ]
                    bboxes.append(bbox)
                # Fix the stupid fact that it has 0 coordinates if it doesnt have a boundary box
                if(num_bboxes==0):
                    bbox_data = [0,0,0,0,0,0,0,0,0,0]
                    bbox=[0,0,0,0]
                    bboxes.append(bbox)
                    i+=1
                label = {
                    'name': image_name.split('/')[1],
                    'faces': num_bboxes,
                    'type': int(image_name.split("--")[0]),
                    'blur': int(bbox_data[4]),
                    'expression': int(bbox_data[5]),
                    'illumination': int(bbox_data[6]),
                    'invalid': int(bbox_data[7]),
                    'occlusion': int(bbox_data[8]),
                    'pose': int(bbox_data[9])
                }
                annotation = {
                    'image': image_name,
                    'bboxes': bboxes,
                    'labels': label
                }
                if(num_bboxes==1 or num_bboxes ==0):
                    annotations.append(annotation)
                    data.append(image_file_path)
                i += 1

        return data, annotations
    
    def resize_bbox(self,bboxes, dim_x_init,dim_y_init, dim_x,dim_y):
        bboxes_resized = []
        #print(bboxes, dim_x_init,dim_y_init, dim_x,dim_y)
        for bbox in bboxes:
            # Calculate the scaling factors for width and height
            scale_x = dim_x / dim_x_init
            scale_y = dim_y / dim_y_init
            #print(scale_x,scale_y)
            # Convert the coordinates to the new dimensions
            bbox_resized = [ int(bbox[0] * scale_x), int(bbox[1] * scale_y), int(bbox[2] * scale_x), int(bbox[3] * scale_y)]
            bboxes_resized.append(bbox_resized)
        #print(bboxes_resized)

        return bboxes_resized
    
    def load_image(self, path):
        # Load and preprocess the image
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        return image
    
def widerfaces_get_datasets(data, load_train=True, load_test=True):
    """
    Load the WIDER Faces dataset

    The images are of multiple sizes, so they are rescaled to a predefined size.
    """
    (data_dir, args) = data

    if load_train:
        print("Loading training dataset")
        train_transform = transforms.Compose([
            #Rescale(256),
            #RandomCrop(224),
            transforms.ToTensor()
            #ai8x.normalize()
        ])

        train_dataset = WIDERFacesDataset(data_path=os.path.join(data_dir, "widerface", "WIDER_train/images"), transform=train_transform)
    else:
        train_dataset = None

    if load_test:
        test_transform = transforms.Compose([
            #Rescale(256),
            #RandomCrop(224),
            transforms.ToTensor()
            
            #ai8x.normalize()
        ])
        # Load validation dataset instead of test dataset, since test dataset is unlabeled
        test_dataset = WIDERFacesDataset(data_path=os.path.join(data_dir, "widerface", "WIDER_val/images"), transform=test_transform)
    else:
        test_dataset = None

    return train_dataset, test_dataset


datasets = [
    {
        'name': 'widerfaces',
        'input': (3, 128, 128),
        'output': [('x', float), ('y', float), ('w', float), ('h', float)],
        'regression': True,
        'loader': widerfaces_get_datasets,
    },
]

#### Test dataset class

In [None]:
# Instantiate the dataset
data_path = 'WIDER_faces/train'
#annotation_file = 'wider_face_split/wider_face_train_bbx_gt.txt'
print('before dataset')
#dataset = WIDERFacesDataset(data_path,{1,2,3,4,5,9})
dataset = WIDERFacesDataset(data_path)

# Test a specific sample
sample = dataset[6]
image, bboxes= sample

# Inspect the sample
print("Image shape:", image.shape)
print("Bounding boxes:", bboxes)

# Create a figure and axes
fig, ax = plt.subplots()

# Display the image
ax.imshow(image)

# Add bounding boxes to the plot
for bbox in [bboxes]:
    x, y, w, h = bbox
    rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='r', facecolor='none')
    ax.add_patch(rect)

# Set axis labels and title
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_title('Random image with Bounding Boxes')

# Show the plot
plt.show()

## Test dataloader above

#### Define plotting function for boundary boxes

In [None]:
def plotBoundaryBox(image,bbox):
    # Create a figure and axes
    fig, ax = plt.subplots()

    # Display the image which is a tensor
    ax.imshow(image.permute(1, 2, 0))

    x, y, w, h = bbox
    rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='r', facecolor='none')
    ax.add_patch(rect)
    # Set axis labels and title
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    #ax.set_title('Image with Boundary box')
    # Show the plot
    plt.show()

In [None]:
args=0
data = ('/home/lionnus/OneDrive/Ubuntu/MLonMCU/maxim7800-face-detection',0)
train_dataset,test_dataset= widerfaces_get_datasets(data,load_train=True)

sample = train_dataset[99]

image, target = sample[0], sample[1]
bbox=target
# Inspect the sample
print("Image shape:", image.shape)
print("Bounding boxes:", bbox)
# Test data loading using DataLoader
batch_size = 4
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
fixed_size= (128,128)
plotBoundaryBox(image,target)

### Easy way

In [None]:
###################################################################################################
#
# Copyright (C) 2019-2021 Maxim Integrated Products, Inc. All Rights Reserved.
#
# Maxim Integrated Products, Inc. Default Copyright Notice:
# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html
#
###################################################################################################
"""
Classes and functions used to utilize the dataset.
"""
import os

from torchvision import transforms

import ai8x
from torchvision.datasets import WIDERFace


class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, bboxes = sample[0], sample[1]['bboxes']

        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        img = transforms.resize(image, (new_h, new_w))

        # h and w are swapped for landmarks because for images,
        # x and y axes are axis 1 and 0 respectively
        bboxes = bboxes * [new_w / w, new_h / h]

        return img, bboxes


class RandomCrop(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, sample):
        image, bboxes = sample[0], sample[1]

        h, w = image.shape[:2]
        new_h, new_w = self.output_size

        top = np.random.randint(0, h - new_h)
        left = np.random.randint(0, w - new_w)

        image = image[top: top + new_h,
                      left: left + new_w]

        bboxes = bboxes - [left, top]

        return image, bboxes


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, bboxes = sample[0], sample[1]['bboxes']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C x H x W
        image = image.transpose((2, 0, 1))
        return torch.from_numpy(image), torch.from_numpy(bboxes)

def widerfaces_get_datasets(data, load_train=True, load_test=True):
    """
    Load the WIDER Faces dataset

    The images are of multiple sizes, so they are rescaled to a predefined size.
    """
    (data_dir, args) = data

    transform = transforms.Compose([
        #ai8x.normalize(args=args)
    ])

    if load_train:
        print("Loading training dataset")
        train_transform = transforms.Compose([
            #Rescale(256),
            #RandomCrop(224),
            transforms.ToTensor()
            #ai8x.normalize()
        ])

        print('Function arguments to WIDERFace are: {}'.format(data_dir))

        train_dataset = torchvision.datasets.WIDERFace(root=data_dir,split='train', transform=train_transform)
        print('Train dataset is: {}'.format(train_dataset))
        # print data types of each component in train_dataset
        print('Train dataset data types are: {}'.format(train_dataset[0]))
    else:
        train_dataset = None

    if load_test:
        test_transform = transforms.Compose([
            #Rescale(256),
            #RandomCrop(224),
            transforms.ToTensor()
            
            #ai8x.normalize()
        ])
        # Load validation dataset instead of test dataset, since test dataset is unlabeled
        test_dataset = torchvision.datasets.WIDERFace(root=data_dir,split='test', transform=test_transform)
    else:
        test_dataset = None

    return train_dataset, test_dataset


datasets = [
    {
        'name': 'widerfaces',
        'input': (3, 128, 128),
        'output': [('x', float), ('y', float), ('w', float), ('h', float)],
        'regression': True,
        'loader': widerfaces_get_datasets,
    },
]

## Define the Network

In [None]:
###################################################################################################
# WIDER Faces Network
# Lionnus Kesting
# Machine Learning on Microcontrollers
# 2023 - ETH Zurich
###################################################################################################
"""
WIDERFaceNet network description
"""
from signal import pause
from torch import nn

import ai8x

import matplotlib
import matplotlib.pyplot as plt

## Function to calculate linear layer dimensions
def conv_shape(x, k=1, p=0, s=1, d=1):
    return int((x + 2*p - d*(k - 1) - 1)/s + 1)

class WIDERFaceNet(nn.Module):
    def __init__(self, num_channels=3, dimensions = (64,64), bias=False, **kwargs):
        super().__init__()
        self.conv1 = nn.Conv2d(num_channels, 4, kernel_size=3, padding=1, bias=bias)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        dim_x = conv_shape(dimensions[0], k=3, p=1, s=1, d=1)
        print(dim_x) 

        self.conv2 = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=bias)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        dim_x = conv_shape(dimensions[0], k=3, p=1, s=1, d=1)
        print(dim_x) 


        self.conv3 = nn.Conv2d(8, 16, kernel_size=3, padding=1, bias=bias)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        dim_x = conv_shape(dimensions[0], k=3, p=1, s=1, d=1)
        dim_y=dim_x #change when not square!
        print(dim_x) 

        self.fc1 = nn.Linear(16 * 8*8, 32)
        self.relu4 = nn.ReLU()

        self.fc2 = nn.Linear(32, 4)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)

        x = self.conv3(x)
        x = self.relu3(x)
        x = self.pool3(x)

        x = x.view(x.size(0), -1)

        x = self.fc1(x)
        x = self.relu4(x)

        x = self.fc2(x)

        return x

def widerfacenet(pretrained=False, **kwargs):
    """
    Constructs a WIDERFaceNet model.
    """
    assert not pretrained
    return WIDERFaceNet(**kwargs)

"""
Network description
"""
models = [
    {
        'name': 'widerfacenet',
        'min_input': 1,
        'dim': 3,
    }
]


In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

## Train the model

In [None]:
# Define hyperparameters and settings
batch_size = 32
learning_rate = 0.001
num_epochs = 10
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the WIDER Faces dataset
args=0
data = ('/home/lionnus/OneDrive/Ubuntu/MLonMCU/maxim7800-face-detection',0)
train_dataset,test_dataset= widerfaces_get_datasets(data,load_train=True, load_test=True)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:

# Create an instance of your model
model = WIDERFaceNet().to(device)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for images, bboxes in train_loader:
        images = images.to(device)
        labels = bboxes.to(device)

        optimizer.zero_grad()
        predicted_coords = model(images)
        loss = criterion(predicted_coords, bboxes)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss}")

# Evaluation loop
model.eval()
total_loss = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        predicted_coords = model(images)
        loss = criterion(predicted_coords, labels)

        total_loss += loss.item()

avg_loss = total_loss / len(test_loader)
print(f"Average Test Loss: {avg_loss}")

# Save the trained model 
# torch.save(model.state_dict(), "widerfacenet.pth")


#### Save the new model

In [None]:
# Get the current directory
current_dir = os.getcwd()

# Specify the subdirectory name for models
models_dir = os.path.join(current_dir, "models")

# Create the models directory if it doesn't exist
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Define the base model name
base_model_name = "widerfacenet_v"

# Find the latest version number in the models directory
latest_version = 0
for file_name in os.listdir(models_dir):
    if file_name.startswith(base_model_name):
        version_str = file_name[len(base_model_name):].split(".")[0]
        version = int(version_str)
        if version > latest_version:
            latest_version = version

# Increment the latest version number by 1
new_version = latest_version + 1

# Create the new model file path
new_model_path = os.path.join(models_dir, f"{base_model_name}{new_version}.pth")

# Save the model with the new name
torch.save(model.state_dict(), new_model_path)

In [None]:
# open the saved pytorch model.pth on cpu
model = WIDERFaceNet()
device="cpu"
model.load_state_dict(torch.load("models/widerfacenet_v0.pth",map_location=torch.device('cpu')))
model.eval()

# Run the trained PyTorch model on a sample from the test set
# Get a random sample from the test_loader
# images, labels = test_dataset[7]
#import Image library
from PIL import Image, ImageOps
#load image test_pic
image_test = Image.open('/home/lionnus/OneDrive/Ubuntu/MLonMCU/maxim7800-face-detection/test_pic2.jpg')
image_test = ImageOps.exif_transpose(image_test)

# Preprocess the image
transform = transforms.Compose([
    transforms.Resize((64,64)),  # Resize to match the model's input size
    transforms.ToTensor(),  # Convert image to tensor
])
image_test_trans = transform(image_test).unsqueeze(0)

# Run model on image
predicted_coords = model(image_test_trans.to(device))
#convert to numpy
predicted_coords = predicted_coords.cpu().detach().numpy()
print('BBOX: ',predicted_coords[0])
#plot the image_test with the predicted coordinates

# Create a figure and plot the image
fig, ax = plt.subplots()
plt.imshow(image_test_trans.squeeze(0).permute(1, 2, 0))

# Plot the predicted coordinates on the image
x, y, w, h = predicted_coords[0]
rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='r', facecolor='none')
ax.add_patch(rect)
# Set plot title and labels
ax.set_title('Predicted Image with Coordinates')
ax.set_xlabel('X')
ax.set_ylabel('Y')

# Show the plot
plt.show()
