In [None]:
# Hot reload changed source files. See
# https://stackoverflow.com/questions/56059651
%reload_ext autoreload
%autoreload 2
from IPython import display
import albumentations as alb
from src import training, util, augmentation
import torch as t
import torchvision as tv
import cv2
import os

# Global definitions
input_data_dir = "./data/images"
augmented_images_dir = "./data/augmented2"  # output directory for derivative images during augmentation
test_data_dir = f"{augmented_images_dir}/test"
train_data_dir = f"{augmented_images_dir}/train"
model_dir = "./data/models"  # output to save model snapshots during training
cuda_device = t.device("cuda" if t.cuda.is_available() else "cpu")

for dpath in [input_data_dir, augmented_images_dir, test_data_dir, train_data_dir, model_dir]:
    os.makedirs(dpath, exist_ok=True)

# 1. Recording your face

Generate webcam snapshots at 10 fps for around 30 seconds with your face in view and for another 10 seconds without. While recording your face rotate and move around your head.

```sh
$ mkdir -p ./data/images
$ ffmpeg -f v4l2 -framerate 10 -video_size 800x600 -i /dev/video0 -r 10/1 "./data/images/%03d.png"
```

# 2. Manually create bounding boxes

Using the program _labelme_, create the bounding boxes for your images. For images without a face set no bounding box.

```sh
labelme --keep-prev --autosave --labels face --nodata --output ./data/images ./data/images
```

The resulting directory structure contains the images in png format and eponymous json files which contain the bounding boxes.

```sh
$ tree ./data/images
# ├── 001.json   // images containing our face
# ├── 001.png
# ├── 002.json
# ├── 002.png
# ├── ...
# ├── 350.json
# ├── 350.png
# ├── 351.json   // images showing the naked background
# ├── 351.png
# ├── ...
# ├── 425.json
# └── 425.png
```

# 3. Augmentation

We extend our dataset by creating derivative images using the [albumentations](https://albumentations.ai/) library, which offers image transformations with a simple interface which we shall use to increase the variance and the volume of our training data.

In [None]:
pipeline = [
    # We misappropriate the shift-scale-rotate transformation to implement
    # a zoom-in and zoom-out transform. Border extrapolation for zoomed-out
    # images must use a constant value or else we might produce spurious
    # reflections of faces at the borders.
    alb.ShiftScaleRotate(shift_limit=0.0, scale_limit=(-0.5, 0.5), rotate_limit=0.0, p=0.5, border_mode=cv2.BORDER_CONSTANT, value=0.0),
    alb.RandomCrop(width=450, height=450, p=1.0),
    alb.HorizontalFlip(p=0.5),
    # Manipulate the brightness and gamma to emulate different lighting
    # conditions.
    alb.RandomBrightnessContrast(p=0.5),
    alb.RandomGamma(p=0.75),
    # Preprocessing for training. We reduce the size and use grayscale.
    alb.Resize(244, 244, p=1.0),
    alb.ToGray(p=1.0),
]

# Generate the samples, separated into training and test data.
augmentation.generate_samples(pipeline, 25, input_data_dir, train_data_dir)
augmentation.generate_samples(pipeline, 5, input_data_dir, test_data_dir)


# Training

We build our model around a pre-trained VGG16

In [None]:

model = training.VGG16DualHead(freeze_vgg16_weights=True, device=cuda_device)

# Set up the data loading pipeline to produce torch tensors and convert images
# from numpy to torch image format.
transforms = tv.transforms.Compose(
    [
        tv.transforms.ToTensor(),  # converts image from numpy to torch format
        tv.transforms.Lambda(lambda x: t.FloatTensor(x)),
    ]
)
label_transforms = tv.transforms.Compose(
    [
        tv.transforms.Lambda(lambda x: t.FloatTensor(x)),
    ]
)
train_data = training.DataFromDisk(train_data_dir, transform_x=transforms, transform_y=label_transforms)
train_loader = t.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
test_data = training.DataFromDisk(test_data_dir, transform_x=transforms, transform_y=label_transforms)
test_loader = t.utils.data.DataLoader(test_data, batch_size=500)

# Configure the optimization algorithm and an adaptive learning rate.
optimizer = t.optim.Adam(model.parameters(), lr=1e-3)
scheduler = t.optim.lr_scheduler.MultiplicativeLR(optimizer, lambda epoch: 0.90)

# Run training loop.
test_loss_mean = []
train_loss_mean = []
for epoch_ii in range(50):
    training.train_epoch(model, cuda_device, train_loader, optimizer)
    scheduler.step()

    # Evaluate performance on the training and the test set to get a measure
    # for how well we're doing. Note that we use the full data set to compute
    # the metrics which takes a lot of time. In practice, we could also
    # evaluate performance on only a subset of the data, or only every n-th
    # epoch.
    test_perf = training.eval_performance(model, test_loader)
    train_perf = training.eval_performance(model, train_loader)
    test_loss_mean.append(test_perf["loss_mean"])
    train_loss_mean.append(train_perf["loss_mean"])

    # Save model's state_dict.
    t.save(model.state_dict(), f"{model_dir}/face_detector_epoch{epoch_ii:03d}.pth")

    # Display training progress in a graph.
    plot = util.plot_training_progress(train_loss_mean, test_loss_mean)
    display.clear_output(wait=True)
    display.display(plot)

    epoch_ii += 1


# Deployment

Finally, the trained model can be used to do inference. With some glue code, we're able to draw a predicted bounding box on top of our webcam image.

In [None]:
# Load model from memory. Choose according to training process
model = training.VGG16DualHead.from_state_dict_pth(f"{model_dir}/face_detector_epoch030.pth")

# Define preprocessor of webcam images
def prep_for_model(cam_img):
    img = alb.center_crop(cam_img, 600, 600)
    img = alb.resize(img, 244, 244)
    img = alb.to_gray(img)
    img = tv.transforms.functional.to_tensor(img)
    return img

try:
    for cam_img in util.read_cam():
        prep_img = prep_for_model(cam_img)
        hyp = model(prep_img).detach().cpu().numpy()
        bbox, conviction = hyp[:4], hyp[4]

        # If our face is in view, draw a bounding box
        if conviction > 0.9:
            img_w_bbox = util.overlay_bboxes(prep_img, [bbox])
        else:
            img_w_bbox = prep_img
        display.clear_output(wait=True)
        display.display(util.ImgType.convert(img_w_bbox, util.ImgType.pil))
except KeyboardInterrupt:
    pass  # graceful stopping of loop running in notebook


# Limitations

Being an instructive guide, the approach described here has several limitations which we shall make explicit.

- [ ] VGG16 Limitierungen
- [ ] Single Channel Model für Greyscale data (s. VGG Limitiereungen)
- [ ] Preprocessing einmal durchführen und als Wurst im Speicher ablegen