## manual data augmentation

In this notebook the code written to augmentate data is consolidated into a few lines of code.
`src.utils` should not need any further explanation.

The code is basically emulation the behavior of the keras image generator, but also accepts another parameter `repetitions`.
The data is processed into the interim folder applying the modifications before training.
Therefore the manipulations do not take place during training, but a time independent preprocessing phase.

The images are copied to the `interim` directory first.
After the data is fully distributed into `interim`, the data gets augmented "inplace", meaning we only work with the `interim` data and the result is a `interim` directory with all the augmented data (in this case 32000 images per class).

`inline_augment_images` returns a list with dictionaries.
These dictionaries contain all necessary information to create records in the upcoming cell.

The parameters should be self explanatory.

In [1]:
import json
import shutil
import os
from os import path

import cv2
import numpy as np
from tqdm import tqdm

def augment_images_by_label(src_dir, target_dir, label_idx, target_size=(None, None),
    repetitions=1, h_flip=False, v_flip=False, rotation_range=0, quantity=None):
    """Augments the images labelwise

    Arguments:
        src_dir: Where the (raw) images are taken from.
        target_dir: Where the augmented images are going to be safed.
        label: Label to add to the feature_description.
        target_size: Size of the output image.
            If at least one entry is None, the original size is used.
        repetitions: How often should this run over the original dataset?
        h_flip: Is it okay if the images are flipped horizontally?
        v_flip: Is it okay if the images are flipped vertically?
        rotation_range: In what range can the images be rotated (in degrees)?
        quantity: How many images should be taken from the original dataset?
            None means => all.
    """
    data_list = []
    filenames =  list(filter(lambda x: x[-5:] == '.jpeg', os.listdir(src_dir)))
    for filename in filenames[:quantity]:
        image_path = path.join(src_dir, filename)
        image = cv2.imread(image_path)
        if h_flip:
            image = cv2.flip(image, 0)
        if v_flip:
            image = cv2.flip(image, 1)
        if None not in target_size:
            image = cv2.resize(image, target_size)

        angle = np.random.uniform(0.0, rotation_range)
        rotmat = cv2.getRotationMatrix2D(tuple(np.divide(image.shape[:2], 2)), angle, 1.0)
        image = cv2.warpAffine(image, rotmat, image.shape[:2])

        os.makedirs(target_dir, exist_ok=True)
        name = str(len(os.listdir(target_dir)))
        
        # the angle should be labeled between [0, 180)
        angle = int(abs(h_flip*360-abs(v_flip*180-angle))) % 180
        data_list.append({
            'image_path': path.join(target_dir, name + '.jpeg'),
            'label': [label_idx, angle],
        })
        cv2.imwrite(data_list[-1]['image_path'], image)

    return data_list

def augment_images(src_dir, target_dir, repetitions=1, *args, **kwargs):
    """Augments images in src_dir and saves them to target_dir.

    Arguments:
        src_dir: Where the (raw) images are taken from.
        target_dir: Where the augmented images are going to be safed.
        repetitions: How often should this run over the original dataset?
    """

    os.makedirs(target_dir, exist_ok=True)
    data_list = []

    for label_idx, label in enumerate(os.listdir(src_dir)):
        actual_src_dir = path.join(src_dir, label)
        actual_target_dir = path.join(target_dir, label)

        for i in tqdm(range(repetitions)):
            data_list += augment_images_by_label(
                actual_src_dir, actual_target_dir, label_idx, *args, **kwargs)

    np.random.shuffle(data_list)

    with open(path.join(target_dir, 'config.json'), 'w') as config:
        json.dump(data_list, config)

    return data_list

def inline_augment_images(directory, *args, **kwargs):
    """Augments images inplace (source and target directory are the same).

    Arguments:
        directory: source and target directory.

    An intermediate 'directory_tmp' is created.
    It is removed after the operation has finished.
    """

    tmp = directory + '_tmp'
    os.rename(directory, tmp)
    data_list = augment_images(tmp, directory, *args, **kwargs)

    try:
        shutil.rmtree(tmp, ignore_errors=True)
    except OSError as e:
        print ("Error: %s - %s." % (e.filename, e.strerror))

    return data_list

In [2]:
from src.utils import reset_and_distribute_data, encode_image_data_as_record

raw = path.join('data', 'srp_raw_02')
interim = path.join('data', 'interim')
processed = path.join('data', 'processed')

reset_and_distribute_data(raw, interim, [1000, 100, 100])
shutil.rmtree(processed, ignore_errors=True)

target_size=(32, 32)

train_data_list = inline_augment_images(path.join(interim, 'train'),
    repetitions=32, h_flip=True, v_flip=True, rotation_range=360, target_size=target_size)

validate_data_list = inline_augment_images(path.join(interim, 'validate'), target_size=target_size)

test_data_list = inline_augment_images(path.join(interim, 'test'), target_size=target_size)

100%|██████████| 32/32 [13:10<00:00, 24.70s/it]
100%|██████████| 32/32 [14:11<00:00, 26.62s/it]
100%|██████████| 32/32 [14:29<00:00, 27.16s/it]
100%|██████████| 1/1 [00:01<00:00,  1.48s/it]
100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
100%|██████████| 1/1 [00:01<00:00,  1.09s/it]


`encode_record` takes the previously described list, which is hardcoded in this instance as:
```python
feature = {
    'image': # /path/to/image
    'label': # [label_idx, angle]
}
```

These features are then used to create protobufs.
protobufs can be read by Tensorflow very efficient and with no overhead of decoding the data (the decoding takes place during the preprocessing here).

This procedure also gives way more control over how the data is stored. For example the angle in this instance is saved, because maybe in the future it may be interesting to determine the angle of a linear node, where the data is way easier to generate by using only vertical lines (and the randomization is done by the augmentation).

In [3]:
labels = os.listdir(raw)

train_record    = path.join(processed, 'train.tfrecord')
validate_record = path.join(processed, 'validate.tfrecord')
test_record     = path.join(processed, 'test.tfrecord')

encode_image_data_as_record(train_data_list, train_record)
encode_image_data_as_record(validate_data_list, validate_record)
encode_image_data_as_record(test_data_list, test_record)

The next step is to create data generators or iterators for the model which can read these tensorflow records.