# Preprocessing

This notebook does the preprocessing for the dataset.

1. The bounding boxes and labels are extracted from the annotation files
2. The image, bounding box and label are grouped and accumulated in a list
3. For training, a train-validation split of 80/20 is done by shuffling the extracted training data and splitting
4. These split data is saved into a CSV file for a `CSVGenerator` in the training section to consume

In [1]:
import glob
import csv
import numpy as np

In [2]:
def extract_box(path):
    """extract_box
    Extract annotation box positions for each labels from VIVA hand dataset.
    output is a list of tuples.

    :param path: text file path
    """

    with open(path) as temp:
        output = []

        for i, line in enumerate(temp):

            if i != 0 and line:
                label, x_1, y_1, x_off, y_off, *_ = line.split()
                pt_1 = (int(x_1), int(y_1))
                pt_2 = (pt_1[0] + int(x_off), (pt_1[1] + int(y_off)))
                output.append((label, pt_1, pt_2))

    return output

def create_csv(image_dir, annotation_dir, csv_out_path, val_out_path=None, val_split=None):
    image_paths = sorted(glob.glob(image_dir + '*'))
    annotations_paths = sorted(glob.glob(annotation_dir + '*'))

    # each image can have up to 4 hand bboxes
    rows = []
    for image_path, annotations_path in zip(image_paths, annotations_paths):
            annotations = extract_box(annotations_path)
            for annotation in annotations:
                # annotation [label, (x1, y1), (x2, y2)]
                # save as image,x1,y2,x2,y2,label
                rows.append([image_path,
                             annotation[1][0], annotation[1][1],
                             annotation[2][0], annotation[2][1],
                             annotation[0]])
    if val_split:
        # shuffle and split
        np.random.shuffle(rows)
        val_size = int(len(rows) * val_split)
        val_rows = rows[:val_size]
        with open('./data/validation.csv' if val_out_path is None else val_out_path, 'w') as csv_file:
            writer = csv.writer(csv_file)
            for row in val_rows:
                writer.writerow(row)
        rows = rows[val_size:]

    with open(csv_out_path, 'w') as csv_file:
            writer = csv.writer(csv_file)
            for row in rows:
                writer.writerow(row)

In [3]:
# this is the root directory where the training data is extracted
data_dir = '/media/appsyoon/New Volume/Machine Learning/data/'
# training data path
train_dir = data_dir + 'detectiondata/train/'
train_image_dir = train_dir + 'pos/'
train_annotation_dir = train_dir + 'posGt/'

out_path = './data/train.csv'

create_csv(train_image_dir, train_annotation_dir, out_path, val_split=0.2)

In [4]:
# the test data images are in the same root dir as training
test_image_dir = data_dir + 'detectiondata/test/pos/'
# but the annotations are downloaded separately and extracted into data_dir/evaluation/
test_annotation_dir = data_dir + 'evaluation/annotations/'

test_out_path = './data/test.csv'

create_csv(test_image_dir, test_annotation_dir, test_out_path)