# Imports

In [None]:
import os
import numpy as np
import cv2
import random

# Methods

In [None]:
def fetch_files(dir:str, filetype:list, arr:list = []):
    with os.scandir(dir) as content:
        for item in content:
            if os.path.isdir(dir + '/' + item.name):
                arr = fetch_files(dir + '/' + item.name, filetype, arr)
            elif item.name.split('.')[-1] in filetype:
                arr.append(item)
    return arr

def square_bounding_box(startX:int, startY:int, endX:int, endY:int):
    box_h, box_w = endY - startY, endX - startX
    aspect = box_h / box_w
    if aspect > 1:
        offset = round((box_h / 2) - (box_w / 2))
        startX, endX = startX - offset, endX + offset
    else:
        offset = round((box_w / 2) - (box_h / 2))
        startY, endY = startY - offset, endY + offset
    return startX, startY, endX, endY

# Config

In [None]:
# Directory
dataset_dir = './dataset/org'
save_dir = './dataset/new'

# Face detect model
conf_threshold = 0.7
prototxt_path = 'deploy.prototxt'
weights_path = 'res10_300x300_ssd_iter_140000_fp16.caffemodel'
face_detect = cv2.dnn.readNet(prototxt_path, weights_path)

# Image properties
image_size = 96
images = fetch_files(dataset_dir, ['jpg', 'png'])

# Dataset creator

In [None]:
for image in images:
    # Read image
    frame = cv2.imread(image.path)
    h, w = frame.shape[:2]

    # Check that image is valid
    if type(frame) == type(None):
        continue

    # OpenCV DNN pre-processing
    blob = cv2.dnn.blobFromImage(frame, 1.0, (image_size * 2, image_size * 2))
    face_detect.setInput(blob)
    faces = face_detect.forward()

    # For all faces detected in frame
    for c, i in enumerate(range(0, faces.shape[2]), start=0):

        # Get the confidence that it is a face 
        confidence = faces[0, 0, i, 2]

        if confidence > conf_threshold:
            # Get coordinates for face
            box = faces[0, 0, i, 3:7] * np.array([w, h, w, h])
            startX, startY, endX, endY = box.astype("int")
            startX_bak, startY_bak, endX_bak, endY_bak = box.astype("int")

            # Make the bounding boxs square
            startX, startY, endX, endY = square_bounding_box(startX, startY, endX, endY)

            # Make sure the box is within the dimensions of the frame
            startX, startY = max(0, startX), max(0, startY)
            endX, endY = min(w, endX), min(h, endY)

            # Pre-process frame
            frame_crop = frame[startY:endY, startX:endX]

            # Resize bounding box
            try:
                frame_crop_resize = cv2.resize(frame_crop, (image_size, image_size), cv2.INTER_AREA)
            except cv2.error:
                break

            # Filename & path formatting
            # NOTE: might not work on non NT systems due to different path structure
            img_ext = image.name.split('.')
            img_path = save_dir + '/' + image.path.replace(image.name, '')[:-1].split('/')[-1]
            img_name = '/' + img_ext[0] + '{0}.'.format('' if c == 0 else '_duplicate_' + str(c)) + img_ext[1] 
            img_str = img_path + img_name

            # Check that filepath exists
            if not os.path.exists(img_path):
                os.makedirs(img_path)

            # Save image to disk
            cv2.imwrite(img_str, frame_crop_resize)
        
        # Only save the first found face for every image
        # NOTE: Uncomment this to include all detected faces    
        break

In [None]:
# Delete excess files to make dataset uniform
folder_name = []
folder_size = []
for folder in os.listdir(save_dir):
    folder_name.append(folder)
    folder_size.append(len([name for name in os.listdir(save_dir + '/' + folder)]))
to_delete = [folder_size[folder_name.index(x)] for x in folder_name if x != folder_name[folder_size.index(min(folder_size))]]
for num in to_delete:
    folder = folder_name[folder_size.index(num)]
    num -= folder_size[folder_size.index(min(folder_size))]
    for n in range(num):
        content = os.listdir(save_dir + '/' + folder)
        os.remove(save_dir + '/' + folder + '/' + content[random.randint(0, num - n)])