In [None]:
import xml.etree.ElementTree as ET
import lxml
import os
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import tensorflow as tf 
import numpy as np 
import cv2

In [None]:
""" setting up google drive and defining directories """

from google.colab import drive
MOUNTPOINT = '/content/drive'

DATADIR = os.path.join(MOUNTPOINT, 'MyDrive', 'Project_Face_Mask')
DATADIR_annotations = os.path.join(DATADIR, 'Annotations')
DATADIR_annotations_train = os.path.join(DATADIR_annotations, 'train')
DATADIR_annotations_test = os.path.join(DATADIR_annotations, 'test')
DATADIR_train = os.path.join(DATADIR, 'train')
DATADIR_test = os.path.join(DATADIR, 'test')



In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
def get_data(root_dir, annotation_directory, limit):
    """
    method to extract the image data from google drive 

    Parameters:
      root_dir: root dir of the dataset in google drive
      annotation_dir: dir where to find the annotations to each image
      limit: amount of faces to extract per label

    Returns:
      img_data: list of imgs (dictionaries)
      number_bboxs: number of lines in the new dataset
    """
    
    number_bboxs = 0
    img_data = []
    mask_corr_count = 0
    mask_incorr_count = 0
    no_mask_count = 0
    
    for root_d,directories,filenames in os.walk(annotation_directory):
            for i, filename in enumerate(filenames):
                                    
                path = os.path.join(root_d,filename)
                root = ET.parse(path).getroot()

                if mask_corr_count == limit and mask_incorr_count == limit and no_mask_count == limit:
                    return np.array(img_data), number_bboxs
                
                image_path = os.path.join(root_dir, root[2].text)

                try:
                  pict = cv2.imread(image_path)
                  picture = cv2.resize(pict, (416,416))
                except Exception:
                  continue
    
                for obj in root[6:]:
                    if obj[0].text == 'unmasked_face' and no_mask_count < limit:
                        no_mask_count += 1        
                    elif obj[0].text == 'masked_face' and mask_corr_count < limit:
                        mask_corr_count += 1                               
                    elif obj[0].text == 'incorrectly_masked_face' and mask_incorr_count < limit:
                        mask_incorr_count += 1
                    else:
                        break
                        
                    img = {} 
                    
                    img["img"] = picture 
                    img["size"] = np.array([float(elem.text) for elem in root[4]])

                    """ set the label """
                    if obj[0].text == 'unmasked_face': 
                        img["label"] = 0
                    elif obj[0].text == 'masked_face': 
                        img["label"] = 1
                    else:
                        img["label"] = 2
                    
                    boxs = []

                    """ resizing the bounding boxes relative the new image size """
                    try:
                        boxs.append(int(float(obj[6][0].text)/img["size"][0]*416))
                        boxs.append(int(float(obj[6][1].text)/img["size"][1]*416))
                        boxs.append(int(float(obj[6][2].text)/img["size"][0]*416))
                        boxs.append(int(float(obj[6][3].text)/img["size"][1]*416))
                    except OverflowError as e:
                        print(str(e))
                        break
                    
                    img["boxs"] = boxs
                    
                    number_bboxs += 1
                    img_data.append(img)
        
                
    return np.array(img_data), number_bboxs

In [None]:
def gen_ds(img_data):
    """ generation of the dataset based on the extracted data """
    print(img_data)
    labels = [img['label'] for img in img_data[0]]
    boxs = [img['boxs'] for img in img_data[0]]

    boxs = tf.ragged.constant(boxs)
    boxs = tf.data.Dataset.from_tensor_slices(boxs)

    labels= tf.ragged.constant(labels)
    labels = tf.data.Dataset.from_tensor_slices(labels)

    imgs = tf.data.Dataset.from_tensor_slices([i['img'] for i in img_data[0]])

    ds = tf.data.Dataset.zip((imgs, labels, boxs))

    
    return ds


In [None]:
img_data_train = get_data(DATADIR_train, DATADIR_annotations_train, 1000)
ds_train = gen_ds(img_data_train)

""" 
cannot be executed because the memory in google drive was exhausted and we 
needed to delete it 
"""

"""img_data_test = get_data(DATADIR_test, DATADIR_annotations_test, 300)
ds_test = gen_ds(img_data_test)"""
