#### Module imports

In [1]:
import os
import random

import numpy                as np
import sqlalchemy           as sqla
import matplotlib.image     as mpimg
import matplotlib.pyplot    as plt
import api.global_variables as glb

from re                     import compile, IGNORECASE
from tqdm                   import tqdm
from uuid                   import UUID
from filecmp                import cmp
from IFR.api                import *
from IFR.classes            import *
from IFR.functions          import *
from sklearn.cluster        import DBSCAN

#### Parameters

In [2]:
# Toggles / flags:
build_n_save      = False # build and save face detectors & verifiers
create_database   = False # create database
save_new_database = True  # save database
align             = True  # perform face alignment

# Paths:
saved_detectors = 'api/saved_models/detectors' # face detector save directory
saved_verifiers = 'api/saved_models/verifiers' # face verifier save directory

SQLITE_DB_FP    = 'api/data/database/loki_test.sqlite' # full path of new database
img_path        = 'api/data/img'                       # image directory to be used

# Other:
load_detectors  = ['retinaface']
load_verifiers  = ['ArcFace']

use_detector    = 'retinaface'
use_verifier    = 'ArcFace'

normalization   = 'base'
metric          = 'cosine'

# DBSCAN
dbscan_eps         = 0.5
dbscan_min_samples = 3

#### Initialization

In [3]:
# Builds and saves face detectors and verifiers (depending on 'build_n_save')
if build_n_save:
    # All face detector and verifier names
    detector_names = ['opencv', 'ssd', 'mtcnn', 'retinaface']
    verifier_names = ['VGG-Face', 'Facenet', 'Facenet512', 'OpenFace',
                      'DeepFace', 'DeepID' , 'ArcFace']

    # Builds all face detectors and verifiers
    detectors = batch_build_detectors(detector_names, show_prog_bar=True,
                                        verbose=False)
    verifiers = batch_build_verifiers(verifier_names, show_prog_bar=True,
                                        verbose=False)

    # Prints the number of face detectors and verifiers built
    print('Number of detectors built:', len(detectors))
    print('Number of verifiers built:', len(verifiers), '\n')

    # Saves each face detector model
    for name, obj in detectors.items():
        status = save_built_model(name, obj, saved_detectors, overwrite=True,
                                    verbose=True)
    print('')

    # Saves each face verifier model
    for name, obj in verifiers.items():
        status = save_built_model(name, obj, saved_verifiers, overwrite=True,
                                    verbose=True)
    print('')

In [4]:
# ---------------------- Loading / creating face detectors ---------------------

# Loads (or creates) all face detectors
print('  -> Loading / creating face detectors:')
detector_models = init_load_detectors(load_detectors, saved_detectors)
print('\n> Detectors:', detector_models, sep='\n')

  -> Loading / creating face detectors:
[load_built_model] Loading model retinaface.pickle: failed! Reason: retinaface.pickle does not exist in api/saved_models/detectors
[build_detector] Building retinaface: success!


> Detectors:
{'retinaface': <tensorflow.python.eager.def_function.Function object at 0x7f413466d4f0>}


In [5]:
# ---------------------- Loading / creating face verifiers ---------------------

# Loads (or creates) all face verifiers
print('  -> Loading / creating face verifiers:')
verifier_models = init_load_verifiers(load_verifiers, saved_verifiers)
print('\n> Verifiers:', verifier_models, sep='\n')

  -> Loading / creating face verifiers:
[load_built_model] Loading model ArcFace.pickle: success!

> Verifiers:
{'ArcFace': <keras.engine.functional.Functional object at 0x7f413466db80>}


#### Database

In [6]:
# Tries to load a database if it exists. If not, create a new one.
print('  -> Loading / creating database: ', end='')
glb.sqla_engine = load_database(SQLITE_DB_FP)
if glb.sqla_engine is None:
    raise AssertionError('Failed to load or create database!')
else:
    print('success!')
print('')

# Tries to load a session if it exists. If not, create a new one.
print('  -> Loading / creating session: ', end='')
glb.sqla_session = start_session(glb.sqla_engine)
if glb.sqla_session is None:
    raise AssertionError('Failed to create session!')
else:
    print('success!')
print('')

  -> Loading / creating database: success!

  -> Loading / creating session: success!



#### Staging area

In [7]:

test_dir       = 'api/data/img'
detector_name  = 'retinaface'
verifier_names = ['ArcFace']
align          = True
show_prog_bar  = False
tags           = []
uids           = []
normalization  = 'base'
auto_grouping  = True
eps            = 0.5
min_samples    = 2
metric         = 'cosine'
pct            = 0.02
check_models   = False
verbose        = False
image_dir      = glb.IMG_DIR
auto_rename    = True

glb.DEBUG      = False

In [58]:
from sqlalchemy import delete

# Params structure
params = CreateDatabaseParams(detector_name  = detector_name,
                              verifier_names = verifier_names,
                              align          = align,
                              normalization  = normalization,
                              auto_grouping  = auto_grouping,
                              eps            = eps,
                              min_samples    = min_samples,
                              metric         = metric,
                              pct            = pct,
                              check_models   = check_models,
                              verbose        = verbose)

# ------------------------------------------------------------------------------

def repopulate_temp_file_table(tpaths):
    """
    
    """

    # First, tries to clear everything in the 'proc_files_temp' table
    try:
        stmt = delete(ProcessedFilesTemp)
        glb.sqla_session.execute(stmt)
        glb.sqla_session.commit()
    except Exception as excpt:
        glb.sqla_session.rollback()
        print("Error when clearing 'proc_files_temp' table",
             f'(reason: {excpt})')
        return True

    if glb.DEBUG:
        print("Populating 'proc_files_temp' table")

    # Loops through each temporary path in 'tpaths'
    for tpath in tpaths:
        # Adds each file name and size to the 'proc_files_temp' table
        glb.sqla_session.add(ProcessedFilesTemp(
                                        filename=tpath[tpath.rindex('/')+1:],
                                        filesize=os.path.getsize(tpath))
                            )

    # Commits the changes
    if glb.DEBUG:
        print('Committing newly added temporary files')
    glb.sqla_session.commit()

    return False

# ------------------------------------------------------------------------------

def file_is_not_unique2(fpath, proc_qry=None):
    """
    TODO: Update documentation
    """
    # Initializes 'is_not_unique' flag
    is_not_unique = False

    # Obtains the processed files from the ProcessedFiles table if it is not
    # provided by the user (i.e. proc_qry is None)
    if proc_qry is None:
        proc_qry = glb.sqla_session.query(ProcessedFiles)

    # Creates a subquery to find if there is/are file(s) in the ProcessedFiles
    # table with the same file size and determines the number of files with the
    # same size
    subqry   = proc_qry.filter(
                        ProcessedFiles.filesize.like(os.path.getsize(fpath))
                              )
    n_subqry = len(subqry.all())

    # Checks if there is at least 1 matching file in the subquery
    if n_subqry > 0:
        # Loops through each matching file
        for j in range(0, n_subqry):
            # Determines if the files are the same (and should be skipped)
            is_not_unique = img_files_are_same(fpath, subqry.all()[j].filepath)
                                
            # Current file matches another one. It's not unique so no need to
            # continue this loop
            if is_not_unique:
                break

    return is_not_unique

# ------------------------------------------------------------------------------

def process_image_zip_file2(myfile, image_dir, auto_rename=True,
                            valid_exts=['.jpg', '.png', '.npy']):
    """
    Processes a zip file containing image files. The zip file ('myfile') is
    assumed to have only valid image files (i.e. '.jpg', '.png', etc).
    
    The contents of the zip file are extracted to a named temporary directory.
    Then each file is checked to see if they have already been processed (exists
    with the same file name and size in the ProcessedFiles table of the
    database) OR if they are duplicate files. A file is considered a duplicate
    if there is at least one file in the 'image_dir' directory that:
        
        1. has the same file size (checked via filecmp.cmp(..., shallow=False))
        2. has the same contents (checked via filecmp.cmp(..., shallow=False))
        3. has the same image width and height (checked via imagesize)
    
    An existing file or duplicate file is ignored during the extraction process.
    If 'auto_rename' is True, then each unique file with the same name as a file
    in 'image_dir' directory gets renamed to a unique identifier using uuid4()
    from the uuid library. If, however, 'auto_rename' is False then the file is
    also skipped despite being a unique file.

    Finally, all unique (possibly renamed) files are moved from the temporary
    directory to the 'image_dir' directory, and the temporary directory is
    deleted.

    Effectively, this function attempts to extract only unique (non-existing)
    image files from the zip file provided and rename them if necessary.

    Inputs:
        1. myfile      - zip file obtained through FastAPI [zip file].

        2. image_dir   - path to directory in which the extracted images will be
                            saved to [string].

        3. auto_rename - toggles between automatic renaming of image files with
                            a non-unique name [boolean, default=True].

    Output:
        1. list with the paths of each image file that was skipped [list of
            strings].

    Signature:
        skipped_files = process_image_zip_file(myfile, image_dir,
                                                auto_rename=True)
    """
    # Create temporary directory and extract all files to it
    with TemporaryDirectory(prefix="create_database_from_zip-") as tempdir:
        # with ZipFile(BytesIO(myfile.file.read()), 'r') as myzip:  # uncomment after development
        with ZipFile(myfile, 'r') as myzip:                # remove after development
            # Extracts all files in the zip folder
            myzip.extractall(tempdir)
            
            # Obtains all file names, temporary file names and temporary file
            # paths. Also initializes skipped_files list
            skipped_files = []
            all_fnames = [name.split('/')[-1] for name in os.listdir(image_dir)]
            all_tnames = [name.split('/')[-1] for name in os.listdir(tempdir)]

            # Filter files by valid extension
            filt_tnames = filter_files_by_ext(all_tnames, valid_exts=valid_exts)
            filt_tpaths = [os.path.join(tempdir, name) for name in filt_tnames]

            #print('filt_tnames:\n', filt_tnames, sep='')

            # Repopulates the 'proc_files_temp' table
            if repopulate_temp_file_table(filt_tpaths):
                raise AssertionError("Could not repopulate"\
                                   + "'proc_files_temp' table.")

            # Obtains the processed files from the ProcessedFiles table
            # proc_files = glb.sqla_session.query(ProcessedFiles)

            # 
            query  = select(ProcessedFiles.filename,
                            ProcessedFilesTemp.filename).join(\
                            ProcessedFilesTemp, ProcessedFiles.filesize ==\
                            ProcessedFilesTemp.filesize)
            result = glb.sqla_session.execute(query)
            invalid_names = [tup[1] for tup in result.all()]
            print('invalid names:'.ljust(14), invalid_names)

            # Loops through each file extracted in the temporary directory
            for i, tname, tpath in zip(range(0, len(filt_tnames)), filt_tnames,
                                             filt_tpaths):
                # ------------------------- File Check -------------------------
                # 


                raise AssertionError('DEBUGGING')

                skip_this_file = file_is_not_unique(tpath, proc_qry=proc_files)

                # Skips the current file if skip_this_file=True
                if skip_this_file:
                    print(f'File skipped (file check failed): {tpath}')
                    skipped_files.append(tpath)
                    continue
                
                
                # ------------------------ Auto renaming -----------------------
                # Checks if the current file name matches any of the other
                # files, renaming them using an unique id if 'auto_rename' is
                # True. If 'auto_rename' is False (and file requires renaming)
                # skip this file and add it to the skipped file names list.
                if tname in all_fnames:
                    if auto_rename:
                        new_name = str(uuid4()) + '.' + tname.split('.')[-1] # uid.extension
                    else:
                        print(f'File skipped (file exists + no auto rename): {tpath}')
                        skipped_files.append(tpath)
                        continue

                # Otherwise, dont rename it
                else:
                    new_name = tname

                # Move file to appropriate directory
                new_fp = os.path.join(image_dir, new_name)
                old_fp = os.path.join(tempdir, tname)
                sh_move(old_fp, new_fp)

    return skipped_files

# ------------------------------------------------------------------------------



# ------------------------------------------------------------------------------

def create_database_from_zip(myfile, params, image_dir, auto_rename,
                    table_names = ['person', 'representation', 'proc_files']):
    """
    API endpoint: create_database_from_zip()

    Creates an SQLite database from a zip file. The zip file is expected to
    contain image files in any of the following formats: .jpg, .png, .npy.

    The images in the zip file are extracted to a temporary directory. Any image
    with the same name of another image in the 'image directory' is either
    renamed (auto_rename=True) or skipped (auto_rename=False). Renamed images
    are renamed using a random unique object identifier obtained by uuid4() from
    the uuid library.

    Parameters:
    - myfile: a zip file

    - params: a structure with the following parameters:
        1. detector_name  - name of face detector model [string].
        2. verifier_names - list of names of face verifier models [list of
                            strings].
        3. align          - perform face alignment flag (default=True)
                            [boolean].
        4. normalization  - name of image normalization [string].
        5. auto_grouping  - toggles whether Representations should be grouped /
                            clusted automatically using the DBSCAN algorithm
                            (default=True) [boolean].
        6. eps            - maximum distance between two samples for one to be
                            considered as in the neighborhood of the other. This
                            is the most important DBSCAN parameter to choose
                            appropriately for the specific data set and distance
                            function (default=0.5) [float].
        7. min_samples    - the number of samples (or total weight) in a
                            neighborhood for a point to be considered as a core
                            point. This includes the point itself
                            (min_samples=2) [integer].
        8. metric         - the metric used when calculating distance between
                            instances in a feature array. It must be an option
                            allowed by sklearn.metrics.pairwise_distances
                            (default='cosine') [string].
        9. pct            - used to filter faces which are smaller than this
                            percentage of the original image's area (width x
                            height) [float].
       10. check_models   - toggles if the function should check if all desired
                            face detector & verifiers are correctly loaded. If
                            they are not, builds them from scratch, exitting if
                            the building fails [boolean].
       11. verbose        - output messages to server's console [boolean].

        [Example] JSON schema:
        {
          "detector_name": "retinaface",
          "verifier_names": ["ArcFace"],
          "align": true,
          "normalization": "base",
          "auto_grouping": true,
          "eps": 0.5,
          "min_samples": 2,
          "metric": "cosine",
          "pct": 0.02,
          "check_models": true,
          "verbose": false
        }

    - image_dir   : full path to directory containing images (string,
                     default: <glb.IMG_DIR>)

    - db_dir      : full path to directory containing saved database (string,
                     default: <glb.RDB_DIR>)

    - auto_rename : flag to force auto renaming of images in the zip file with
                     names that match images already in the image directory
                     (boolean, default: True)

    - force_create: flag to force database creation even if one already exists,
                     overwritting the old one (boolean, default: True)

    Output:\n
        JSON-encoded dictionary with the following key/value pairs is returned:
            1. length: length of the newly created database OR of the currently
                loaded one if this process is skipped (i.e. force_create=False
                with existing database loaded)
            
            2. message: informative message string
    """   
    # Initialize output message
    output_msg = ''

    # If image directory provided is None or is not a directory, use default
    # directory
    if not image_dir or not os.path.isdir(image_dir):
        global img_dir
        output_msg += 'Image dir is None, does not exist or is not a '\
                   +  'directory. Using default directory instead.\n'
        image_dir = img_dir

    # Database does not exist
    if  database_is_empty(glb.sqla_engine):
        # Do nothing, but set message
        output_msg += 'Database does not exist! '\
                   +  'Please create one before using this endpoint.\n'

    # Face Representation table does not exist
    elif not all_tables_exist(glb.sqla_engine, table_names):
        # Do nothing, but set message
        output_msg += "Face representation table ('representation') "\
                   +  'does not exist! Please ensure that this table exists '\
                   +  'before using this endpoint.\n'

    # Otherwise (database is not empty and table exists), 
    else:
        # Initialize dont_skip flag as True
        dont_skip   = True

        # Extract zip files
        output_msg += 'Extracting images in zip:'

        print('Entered else (main body)')

        try:
            # Process the zip file containing the image files
            skipped_files = process_image_zip_file2(myfile, image_dir,
                                                    auto_rename=auto_rename)
            output_msg += ' success! '

        except Exception as excpt:
            dont_skip   = False
            output_msg += f' failed (reason: {excpt}).'

        raise AssertionError('FOR DEBUGGING - assertion error break!')

        # Processes face images from the image directory provided if 'dont_skip'
        # is True
        if dont_skip:
            output_msg += 'Creating database: '
            records = process_faces_from_dir(image_dir, glb.models, glb.models,
                            detector_name  = params.detector_name,
                            verifier_names = params.verifier_names,
                            normalization  = params.normalization,
                            align          = params.align,
                            auto_grouping  = params.auto_grouping,
                            eps            = params.eps,
                            min_samples    = params.min_samples,
                            metric         = params.metric,
                            pct            = params.pct,
                            check_models   = params.check_models,
                            verbose        = params.verbose)
        
            # Commits the records and updates the message
            glb.sqla_session.commit()
            output_msg += ' success!'
        else:
            records = []

    return {'n_records':len(records), 'n_skipped':len(skipped_files),
            'skipped_files':skipped_files, 'message':output_msg}

# ------------------------------------------------------------------------------

In [19]:
do_process_faces_from_dir = False
if do_process_faces_from_dir:
    records = process_faces_from_dir(test_dir, detector_models, verifier_models,
                                      detector_name  = detector_name,
                                      verifier_names = verifier_names,
                                      normalization  = normalization,
                                      align          = align,
                                      auto_grouping  = auto_grouping,
                                      eps            = eps,
                                      min_samples    = min_samples,
                                      metric         = metric,
                                      pct            = pct,
                                      check_models   = check_models,
                                      verbose        = verbose)

Processing face images: 100%|██████████| 4/4 [00:20<00:00,  5.12s/it]


In [59]:
test_process_zip_file = True
if test_process_zip_file:
    myfile = 'api/data/test1a.zip'
    skipped_files = process_image_zip_file2(myfile, image_dir, auto_rename=True)

    print('')
    print(len(skipped_files), 'skipped files:')
    for skpd_file in skipped_files:
        print(f'  > {skpd_file}')
    if len(skipped_files) == 0:
        print('  None')
    print('')

result:        [('img01.jpg', 'img01.jpg'), ('img02.jpg', 'img02.jpg'), ('img03.jpg', 'img03.jpg'), ('img53.jpg', 'img53.jpg')]
invalid names: ['img01.jpg', 'img02.jpg', 'img03.jpg', 'img53.jpg']


AssertionError: DEBUGGING

In [None]:
test_zip_endpoint = False
if test_zip_endpoint:
    myfile = 'api/data/test1.zip'
    output = create_database_from_zip(myfile, params, image_dir, auto_rename)

In [None]:
from filecmp import cmp
import imagesize

tst_dir = 'api/data/img'
tst_pth = 'api/data/img/img34_copy_test.jpg'
tst_pth = 'api/data/img/img12.jpg'

# ------------------------------------------------------------------------------

def remove_img_file_duplicates(trgt_dir, dont_delete=False):
    """
    Detects and removes (if dont_delete=False) all duplicate image files in a
    target directory 'trgt_dir'. Also returns a list with the name of all
    duplicate files, regardless if they were deleted or not. The algorithm works
    in the following way:

        1. The full path and file size of all files (in the directory) are
            obtained. A list with all unique file sizes is calculated.

        2. For each file size in the unique file size list:
            2.1. The indicies of all files with a matching file size are
                  obtained.

            2.2. If there are multiple matches, the first file (corresponding to
                  the first index) is set as the reference file for comparison.
                  Its width and height are calculated without loading the entire
                  image to memory.

            2.3. Every other match is compared to the reference file. The
                  comparison is made by using filecmp.cmp() (with
                  shallow=False). Their widths and heights are also calculated
                  and compared to the reference image's width and height.

            2.4. If any match is deemed the same (filecmp.cmp() results in True
                  and has the same width and height), the matching file is
                  considered a duplicate and is deleted (unless
                  dont_delete=True). The file's name is also stored in the
                  duplicate file names' list.

        3. Returns a list with the names of all duplicate files (regardless if
            they were deleted or not).

    Inputs:
        1. trgt_dir    - path to target directory [string].

        2. dont_delete - toggles if the function should delete the duplicate
                          files or not [boolean, default=False].

    Output:
        1. Returns the names of all duplicate files (regardless if they were
            deleted or not) [list of strings].

    Signature:
        dup_file_names = remove_img_file_duplicates(trgt_dir, dont_delete=False)
    """
    # Initialize duplicate files' name list
    dup_files = []

    # Obtains all file full paths ('all_files'), their file sizes ('all_sizes')
    # and a list of all unique file sizes ('unq_sizes')
    all_files = [os.path.join(trgt_dir, pth) for pth in os.listdir(trgt_dir)]
    all_sizes = np.array([os.path.getsize(pth) for pth in all_files])
    unq_sizes = np.unique(all_sizes)

    # Loops through all unique file sizes
    for sze in unq_sizes:
        # Gets the indices of all files with the same current file size
        ii = np.where(all_sizes == sze)[0]

        # If there are multiple matches, compare them to see if there are
        # duplicates. Otherwise, just continue
        if len(ii) > 1:
            # Sets the first index (file) as a reference file (for comparison)
            # and obtains their width and height
            refw, refh = imagesize.get(all_files[ii[0]])

            # Loops through each remaining file index
            for i in ii[1:]:
                # Calculates the current matched file's width and height
                wi, hi = imagesize.get(all_files[i])

                # Files have the same size, content and image size
                if cmp(all_files[ii[0]], all_files[i], shallow=False)\
                    and refw == wi and refh == hi:
                    # Appends the duplicate file's name
                    dup_files.append(all_files[i])

                    # Removes the duplicate file if dont_delete=False
                    if not dont_delete:
                        os.remove(all_files[i])

    return dup_files

# ------------------------------------------------------------------------------

def img_file_is_duplicate(img_path, file_fps, file_sizes):
    """
    
    """
    # Initializes is_duplicate flag as False (assumes its unique)
    is_duplicate = False

    # Ensures the file_sizes list is a numpy array
    if not isinstance(file_sizes, np.ndarray):
        file_sizes = np.array(file_sizes)

    # Gets the indices of all files with the file as the current file's size
    ii = np.where(file_sizes == os.path.getsize(img_path))[0]

    # If there are multiple matches, compare them to see if there are
    # duplicates. Otherwise, this file is unique so just return False
    if len(ii) > 1:
        # Gets the width and height of the input file 
        refw, refh = imagesize.get(img_path)

        # Loops through each matched files' index
        for i in ii:
            # Calculates the current matched file's width and height
            wi, hi = imagesize.get(file_fps[i])

            # Files have the same size, content and image size, so this file is
            # a duplicate
            if cmp(img_path, file_fps[i], shallow=False) and refw == wi\
                and refh == hi:
                is_duplicate = True
                break

    return is_duplicate

# ------------------------------------------------------------------------------

dup_file_names = remove_img_file_duplicates(tst_dir, dont_delete=True)

print('Duplicate files:')
for name in dup_file_names:
    print('  > ', name)
if len(dup_file_names) == 0:
    print('  None')

print('\n', '-'*79, '\n', sep='')
files_fps = [os.path.join(tst_dir, pth) for pth in os.listdir(tst_dir)]
files_sizes = np.array([os.path.getsize(pth) for pth in files_fps])

print('File is duplicate:',
        img_file_is_duplicate(tst_pth, files_fps, files_sizes))