#### Module imports

In [None]:
import os
import random

import numpy                as np
import sqlalchemy           as sqla
import matplotlib.image     as mpimg
import matplotlib.pyplot    as plt
import api.global_variables as glb

from re                     import compile, IGNORECASE
from tqdm                   import tqdm
from uuid                   import UUID
from filecmp                import cmp
from IFR.api                import *
from IFR.classes            import *
from IFR.functions          import *
from sklearn.cluster        import DBSCAN

#### Parameters

In [None]:
# Toggles / flags:
build_n_save      = False # build and save face detectors & verifiers
create_database   = False # create database
save_new_database = True  # save database
align             = True  # perform face alignment

# Paths:
saved_detectors = 'api/saved_models/detectors' # face detector save directory
saved_verifiers = 'api/saved_models/verifiers' # face verifier save directory

SQLITE_DB_FP    = 'api/data/database/loki.sqlite' # full path of new database
img_path        = 'api/data/img'                       # image directory to be used

# Other:
load_detectors  = ['retinaface']
load_verifiers  = ['ArcFace']

use_detector    = 'retinaface'
use_verifier    = 'ArcFace'

normalization   = 'base'
metric          = 'cosine'

# DBSCAN
dbscan_eps         = 0.5
dbscan_min_samples = 3

#### Initialization

In [None]:
# Builds and saves face detectors and verifiers (depending on 'build_n_save')
if build_n_save:
    # All face detector and verifier names
    detector_names = ['opencv', 'ssd', 'mtcnn', 'retinaface']
    verifier_names = ['VGG-Face', 'Facenet', 'Facenet512', 'OpenFace',
                      'DeepFace', 'DeepID' , 'ArcFace']

    # Builds all face detectors and verifiers
    detectors = batch_build_detectors(detector_names, show_prog_bar=True,
                                        verbose=False)
    verifiers = batch_build_verifiers(verifier_names, show_prog_bar=True,
                                        verbose=False)

    # Prints the number of face detectors and verifiers built
    print('Number of detectors built:', len(detectors))
    print('Number of verifiers built:', len(verifiers), '\n')

    # Saves each face detector model
    for name, obj in detectors.items():
        status = save_built_model(name, obj, saved_detectors, overwrite=True,
                                    verbose=True)
    print('')

    # Saves each face verifier model
    for name, obj in verifiers.items():
        status = save_built_model(name, obj, saved_verifiers, overwrite=True,
                                    verbose=True)
    print('')

In [None]:
# ---------------------- Loading / creating face detectors ---------------------

# Loads (or creates) all face detectors
print('  -> Loading / creating face detectors:')
detector_models = init_load_detectors(load_detectors, saved_detectors)
print('\n> Detectors:', detector_models, sep='\n')

In [None]:
# ---------------------- Loading / creating face verifiers ---------------------

# Loads (or creates) all face verifiers
print('  -> Loading / creating face verifiers:')
verifier_models = init_load_verifiers(load_verifiers, saved_verifiers)
print('\n> Verifiers:', verifier_models, sep='\n')

#### Database

In [None]:
# Tries to load a database if it exists. If not, create a new one.
print('  -> Loading / creating database: ', end='')
glb.sqla_engine = load_database(SQLITE_DB_FP)
if glb.sqla_engine is None:
    raise AssertionError('Failed to load or create database!')
else:
    print('success!')
print('')

# Tries to load a session if it exists. If not, create a new one.
print('  -> Loading / creating session: ', end='')
glb.sqla_session = start_session(glb.sqla_engine)
if glb.sqla_session is None:
    raise AssertionError('Failed to create session!')
else:
    print('success!')
print('')

#### Functions

In [None]:
# ------------------------------------------------------------------------------

def process_image_zip_file_nb(myfile, image_dir, t_check=True, n_token=2,
                              valid_exts=['.jpg', '.png', '.npy']):
    """
    Processes a zip file containing image files. The zip file ('myfile') is
    assumed to have only valid image files (i.e. '.jpg', '.png', etc).
    
    The contents of the zip file are extracted to a named temporary directory.
    Then each file is checked to see if they have already been processed (exists
    with the same file name and size in the ProcessedFiles table of the
    database) OR if they are duplicate files. A file is considered a duplicate
    if there is at least one file in the 'image_dir' directory that:
        
        1. has the same file size (checked via filecmp.cmp(..., shallow=False))
        2. has the same contents (checked via filecmp.cmp(..., shallow=False))
        3. has the same image width and height (checked via imagesize)
    
    An existing file or duplicate file is ignored during the extraction process.
    If 'auto_rename' is True, then each unique file with the same name as a file
    in 'image_dir' directory gets renamed to a unique identifier using uuid4()
    from the uuid library. If, however, 'auto_rename' is False then the file is
    also skipped despite being a unique file.

    Finally, all unique (possibly renamed) files are moved from the temporary
    directory to the 'image_dir' directory, and the temporary directory is
    deleted.

    Effectively, this function attempts to extract only unique (non-existing)
    image files from the zip file provided and rename them if necessary.

    Inputs:
        1. myfile      - zip file obtained through FastAPI [zip file].

        2. image_dir   - path to directory in which the extracted images will be
                            saved to [string].

        3. auto_rename - toggles between automatic renaming of image files with
                            a non-unique name [boolean, default=True].

    Output:
        1. list with the paths of each image file that was skipped [list of
            strings].

    Signature:
        skipped_files = process_image_zip_file_nb(myfile, image_dir,
                                            t_check=True, n_token=2,
                                            valid_exts=['.jpg', '.png', '.npy'])
    """
    # Create temporary directory and extract all files to it
    with TemporaryDirectory(prefix="process_image_zip_file-") as tempdir:
        # with ZipFile(BytesIO(myfile.file.read()), 'r') as myzip:
        with ZipFile(myfile, 'r') as myzip:
            # Extracts all files in the zip file to a temporary directory,
            # flattens the directory structure and filters the files by valid
            # extensions
            myzip.extractall(path=tempdir)
            flatten_dir_structure(tempdir, valid_exts=valid_exts,
                                    n_token=n_token)

            # Obtains the files' paths, removes corrupted images and creates a
            # new list with only the uncorrupted files
            all_tpaths = [os.path.join(tempdir, file) for file\
                        in os.listdir(tempdir)]
            tpaths     = []
            for pth in all_tpaths:
                if not image_is_uncorrupted(pth, transpose_check=t_check):
                    os.remove(pth)     # deletes corrupted images
                else:
                    tpaths.append(pth) # appends valid path to tpaths

            # Repopulates the 'proc_files_temp' table
            if repopulate_temp_file_table(tpaths):
                raise AssertionError("Could not repopulate"\
                                   + "'proc_files_temp' table.")

            # Queries the database to figure out which files have the SAME size
            query  = select(ProcessedFiles.filename,
                            ProcessedFilesTemp.filename).join(\
                            ProcessedFilesTemp, ProcessedFiles.filesize ==\
                            ProcessedFilesTemp.filesize)
            result = glb.sqla_session.execute(query)

            # Initializes the skipped_files list then loops through each matched
            # & temporary file pairs in the query's result
            skipped_files = []
            for fname, tname in result:
                # Obtains the full path of the matched & temporary files
                fname_fullpath = os.path.join(image_dir, fname)
                tname_fullpath = os.path.join(tempdir, tname)
                
                # Checks if the files are different or not
                if not cmp(fname_fullpath, tname_fullpath):
                    # Files are different, so check if they have the same name
                    if fname == tname:
                        # Names are the same, so rename them
                        tname = rename_file_w_hex_token(tname)
                    
                    # Determines the new full path for tname and moves the file
                    tname_fullpath_dest = os.path.join(image_dir, tname)
                    sh_move(tname_fullpath, tname_fullpath_dest)

                else:
                    # Files are the same, so remove it from tempdir
                    os.remove(tname_fullpath)
                    skipped_files.append(tname)

            # Queries for files that have SAME name and DIFFERENT size from the
            # existing ones that have to be renamed
            query = select(ProcessedFilesTemp.filename).join(ProcessedFiles,
                    (ProcessedFilesTemp.filename == ProcessedFiles.filename)\
                    & (ProcessedFilesTemp.filesize != ProcessedFiles.filesize))
            result = glb.sqla_session.execute(query)

            # Loops through each row in result
            for row in result:
                # Obtains the file name and renames it
                filename         = row.filename
                filename_renamed = rename_file_w_hex_token(filename)
                print(filename_renamed, filename)

                # Moves the file to the appropriate location
                sh_move(os.path.join(tempdir, filename),
                        os.path.join(tempdir, filename_renamed))

            # Now it's safe to move the remaining files in tempdir directly to
            # img_dir
            for file in os.listdir(tempdir):
                # Moves each file to the appropriate location (ensuring it is
                # not a directory)
                if not os.path.isdir(os.path.join(tempdir, file)):
                    sh_move(os.path.join(tempdir,file),
                            os.path.join(image_dir, file))

    return skipped_files
    
# ------------------------------------------------------------------------------





# ------------------------------------------------------------------------------

def process_faces_from_dir(img_dir, detector_models, verifier_models,
                        detector_name='retinaface', verifier_names=['ArcFace'],
                        normalization='base', align=True, auto_grouping=True, 
                        eps=0.5, min_samples=2, metric='cosine', pct=0.02,
                        check_models=True, verbose=False):
    """
    Processes face images contained in the directory 'img_dir'. If there are no
    images in the directory, an assertion error is raised. The 'processing'
    includes the following steps, performed per image:
        1. Faces are detected in the image using the 'detector_name' face
            detector.

        2. If a detected face (region) is too small, it is discarded. This
            filtering is determined by 'pct'. If a region's area is smaller than
            the original image's area multiplied by this percentage factor
            'pct', then it is discarded. This helps with detection of tiny faces
            which are not useful for recognition.

        3. For each filtered face, the deep neural embeddings (which is just a
            vector representation of the face) is calculated.

        4. A face representation object (see help(FaceRep) for more details) is
            created for each face and added (but not committed!) to the current
            session.

    An optional 'fifth' step is performed if 'auto_grouping' is True. The
    function tries to group similar face representations using the DBSCAN
    algorithm on the embeddings, such that each group corresponds to faces of
    (ideally) the same person. If multiple face verifiers were passed to this
    function, the grouping is performed using the embeddings obtained from the
    FIRST face verifier in the list.

    If 'check_models' is True, then the function ensures that:
        1. the 'detector_name' face detector is in the 'detector_models'
            dictionary.

        2. the 'verifier_names' face verifier is in the 'verifier_models'
            dictionary.

    In both cases, if a detector or verifier is not in the respective
    dictionary, the function attempts to build them from scratch. If the
    building process fails, then an assertion error is raised as either a face
    detector and/or verifier will be missing.

    IMPORTANT: This function uses the 'sqla_session' global variable from the
    'global_variables.py' module to add changes (but not commit) to the SQL
    alchemy session.

    Inputs:
         1. img_dir         - full path to the directory containing the images
                                [string].

         2. detector_models - dictionary of face detector model names (keys) and
                                objects (values) [dictionary].

         3. verifier_models - dictionary of face verifier model names (keys) and
                                objects (values) [dictionary].

         4. detector_name   - chosen face detector's name. Options: opencv, ssd,
                                mtcnn or retinaface [string,
                                default='retinaface'].

         5. verifier_names  - chosen face verifiers' name(s). Options: VGG-Face,
                                OpenFace, Facenet, Facenet512, DeepFace, DeepID
                                and ArcFace. Can be either a string (with a
                                single name) or a list of string (with several
                                names) [string or list of strings,
                                default=['ArcFace']].

         6. normalization   - normalizes the face image and may increase face
                                recognition performance depending on the
                                normalization type and the face verifier model.
                                Options: base, raw, Facenet, Facenet2018,
                                VGGFace, VGGFace2 and ArcFace [string,
                                default='base'].
        
         7. align           - toggles if face images should be aligned. This
                                improves face recognition performance at the
                                cost of some speed [boolean, default=True].

         8. auto_grouping   - toggles whether the faces should be grouped
                                automatically using the DBSCAN algorithm. If
                                multiple verifier names are passed, uses the
                                embeddings of the first verifier during the
                                clustering procedure [boolean, default=True].

         9. eps             - the maximum distance between two samples for one
                                to be considered as in the neighborhood of the
                                other. This is the most important DBSCAN
                                parameter to choose appropriately for the
                                specific data set and distance function
                                [float, default=0.5].

        10. min_samples     - the number of samples (or total weight) in a
                                neighborhood for a point to be considered as a
                                core point. This includes the point itself
                                [integer, min_samples=2].

        11. metric          - the metric used when calculating distance between
                                instances in a feature array. It must be one of
                                the options allowed by
                                sklearn.metrics.pairwise_distances
                                [string, default='cosine'].

        12. pct             - percentage of image area as a decimal. This will
                                be used to filter out 'small' detections [float,
                                default=0.02].

        12. check_models    - toggles if the function should ensure the face 
                                detectors and verifiers are contained in the
                                respective dictionaries [boolean, default=True].
            
        14. verbose         - toggles the function's warnings and other messages
                                [boolean, default=True].

    Output:
        1. returns a list of the FaceRep objects created [list of FaceRep
            objects].

    Signature:
        records = process_faces_from_dir(img_dir, detector_models,
                        verifier_models, detector_name='retinaface',
                        verifier_names=['ArcFace'], normalization='base',
                        align=True, auto_grouping=True, eps=0.5, min_samples=2,
                        metric='cosine', check_models=True, verbose=False)
    """
    # Initializes records (which will be a list of FaceReps)
    records = []

    # Assuming img_dir is a directory containing images
    img_paths = get_image_paths(img_dir)
    img_paths.sort()

    # No images found, do something about it
    if len(img_paths) == 0:
        # Does something about the fact that there are no images in the
        # directory - for now just raise an assertion error
        raise AssertionError('No images in the directory specified')

    # Ensures that the face detector and verifiers exist
    if check_models:
        # Ensures face detectors exist
        ret1, detector_models = ensure_detectors_exists(models=detector_models,
                                                detector_names=[detector_name],
                                                verbose=verbose)

        # Ensures face verifiers exist
        ret2, verifier_models = ensure_verifiers_exists(models=verifier_models,
                                                verifier_names=verifier_names,
                                                verbose=verbose)

        # Asserts that the face detectors and verifiers exist
        assert ret1 and ret2, f'Could not ensure existence of '\
                            + f'face detectors ({ret1}) or verifiers ({ret2})!'

    # If auto grouping is True, then initialize the embeddings list
    if auto_grouping:
        embds = []
    
    # Obtains the processed files names from the ProcessedFiles table to skip
    # already processed files
    proc_fnames = glb.sqla_session.query(ProcessedFiles.filename)
    proc_fnames = [item[0] for item in proc_fnames.all()]

    # Creates the progress bar
    n_imgs = len(img_paths)
    pbar   = tqdm(range(0, n_imgs), desc='Processing face images',
                    disable=False)

    # Loops through each image in the 'img_dir' directory
    for index, i, img_path in zip(pbar, range(0, n_imgs), img_paths):
        # Skips the current file if it has already been processed
        if img_path[img_path.rindex('/')+1:] in proc_fnames:
            if glb.DEBUG:
                print(f'Skipping: {img_path}'.ljust(40), '(already processed)')
            continue

        # Detects faces
        output = do_face_detection(img_path, detector_models=detector_models,
                                    detector_name=detector_name, align=align,
                                    verbose=verbose)

        # Filter regions & faces which are too small
        image_size             = mpimg.imread(img_path).shape
        filtered_regions, idxs = discard_small_regions(output['regions'],
                                                        image_size, pct=pct)
        filtered_faces         = [output['faces'][i] for i in idxs]

        # Calculates the deep neural embeddings for each face image in outputs
        embeddings = calc_embeddings(filtered_faces, verifier_models,
                                     verifier_names=verifier_names,
                                     normalization=normalization)

        # Loops through each (region, embedding) pair and create a record
        # (FaceRep object)
        for region, cur_embds in zip(filtered_regions, embeddings):
            # id        - handled by sqlalchemy
            # person_id - dont now exactly how to handle this (sqlalchemy?)
            # image_name_orig = img_path.split('/')[-1]
            # image_fp_orig   = img_path
            # image_name      = ''   # currently not being used in this approach
            # image_fp        = ''   # currently not being used in this approach
            # group_no        = -1
            # region          = region
            # embeddings      = cur_embds
            record = FaceRep(image_name_orig=img_path.split('/')[-1],
                        image_name='', image_fp_orig=img_path,
                        image_fp='', group_no=-1, region=region,
                        embeddings=cur_embds)
            
            # Appends each record to the records list
            records.append(record)

            # If auto grouping is True, then store each calculated embedding
            if auto_grouping:
                embds.append(cur_embds[verifier_names[0]])

        # After file has been processed, add it to the ProcessedFiles table
        glb.sqla_session.add(ProcessedFiles(filename=img_path.split('/')[-1],
                                            # filepath=img_path,
                                            filesize=os.path.getsize(img_path)))
    
    if glb.DEBUG:
        print('Commits processed files')
    glb.sqla_session.commit()

    # Clusters Representations together using the DBSCAN algorithm
    if auto_grouping and len(embds) > 0:
        # Clusters embeddings using DBSCAN algorithm
        results = DBSCAN(eps=eps, min_samples=min_samples,
                         metric=metric).fit(embds)

        # Loops through each label and updates the 'group_no' attribute of each
        # record IF group_no != -1 (because -1 is already the default value and
        # means "no group")
        for i, lbl in enumerate(results.labels_):
            if lbl == -1:
                continue
            else:
                records[i].group_no = int(lbl)

    # Loops through each record and add them to the global session
    if glb.DEBUG:
        print('add representation to FaceRep table')
    for record in records:
        glb.sqla_session.add(record)
    glb.sqla_session.commit()

    # Add how many person to Person table as the detected clusters
    if glb.DEBUG:
        print('add person to Person table')
    subquery = select(FaceRep.group_no).where(FaceRep.group_no > -1).group_by(FaceRep.group_no).order_by(FaceRep.group_no)
    query = insert(Person).from_select(["group_no"], subquery)
    glb.sqla_session.execute(query)
    glb.sqla_session.commit()

    # Populate the person_id field in FaceRep with the corresponding ID in Person table
    if glb.DEBUG:
        print('Create joins between Person and FaceRep tables')
    subquery = select(Person.id).where(FaceRep.group_no == Person.group_no).where(FaceRep.group_no > -1)
    query = update(FaceRep).values(person_id = subquery.scalar_subquery()).where(FaceRep.group_no > -1)
    if glb.DEBUG:
        print(query)
    glb.sqla_session.execute(query)
    glb.sqla_session.commit()
    
    # Set group_no to -2 for the representation that have been linked with person
    if glb.DEBUG:
        print('Set group_no to -2 for FaceRep and Person that have been already linked together')
    query = update(FaceRep).values(group_no = -2).where(FaceRep.group_no > -1)
    glb.sqla_session.execute(query)
    query = update(Person).values(group_no = -2).where(Person.group_no > -1)
    glb.sqla_session.execute(query)
    glb.sqla_session.commit()

    # Return representation database
    return records

# ------------------------------------------------------------------------------

def faces_import_from_zip_nb(myfile, params, image_dir, n_token=2):
    """
    API endpoint: create_database_from_zip()

    Creates an SQLite database from a zip file. The zip file is expected to
    contain image files in any of the following formats: .jpg, .png, .npy.

    The images in the zip file are extracted to a temporary directory. Any image
    with the same name of another image in the 'image directory' is either
    renamed (auto_rename=True) or skipped (auto_rename=False). Renamed images
    are renamed using a random unique object identifier obtained by uuid4() from
    the uuid library.

    Parameters:
    - myfile: a zip file

    - params: a structure with the following parameters:
        1. detector_name  - name of face detector model [string].
        2. verifier_names - list of names of face verifier models [list of
                            strings].
        3. align          - perform face alignment flag (default=True)
                            [boolean].
        4. normalization  - name of image normalization [string].
        5. auto_grouping  - toggles whether Representations should be grouped /
                            clusted automatically using the DBSCAN algorithm
                            (default=True) [boolean].
        6. eps            - maximum distance between two samples for one to be
                            considered as in the neighborhood of the other. This
                            is the most important DBSCAN parameter to choose
                            appropriately for the specific data set and distance
                            function (default=0.5) [float].
        7. min_samples    - the number of samples (or total weight) in a
                            neighborhood for a point to be considered as a core
                            point. This includes the point itself
                            (min_samples=2) [integer].
        8. metric         - the metric used when calculating distance between
                            instances in a feature array. It must be an option
                            allowed by sklearn.metrics.pairwise_distances
                            (default='cosine') [string].
        9. pct            - used to filter faces which are smaller than this
                            percentage of the original image's area (width x
                            height) [float].
       10. check_models   - toggles if the function should check if all desired
                            face detector & verifiers are correctly loaded. If
                            they are not, builds them from scratch, exitting if
                            the building fails [boolean].
       11. verbose        - output messages to server's console [boolean].

        [Example] JSON schema:
        {
          "detector_name": "retinaface",
          "verifier_names": ["ArcFace"],
          "align": true,
          "normalization": "base",
          "auto_grouping": true,
          "eps": 0.5,
          "min_samples": 2,
          "metric": "cosine",
          "pct": 0.02,
          "check_models": true,
          "verbose": false
        }

    - image_dir   : full path to directory containing images (string,
                     default: <glb.IMG_DIR>)

    - db_dir      : full path to directory containing saved database (string,
                     default: <glb.RDB_DIR>)

    - auto_rename : flag to force auto renaming of images in the zip file with
                     names that match images already in the image directory
                     (boolean, default: True)

    - force_create: flag to force database creation even if one already exists,
                     overwritting the old one (boolean, default: True)

    Output:\n
        JSON-encoded dictionary with the following key/value pairs is returned:
            1. length: length of the newly created database OR of the currently
                loaded one if this process is skipped (i.e. force_create=False
                with existing database loaded)
            
            2. message: informative message string
    """   
    # These are hard-codded constants for now
    table_names = ['person', 'representation', 'proc_files', 'proc_files_temp']
    valid_exts  = ['.jpg', '.png', '.npy']

    # Initialize output message and skipped_files list
    output_msg    = ''
    skipped_files = []

    # If image directory provided is None or is not a directory, use default
    # directory
    if not image_dir or not os.path.isdir(image_dir):
        global img_dir
        output_msg += 'Image dir is None, does not exist or is not a '\
                   +  'directory. Using default directory instead.\n'
        image_dir = img_dir

    # Database does not exist
    if  database_is_empty(glb.sqla_engine):
        # Do nothing, but set message
        output_msg += 'Database does not exist! '\
                   +  'Please create one before using this endpoint.\n'

    # Face Representation table does not exist
    elif not all_tables_exist(glb.sqla_engine, table_names):
        # Do nothing, but set message
        output_msg += "Face representation table ('representation') "\
                   +  'does not exist! Please ensure that this table exists '\
                   +  'before using this endpoint.\n'

    # Otherwise (database is not empty and table exists), 
    else:
        # Initialize dont_skip flag as True
        dont_skip   = True

        # Extract zip files
        output_msg += 'Extracting images in zip:'
        skipped_files = []

        try:
            # Process the zip file containing the image files
            skipped_files = process_image_zip_file_nb(myfile, image_dir,
                                            t_check=t_check, n_token=n_token,
                                            valid_exts=valid_exts)
            output_msg += ' success! '

        except Exception as excpt:
            dont_skip   = False
            output_msg += f' failed (reason: {excpt}).'

        # Processes face images from the image directory provided if 'dont_skip'
        # is True
        if dont_skip:
            output_msg += 'Creating database: '

            records = process_faces_from_dir(image_dir,
                            detector_models, verifier_models,
                            detector_name  = params.detector_name,
                            verifier_names = params.verifier_names,
                            normalization  = params.normalization,
                            align          = params.align,
                            auto_grouping  = params.auto_grouping,
                            eps            = params.eps,
                            min_samples    = params.min_samples,
                            metric         = params.metric,
                            pct            = params.pct,
                            check_models   = params.check_models,
                            verbose        = params.verbose)
        
            # Commits the records and updates the message
            glb.sqla_session.commit()
            output_msg += ' success!'
        else:
            records = []

    return {'n_records':len(records), 'n_skipped':len(skipped_files),
            'skipped_files':skipped_files, 'message':output_msg}

# ------------------------------------------------------------------------------

#### Parameters

In [None]:
# Parameters
test_dir       = 'api/data/img'
detector_name  = 'retinaface'
verifier_names = ['ArcFace']
align          = True
show_prog_bar  = False
tags           = []
uids           = []
normalization  = 'base'
auto_grouping  = True
eps            = 0.5
min_samples    = 2
metric         = 'cosine'
pct            = 0.02
check_models   = False
verbose        = False
image_dir      = glb.IMG_DIR
auto_rename    = True
t_check        = True
n_token        = 2
valid_exts     = ['.jpg', '.png', '.npy']

glb.DEBUG      = False

# Params structure
params = CreateDatabaseParams(detector_name  = detector_name,
                              verifier_names = verifier_names,
                              align          = align,
                              normalization  = normalization,
                              auto_grouping  = auto_grouping,
                              eps            = eps,
                              min_samples    = min_samples,
                              metric         = metric,
                              pct            = pct,
                              check_models   = check_models,
                              verbose        = verbose)


#### Populating database from dir or zip

In [None]:
# Process faces from directory
do_process_faces_from_dir = False
if do_process_faces_from_dir:
    records = process_faces_from_dir(test_dir,
                                        detector_models, verifier_models,
                                        detector_name  = detector_name,
                                        verifier_names = verifier_names,
                                        normalization  = normalization,
                                        align          = align,
                                        auto_grouping  = auto_grouping,
                                        eps            = eps,
                                        min_samples    = min_samples,
                                        metric         = metric,
                                        pct            = pct,
                                        check_models   = check_models,
                                        verbose        = verbose)

In [None]:
# Populate faces from zip
test_faces_import_from_zip_nb = False
if test_faces_import_from_zip_nb:
    myfile = 'api/data/test2b.zip'

    output = faces_import_from_zip_nb(myfile, params, image_dir)
    print('\n', output, '\n', sep='')

In [None]:
# Process image zip file
do_process_image_zip_file_nb = False
if do_process_image_zip_file_nb:
    myfiles = ['api/data/test1a.zip',
               'api/data/test2.zip']

    for myfile in myfiles:
        print(f'Current zip {myfile}:')
        skipped_files = process_image_zip_file_nb(myfile, image_dir,
                                            t_check=True, n_token=2,
                                            valid_exts=['.jpg', '.png', '.npy'])
        print(f'Skipped files (# {len(skipped_files)}):')
        for i, f in enumerate(skipped_files):
            print(f'  > ({i}) {f}')
        if len(skipped_files) == 0:
            print('  None\n')
        else:
            print('')
        print('')

#### Staging area

In [None]:
def create_group2person_map(group_nos, person_ids):
    unique_gnos   = np.unique(group_nos)
    new_person_id = max(person_ids) + 1

    group2person = {}
    for u_gno in unique_gnos:
        # Determines the person id associated to the group number
        est_person_id = np.bincount(person_ids[group_nos == u_gno]).argmax()

        if est_person_id == 0:
            est_person_id  = new_person_id
            new_person_id += 1

        print('unique group no:', u_gno, ' |  est. person id:', est_person_id)

        group2person[u_gno] = est_person_id

    return group2person

# ------------------------------------------------------------------------------

def group_embeddings(verifier_name, eps=0.5, min_samples=2, metric='cosine'):
    # Attempts to get all embeddings stored in the database, given the chosen
    # verifier model's name
    try:
        embds = get_embeddings_as_array(verifier_name)
    except Exception as excpt:
        print(f'Could not get embeddings (reason: {excpt})')
        raise AssertionError('Could not get embeddings.')

    # Clusters embeddings using DBSCAN algorithm
    result = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(embds)

    # Gets all person ids
    stmt       = select(FaceRep.person_id)
    qry_result = glb.sqla_session.execute(stmt)
    #person_ids = [id[0] for id in qry_result.all()]

    person_ids = []
    for id in qry_result.all():
        if id[0] is not None:
            person_ids.append(id[0])
        else:
            person_ids.append(0)

    person_ids   = np.array(person_ids)
    group_nos    = np.array(result.labels_)

    group2person = create_group2person_map(group_nos, person_ids)

    stmt = update(FaceRep).values(group_no=group_nos)
    glb.sqla_session.execute(stmt)
    glb.sqla_session.commit()



    return person_ids, group_nos, group2person, result



# ------------------------------------------------------------------------------

# for updating the FaceRep inside a cluster, then use this query:
# (existing_person_id is the value you got from previous query)

# update(FaceRep).values(person_id = existing_person_id).where((FaceRep.group_no == cluster_x) & (FaceRep.person_id == None))


# Analytically, these are the steps after inporting new images:

# 1. run DBSCAN on all the images in img_dir. This will fill group_no field with cluster ID for both existing and new images.
# 2. for each cluster:
#   2.1. we get one record that has already person_id not null.
#   2.2. for the remains records in the cluster that has person_id null, we will update the field with the specific ID.
#   2.3. we set group_no = -2 for all of them in the cluster.

person_ids, group_nos, group2person, dbscan_out = group_embeddings('ArcFace', eps=0.5, min_samples=2, metric='cosine')

print('')
print('g. nos'.center(6), '|', 'p. ids'.center(6))
print('-' * 7, '|', '-' * 7, sep='')
for pid, gnb in zip(person_ids, group_nos):
    print(f'{gnb}'.center(6), '|', f'{pid}'.center(6))

print('\n    ', group2person, sep='')


## Luca's code start here

In [None]:
verifier_name = 'ArcFace'
eps = 0.5
min_samples = 2
metric = 'cosine'

try:
    embds = get_embeddings_as_array(verifier_name)
except Exception as excpt:
    print(f'Could not get embeddings (reason: {excpt})')
    raise AssertionError('Could not get embeddings.')

# Clusters embeddings using DBSCAN algorithm
result = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(embds)
print(result.labels_)

In [None]:
# delete temp table
glb.sqla_session.execute(delete(tempClustering))
glb.sqla_session.commit()

# and populate again!
group_no = [{'group_no': int(no)} for no in result.labels_]
query = insert(tempClustering).values(group_no)
glb.sqla_session.execute(query)
glb.sqla_session.commit()


SELECT temp_clustering.group_no 
FROM temp_clustering
0
0
1
1
1
1
2
2
1
1
2
2
2
0


In [None]:
# check the temp table
query = select(tempClustering.group_no)
print(query)
result = glb.sqla_session.execute(query)
for item in result.all():
    print(item.group_no)

In [None]:
# get the list of ID of the new clusters

new_cluster_list = []
query = select(tempClustering.group_no).group_by(tempClustering.group_no)
print(query)
result = glb.sqla_session.execute(query)
for item in result.all():
    new_cluster_list.append(item.group_no)

print(new_cluster_list)

In [None]:
# get the person id for each cluster
for cluster in new_cluster_list:
    query = select(FaceRep.person_id).where((FaceRep.group_no == cluster) & (FaceRep.person_id != None)).limit(1)
    result = glb.sqla_session.execute(query).first()
    if(result):
        #we have a match
        print("match for", cluster)
        person_id = result.person_id
        print(person_id)

        query = update(FaceRep).values(person_id = person_id).where((FaceRep.group_no == cluster) & (FaceRep.person_id == None))
        glb.sqla_session.execute(query)
        glb.sqla_session.commit()

    else:
        print("no match for", cluster)
        query = text("SELECT rep1.id FROM representation AS rep1 WHERE (rep1.group_no == 0) & ((SELECT rep2.id FROM representation AS rep2 WHERE (rep2.group_no == 0) & (rep2.person_id IS NOT NULL)) IS NULL)")
        result = glb.sqla_session.execute(query)
        for item in result:
            print(item.id)


In [18]:
cluster = 0
query = text("SELECT rep1.id FROM representation AS rep1 WHERE (rep1.group_no == 0) & ((SELECT rep2.id FROM representation AS rep2 WHERE (rep2.group_no == 0) & (rep2.person_id IS NOT NULL)) IS NULL)")
print(query)
result = glb.sqla_session.execute(query)
for item in result:
    print(item.id)

SELECT rep1.id FROM representation AS rep1 WHERE (rep1.group_no == 0) & ((SELECT rep2.id FROM representation AS rep2 WHERE (rep2.group_no == 0) & (rep2.person_id IS NOT NULL)) IS NULL)
1
2
14
