#Import and Initial Mount Disk


In [7]:
# install library
# !pip install -U tensorflow-addons
# !pip install facenet-pytorch


In [8]:
import os
# Mount drive
# from google.colab import drive
# drive.mount("/content/drive")
# path = "/content/drive/My Drive/Colab Notebooks/face_recognize"
# os.chdir(path)

import time
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras import models, layers, metrics, optimizers, Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization
from functools import partial
import matplotlib.pyplot as plt
import numpy as np
import cv2
import math
import io
import pickle
import tensorflow_datasets as tfds
import random
import csv

from train_tensorflow.inceptionresnetv1 import InceptionResNetV1
from train_tensorflow.models import convert_model_to_embedding,\
    LayerBeforeArcFace, ArcFaceLoss, \
    call_instance_model, call_instance_model_old, convert_dense_layer_to_arcface,\
    special_convert_dense_layer_to_arcface
from train_tensorflow.Classify import Classify
from tool.FormatFunction import FormatFunction
from tool.FileFunction import FileFunction
from tool.GlobalValue import GlobalValue


In [9]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


# Train

## Init value

In [10]:
READ_RAW_DATA_THEN_SAVE = False
backbone_type = "InceptionResNetV2Old"
head_type = "Dense"
MODEL_NAME = f"160-64-{backbone_type}-{head_type}(v1)"
is_convert = False
path_save_model = os.path.join(os.getcwd(), "save_model", MODEL_NAME)
global_value = GlobalValue(image_size=[160, 260], batch_size=64, shuffle_size=1000, ratio_train=0.9, ratio_test=0.1, ratio_valid=0.0, epochs=40, small_epochs=50,
                           image_each_class=15)
format_function = FormatFunction(global_value)
file_function = FileFunction()


## Prepare folder and other thing

In [11]:
# Create folder to save model
if not os.path.exists(path_save_model):
    os.makedirs(path_save_model)

# Read label dictionary(name of people not the path of image)
if READ_RAW_DATA_THEN_SAVE:
    label_dict = dict()
    label_dict.update(format_function.get_label_dict(
        os.path.join(os.getcwd(), "dataset", "CASIA_align")))
    label_dict.update(format_function.get_label_dict(
        os.path.join(os.getcwd(), "dataset", "AFDB")))
    path = os.path.join(os.getcwd(), "cache", "data", "label_dict.pkl")
    with open(path, 'wb') as file:
        pickle.dump(label_dict, file)
path = os.path.join(os.getcwd(), "cache", "data", "label_dict.pkl")
with open(path, 'rb') as f:
    label_dict = pickle.load(f)

# Save data path to file to read faster
if READ_RAW_DATA_THEN_SAVE:
    path_image_no_mask = list()
    path_image_no_mask.extend(file_function.get_data_path_by_dictionary(
        os.path.join(os.getcwd(), "dataset", "CASIA_align")))
    path_image_no_mask.extend(file_function.get_data_path_by_dictionary(
        os.path.join(os.getcwd(), "dataset", "AFDB")))
    saved_path = os.path.join(os.getcwd(), "cache",
                              "data", "path_image_no_mask.pkl")
    with open(saved_path, 'wb') as file:
        pickle.dump(path_image_no_mask, file)

    path_image_mask = list()
    path_image_mask.extend(file_function.get_data_path_by_dictionary(
        os.path.join(os.getcwd(), "dataset", "CASIA_mask")))
    path_image_mask.extend(file_function.get_data_path_by_dictionary(
        os.path.join(os.getcwd(), "dataset", "AFDB_mask")))
    saved_path = os.path.join(os.getcwd(), "cache",
                              "data", "path_image_mask.pkl")
    with open(saved_path, 'wb') as file:
        pickle.dump(path_image_mask, file)


## Start train

# Train version 1


In [12]:
# Create embedding model
# tf.config.run_functions_eagerly(True)
input_shape = (global_value.IMAGE_SIZE[0], global_value.IMAGE_SIZE[1], 3)
input = Input(shape=input_shape)
model = call_instance_model_old(input_shape, num_classes=len(label_dict), embd_shape=512, head_type=head_type,
                            backbone_type=backbone_type)
# ----find the latest epoch
actual_epochs = 1
if is_convert:
    path_dense = os.path.join(os.getcwd(), "save_model", "160-64-InceptionResNetV1-Dense(v1)",
                              "epoch35.h5")
    model = special_convert_dense_layer_to_arcface(path_dense, input_shape,
                                                   len(label_dict), 512, model_name=model_name)
else:
    # ----find the latest epoch
    for i in range(1000):
        last_save_path = os.path.join(
            path_save_model, "epoch{}.h5".format(actual_epochs))
        if not os.path.exists(last_save_path):
            break
        actual_epochs += 1

    # Load saved model
    if (actual_epochs != 1):
        load_path = os.path.join(
            path_save_model, "epoch{}.h5".format(actual_epochs-1))
        print("load ", load_path)
        model.load_weights(load_path)

if head_type == 'ArcFace':
    loss_function = ArcFaceLoss(scale=30)
elif head_type == "Dense":
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=loss_function,
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
# model.summary()
# tf.keras.utils.plot_model(model, to_file="model.png", show_shapes=True)

# Normal train network
for i in range(global_value.EPOCHS):
    # Measure time
    now = time.time()
    # Read data path from file
    path = os.path.join(os.getcwd(), "cache", "data", "path_image_no_mask.pkl")
    with open(path, 'rb') as f:
        path_image_no_mask = pickle.load(f)
        path_image_no_mask = file_function.get_data_path_with_limit(
            path_image_no_mask, global_value.IMAGE_EACH_CLASS)
    path = os.path.join(os.getcwd(), "cache", "data", "path_image_mask.pkl")
    with open(path, 'rb') as f:
        path_image_mask = pickle.load(f)
        path_image_mask = file_function.get_data_path_with_limit(
            path_image_mask, global_value.IMAGE_EACH_CLASS)

    # Combine data path
    path_image_no_mask.extend(path_image_mask)
    random.shuffle(path_image_no_mask)

    # Index label (change label of data from string to number)
    label_index = list()
    for path in path_image_no_mask:
        label = path.split(os.sep)[-2]
        label = label_dict[label]
        label_index.append(label)
    path_dataset = tf.data.Dataset.from_tensor_slices(path_image_no_mask)
    label_dataset = tf.data.Dataset.from_tensor_slices(label_index)
    origin_dataset = tf.data.Dataset.zip((path_dataset, label_dataset))

    # Repeat data
    # origin_dataset  = origin_dataset.shuffle(global_value.SHUFFLE_SIZE).repeat(2)

    # Split train, test datase
    train_dataset, test_dataset, _ = format_function.get_dataset_partition(
        origin_dataset, 0.9, 0.1, 0)

    # read data from path
    train_dataset = train_dataset.map(
        format_function.process_image, num_parallel_calls=tf.data.AUTOTUNE)
    test_dataset = test_dataset.map(
        format_function.process_image, num_parallel_calls=tf.data.AUTOTUNE)

    # augmentation data(flip, rotate,...)
    train_dataset = train_dataset.map(
        format_function.augment_data, num_parallel_calls=tf.data.AUTOTUNE)
    test_dataset = test_dataset.map(
        format_function.augment_data, num_parallel_calls=tf.data.AUTOTUNE)

    # batch data
    train_dataset = train_dataset.batch(global_value.BATCH_SIZE)
    test_dataset = test_dataset.batch(global_value.BATCH_SIZE)

    # Set cache and prefetch to improve performance
    train_dataset = train_dataset.prefetch(
        buffer_size=tf.data.experimental.AUTOTUNE)
    test_dataset = test_dataset.prefetch(
        buffer_size=tf.data.experimental.AUTOTUNE)

    print("--------------------------big epoch {}--------------------------".format(actual_epochs))
    history = model.fit(
        train_dataset,
        epochs=1,
        validation_data=test_dataset
    )
    model.save_weights(os.path.join(
        path_save_model, "epoch{}.h5".format(actual_epochs)))
    with open(os.path.join(os.getcwd(), "cache", "log", MODEL_NAME+".csv"), "a", newline='') as f:
        row = [actual_epochs, history.history['loss'], history.history['sparse_categorical_accuracy'],
               history.history['val_loss'], history.history['val_sparse_categorical_accuracy'], time.time() - now]
        writer = csv.writer(f)
        writer.writerow(row)
    actual_epochs += 1
    # https://stackoverflow.com/questions/64734474/how-to-calculate-accuracy-for-facial-recognition-system


load  g:\My Drive\Colab Notebooks\face_recognize\save_model\160-64-InceptionResNetV2Old-Dense(v1)\epoch34.h5
--------------------------big epoch 35--------------------------


ResourceExhaustedError: Graph execution error:

Detected at node 'InceptionResNetV2Old/batch_normalization_205/FusedBatchNormV3' defined at (most recent call last):
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\traitlets\config\application.py", line 1041, in launch_instance
      app.start()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\kernelapp.py", line 724, in start
      self.io_loop.start()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\asyncio\base_events.py", line 1899, in _run_once
      handle._run()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\kernelbase.py", line 512, in dispatch_queue
      await self.process_one()
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\kernelbase.py", line 501, in process_one
      await dispatch(*args)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\kernelbase.py", line 408, in dispatch_shell
      await result
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\kernelbase.py", line 731, in execute_request
      reply_content = await reply_content
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\ipkernel.py", line 417, in do_execute
      res = shell.run_cell(
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\ipykernel\zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\IPython\core\interactiveshell.py", line 2945, in run_cell
      result = self._run_cell(
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\IPython\core\interactiveshell.py", line 3000, in _run_cell
      return runner(coro)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\IPython\core\interactiveshell.py", line 3203, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\IPython\core\interactiveshell.py", line 3382, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\nguye\AppData\Local\Temp\ipykernel_38876\3380202306.py", line 103, in <module>
      history = model.fit(
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\layers\normalization\batch_normalization.py", line 850, in call
      outputs = self._fused_batch_norm(inputs, training=training)
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\layers\normalization\batch_normalization.py", line 660, in _fused_batch_norm
      output, mean, variance = control_flow_util.smart_cond(
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\utils\control_flow_util.py", line 108, in smart_cond
      return tf.__internal__.smart_cond.smart_cond(
    File "c:\Users\nguye\Anaconda3\envs\python310\lib\site-packages\keras\layers\normalization\batch_normalization.py", line 634, in _fused_batch_norm_training
      return tf.compat.v1.nn.fused_batch_norm(
Node: 'InceptionResNetV2Old/batch_normalization_205/FusedBatchNormV3'
OOM when allocating tensor with shape[64,64,77,127] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node InceptionResNetV2Old/batch_normalization_205/FusedBatchNormV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_74862]