# Import libraries

In [7]:
import os
import cv2
import dlib
import tqdm
import glob
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import face_recognition as fr

### Tensorflow dependencies ###
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

from imutils.video import WebcamVideoStream

### Other constants ###
batch_size = 64

real_dir = '/home/minhhieu/Desktop/Hieu/datasets/scene01/real'
attack_dir = '/home/minhhieu/Desktop/Hieu/datasets/scene01/attack'

real_imgs_dir = '/home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/real'
attack_imgs_dir = '/home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/attack'

tfrecord_output_path = '/home/minhhieu/Desktop/Hieu/datasets/MSU_MFSD_presentation_attack.tfrecord'

# Convert videos data to images data

In [91]:
# with face detection
def videos_to_images(video_dir, output_imgs_dir, images_per_vid=30):
    print(f'[INFO] Converting {video_dir} --> {output_imgs_dir}')
    files = glob.glob(f'{video_dir}/*.mov') + glob.glob(f'{video_dir}/*.mp4')
    for file in tqdm.tqdm(files):
        num_frames = 0
        vid_stream = cv2.VideoCapture(file)
        
        while(num_frames < images_per_vid):
            filename = file.split('.')[0]
            filename = filename.split('/')[-1]
            filename = filename + f"_{num_frames}.jpg"
            filename = os.path.join(output_imgs_dir, filename)
            
            ret, img = vid_stream.read()

            if(ret):
                cv2.imwrite(filename, img)
                num_frames += 1
            else:
                break
        
        vid_stream.release()
        
def crop_face_images(imgs_dir):
    print(f'[INFO] Cropping face from images in {imgs_dir}...')
    files = glob.glob(f'{imgs_dir}/*.jpg') + glob.glob(f'{imgs_dir}/*.png') + glob.glob(f'{imgs_dir}/*.jpeg')
    for file in tqdm.tqdm(files):
        img = cv2.imread(file)
        locations = fr.face_locations(img)
        if(len(locations) > 0):
            y1, x2, y2, x1 = locations[0]
            face = img[y1:y2, x1:x2]
            cv2.imwrite(file, face)
    
videos_to_images(real_dir, real_imgs_dir, images_per_vid=30)
videos_to_images(attack_dir, attack_imgs_dir, images_per_vid=10)

crop_face_images(real_imgs_dir)
crop_face_images(attack_imgs_dir)

  1%|▏         | 1/69 [00:00<00:11,  6.05it/s]

[INFO] Converting /home/minhhieu/Desktop/Hieu/datasets/scene01/real --> /home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/real


100%|██████████| 69/69 [00:08<00:00,  7.90it/s]
  1%|          | 2/210 [00:00<00:11, 17.37it/s]

[INFO] Converting /home/minhhieu/Desktop/Hieu/datasets/scene01/attack --> /home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/attack


100%|██████████| 210/210 [00:10<00:00, 19.40it/s]
  0%|          | 1/2070 [00:00<04:14,  8.12it/s]

[INFO] Cropping face from images in /home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/real...


100%|██████████| 2070/2070 [03:55<00:00,  8.80it/s]
  0%|          | 3/6300 [00:00<05:32, 18.95it/s]

[INFO] Cropping face from images in /home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/attack...


100%|██████████| 6300/6300 [05:09<00:00, 20.34it/s]


# Parse video data to tfrecord of (image, label) pairs

In [92]:
detector = dlib.get_frontal_face_detector()

def _bytes_feature(value):
    if(isinstance(value, type(tf.constant(0)))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def make_example(filename, label):
    assert isinstance(filename, str) and isinstance(label, int)
    
    feature = {
        'image/filename' : _bytes_feature(str.encode(filename)),
        'image/encoded' : _bytes_feature(open(filename, 'rb').read()),
        'image/label' : _int64_feature(label),
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

def parse_images_to_tfrecord(files, output_path, images_per_file=30,):
    print(f'[INFO] parsing images from {os.path.dirname(files[0])} to tfrecords ... ')
    
    samples = []
    for file in tqdm.tqdm(files):        
        label = file.split('/')[-2]
        label = 1 if label == 'attack' else 0 
        
        samples.append((file, label))
    
                
    print(f'[INFO] Writing to tfrecord file : {output_path} ...')
    with tf.io.TFRecordWriter(output_path) as writer:
        for filename, label in samples:
            tf_example = make_example(filename, label)
            writer.write(tf_example.SerializeToString())
            
    print('DONE!')
            
attack_imgs_files = glob.glob(f'{attack_imgs_dir}/*.jpg')
real_imgs_files = glob.glob(f'{real_imgs_dir}/*.jpg')
parse_images_to_tfrecord(attack_imgs_files+real_imgs_files, tfrecord_output_path, images_per_file=1)

100%|██████████| 8370/8370 [00:00<00:00, 2058539.02it/s]

[INFO] parsing images from /home/minhhieu/Desktop/Hieu/datasets/scene01/imgs/attack to tfrecords ... 
[INFO] Writing to tfrecord file : /home/minhhieu/Desktop/Hieu/datasets/MSU_MFSD_presentation_attack.tfrecord ...





DONE!


# Read data back from tfrecord

In [52]:
def standardize_image(image):
    mean = tf.math.reduce_mean(image)
    std = tf.math.reduce_std(image)
    
    image = tf.math.subtract(image, mean)
    image = tf.math.divide(image, std)
    
    return image

def normalize_image(image):
    image = tf.math.subtract(image, 127.5)
    image = tf.math.divide(image, 127.5)
    
    return image

def _parse_tfrecord(color_space=None):
    def _parse_func(example_proto, img_dim=(128, 128), color_space=color_space):
        feature_desc = {
            'image/filename' : tf.io.FixedLenFeature([], tf.string),
            'image/encoded' : tf.io.FixedLenFeature([], tf.string),
            'image/label' : tf.io.FixedLenFeature([], tf.int64)
        }

        x = tf.io.parse_single_example(example_proto, feature_desc)
        image = tf.image.decode_jpeg(x['image/encoded'])
        image = tf.image.resize(image, img_dim)
        image = tf.cast(image, dtype=tf.float32)
        
        ### Convert colorspace if needed ###
        if(color_space is not None):
            if(color_space == 'hsv') : image = tf.image.rgb_to_hsv(image)

        ### Normalize/Standardization ###
        image = standardize_image(image)
        image = tf.convert_to_tensor(image)
        depth = tf.ones((52,52,1))
        if(x['image/label'] == tf.constant(1, dtype=tf.int64)): # if spoof -> depth is zero
            depth = tf.zeros((52,52,1))
        label = tf.one_hot(x['image/label'], depth=2)

        return image, label, depth

    return _parse_func

def read_from_tfrecord(tfrecord_file, colorspace=None,batch_size=64, buffer_size=40000):
    dataset_len = int(sum(1 for _ in tf.data.TFRecordDataset(tfrecord_file)))
    parse_func = _parse_tfrecord(color_space=colorspace)
    
    num_batches = dataset_len // batch_size
    train_size = int(num_batches * 0.7)
    test_size = num_batches - train_size
    
    raw_dataset = tf.data.TFRecordDataset(tfrecord_file)
    raw_dataset = raw_dataset.repeat()
    raw_dataset = raw_dataset.shuffle(buffer_size=buffer_size)
    
    dataset = raw_dataset.map(parse_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    
    train_dataset = dataset.take(train_size)
    test_dataset = dataset.skip(train_size)
    
    return dataset_len, train_dataset, test_dataset

# dataset_len, train_dataset, test_dataset = read_from_tfrecord(tfrecord_output_path, batch_size=64, colorspace='hsv')
# x, y, depth = next(iter(train_dataset))
# x = x.numpy()[0]
# print(y, depth)
# plt.imshow(x)
# plt.show()

# Create patch-based CNN model

In [4]:
class RandomCrop(tf.keras.layers.Layer):
    def __init__(self, input_shape, patch_dim):
        super(RandomCrop, self).__init__()
        self.patch_dim = patch_dim
                
    def call(self, inputs):
        return tf.image.random_crop(inputs, size=self.patch_dim)


def _conv_bn_pool_block(filters, kernel_size=5, pool_size=2):
    block = Sequential([
        Conv2D(filters, kernel_size=kernel_size, strides=1, padding='same'),
        BatchNormalization(),
        MaxPooling2D(pool_size=pool_size)
    ])
    
    return block

def patch_based_cnn(input_dim=(128, 128, 3), patch_dim=(96,96,3), batch_size=64, name='PatchBasedCNN'):
    inputs = Input(shape=input_dim)
    patch_inputs = RandomCrop(input_dim, patch_dim=(batch_size, patch_dim[0], patch_dim[1], patch_dim[2]))(inputs)
    
    x = _conv_bn_pool_block(50)(patch_inputs)
    x = _conv_bn_pool_block(100)(x)
    x = _conv_bn_pool_block(150)(x)
    x = _conv_bn_pool_block(200)(x)
    x = _conv_bn_pool_block(250)(x)
    x = Flatten()(x)
    
    fc = Dense(1000, activation='relu')(x)
    fc = BatchNormalization()(fc)
    fc = Dropout(0.5)(fc)
    
    fc = Dense(400, activation='relu')(fc)
    fc = BatchNormalization()(fc)
    output = Dense(2, activation='softmax')(fc)

    model = Model(inputs = inputs, outputs=output, name=name)
    return model    

model = patch_based_cnn()
model.summary()

Model: "PatchBasedCNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128, 128, 3)]     0         
_________________________________________________________________
random_crop (RandomCrop)     (64, 96, 96, 3)           0         
_________________________________________________________________
sequential (Sequential)      (64, 48, 48, 50)          4000      
_________________________________________________________________
sequential_1 (Sequential)    (64, 24, 24, 100)         125500    
_________________________________________________________________
sequential_2 (Sequential)    (64, 12, 12, 150)         375750    
_________________________________________________________________
sequential_3 (Sequential)    (64, 6, 6, 200)           751000    
_________________________________________________________________
sequential_4 (Sequential)    (64, 3, 3, 250)         

# Create depth-based CNN model

In [5]:
def _3conv_pool_block(filters, kernel_size=3, pool_size=2):
    assert isinstance(filters, list) and len(filters) == 3
    block = Sequential([
        Conv2D(filters[0], kernel_size=kernel_size, strides=1, padding='same'),
        LeakyReLU(alpha=0.2),
        Conv2D(filters[1], kernel_size=kernel_size, strides=1, padding='same'),
        LeakyReLU(alpha=0.2),
        Conv2D(filters[2], kernel_size=kernel_size, strides=1, padding='same'),
        LeakyReLU(alpha=0.2),
        MaxPooling2D(pool_size=pool_size)
    ])
    
    return block

def _conv_convT_block(filters, conv_kernel=3, convT_kernel=6):
    assert isinstance(filters, list) and len(filters) == 2
    block = Sequential([
        Conv2D(filters[0], kernel_size=conv_kernel, strides=1, padding='same'),
        Conv2DTranspose(filters[1], kernel_size=convT_kernel, strides=1, padding='valid')
    ])

    return block

def depth_based_cnn(input_shape=(128, 128, 3), name="DepthBasedCNN"):
    inputs = Input(shape=input_shape)
    
    x = _3conv_pool_block([64, 64, 128])(inputs)
    x = _3conv_pool_block([128,256,160])(x)
    x = _conv_convT_block([128, 128])(x)
    x = _conv_convT_block([128, 128])(x)
    x = _conv_convT_block([160, 160])(x)
    x = _conv_convT_block([320, 320])(x)
    output = Conv2D(1, kernel_size=3, strides=1, padding='same')(x)
    
    model = Model(inputs=inputs, outputs=output, name=name)
    return model

model = depth_based_cnn()
model.summary()

Model: "DepthBasedCNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 128, 128, 3)]     0         
_________________________________________________________________
sequential_5 (Sequential)    (None, 64, 64, 128)       112576    
_________________________________________________________________
sequential_6 (Sequential)    (None, 32, 32, 160)       811552    
_________________________________________________________________
sequential_7 (Sequential)    (None, 37, 37, 128)       774400    
_________________________________________________________________
sequential_8 (Sequential)    (None, 42, 42, 128)       737536    
_________________________________________________________________
sequential_9 (Sequential)    (None, 47, 47, 160)       1106240   
_________________________________________________________________
sequential_10 (Sequential)   (None, 52, 52, 320)     

# Combine the two models and train

In [50]:
def full_model(input_dim=(128, 128, 3)):
    inputs = Input(shape=input_dim)
    
    depth_cnn = depth_based_cnn()
    patch_cnn = patch_based_cnn()
    
    spoof_score = patch_cnn(inputs)
    depth_map   = depth_cnn(inputs)
    
    model = Model(inputs=inputs, outputs=[spoof_score, depth_map], name='DepthBased_PatchBased_CNN')
    return model

losses = {
    'PatchBasedCNN' : tf.keras.losses.BinaryCrossentropy(),
    'DepthBasedCNN' : tf.keras.losses.MeanSquaredError()
}

optimizer = Adam(lr=0.0001, amsgrad=True, beta_1=0.5)

model = full_model()
model.compile(optimizer=optimizer, loss=losses)
print(model.summary())

Model: "DepthBased_PatchBased_CNN"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 128, 128, 3) 0                                            
__________________________________________________________________________________________________
PatchBasedCNN (Functional)      (64, 2)              5165302     input_12[0][0]                   
__________________________________________________________________________________________________
DepthBasedCNN (Functional)      (None, 52, 52, 1)    7693025     input_12[0][0]                   
Total params: 12,858,327
Trainable params: 12,854,027
Non-trainable params: 4,300
__________________________________________________________________________________________________
None


In [60]:
for i in range(100):
    print(f'Epoch {i+1}/100')
    for j in range(dataset_len // batch_size):
        images, labels, depths = next(iter(train_dataset))
        y = {
            'DepthBasedCNN' : depths,
            'PatchBasedCNN' : labels
        }
        
        total, depth_loss, patch_loss = model.train_on_batch(images, y)
        print(f'    Batch #{j+1}, depth loss = {depth_loss}, patch loss = {patch_loss}, total = {total}')

Epoch 1/100
    Batch #1, depth loss = 0.3752802014350891, path loss = 0.12390800565481186, total = 0.49918821454048157
    Batch #2, depth loss = 0.414543092250824, path loss = 0.13238564133644104, total = 0.5469287633895874
    Batch #3, depth loss = 0.2534486651420593, path loss = 0.19448532164096832, total = 0.44793397188186646
    Batch #4, depth loss = 0.13797500729560852, path loss = 0.14190071821212769, total = 0.2798757255077362
    Batch #5, depth loss = 0.41948434710502625, path loss = 0.12218107283115387, total = 0.5416654348373413
    Batch #6, depth loss = 0.2716611623764038, path loss = 0.16272607445716858, total = 0.4343872368335724
    Batch #7, depth loss = 0.22607339918613434, path loss = 0.1834670901298523, total = 0.40954047441482544
    Batch #8, depth loss = 0.33743685483932495, path loss = 0.16236276924610138, total = 0.49979960918426514
    Batch #9, depth loss = 0.27310189604759216, path loss = 0.15228591859340668, total = 0.42538779973983765
    Batch #10, de

    Batch #77, depth loss = 0.24274104833602905, path loss = 0.13089409470558167, total = 0.3736351430416107
    Batch #78, depth loss = 0.20622959733009338, path loss = 0.14956827461719513, total = 0.3557978868484497
    Batch #79, depth loss = 0.16658452153205872, path loss = 0.13125042617321014, total = 0.29783493280410767
    Batch #80, depth loss = 0.23372139036655426, path loss = 0.15867149829864502, total = 0.3923928737640381
    Batch #81, depth loss = 0.4161258041858673, path loss = 0.1313847154378891, total = 0.5475105047225952
    Batch #82, depth loss = 0.16484759747982025, path loss = 0.21807944774627686, total = 0.3829270601272583
    Batch #83, depth loss = 0.1633087545633316, path loss = 0.16806866228580475, total = 0.33137741684913635
    Batch #84, depth loss = 0.411620557308197, path loss = 0.16352209448814392, total = 0.5751426219940186
    Batch #85, depth loss = 0.1519843339920044, path loss = 0.16450132429599762, total = 0.3164856433868408
    Batch #86, depth lo

    Batch #22, depth loss = 0.1359390914440155, path loss = 0.20385272800922394, total = 0.33979183435440063
    Batch #23, depth loss = 0.18059226870536804, path loss = 0.2087990641593933, total = 0.38939133286476135
    Batch #24, depth loss = 0.10217852145433426, path loss = 0.17205068469047546, total = 0.2742291986942291
    Batch #25, depth loss = 0.12012328207492828, path loss = 0.15105900168418884, total = 0.2711822986602783
    Batch #26, depth loss = 0.2355806529521942, path loss = 0.25329798460006714, total = 0.48887863755226135
    Batch #27, depth loss = 0.18251237273216248, path loss = 0.23588205873966217, total = 0.41839444637298584


KeyboardInterrupt: 

# Train two models separately