In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from mtcnn.mtcnn import MTCNN
import json
from video_loader import VideoReader, ExtractedFeatureLoader
import cv2
import matplotlib.pyplot as plt
import gc
%matplotlib inline


In [14]:
class EfficientNetLite(object):
    
    def __init__(self, path,  output_layer_ind=158):
        self.interpreter = tf.lite.Interpreter(model_path=path)
        self.interpreter.allocate_tensors()
        self.out_ind = output_layer_ind
    
    def extract_from_image(self, img):
        # Note strange behavior with RuntimeError has been observed
        
        self.interpreter.set_tensor(0, img)
        self.interpreter.invoke()
        output_data = self.interpreter.get_tensor(self.out_ind)
        
        return output_data
    
    def get_output_shapes(self):
        
        out_shapes = self.interpreter.get_tensor(self.out_ind).shape
        
        return out_shapes
        
class BlazefaceNetLite(object):
    
    def __init__(self, path,  output_layer_inds=[114, 157]):
        self.interpreter = tf.lite.Interpreter(model_path=path)
        self.interpreter.allocate_tensors()
        self.out_ind = output_layer_inds
    
    def extract_from_image(self, img):
        # Note strange behavior with RuntimeError has been observed
        
        self.interpreter.set_tensor(0, img)
        self.interpreter.invoke()
        out16 = self.interpreter.get_tensor(self.out_ind[0])
        out8 = self.interpreter.get_tensor(self.out_ind[1])
        
        return out16, out8
    
    def get_output_shapes(self):
        
        out_shape1 = self.interpreter.get_tensor(self.out_ind[0]).shape[1:]
        out_shape2 = self.interpreter.get_tensor(self.out_ind[1]).shape[1:]
        
        return out_shape1, out_shape2
    

class DeepFakeLoadExtractFeatures(object):
    def __init__(self, chan_means=[0.485*255, 0.456*255, 0.406*255],
                       chan_std_dev=[0.229*255, 0.224*255, 0.225*255],
                       resize_shape=(300,300),
                       seq_length=298,
                       feat_extractor_path='',
                       feat_extractor_output_layers=[114, 157],
                       mode="train"):
        """[summary]
        
        Keyword Arguments:
            chan_means {list} -- [description] (default: {[0.485, 0.456, 0.406]})
            chan_std_dev {list} -- [description] (default: {[0.229, 0.224, 0.225]})
            resize_shape {tuple} -- [description] (default: {(300,300)})
            seq_length {int} -- [description] (default: {298})
            mode {str} -- [description] (default: {"train"})
        """

        self.chan_means = chan_means
        self.chan_std_dev = chan_std_dev
        self.resize_shape = resize_shape
        self.seq_length = seq_length
        self.mode = mode
        self.reader = VideoReader()
        self.efficientnet_extractor = BlazefaceNetLite(path=feat_extractor_path, 
                                      output_layer_inds=feat_extractor_output_layers)
        
        self.frame_feature_shapes = self.efficientnet_extractor.get_output_shapes()
        
    def get_frames(self, fnames):

        num_frames = self.seq_length
        
        real = fnames.numpy()[0].decode('utf-8')
        fake = fnames.numpy()[1].decode('utf-8')
        
        real_capture = cv2.VideoCapture(real)
        fake_capture = cv2.VideoCapture(fake)
        
        
        # Counts should be equal between real and fakes
        frame_count = int(fake_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Useful if loading total video size
        if frame_count < num_frames:
            num_frames = frame_count
        
        # Base inds on same frame grab to use matching video frames
        start = np.random.randint(frame_count-num_frames)
        frame_idxs = np.linspace(start, start+num_frames, num=num_frames, dtype=np.int)
        
        real_vid, _ = self.reader._read_frames_at_indices(real, real_capture, frame_idxs)
        fake_vid, _ = self.reader._read_frames_at_indices(fake, fake_capture, frame_idxs)
        
        real_capture.release()
        fake_capture.release()
        
        return real_vid, fake_vid
    
    def normalize(self, video, chan_means, chan_std_dev):
        """[summary]

        Arguments:
            video {tf.Tensor} -- tensorflow reshaped video data
            chan_means {array} -- [description]
            chan_std_dev {array} -- [description]

        Returns:
            [tf.Tensor] -- normalized video data
        """
        
        video -= chan_means
        video /= chan_std_dev

        return video
    
    def transform_vid(self, filenames):

        
        chan_means = self.chan_means
        chan_std_dev = self.chan_std_dev
        resize_shape = self.resize_shape
        
        # For kaggle only
        # fname = parts[-1].numpy().decode('utf-8')
        # global filelog
        # filelog.append(fname)
        
        real_vid, fake_vid = self.get_frames(filenames)
        

        real_vid = tf.image.resize(real_vid, size=resize_shape)
        fake_vid = tf.image.resize(fake_vid, size=resize_shape)
        real_vid = self.normalize(real_vid, chan_means, chan_std_dev)
        fake_vid = self.normalize(fake_vid, chan_means, chan_std_dev)

        return real_vid, fake_vid
    
    def extract_features(self, filenames):
        
        rvid, fvid = self.transform_vid(filenames)
        
        real_output16 = np.empty((self.seq_length, *self.frame_feature_shapes[0]), dtype=np.float32)
        fake_output16 = np.empty((self.seq_length, *self.frame_feature_shapes[0]), dtype=np.float32)
        
        real_output8 = np.empty((self.seq_length, *self.frame_feature_shapes[1]), dtype=np.float32)
        fake_output8 = np.empty((self.seq_length, *self.frame_feature_shapes[1]), dtype=np.float32)
        
        for i in range(self.seq_length):
            
            real_output16[i], real_output8[i] = self.efficientnet_extractor.\
                                                extract_from_image(tf.reshape(rvid[i], 
                                                (1, *self.resize_shape, 3)))
            
            fake_output16[i], fake_output8[i] = self.efficientnet_extractor.\
                                                extract_from_image(tf.reshape(fvid[i], 
                                                (1, *self.resize_shape, 3)))
        del rvid, fvid
        gc.collect()
        
        out = (tf.stack((real_output16, fake_output16)), tf.stack((real_output8, fake_output8)))
        return out
    
    def transform_map(self, x):
        results16, results8 = tf.py_function(func=self.extract_features,
                                        inp=[x],
                                        Tout=[tf.float32, tf.float32])
        # Convention is 
        # result_tensor.set_shape((2,None,None,None,None))
        labels = tf.constant([[0.0], [1.0]])
        return results16, results8, labels

class DeepFakeLoadExtractFeaturesV2(object):
    def __init__(self, chan_means=[0.485*255, 0.456*255, 0.406*255],
                       chan_std_dev=[0.229*255, 0.224*255, 0.225*255],
                       resize_shape=(300,300),
                       seq_length=298,
                       feat_extractor_path='',
                       feat_extractor_output_layers=[114, 157],
                       mode="train"):
        """[summary]
        
        Keyword Arguments:
            chan_means {list} -- [description] (default: {[0.485, 0.456, 0.406]})
            chan_std_dev {list} -- [description] (default: {[0.229, 0.224, 0.225]})
            resize_shape {tuple} -- [description] (default: {(300,300)})
            seq_length {int} -- [description] (default: {298})
            mode {str} -- [description] (default: {"train"})
        """

        self.chan_means = chan_means
        self.chan_std_dev = chan_std_dev
        self.resize_shape = resize_shape
        self.seq_length = seq_length
        self.mode = mode
        self.reader = VideoReader()
        self.efficientnet_extractor = BlazefaceNetLite(path=feat_extractor_path, 
                                      output_layer_inds=feat_extractor_output_layers)
        
        self.frame_feature_shapes = self.efficientnet_extractor.get_output_shapes()
        
    def get_frames(self, fnames):

        num_frames = self.seq_length
        
        real = fnames.numpy()[0].decode('utf-8')
        fake = fnames.numpy()[1].decode('utf-8')
        
        real_capture = cv2.VideoCapture(real)
        fake_capture = cv2.VideoCapture(fake)
        
        
        # Counts should be equal between real and fakes
        frame_count = int(fake_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Useful if loading total video size
        if frame_count < num_frames:
            num_frames = frame_count
        
        # Base inds on same frame grab to use matching video frames
        start = np.random.randint(frame_count-num_frames)
        frame_idxs = np.linspace(start, start+num_frames, num=num_frames, dtype=np.int)
        
        real_vid, _ = self.reader._read_frames_at_indices(real, real_capture, frame_idxs)

        if np.random.choice([0,1], p=[0.9, 0.1]) == 1:
            # Throw an easy differable example randomly
            start = np.random.randint(frame_count-num_frames)
            frame_idxs = np.linspace(start, start+num_frames, num=num_frames, dtype=np.int)

        fake_vid, _ = self.reader._read_frames_at_indices(fake, fake_capture, frame_idxs)
        
        real_capture.release()
        fake_capture.release()
        
        return real_vid, fake_vid
    
    def normalize(self, video, chan_means, chan_std_dev):
        """[summary]

        Arguments:
            video {tf.Tensor} -- tensorflow reshaped video data
            chan_means {array} -- [description]
            chan_std_dev {array} -- [description]

        Returns:
            [tf.Tensor] -- normalized video data
        """
        
        # video -= chan_means
        # video /= chan_std_dev
        video /= 127.5
        video -= 1.0

        return video
    
    def transform_vid(self, filenames):

        
        chan_means = self.chan_means
        chan_std_dev = self.chan_std_dev
        resize_shape = self.resize_shape
        
        # For kaggle only
        # fname = parts[-1].numpy().decode('utf-8')
        # global filelog
        # filelog.append(fname)
        
        real_vid, fake_vid = self.get_frames(filenames)
        

        real_vid = tf.image.resize(real_vid, size=resize_shape)
        fake_vid = tf.image.resize(fake_vid, size=resize_shape)
        real_vid = self.normalize(real_vid, chan_means, chan_std_dev)
        fake_vid = self.normalize(fake_vid, chan_means, chan_std_dev)

        return real_vid, fake_vid
    
    def extract_features(self, videos):
        
        rvid, fvid = videos[0], videos[1]
        
        real_output16 = np.empty((self.seq_length, *self.frame_feature_shapes[0]), dtype=np.float32)
        fake_output16 = np.empty((self.seq_length, *self.frame_feature_shapes[0]), dtype=np.float32)
        
        real_output8 = np.empty((self.seq_length, *self.frame_feature_shapes[1]), dtype=np.float32)
        fake_output8 = np.empty((self.seq_length, *self.frame_feature_shapes[1]), dtype=np.float32)
        
        for i in range(self.seq_length):
            
            real_output16[i], real_output8[i] = self.efficientnet_extractor.\
                                                extract_from_image(tf.reshape(rvid[i], 
                                                (1, *self.resize_shape, 3)))
            
            fake_output16[i], fake_output8[i] = self.efficientnet_extractor.\
                                                extract_from_image(tf.reshape(fvid[i], 
                                                (1, *self.resize_shape, 3)))
        del rvid, fvid
        gc.collect()
        
        out = (tf.stack((real_output16, fake_output16)), tf.stack((real_output8, fake_output8)))
        return out
    
    def load_videos_map(self, x):
        # Loading map function for parallel loading call

        real_vid, fake_vid = tf.py_function(func=self.transform_vid,
                                inp=[x],
                                Tout=[tf.float32, tf.float32])
        return tf.stack((real_vid, fake_vid))


    def extract_feats_map(self, x):
        results16, results8 = tf.py_function(func=self.extract_features,
                                        inp=[x],
                                        Tout=[tf.float32, tf.float32])
        # Convention is 
        # result_tensor.set_shape((2,None,None,None,None))
        labels = tf.constant([[0.0], [1.0]])
        return results16, results8, labels

In [50]:
vids = tf.constant(['/home/kevin/data/deepfakes_data/source/train_val_sort/train/FAKE/gthvvygfcj.mp4', '/home/kevin/hdfhmlsrzn.mp4'])
extractor = DeepFakeLoadExtractFeatures(
            feat_extractor_path='models/blazeface/face_detection_front.tflite',
            seq_length=30,
            resize_shape=(128,128))

In [51]:
outs = extractor.extract_features(vids)

In [52]:
outs[0].shape

TensorShape([2, 30, 16, 16, 88])

In [53]:
outs[1].shape

TensorShape([2, 30, 8, 8, 96])

### Test full implementation

In [27]:
# Dataset params
data_pairs_path = '/home/kevin/deepfake-proj/data/source/fake_to_real_maps_82k.csv'
resize_shape = (128,128)
sequence_len = 64
prefetch_num = 10
train_val_split = 0.015
n_calls = 8

In [28]:
df_pairs = pd.read_csv(data_pairs_path)[['real', 'fake']]
train_df, val_df = train_test_split(df_pairs, test_size = train_val_split)
print(len(train_df))
print(len(val_df))

80770
1230


In [29]:
train_ds = tf.data.Dataset.from_tensor_slices(train_df.to_numpy())
val_ds = tf.data.Dataset.from_tensor_slices(val_df.to_numpy())

In [30]:
extractor = DeepFakeLoadExtractFeaturesV2(
            feat_extractor_path='models/blazeface/face_detection_front.tflite',
            seq_length=sequence_len,
            resize_shape=resize_shape)

In [31]:
train_ds = train_ds.map(lambda x: extractor.load_videos_map(x), num_parallel_calls=n_calls)
val_ds = val_ds.map(lambda x: extractor.load_videos_map(x), num_parallel_calls=n_calls)

In [32]:
i = 0
for item in train_ds:
    print(item)
    break

tf.Tensor(
[[[[[ 4.56617594e-01  4.25245047e-01  3.62499952e-01]
    [ 4.99264717e-01  4.67892170e-01  4.05147076e-01]
    [ 5.49754858e-01  5.18382311e-01  4.39950943e-01]
    ...
    [ 8.27450991e-01  7.41176486e-01  6.23529434e-01]
    [ 7.80392170e-01  6.86274529e-01  5.92156887e-01]
    [ 7.56862760e-01  6.78431392e-01  5.84313750e-01]]

   [[ 4.33088183e-01  4.01715636e-01  3.54656816e-01]
    [ 4.58823562e-01  4.27451015e-01  3.80392194e-01]
    [ 4.90196109e-01  4.58823562e-01  3.96078467e-01]
    ...
    [ 7.74019599e-01  7.03431368e-01  5.85784316e-01]
    [ 7.33333349e-01  6.78431392e-01  5.52941203e-01]
    [ 7.33333349e-01  6.70588255e-01  5.68627477e-01]]

   [[ 4.15196061e-01  3.83823514e-01  3.36764693e-01]
    [ 4.35294151e-01  4.03921604e-01  3.41176510e-01]
    [ 4.58823562e-01  4.27451015e-01  3.64705920e-01]
    ...
    [ 7.41176486e-01  6.78431392e-01  5.92156887e-01]
    [ 7.33333349e-01  6.70588255e-01  5.84313750e-01]
    [ 6.86274529e-01  6.47058845e-01  5.529

In [33]:
train_ds = train_ds.map(lambda x: extractor.extract_feats_map(x)).prefetch(prefetch_num)
val_ds = val_ds.map(lambda x: extractor.extract_feats_map(x)).prefetch(prefetch_num)

In [None]:
i = 0
for item in train_ds:
    print(item)
    break

In [23]:
len(item)

3

In [26]:
item[2]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.],
       [1.]], dtype=float32)>