In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from mtcnn.mtcnn import MTCNN
import json
from video_loader import VideoReader, ExtractedFeatureLoader
import cv2
import matplotlib.pyplot as plt
import gc
%matplotlib inline


In [2]:
class EfficientNetLite(object):
    
    def __init__(self, path,  output_layer_ind=158):
        self.interpreter = tf.lite.Interpreter(model_path=path)
        self.interpreter.allocate_tensors()
        self.out_ind = output_layer_ind
    
    def extract_from_image(self, img):
        # Note strange behavior with RuntimeError has been observed
        
        self.interpreter.set_tensor(0, img)
        self.interpreter.invoke()
        output_data = self.interpreter.get_tensor(self.out_ind)
        
        return output_data
    
    def get_output_shapes(self):
        
        out_shapes = self.interpreter.get_tensor(self.out_ind).shape
        
        return out_shapes
        

    

class DeepFakeLoadExtractFeatures(object):
    def __init__(self, chan_means=[0.485*255, 0.456*255, 0.406*255],
                       chan_std_dev=[0.229*255, 0.224*255, 0.225*255],
                       resize_shape=(300,300),
                       seq_length=298,
                       feat_extractor_path='',
                       feat_extractor_output_layer=158,
                       mode="train"):
        """[summary]
        
        Keyword Arguments:
            chan_means {list} -- [description] (default: {[0.485, 0.456, 0.406]})
            chan_std_dev {list} -- [description] (default: {[0.229, 0.224, 0.225]})
            resize_shape {tuple} -- [description] (default: {(300,300)})
            seq_length {int} -- [description] (default: {298})
            mode {str} -- [description] (default: {"train"})
        """

        self.chan_means = chan_means
        self.chan_std_dev = chan_std_dev
        self.resize_shape = resize_shape
        self.seq_length = seq_length
        self.mode = mode
        self.reader = VideoReader()
        self.efficientnet_extractor = EfficientNetLite(path=feat_extractor_path, 
                                      output_layer_ind=feat_extractor_output_layer)
        
        self.frame_feature_shapes = self.efficientnet_extractor.get_output_shapes()[1:]
        
    def get_frames(self, fnames):

        num_frames = self.seq_length
        
        real = fnames.numpy()[0].decode('utf-8')
        fake = fnames.numpy()[1].decode('utf-8')
        
        real_capture = cv2.VideoCapture(real)
        fake_capture = cv2.VideoCapture(fake)
        
        
        # Counts should be equal between real and fakes
        frame_count = int(fake_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Useful if loading total video size
        if frame_count < num_frames:
            num_frames = frame_count
        
        # Base inds on same frame grab to use matching video frames
        start = np.random.randint(frame_count-num_frames)
        frame_idxs = np.linspace(start, start+num_frames, num=num_frames, dtype=np.int)
        
        real_vid, _ = self.reader._read_frames_at_indices(real, real_capture, frame_idxs)
        fake_vid, _ = self.reader._read_frames_at_indices(fake, fake_capture, frame_idxs)
        
        real_capture.release()
        fake_capture.release()
        
        return real_vid, fake_vid
    
    def normalize(self, video, chan_means, chan_std_dev):
        """[summary]

        Arguments:
            video {tf.Tensor} -- tensorflow reshaped video data
            chan_means {array} -- [description]
            chan_std_dev {array} -- [description]

        Returns:
            [tf.Tensor] -- normalized video data
        """
        
        video -= chan_means
        video /= chan_std_dev

        return video
    
    def transform_vid(self, filenames):

        
        chan_means = self.chan_means
        chan_std_dev = self.chan_std_dev
        resize_shape = self.resize_shape
        
        # For kaggle only
        # fname = parts[-1].numpy().decode('utf-8')
        # global filelog
        # filelog.append(fname)
        
        real_vid, fake_vid = self.get_frames(filenames)
        

        real_vid = tf.image.resize(real_vid, size=resize_shape)
        fake_vid = tf.image.resize(fake_vid, size=resize_shape)
        real_vid = self.normalize(real_vid, chan_means, chan_std_dev)
        fake_vid = self.normalize(fake_vid, chan_means, chan_std_dev)

        return real_vid, fake_vid
    
    def extract_features(self, filenames):
        
        rvid, fvid = self.transform_vid(filenames)
        
        real_output_seq = np.empty((self.seq_length, *self.frame_feature_shapes), dtype=np.float32)
        fake_output_seq = np.empty((self.seq_length, *self.frame_feature_shapes), dtype=np.float32)
        
        for i in range(self.seq_length):
            
            real_output_seq[i] = self.efficientnet_extractor.\
                                      extract_from_image(tf.reshape(rvid[i], 
                                                        (1, *self.resize_shape, 3)))
            fake_output_seq[i] = self.efficientnet_extractor.\
                                      extract_from_image(tf.reshape(fvid[i], 
                                                        (1, *self.resize_shape, 3)))
        del rvid, fvid
        gc.collect()
        return tf.stack((real_output_seq, fake_output_seq))
    
    def transform_map(self, x):
        result_tensor = tf.py_function(func=self.extract_features,
                                        inp=[x],
                                        Tout=[tf.float32])
        # Convention is that result_tensor[0] = real_vid, result_tensor[1] = fake_vid
        result_tensor[0].set_shape((2,None,None,None,None))
        labels = tf.constant([[0.0], [1.0]])
        return result_tensor[0], labels

In [3]:
vids = tf.constant(['/home/kevin/data/deepfakes_data/source/train_val_sort/train/FAKE/gthvvygfcj.mp4', '/home/kevin/hdfhmlsrzn.mp4'])
extractor = DeepFakeLoadExtractFeatures(
            feat_extractor_path='models/efficientnet-lite0/efficientnet-lite0-fp32.tflite',
            seq_length=30,
            resize_shape=(224,224))

In [4]:
outs = extractor.extract_features(vids)

In [5]:
outs

<tf.Tensor: shape=(2, 30, 7, 7, 320), dtype=float32, numpy=
array([[[[[ 2.68818378e+00, -2.57375896e-01, -6.07490063e-01, ...,
            1.49994075e-01,  2.79033661e-01, -1.07991886e+00],
          [ 3.81975555e+00,  4.01659071e-01,  6.44412041e-02, ...,
            1.80464840e+00,  5.81324577e-01, -5.42699814e-01],
          [-1.53665543e+00,  1.36009860e+00,  2.75563240e-01, ...,
            4.93231344e+00, -1.49635649e+00,  3.91575336e+00],
          ...,
          [-1.15486157e+00,  2.05128026e+00,  1.88769245e+00, ...,
            5.78962755e+00, -1.67881536e+00,  3.47742653e+00],
          [ 2.64917707e+00,  9.81077611e-01,  3.00055552e+00, ...,
            2.23114586e+00, -1.73668861e-01,  8.23930740e-01],
          [ 1.92368686e+00,  4.87299979e-01,  8.85223389e-01, ...,
           -1.29479110e-01,  2.58852005e-01, -1.10142994e+00]],

         [[ 3.67319632e+00, -5.15828669e-01, -7.52059460e-01, ...,
           -4.15484250e-01,  1.31238317e+00, -1.46930027e+00],
          [ 3

### Test full implementation

In [6]:
# Dataset params
data_pairs_path = '/home/kevin/data/deepfakes_data/source/labels/fake_to_real_mapping.csv'
resize_shape = (224,224)
sequence_len = 30
prefetch_num = 10
train_val_split = 0.015

In [7]:
df_pairs = pd.read_csv(data_pairs_path)[['real', 'fake']]
train_df, val_df = train_test_split(df_pairs, test_size = train_val_split)
print(len(train_df))
print(len(val_df))

97635
1487


In [8]:
train_ds = tf.data.Dataset.from_tensor_slices(train_df.to_numpy())
val_ds = tf.data.Dataset.from_tensor_slices(val_df.to_numpy())

In [9]:
extractor = DeepFakeLoadExtractFeatures(
            feat_extractor_path='models/efficientnet-lite0/efficientnet-lite0-fp32.tflite',
            seq_length=sequence_len,
            resize_shape=resize_shape)

In [10]:
train_ds = train_ds.map(lambda x: extractor.transform_map(x)).prefetch(prefetch_num)
val_ds = val_ds.map(lambda x: extractor.transform_map(x)).prefetch(prefetch_num)

In [11]:
i = 0
for vids, labels in train_ds:
    print(vids.shape)
    print(labels.shape)
    i += 1
    if i > 3:
        break

(2, 30, 7, 7, 320)
(2, 1)
(2, 30, 7, 7, 320)
(2, 1)
(2, 30, 7, 7, 320)
(2, 1)
(2, 30, 7, 7, 320)
(2, 1)


In [13]:
np.save('test.npy', vids.numpy())

In [15]:
np.load('test.npy', allow_pickle=True).shape

(2, 30, 7, 7, 320)

In [37]:
img = tf.random.uniform(shape=[1,224,224,3])

In [46]:
interpreter = tf.lite.Interpreter(model_path='/home/kevin/Downloads/efficientnet-lite0/efficientnet-lite0-fp32.tflite')

In [47]:
interpreter.resize_tensor_input(0, (224,224,3))
interpreter.allocate_tensors()

RuntimeError: tensorflow/lite/kernels/conv.cc:316 input->dims->size != 4 (3 != 4)Node number 0 (CONV_2D) failed to prepare.


In [40]:
%%time
for i in range(30):
    interpreter.set_tensor(0, img)
    interpreter.invoke()
    output_data = interpreter.get_tensor(158)

CPU times: user 1.19 s, sys: 28.5 ms, total: 1.21 s
Wall time: 511 ms


In [15]:
output_data.shape

(1, 7, 7, 320)

In [35]:
interpreter.set_tensor(0, img)
interpreter.invoke()
output_data = interpreter.get_tensor(158)

ValueError: Cannot set tensor: Tensor is unallocated. Try calling allocate_tensors() first

In [45]:
interpreter.get_tensor(158).shape

(1, 7, 7, 320)

In [43]:
type(output_data)

numpy.ndarray

In [48]:
tf.stack((output_data, output_data))

<tf.Tensor: shape=(2, 1, 7, 7, 320), dtype=float32, numpy=
array([[[[[-5.33798933e-01, -9.35195625e-01,  1.34539652e+00, ...,
            6.04043663e-01, -2.47729301e-01, -2.25424480e+00],
          [ 3.20929885e-01, -1.55747151e+00,  1.53465509e+00, ...,
            3.10086191e-01, -4.14801598e-01, -1.84390545e+00],
          [-4.34443951e-02, -1.43075681e+00,  1.67901564e+00, ...,
            5.48883736e-01, -6.98149681e-01, -1.54888439e+00],
          ...,
          [-9.57852602e-02, -1.22169435e-01,  1.86268711e+00, ...,
            1.49149954e-01, -8.21370602e-01, -1.67614269e+00],
          [ 4.31544662e-01,  3.04008782e-01,  1.36211109e+00, ...,
           -2.81420887e-01, -7.10749626e-01, -1.76082706e+00],
          [-4.79822516e-01,  1.04507983e-01,  1.23279238e+00, ...,
            2.66000926e-01, -6.13765240e-01, -1.72692585e+00]],

         [[ 2.53289342e-01, -2.17129970e+00,  2.02890873e+00, ...,
            2.93990552e-01, -4.61992264e-01, -1.99078655e+00],
          [ 4.

In [21]:
for r, f in df_pairs.to_numpy():
    print(r.split('/')[-1].split('.')[0])
    print(f.split('/')[-1].split('.')[0])
    break

lxeqbyddvt
gthvvygfcj


In [None]:
np.linspace(0, 300, )

In [6]:
reader = VideoReader()


In [32]:
path = '/home/kevin/data/deepfakes_data/source/train_val_sort/train/FAKE/gthvvygfcj.mp4'
capture = cv2.VideoCapture(path)
fc = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
print(fc)
idxs = np.linspace(0, 290, num=290, dtype='int', endpoint=False)
arr, _ = reader._read_frames_at_indices(path,capture, idxs)
capture.release()

300


In [33]:
arr.shape

(290, 1080, 1920, 3)

## Testing the extracted feature loader

In [2]:
import glob

In [3]:
feats = glob.glob('../data/intermediate/whole_videos_7x7x320/*.npy')

In [4]:
df_feats = pd.DataFrame()
df_feats['feats'] = np.array(feats)
train_df, val_df = train_test_split(df_feats, test_size = 0.015)

In [5]:
len(train_df)

97635

In [6]:
len(val_df)

1487

In [7]:
val_df.feats.to_list()

['../data/intermediate/whole_videos_7x7x320/ncmdnjgijn_fsmwlfevvp.npy',
 '../data/intermediate/whole_videos_7x7x320/fumtejbdie_sagfyyshga.npy',
 '../data/intermediate/whole_videos_7x7x320/cqxxumarvp_eiayatksud.npy',
 '../data/intermediate/whole_videos_7x7x320/lcuqagpkgi_ppvegigiuf.npy',
 '../data/intermediate/whole_videos_7x7x320/dadrotzwag_pfhhczoboq.npy',
 '../data/intermediate/whole_videos_7x7x320/vwlhtziexs_wmzvnzklzq.npy',
 '../data/intermediate/whole_videos_7x7x320/kmsfmffrrp_cuohqkgmse.npy',
 '../data/intermediate/whole_videos_7x7x320/kshlyriwdh_vltrakrzec.npy',
 '../data/intermediate/whole_videos_7x7x320/jbpxyyacyj_vvfakkchyo.npy',
 '../data/intermediate/whole_videos_7x7x320/qfalftifws_hvtwtohuor.npy',
 '../data/intermediate/whole_videos_7x7x320/boobpetndu_xlettggvts.npy',
 '../data/intermediate/whole_videos_7x7x320/nnfgkpbpgy_ckmqcjdxjz.npy',
 '../data/intermediate/whole_videos_7x7x320/hazsnizlii_mzamrxtqfa.npy',
 '../data/intermediate/whole_videos_7x7x320/rohwvmkvjg_yggapsxth

In [8]:
train_ds = tf.data.Dataset.from_tensor_slices(train_df.feats.to_list())
val_ds = tf.data.Dataset.from_tensor_slices(val_df.feats.to_list())
loader = ExtractedFeatureLoader()


In [9]:
train_ds = train_ds.map(lambda x: loader.tflow_map(x)).batch(2)
val_ds = val_ds.map(lambda x: loader.tflow_map(x)).batch(2)

In [10]:
# for fname in val_ds:
#     print(fname.numpy().decode('utf-8'))

In [11]:
for vid, label in val_ds:
    print(vid.shape)
    print(label)
    break

(2, 2, 32, 7, 7, 320)
tf.Tensor(
[[[0.]
  [1.]]

 [[0.]
  [1.]]], shape=(2, 2, 1), dtype=float32)


In [12]:
train_pair = tf.reshape(vid, shape=(2*2, *vid.shape[2:]))
label = tf.reshape(label, shape=(2*2, *label.shape[2:]))

In [13]:
train_pair.shape

TensorShape([4, 32, 7, 7, 320])

In [14]:
label.shape

TensorShape([4, 1])

In [15]:
label

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[0.],
       [1.],
       [0.],
       [1.]], dtype=float32)>