In [1]:
import os, sys, time
import cv2
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [1]:
test_dir = "./input/deepfake-detection-challenge/own_test/"

test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
frame_h = 5
frame_l = 5
len(test_videos)

7

In [2]:
test_videos

['_a_charan.mp4',
 '_deepest_fake.mp4',
 '_result-charan1.mp4',
 '_vig.mp4',
 'all.mp4',
 'mohit.mp4',
 'panchal.mp4']

In [5]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

PyTorch version: 1.6.0
CUDA version: 10.1
cuDNN version: 7604


In [6]:
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(gpu)

device(type='cuda', index=0)

In [7]:
import sys
sys.path.insert(0, "/input/blazeface-pytorch")
sys.path.insert(0, "/input/deepfakes-inference-demo")

In [8]:
from blazeface import BlazeFace
facedet = BlazeFace().to(gpu)
facedet.load_weights(r"input\blazeface-pytorch\blazeface.pth")
facedet.load_anchors(r"input\blazeface-pytorch\anchors.npy")
_ = facedet.train(False)

In [12]:
from helpers.read_video_1 import VideoReader
from helpers.face_extract_1 import FaceExtractor

frames_per_video = 81 #frame_h * frame_l
video_reader = VideoReader()
video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn, facedet)

In [14]:
input_size = 480

In [15]:
from torchvision.transforms import Normalize

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalize_transform = Normalize(mean, std)

In [16]:
def isotropically_resize_image(img, size, resample=cv2.INTER_AREA):
    h, w = img.shape[:2]
    if w > h:
        h = h * size // w
        w = size
    else:
        w = w * size // h
        h = size

    resized = cv2.resize(img, (w, h), interpolation=resample)
    return resized


def make_square_image(img):
    h, w = img.shape[:2]
    size = max(h, w)
    t = 0
    b = size - h
    l = 0
    r = size - w
    return cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=0)

In [17]:
import torch.nn as nn
import torchvision.models as models

class MyResNeXt(models.resnet.ResNet):
    def __init__(self, training=True):
        super(MyResNeXt, self).__init__(block=models.resnet.Bottleneck,
                                        layers=[3, 4, 6, 3], 
                                        groups=32, 
                                        width_per_group=4)
        self.fc = nn.Linear(2048, 1)

In [18]:
checkpoint = torch.load(r"input\deepfakes-inferece-demo\resnext.pth", map_location=gpu)

model = MyResNeXt().to(gpu)
model.load_state_dict(checkpoint)
_ = model.eval()

del checkpoint

In [19]:
def predict_on_video(video_path, batch_size):
    try:
        # Find the faces for N frames in the video.
        faces = face_extractor.process_video(video_path)
        # print(faces)

        # Only look at one face per frame.
        face_extractor.keep_only_best_face(faces)
        
        if len(faces) > 0:
            # NOTE: When running on the CPU, the batch size must be fixed
            # or else memory usage will blow up. (Bug in PyTorch?)
            x = np.zeros((batch_size, input_size, input_size, 3), dtype=np.uint8)

            # If we found any faces, prepare them for the model.
            n = 0
            for frame_data in faces:
                for face in frame_data["faces"]:
                    # Resize to the model's required input size.
                    # We keep the aspect ratio intact and add zero
                    # padding if necessary.                    
                    resized_face = isotropically_resize_image(face, input_size)
                    resized_face = make_square_image(resized_face)

                    if n < batch_size:
                        x[n] = resized_face
                        n += 1
                    else:
                        print("WARNING: have %d faces but batch size is %d" % (n, batch_size))
                    
                    # Test time augmentation: horizontal flips.
                    # TODO: not sure yet if this helps or not
                    #x[n] = cv2.flip(resized_face, 1)
                    #n += 1

            if n > 0:
                x = torch.tensor(x, device=gpu).float()

                # Preprocess the images.
                x = x.permute((0, 3, 1, 2))

                for i in range(len(x)):
                    x[i] = normalize_transform(x[i] / 255.)

                # Make a prediction, then take the average.
                with torch.no_grad():
                    y_pred = model(x)
                    y_pred = torch.sigmoid(y_pred.squeeze())
                    return y_pred[:n].mean().item()

    except Exception as e:
        print("Prediction error on video %s: %s" % (video_path, str(e)))

    return 0.5

In [21]:
from concurrent.futures import ThreadPoolExecutor

def predict_on_video_set(videos, num_workers):
    def process_file(i):
        filename = videos[i]
        y_pred = predict_on_video(os.path.join(test_dir, filename), batch_size=frames_per_video)
        return y_pred

    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        predictions = ex.map(process_file, range(len(videos)))

    return list(predictions)

In [22]:
speed_test = True  # you have to enable this manually

In [23]:
# train_videos

In [24]:
if speed_test:
    start_time = time.time()
    speedtest_videos = test_videos[:5]
    predictions = predict_on_video_set(speedtest_videos, num_workers=4)
    elapsed = time.time() - start_time
    print("Elapsed %f sec. Average per video: %f sec." % (elapsed, elapsed / len(speedtest_videos)))

Elapsed 41.435339 sec. Average per video: 8.287068 sec.


In [25]:
# y_pred = predict_on_img(r"E:\My Documents\charan.jpg", batch_size=1)

In [26]:
speedtest_videos    

['_a_charan.mp4',
 '_deepest_fake.mp4',
 '_result-charan1.mp4',
 '_vig.mp4',
 'all.mp4']

In [27]:
predictions

[0.533103883266449,
 0.4669482111930847,
 0.6251113414764404,
 0.05502110347151756,
 0.4842233657836914]

In [28]:
test_videos

['_a_charan.mp4',
 '_deepest_fake.mp4',
 '_result-charan1.mp4',
 '_vig.mp4',
 'all.mp4',
 'mohit.mp4',
 'panchal.mp4']

In [29]:
predictions = predict_on_video_set(test_videos, num_workers=4)

In [30]:
predictions

[0.533103883266449,
 0.4669482111930847,
 0.6251113414764404,
 0.05502110347151756,
 0.4842233657836914,
 0.6406271457672119,
 0.47697553038597107]