Tests the shapes of the model outputs are what we expect.

In [1]:
import os
import time 

import torch
import numpy as np
import matplotlib.pyplot as plt
from torchsummary import summary

import cheapfake.contrib.models as models
import cheapfake.contrib.dataset as dataset
import cheapfake.contrib.transforms as transforms

%matplotlib notebook

In [2]:
random_seed = 41
metadata_path = "/home/shu/cheapfake/cheapfake/contrib/balanced_metadata_fs03.csv"

dfdataset = dataset.DeepFakeDataset(metadata_path=metadata_path, frame_transform=transforms.BatchRescale(4), sequential_audio=True, random_seed=random_seed, verbose=False)
frames, audio, audio_stft = dfdataset.__getitem__(41)
frames = frames[:75]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = models.CheapFake(device=device)

start_time = time.time()
landmarks, fan_embedding, lipnet_embedding = model(frames.float().cuda(), audio.float().cuda())
end_time = time.time()

print("Prediction step took {} seconds".format(end_time - start_time))

TypeError: forward() missing 1 required positional argument: 'audio'

In [3]:
print(fan_embedding.shape)
print(lipnet_embedding.shape)

torch.Size([1, 256])
torch.Size([1, 256])


In [5]:
print(torch.cat((fan_embedding, lipnet_embedding), axis=0).shape)

torch.Size([2, 256])


Need to create embeddings for the LipNet output as well..

In [None]:
class LipNetEncoder(torch.nn.Module):
    def __init__(self):
        super(LipNetEncoder, self).__init__()
        
        self.conv1 = torch.nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.batchnorm1 = torch.nn.BatchNorm2d(16)
        self.maxpool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = torch.nn.Conv2d(16, 25, kernel_size=3, stride=1, padding=1)
        self.batchnorm2 = torch.nn.BatchNorm2d(25)
        self.maxpool2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        self.flatten = torch.nn.Flatten()
        self.fc1 = torch.nn.Linear(25 * 18 * 128, 256)
        self.relu = torch.nn.ReLU(inplace=True)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = self.maxpool2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        
        return x

In [None]:
lipnet_encoder = LipNetEncoder().cuda()
reshaped_embedding = lipnet_embedding[None, :, :, None]
reshaped_embedding = reshaped_embedding.permute(0, -1, 1, 2)
print(reshaped_embedding.shape)
output = lipnet_encoder(reshaped_embedding.float().cuda())
print(output.shape)

In [None]:
print(torch.cat((fan_embedding, output), axis=0).shape)

In [None]:
import sys
sys.path.insert(1, "/home/shu/i2ai")
from mmid.audio_models.VGGVox import VGGVox
from mmid.audio_models.ResNetSE34L import ResNetSE34L

In [None]:
audio_model = ResNetSE34L()
audio_embeddings = audio_model(audio_stft.view(audio_stft.shape[0], -1).float())
print(audio_embeddings.shape)

print(torch.cat((fan_embedding, output, audio_embeddings.to(device)), axis=0).shape)

Test the integration of the above code with code recently edited in models.py

In [None]:
model = models.CheapFake(device=device)

start_time = time.time()
fan_output, fan_embedding, lipnet_embedding = model(frames.float().to(device))
end_time = time.time()

print(fan_output.shape)
print(fan_embedding.shape)
print(lipnet_embedding.shape)
print("Entire operation took {} seconds".format(end_time - start_time))