In [1]:
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

from sda.encoder_image import Encoder
from sda.img_generator import Generator
from sda.rnn_audio import RNN

from scipy import signal
from skimage import transform as tf
import numpy as np
from PIL import Image
import contextlib
import shutil
import skvideo.io as sio
import scipy.io.wavfile as wav
import ffmpeg
import face_alignment
from pydub import AudioSegment
from pydub.utils import mediainfo

import glob

# Set random seed for reproducibility
manualSeed = 999
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)

dev = torch.device("cuda:0")

Random Seed:  999


In [2]:
audio_filenames = glob.glob('/home/jarrod/dev/speech-driven-animation/data/*/*.wav')

In [3]:
audio_filenames

['/home/jarrod/dev/speech-driven-animation/data/s3/sbag9a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/sbba9a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/pbwc9a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/bgbn6n.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/brbm4n.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/lbid3a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/lbbe1a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/lwbs1a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/lbwe5a.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/lway9s.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/swbb9s.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/sgai6p.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/prap4p.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/sgiu9s.wav',
 '/home/jarrod/dev/speech-driven-animation/data/s3/pbwc7s.wav',
 '/home/jarrod/dev/speech-driven-animati

In [4]:
# custom weights initialization called on netG and netD
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

# Network

In [5]:
class getDataSample():
    def __init__(self, model_path="grid", gpu=-1):

        if model_path == "grid":
            model_path = "/home/jarrod/dev/speech-driven-animation/sda/data/grid.dat"
#         elif model_path == "timit":
#             model_path = os.path.split(__file__)[0] + "/data/timit.dat"
#         elif model_path == "crema":
#             model_path = os.path.split(__file__)[0] + "/data/crema.dat"

        if gpu < 0:
            self.device = torch.device("cpu")
            model_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
            self.fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, device="cpu", flip_input=False)
        else:
            self.device = torch.device("cuda:" + str(gpu))
            model_dict = torch.load(model_path, map_location=lambda storage, loc: storage.cuda(gpu))
            self.fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, device="cuda:" + str(gpu),
                                                   flip_input=False)

        self.stablePntsIDs = [33, 36, 39, 42, 45]
        self.mean_face = model_dict["mean_face"]
        self.img_size = model_dict["img_size"]
        self.audio_rate = model_dict["audio_rate"]
        self.video_rate = model_dict["video_rate"]
        self.audio_feat_len = model_dict['audio_feat_len']
        self.audio_feat_samples = model_dict['audio_feat_samples']
        self.id_enc_dim = model_dict['id_enc_dim']
        self.rnn_gen_dim = model_dict['rnn_gen_dim']
        self.aud_enc_dim = model_dict['aud_enc_dim']
        # I think this is the size of the noise vector
        self.aux_latent = model_dict['aux_latent']
        # sequential noise is a boolean value
        self.sequential_noise = model_dict['sequential_noise']
        self.conversion_dict = {'s16': np.int16, 's32': np.int32}
        
        # image preprocessing
        self.img_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((self.img_size[0], self.img_size[1])),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    def preprocess_img(self, img):
        src = self.fa.get_landmarks(img)[0][self.stablePntsIDs, :]
        dst = self.mean_face[self.stablePntsIDs, :]
        tform = tf.estimate_transform('similarity', src, dst)  # find the transformation matrix
        warped = tf.warp(img, inverse_map=tform.inverse, output_shape=self.img_size)  # wrap the frame image
        warped = warped * 255  # note output from wrap is double image (value range [0,1])
        warped = warped.astype('uint8')

        return warped

    def _cut_sequence_(self, seq, cutting_stride, pad_samples):
        pad_left = torch.zeros(pad_samples // 2, 1)
        pad_right = torch.zeros(pad_samples - pad_samples // 2, 1)

        seq = torch.cat((pad_left, seq), 0)
        seq = torch.cat((seq, pad_right), 0)

        stacked = seq.narrow(0, 0, self.audio_feat_samples).unsqueeze(0)
        iterations = (seq.size()[0] - self.audio_feat_samples) // cutting_stride + 1
        for i in range(1, iterations):
            stacked = torch.cat((stacked, seq.narrow(0, i * cutting_stride, self.audio_feat_samples).unsqueeze(0)))
        return stacked.to(self.device)

    def __call__(self, img, audio, fs=None, aligned=False):
        if isinstance(img, str):  # if we have a path then grab the image
            frm = Image.open(img)
            frm.thumbnail((400, 400))
            frame = np.array(frm)
        else:
            frame = img

        # handle aligning the face with the model's learned "mean face"
        # may also do some preprocessing
        if not aligned:
            frame = self.preprocess_img(frame)

        # if we have a path then grab the audio clip
        if isinstance(audio, str):  
            info = mediainfo(audio)
            fs = int(info['sample_rate'])
            audio = np.array(AudioSegment.from_file(audio, info['format_name']).set_channels(1).get_array_of_samples())

            if info['sample_fmt'] in self.conversion_dict:
                audio = audio.astype(self.conversion_dict[info['sample_fmt']])
            else:
                if max(audio) > np.iinfo(np.int16).max:
                    audio = audio.astype(np.int32)
                else:
                    audio = audio.astype(np.int16)

        if fs is None:
            raise AttributeError("Audio provided without specifying the rate. Specify rate or use audio file!")

        if audio.ndim > 1 and audio.shape[1] > 1:
            audio = audio[:, 0]

        max_value = np.iinfo(audio.dtype).max
        
        if fs != self.audio_rate:
            seq_length = audio.shape[0]
            speech = torch.from_numpy(
                signal.resample(audio, int(seq_length * self.audio_rate / float(fs))) / float(max_value)).float()
            speech = speech.view(-1, 1)
            
        else:
            audio = torch.from_numpy(audio / float(max_value)).float()
            speech = audio.view(-1, 1)

        # take the input image and preprocess it    
        frame = self.img_transform(frame).to(self.device)

        cutting_stride = int(self.audio_rate / float(self.video_rate))
        audio_seq_padding = self.audio_feat_samples - cutting_stride

        # Create new sequences of the audio windows
        audio_feat_seq = self._cut_sequence_(speech, cutting_stride, audio_seq_padding)
        frame = frame.unsqueeze(0)
        audio_feat_seq = audio_feat_seq.unsqueeze(0)
        audio_feat_seq_length = audio_feat_seq.size()[1]
    
        return speech, audio_feat_seq, audio_feat_seq_length, frame

In [8]:
class videoGenerator(nn.Module):
    def __init__(self, gpu=-1):
        
        super(videoGenerator, self).__init__()
            
        self.device = torch.device("cuda:" + str(gpu))    
        
        # size of noise vector
        self.aux_latent = 10
        self.sequential_noise = True
        self.img_size = (128,96)
        self.rnn_gen_dim = 256
        self.id_enc_dim = 128
        self.aud_enc_dim = 256
        self.audio_feat_len = 0.2
        self.audio_rate = 50000

        # audio encoder
        self.encoder = RNN(self.audio_feat_len, self.aud_enc_dim, self.rnn_gen_dim,
                           self.audio_rate, init_kernel=0.005, init_stride=0.001)
        
        

        # id_image encoder
        self.encoder_id = Encoder(self.id_enc_dim, self.img_size)
        skip_channels = list(self.encoder_id.channels)
        skip_channels.reverse()

        # generator
        self.generator = Generator(self.img_size, self.rnn_gen_dim, condition_size=self.id_enc_dim,
                                   num_gen_channels=self.encoder_id.channels[-1],
                                   skip_channels=skip_channels, aux_size=self.aux_latent,
                                   sequential_noise=self.sequential_noise)
        
    def _broadcast_elements_(self, batch, repeat_no):
        total_tensors = []
        for i in range(0, batch.size()[0]):
            total_tensors += [torch.stack(repeat_no * [batch[i]])]

        return torch.stack(total_tensors)    
    
    
    def forward(self, audio_feat_seq, audio_feat_seq_length, frame):
        
        print(frame.requires_grad)
        
        # create audio encoding from the RNN
        z = self.encoder(audio_feat_seq, [audio_feat_seq_length])  # Encoding for the motion
        
        
        print(z)
        
        
        # generate the noise input for the generator
        noise = torch.FloatTensor(1, audio_feat_seq_length, self.aux_latent).normal_(0, 0.33).to(self.device)
        
        # create encoding from image
        z_id, skips = self.encoder_id(frame, retain_intermediate=True)
        
        # the decoder is abstracted from the encoder, so pass the intermediate outputs of each
        # conv layer (the skips) to the generator (which is the decoder)
        skip_connections = []
        for skip_variable in skips:
            skip_connections.append(self._broadcast_elements_(skip_variable, z.size()[1]))
        skip_connections.reverse()

        z_id = self._broadcast_elements_(z_id, z.size()[1])
        gen_video = self.generator(z, c=z_id, aux=noise, skip=skip_connections)
        
        return gen_video

In [9]:
class frameDiscriminator(nn.Module):
    def __init__(self):
        super(frameDiscriminator, self).__init__()
        # number of channels = 3 id_frame, and 3 target frame
        self.nc = 3
        self.ndf = 64
        
        self.main = nn.Sequential(
            # input is (nc) x 96 x 128
            nn.Conv2d(self.nc, self.ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 48 x 64
            nn.Conv2d(self.ndf, self.ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(self.ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
           
            # state size. (ndf*2) x 24 x 32
            nn.Conv2d(self.ndf * 2, self.ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(self.ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 12 x 16
            nn.Conv2d(self.ndf * 4, self.ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(self.ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 6 x 8
            nn.Conv2d(self.ndf * 8, self.ndf * 8, 4, 1, 0, bias=False),
        )
        self.fc1 = nn.Linear(self.ndf*8*5*3, 1)
        self.output = nn.Sigmoid()

    def forward(self, x):
        x = self.main(x)
#         print(x.shape)
        x = x.view(-1, self.ndf*8*5*3)
        x = self.fc1(x)
        
        return self.output(x) 

In [10]:
va = getDataSample(gpu=0, model_path="grid")

In [11]:
aud, a1, a2, frame = va("example/male_face2.jpg", "example/hello_world.wav")

In [13]:
del va

NameError: name 'va' is not defined

In [11]:
frameD = frameDiscriminator().to(dev)
frameD.apply(weights_init)

frameDiscriminator(
  (main): Sequential(
    (0): Conv2d(3, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv2d(256, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (9): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.2, inplace=True)
    (11): Conv2d(512, 512, kernel_size=(4, 4), stride=(1, 1), bias=False)
  )
  (fc1): Linear(in_features=7680, out_feature

In [12]:
ntest = videoGenerator(gpu=0).to(dev)
ntest.train()

videoGenerator(
  (encoder): RNN(
    (encoder): Encoder(
      (cl): ModuleList(
        (0): Conv1d(1, 16, kernel_size=(250,), stride=(50,), padding=(100,))
        (1): Conv1d(16, 32, kernel_size=(4,), stride=(2,), padding=(1,))
        (2): Conv1d(32, 64, kernel_size=(4,), stride=(2,), padding=(1,))
        (3): Conv1d(64, 128, kernel_size=(4,), stride=(2,), padding=(1,))
        (4): Conv1d(128, 256, kernel_size=(10,), stride=(5,), padding=(3,))
        (5): Conv1d(256, 256, kernel_size=(5,), stride=(1,))
      )
      (activations): ModuleList(
        (0): Sequential(
          (0): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): ReLU(inplace=True)
        )
        (1): Sequential(
          (0): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): ReLU(inplace=True)
        )
        (2): Sequential(
          (0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=T

In [13]:
criterion = nn.BCELoss()
a1.shape

torch.Size([1, 71, 10000, 1])

In [14]:
a1 = a1.to(dev)
a1.requires_grad = True
frame = frame.to(dev)
frame.requires_grad = True
label = torch.full((71,1), 1, device=dev)

In [15]:
output = ntest(a1, a2, frame)
d_out = frameD(output)
err = criterion(d_out, label)

True
encoder11  True
encoder22  False


NameError: name 'x1' is not defined

In [None]:
output

In [14]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        

    def forward(self, x, lengths):
        seq_length = x.size()[1]
        print("encoder11 ", x.requires_grad)
        x = x.reshape(-1, 1, 10)
        print("encoder22 ", x.requires_grad)
        
        return x

In [15]:
# audio encoder
# size of noise vector
aux_latent = 10
sequential_noise = True
img_size = (128,96)
rnn_gen_dim = 256
id_enc_dim = 128
aud_enc_dim = 256
audio_feat_len = 0.2
audio_rate = 50000
# encoder = RNN(audio_feat_len, aud_enc_dim, rnn_gen_dim, audio_rate, init_kernel=0.005, init_stride=0.001)
        
encoder = RNN()

In [16]:
encoder = encoder.to(dev)

In [32]:
X = torch.ones((1,71,10000,1))
X.requires_grad = True

In [34]:
a1.requires_grad = True

In [38]:
a3 = torch.Tensor([a2])
a3.requires_grad = True
z = encoder(a1, a3)

encoder11  True
encoder22  False


In [31]:
X.dtype

torch.float32

In [36]:
a1 *= 0

In [39]:
dir(a1)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__cuda_array_interface__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ilshift__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rfloordiv__',
 '__rmul__',
 '__rpow__',
 '__rshift__',
 '__rsub__',
 '__rtrued

In [56]:
err.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn