# Inputs

In [1]:
video_fn = 'data/bibi_cabinet_meetings_indexed_trimed/0001.mp4'
video_frames_per_seconds = 20
audio_fn = 'audio_features/wav_files/0001.mp4.wav'
output_fn = '0001_res.mp4'

In [2]:
EVERY_N_SECONDS = 1.0/video_frames_per_seconds
temp_video_folder = 'temp_video_folder'

# Parameters (Do Not Change! (unless you change the models))

In [30]:
audio_frames_per_seconds = 200
bibi_inpainting_model = './video_model.pt'
audio_to_lips_model = './audio_model.h5'
face_predictor_path = './shape_predictor_68_face_landmarks.dat'

In [4]:
from tqdm import tqdm
import scipy.io.wavfile as wav
import numpy as np
from python_speech_features import logfbank, mfcc
import cv2
import dlib
import pickle as pkl
from imutils import face_utils
import moviepy.editor as mpe
from skimage.transform import resize
from tqdm import tqdm
import keras
from keras.models import Model, Sequential
from keras.layers import Input,Dense,LSTM,Dropout,LeakyReLU,Activation
from keras.layers import TimeDistributed,Flatten,Conv2D,Reshape,concatenate,Lambda
import sklearn
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import os
import scipy.misc
import shutil

Using TensorFlow backend.


# Data Load

In [7]:
def load_data(input_video_fn, input_audio_fn):
    assert('wav' in input_audio_fn)
    rate, sig = wav.read(input_audio_fn)
    mfccFeat = mfcc(sig, rate)
    fbankFeat = logfbank(sig, rate)
    audio_features = np.concatenate((mfccFeat, fbankFeat), axis=1)
    
    detector, predictor = dlib.get_frontal_face_detector(), dlib.shape_predictor(face_predictor_path)
    video_path = input_video_fn
    video = mpe.VideoFileClip(video_path)
    video_duration = video.duration

    res, res_cut_params, res_masked, res_ff = [], [], [], []
    frame = video.get_frame(0)
    frame_width = frame.shape[1]
    frame_height = frame.shape[0]

    def get_face_features_from_frame(frame,last_frame_rects):
        gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        rects = detector(frame, 0)

        if rects is None or len(rects)!=1:
            rects = last_frame_rects
        if len(rects) != 1:
            raise Exception("WHOA??! More than one face in the image!")

        dlib_features = predictor(gray, rects[0])
        return face_utils.shape_to_np(dlib_features),rects
    
    last_frame_rects = dlib.rectangles()
    last_frame_rects.append(
        dlib.rectangle(int(frame_width/4),int(frame_height/4),int(frame_width/4),int(frame_height/4)))
    
    with tqdm(np.arange(0, video_duration, EVERY_N_SECONDS), desc='processing_frames') as _tqdm:
        for frame_time in _tqdm:
            frame = video.get_frame(frame_time)
            face_features,last_frame_rects = get_face_features_from_frame(frame,last_frame_rects)
            frame = np.array(frame, dtype=np.float32)/256
            
            size_i = face_features[:,0].max() - face_features[:,0].min()
            size_j = size_i
            mean_j, mean_i = int(np.median(face_features[48:,0])), int(np.median(face_features[48:,1]))
            size_i_3 = int(size_i/3)
            size_j_2 = int(size_j/2)
            frame_cut = [mean_i-3*size_i_3,mean_i+size_i_3,mean_j-size_j_2,mean_j+size_j_2]
            frame_masked = frame[frame_cut[0]:frame_cut[1],frame_cut[2]:frame_cut[3]]
            frame_masked = resize(frame_masked, (100,74), anti_aliasing=True).astype(np.float32)
            frame_masked[67:87, 16:-16] = -1
            
            face_features = face_features.astype(np.float32)
            face_features[:,0] -= face_features[:,0].min()
            face_features[:,1] -= face_features[:,1].min()
            face_features[:,0] /= face_features[:,0].max()
            face_features[:,1] /= face_features[:,1].max()

            res.append(np.array(frame))
            res_cut_params.append(frame_cut)
            res_masked.append(frame_masked)
            res_ff.append(face_features)
        
    res, res_cut_params, res_ff = np.array(res), np.array(res_cut_params), np.array(res_ff)
    res_masked = np.array(res_masked)
    return res, res_cut_params, res_masked, res_ff, audio_features

In [8]:
frames, frames_cut_params, frames_masked, frames_face_features, audio_features = load_data(video_fn, audio_fn)

W0710 22:34:50.599924 140279326533376 sigproc.py:82] frame length (1103) is greater than FFT size (512), frame will be truncated. Increase NFFT to avoid.
W0710 22:34:52.741321 140279326533376 sigproc.py:82] frame length (1103) is greater than FFT size (512), frame will be truncated. Increase NFFT to avoid.
processing_frames: 100%|██████████| 200/200 [00:34<00:00,  6.04it/s]


In [9]:
duration = min(frames.shape[0]/video_frames_per_seconds, 
               audio_features.shape[0]/audio_frames_per_seconds)

# Audio Model

In [10]:
def create_combined_model():
    face_input =Input(shape=(48*2,),          name='face_input',  dtype='float32')
    audio_input=Input(shape=(100, 39,), name='audio_input', dtype='float32')
    
    # face layers:
    face=Dense(32, activation='sigmoid',               name='face_dense_1')     (face_input)    
    face=Dropout(0.4,                                  name='face_dropout_1')   (face)
    face=Dense(5,                                      name='face_dense_2')     (face)
    
    face_only_output = Dense(40, activation='sigmoid', name='face_only_output') (face)
        
    # audio layer:
    audio = TimeDistributed(
        Dense(30, activation='sigmoid',    name='audio_TD_1'))      (audio_input)
    audio = Dropout(0.2,                   name='audio_dropout_1') (audio)
    audio = LSTM(25,return_sequences=True, name='audio_lstm_1')    (audio)
    audio = Reshape((1,100,25),      name='audio_reshape_1') (audio)
    audio = Dropout(0.4,                   name='audio_dropout_2') (audio)
    audio = Conv2D(20, kernel_size=(25,25), strides=5, activation='sigmoid',
       data_format='channels_first',       name='audio_conv2d_1')  (audio)
    audio = Flatten(                       name='audio_flatten_1') (audio)
    audio = Dropout(0.4,                   name='audio_dropout_3') (audio)
    audio = Dense(5, activation='sigmoid', name='audio_dense_1')   (audio)
    
    audio_boosting = Dense(40, 
        activation='sigmoid',              name='audio_boosting')   (audio)
    audio_boosting = Lambda(lambda x:x*2-1,name='audio_boosting_lambda')(
                                                                    audio_boosting)
    audio_boosting_output=keras.layers.Add(name='audio_boosting_output')(
                                                                    [face_only_output, audio_boosting])
        
    #concatenate_layer:
    conc = concatenate([audio, face],      name='concatenation_1')
    conc = Dense(20, activation='sigmoid', name='conc_dense_1') (conc)
    
    conc_output = Dense(40,
        activation='sigmoid',              name='conc_output')  (conc)

    model_combined  = Model(inputs=[audio_input, face_input],
                            outputs=[face_only_output, audio_boosting_output, conc_output])
    return model_combined

audio_model = create_combined_model()
audio_model.load_weights(audio_to_lips_model)

W0710 22:35:30.517032 140279326533376 deprecation_wrapper.py:119] From /home/jupyter/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0710 22:35:30.539246 140279326533376 deprecation_wrapper.py:119] From /home/jupyter/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0710 22:35:30.554142 140279326533376 deprecation_wrapper.py:119] From /home/jupyter/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:131: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0710 22:35:30.554769 140279326533376 deprecation_wrapper.py:119] From /home/jupyter/.local/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with

In [11]:
def loadpkl(fn):
    with open(fn,'rb') as f:
        return pkl.load(f)
scalerX=       loadpkl('scalerX')
scalerX_extra= loadpkl('scalerX_extra')
scalery=       loadpkl('scalery')

In [12]:
assert(duration>0.5)

In [13]:
def get_audio_features_for_prediction():
    len_X = int(duration * video_frames_per_seconds)
    audio_features_X = []
    audio_features_FF = np.array(frames_face_features[:len_X])
    for i in range(len_X):
        i_mid = int(i * audio_frames_per_seconds / video_frames_per_seconds)
        i_start = i_mid - 50
        i_end = i_mid + 50
        if i_start < 0:
            i_end -= i_start
            i_start = 0
        elif i_end > audio_features.shape[0]:
            dif = i_end - audio_features.shape[0]
            i_start -= dif
            i_end -= dif
        audio_features_X.append(audio_features[i_start:i_end])
    return np.array(audio_features_X), audio_features_FF[:,:48], audio_features_FF[:, 48:]

audio_features_X, audio_features_X_FF, audio_features_y = get_audio_features_for_prediction()

In [14]:
audio_features_X = scalerX.transform(audio_features_X.reshape(audio_features_X.shape[0],-1)).reshape(-1,100,39)
audio_features_X_FF = scalerX_extra.transform(audio_features_X_FF.reshape((audio_features_X_FF.shape[0],-1)))

# Predict

In [15]:
lips = audio_model.predict({'audio_input': audio_features_X, 'face_input': audio_features_X_FF})[2]

In [16]:
lips = scalery.inverse_transform(lips)
lips = lips.reshape(-1,20,2)
lips[:,:,0] += audio_features_y[:,:,0].mean(axis=1, keepdims=True)
lips[:,:,1] += audio_features_y[:,:,1].mean(axis=1, keepdims=True)

In [17]:
audio_features_X_FF = scalerX_extra.inverse_transform(audio_features_X_FF).reshape((-1,48,2))

In [18]:
face_features_predicted = np.concatenate((audio_features_X_FF, lips), axis=1)

# Video Model

In [19]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
device

device(type='cuda', index=0)

In [20]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.leakyReLU = nn.LeakyReLU(0.2, inplace=False)
        self.conv1 = nn.Conv2d(3, 6, 4, 2, (1,0))
        self.conv2 = nn.Conv2d(6, 12, 4, 2, (0,1))
        self.batchnorm1 = nn.BatchNorm2d(12)
        self.conv3 = nn.Conv2d(12, 12, 4, 2, (1,0))
        self.batchnorm2 = nn.BatchNorm2d(12)
        self.conv4 = nn.Conv2d(12, 12, 4, 2, 1)
        self.batchnorm3 = nn.BatchNorm2d(12)
        self.linear1 = nn.Linear(288, 30)
        self.linear2 = nn.Linear(30, 60)
        
        self.linear1_ff = nn.Linear(136, 40)
        self.linear2_ff = nn.Linear(40, 10)
        
        self.up_linear1 = nn.Linear(70, 30)
        self.up_linear2 = nn.Linear(30, 288)
        
        self.up_stage_0 = nn.Conv2d(24, 3, 3, 1, 1)

        self.up_convt1 = nn.ConvTranspose2d(24,12,4,2,1)
        self.up_batchnorm1 = nn.BatchNorm2d(12)
        self.up_stage_1 = nn.Conv2d(24, 3, 3, 1, 1)

        self.up_convt2 = nn.ConvTranspose2d(24,12,4,2,(1,0))
        self.up_batchnorm2 = nn.BatchNorm2d(12)
        self.up_stage_2 = nn.Conv2d(24, 3, 3, 1, 1)

        self.up_convt3 = nn.ConvTranspose2d(24,6,4,2,(0,1))
        self.up_batchnorm3 = nn.BatchNorm2d(6)
        self.up_stage_3 = nn.Conv2d(12, 3, 3, 1, 1)

        self.up_convt4 = nn.ConvTranspose2d(12,3,4,2,(1,0))
        self.up_batchnorm4 = nn.BatchNorm2d(3)
        
        self.res_conv1 = nn.Conv2d(3,10,5,1,2)
        self.res_conv2 = nn.Conv2d(10,3,5,1,2)
        self.res_conv3 = nn.Conv2d(6,3,5,1,2)
        
        self.sigmoid = nn.Sigmoid()
        
        self.upsample = nn.Upsample((100,74))
        
    def forward(self, original_image, face_features, return_stage=-1):
        y = self.leakyReLU(self.linear1_ff(face_features))
        y = self.leakyReLU(self.linear2_ff(y))
        
        x_down_1 =original_image.clone()
        x_down_1[original_image==-1]=0
        
        x_down_2 = self.leakyReLU(self.conv1(x_down_1))

        x_down_3 = self.conv2(x_down_2)
        x_down_3 = self.leakyReLU(self.batchnorm1(x_down_3))

        x_down_4 = self.conv3(x_down_3)
        x_down_4 = self.leakyReLU(self.batchnorm2(x_down_4))

        x_down_5 = self.conv4(x_down_4)
        x_down_5 = self.leakyReLU(self.batchnorm3(x_down_5))

        x = x_down_5.reshape((-1,288))
        
        x = self.leakyReLU(self.linear1(x))
        x = self.leakyReLU(self.linear2(x))

        x = torch.cat((x,y), dim=1)
        
        x = self.leakyReLU(self.up_linear1(x))
        x = self.leakyReLU(self.up_linear2(x))
        
        x = x.reshape((-1, 12, 6, 4))
        
        x = torch.cat((x,x_down_5), dim=1)
        x0 = self.upsample(self.sigmoid(self.up_stage_0(x)))
        
        x = self.leakyReLU(self.up_batchnorm1(self.up_convt1(x)))
        x = torch.cat((x,x_down_4), dim=1)
        x1 = self.upsample(self.sigmoid(self.up_stage_1(x)))
        
        x = self.leakyReLU(self.up_batchnorm2(self.up_convt2(x)))
        x = torch.cat((x,x_down_3), dim=1)
        x2 = self.upsample(self.sigmoid(self.up_stage_2(x)))
        
        x = self.leakyReLU(self.up_batchnorm3(self.up_convt3(x)))
        x = torch.cat((x,x_down_2), dim=1)
        x3 = self.upsample(self.sigmoid(self.up_stage_3(x)))
        
        x = self.sigmoid(self.up_batchnorm4(self.up_convt4(x)))
        x4 = x
        
        x = x.clone()
        x[original_image!=-1] = original_image[original_image!=-1]
        
        x_start = x.clone()
        x = self.leakyReLU(self.res_conv1(x))
        x = self.leakyReLU(self.res_conv2(x))
        x = torch.cat((x,x_start),dim=1)
        x = torch.sigmoid(self.res_conv3(x)).clone()
        mask = original_image!=-1
        x[mask] = original_image[mask]
        x5 = x
        
        return x0, x1, x2, x3, x4, x5

generator = Generator()
generator = generator.to(device)

In [31]:
generator.load_state_dict(torch.load(bibi_inpainting_model))
generator.eval()

Generator(
  (leakyReLU): LeakyReLU(negative_slope=0.2)
  (conv1): Conv2d(3, 6, kernel_size=(4, 4), stride=(2, 2), padding=(1, 0))
  (conv2): Conv2d(6, 12, kernel_size=(4, 4), stride=(2, 2), padding=(0, 1))
  (batchnorm1): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(12, 12, kernel_size=(4, 4), stride=(2, 2), padding=(1, 0))
  (batchnorm2): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(12, 12, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  (batchnorm3): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear1): Linear(in_features=288, out_features=30, bias=True)
  (linear2): Linear(in_features=30, out_features=60, bias=True)
  (linear1_ff): Linear(in_features=136, out_features=40, bias=True)
  (linear2_ff): Linear(in_features=40, out_features=10, bias=True)
  (up_linear1): Linear(in_features=70, out_features=30, bias=True)
  (up_linear2): Line

In [32]:
with tqdm(range(face_features_predicted.shape[0]),desc='predicting video') as _tqdm:
    for i in _tqdm:
        pred = generator(
            torch.Tensor([frames_masked[i].transpose([2,0,1])]).to(device),
            torch.Tensor([face_features_predicted[i].reshape(-1)]).to(device)
        )[5].detach().cpu().numpy().transpose([0,2,3,1])[0]
        i1,i2,j1,j2 = frames_cut_params[i]
        i_size,j_size = i2-i1, j2-j1
        frames[i,i1:i2,j1:j2]=resize(pred,(i_size,j_size),anti_aliasing=True).astype(np.float32)

predicting video: 100%|██████████| 200/200 [00:02<00:00, 73.11it/s]


In [33]:
os.mkdir(temp_video_folder)

In [34]:
with tqdm(range(frames_masked.shape[0]), desc='saving temp images') as _tqdm:
    for i in _tqdm:
        scipy.misc.toimage(frames[i], cmin=0.0, cmax=1.0).save(os.path.join(temp_video_folder,'%05d.png' % i))

`toimage` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use Pillow's ``Image.fromarray`` directly instead.
  This is separate from the ipykernel package so we can avoid doing imports until

saving temp images: 100%|██████████| 200/200 [00:28<00:00,  6.75it/s]


In [35]:
IMAGE_DIR_FMT = os.path.join(temp_video_folder,'%05d.png')
!ffmpeg -framerate $video_frames_per_seconds -i $IMAGE_DIR_FMT -i $audio_fn -shortest temp_video_.mp4 

ffmpeg version 3.2.12-1~deb9u1 Copyright (c) 2000-2018 the FFmpeg developers
  built with gcc 6.3.0 (Debian 6.3.0-18+deb9u1) 20170516
  configuration: --prefix=/usr --extra-version='1~deb9u1' --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libebur128 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxvid --enable-libzmq --enable-libzvbi --ena

In [36]:
!ffmpeg -i temp_video_.mp4 -i $audio_fn -shortest $output_fn 

ffmpeg version 3.2.12-1~deb9u1 Copyright (c) 2000-2018 the FFmpeg developers
  built with gcc 6.3.0 (Debian 6.3.0-18+deb9u1) 20170516
  configuration: --prefix=/usr --extra-version='1~deb9u1' --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libebur128 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxvid --enable-libzmq --enable-libzvbi --ena

In [37]:
!ffmpeg -i $output_fn

ffmpeg version 3.2.12-1~deb9u1 Copyright (c) 2000-2018 the FFmpeg developers
  built with gcc 6.3.0 (Debian 6.3.0-18+deb9u1) 20170516
  configuration: --prefix=/usr --extra-version='1~deb9u1' --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libebur128 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxvid --enable-libzmq --enable-libzvbi --e

In [38]:
os.remove('temp_video_.mp4')

In [39]:
shutil.rmtree(temp_video_folder)