**Config**

In [3]:
#
# https://github.com/yoyo-nb/Thin-Plate-Spline-Motion-Model
# https://github.com/AliaksandrSiarohin/monkey-net
# https://arxiv.org/abs/1812.08861
# https://papers.nips.cc/paper/2019/file/31c0b36aef265d9221af80872ceb62f9-Paper.pdf
#
# https://www.youtube.com/watch?v=XjObqq6we4U
# https://colab.research.google.com/drive/11pf0SkMIhz-d5Lo-m7XakXrgVHhycWg6?usp=sharing#scrollTo=czsWABcK_2KE
# https://discord.com/channels/1054529544817741927/1054529545568538756
# https://www.d-id.com/creative-reality-studio/
# 
# NOTE: Detour we're taking... we will use a related project to LEARN the methods quicker (Monkey-NET)
# http://192.168.1.239:7878/notebooks/demo.ipynb
#
# Putting audio back into the resulting video
# ffmpeg -an -i result.mp4 -vn -i assets/video_intro.mp4 -c:a copy -c:v copy result_with_audio.mp4
# check: https://youtu.be/zZTOsm6Wm2w?t=270
#
# Changing frame rate:
# ffmpeg -i samfridman.mov -filter:v fps=30 lexfridman.mp4

In [33]:
import torch

# edit the config
device = torch.device('cuda:0')
dataset_name = "vox" # ['vox', 'taichi', 'ted', 'mgif']

source_image_path = "./assets/anderson.png"
driving_video_path = "./assets/anastasi.mp4"
output_video_path = "./generated.mp4"

config_path = "config/vox-256.yaml"
checkpoint_path = "checkpoints/vox.pth.tar"
predict_mode = "relative" # ['standard', 'relative', 'avd']
find_best_frame = False # when use the relative mode to animate a face, use 'find_best_frame=True' can get better quality result

pixel = 256 # for vox, taichi and mgif, the resolution is 256*256
if(dataset_name == "ted"): # for ted, the resolution is 384*384
    pixel = 384


**Read image and video**

In [None]:
import imageio
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from skimage.transform import resize
from IPython.display import HTML
import warnings
warnings.filterwarnings("ignore")

source_image = imageio.imread(source_image_path)
reader = imageio.get_reader(driving_video_path)


source_image = resize(source_image, (pixel, pixel))[..., :3]

fps = reader.get_meta_data()['fps']

print("fps: ", fps)

driving_video = []
try:
    for im in reader:
        driving_video.append(im)
except RuntimeError:
    pass
reader.close()

driving_video = [resize(frame, (pixel, pixel))[..., :3] for frame in driving_video]

def display(source, driving, generated=None):
    fig = plt.figure(figsize=(8 + 4 * (generated is not None), 6))

    ims = []
    for i in range(len(driving)):
        cols = [source]
        cols.append(driving[i])
        if generated is not None:
            cols.append(generated[i])
        im = plt.imshow(np.concatenate(cols, axis=1), animated=True)
        plt.axis('off')
        ims.append([im])

    ani = animation.ArtistAnimation(fig, ims, interval=50, repeat_delay=1000)
    plt.close()
    return ani
    

HTML(display(source_image, driving_video).to_html5_video())

**Create a model and load checkpoints**

In [None]:
import yaml
import torch
from modules.inpainting_network import InpaintingNetwork
from modules.keypoint_detector import KPDetector
from modules.dense_motion import DenseMotionNetwork
from modules.avd_network import AVDNetwork

with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.Loader)

inpainting = InpaintingNetwork(**config['model_params']['generator_params'],
                                    **config['model_params']['common_params'])
kp_detector = KPDetector(**config['model_params']['common_params'])
dense_motion_network = DenseMotionNetwork(**config['model_params']['common_params'],
                                          **config['model_params']['dense_motion_params'])
avd_network = AVDNetwork(num_tps=config['model_params']['common_params']['num_tps'],
                         **config['model_params']['avd_network_params'])
kp_detector.to(device)
dense_motion_network.to(device)
inpainting.to(device)
avd_network.to(device)

checkpoint = torch.load(checkpoint_path, map_location=device)

inpainting.load_state_dict(checkpoint['inpainting_network'])
kp_detector.load_state_dict(checkpoint['kp_detector'])
dense_motion_network.load_state_dict(checkpoint['dense_motion_network'])
if 'avd_network' in checkpoint:
    avd_network.load_state_dict(checkpoint['avd_network'])

inpainting.eval()
kp_detector.eval()
dense_motion_network.eval()
avd_network.eval()

**Perform image animation**

In [None]:
from demo import make_animation
from skimage import img_as_ubyte

if predict_mode=='relative' and find_best_frame:
    from demo import find_best_frame as _find
    i = _find(source_image, driving_video, device.type=='cuda')
    print ("Best frame: " + str(i))
    driving_forward = driving_video[i:]
    driving_backward = driving_video[:(i+1)][::-1]
    predictions_forward = make_animation(source_image, driving_forward, inpainting, kp_detector, dense_motion_network, avd_network, device = device, mode = predict_mode)
    predictions_backward = make_animation(source_image, driving_backward, inpainting, kp_detector, dense_motion_network, avd_network, device = device, mode = predict_mode)
    predictions = predictions_backward[::-1] + predictions_forward[1:]
else:
    predictions = make_animation(source_image, driving_video, inpainting, kp_detector, dense_motion_network, avd_network, device = device, mode = predict_mode)

#save resulting video
imageio.mimsave(output_video_path, [img_as_ubyte(frame) for frame in predictions], fps=fps)

HTML(display(source_image, driving_video, predictions).to_html5_video())

In [None]:
!rm generated_with_audio.mp4
!ffmpeg -an -i generated.mp4 -vn -i assets/anastasi.mp4 -c:a copy -c:v copy generated_with_audio.mp4