In [1]:
from models import create_model
from utils.options import parse
from PIL import Image
import torch
import numpy as np

In [2]:
import cv2

In [3]:
app_opt_path = './configs/sampler/sampler_high_res.yml'
app_opt = parse(app_opt_path, is_train=False)
app_opt['pretrained_sampler'] = './pretrained_models/sampler_high_res.pth'
app_opt['dist'] = False
app_model = create_model(app_opt)
app_model.load_network()

Working with z of shape (1, 256, 32, 16) = 131072 dimensions.


In [4]:
motion_opt_path = './configs/video_transformer/video_trans_high_res.yml'
motion_opt = parse(motion_opt_path, is_train=False)
motion_opt['pretrained_sampler'] = './pretrained_models/video_trans_high_res.pth'
motion_opt['dist'] = False
motion_model = create_model(motion_opt)
motion_model.load_network()

Working with z of shape (1, 256, 32, 16) = 131072 dimensions.
Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off]




Loading model from: D:\DIPLOMA\Code\Text2Performer\venv\Lib\site-packages\lpips\weights\v0.1\vgg.pth


In [5]:
def load_raw_image(img_path, downsample=True):
    with open(img_path, 'rb') as f:
        image = Image.open(f)
        width, height = image.size
        if downsample:
            width = width // 2
            height = height // 2
        image = image.resize(
            size=(width, height), resample=Image.LANCZOS)

    return image

In [6]:
import os

In [7]:
from utils.util import set_random_seed

In [8]:
set_random_seed(8)

In [9]:
save_dir = './results'
os.makedirs(save_dir, exist_ok=True)

In [10]:
x_identity, x_pose = app_model.sample_appearance(
    ['"The dress the person wears has long sleeves and it is of short length. Its texture is pure color.'], f'{save_dir}/exampler.png'
)

Sample timestep    1

In [11]:
video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                ['The lady moves to the right.'],
                                                8, list(range(0, 8)), 
                                                video_embeddings_pred, 
                                                f'{save_dir}/sequence1')
motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence1')


video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                ['The person is moving to the center from the right.'], 
                                                8, list(range(0, 8)), 
                                                video_embeddings_pred, 
                                                f'{save_dir}/sequence2')
motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence2')

# video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
# motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
#                                                 ['She turns right from the front to the side.'], 
#                                                 8, list(range(0, 8)), 
#                                                 video_embeddings_pred, 
#                                                 f'{save_dir}/sequence3')
# motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence3')
# 
# video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
# motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
#                                                 ['She turns right from the side to the back.'], 
#                                                 8, list(range(0, 8)), 
#                                                 video_embeddings_pred, 
#                                                 f'{save_dir}/sequence4')
# motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence4')

In [13]:
def inter_sequence_inter(first_seq_idx, second_seq_idx):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)

    first_frame_path = f'{save_dir}/sequence{first_seq_idx}/007.png'
    first_frame = load_raw_image(first_frame_path, downsample=False)
    first_frame = np.array(first_frame).transpose(2, 0, 1).astype(np.float32)
    first_frame = first_frame / 127.5 - 1
    first_frame = torch.from_numpy(first_frame).unsqueeze(0).to(device)

    first_frame_embedding = motion_model.get_quantized_frame_embedding(first_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

    video_embeddings_pred[:, :32, :] = first_frame_embedding

    end_frame_path = f'{save_dir}/sequence{second_seq_idx}/000.png'
    end_frame = load_raw_image(end_frame_path, downsample=False)
    end_frame = np.array(end_frame).transpose(2, 0, 1).astype(np.float32)
    end_frame = end_frame / 127.5 - 1
    end_frame = torch.from_numpy(end_frame).unsqueeze(0).to(device)

    end_frame_embedding = motion_model.get_quantized_frame_embedding(end_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

    video_embeddings_pred[:, -32:, :] = end_frame_embedding

    motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                    ['empty'], 
                                                    8, list(range(1, 7)), 
                                                    video_embeddings_pred, 
                                                    f'{save_dir}/sequence{first_seq_idx}_{second_seq_idx}')
    motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence{first_seq_idx}_{second_seq_idx}')

In [14]:
inter_sequence_inter(1, 2)
# inter_sequence_inter(2, 3)
# inter_sequence_inter(3, 4)
# inter_sequence_inter(4, 5)

In [15]:
def intra_sequence_inter(seq_idx):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
    
    for frame_idx in range(7):
        first_frame_path = f'{save_dir}/sequence{seq_idx}/{frame_idx:03d}.png'
        first_frame = load_raw_image(first_frame_path, downsample=False)
        first_frame = np.array(first_frame).transpose(2, 0, 1).astype(np.float32)
        first_frame = first_frame / 127.5 - 1
        first_frame = torch.from_numpy(first_frame).unsqueeze(0).to(device)

        first_frame_embedding = motion_model.get_quantized_frame_embedding(first_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

        video_embeddings_pred[:, :32, :] = first_frame_embedding

        end_frame_path = f'{save_dir}/sequence{seq_idx}/{frame_idx+1:03d}.png'
        end_frame = load_raw_image(end_frame_path, downsample=False)
        end_frame = np.array(end_frame).transpose(2, 0, 1).astype(np.float32)
        end_frame = end_frame / 127.5 - 1
        end_frame = torch.from_numpy(end_frame).unsqueeze(0).to(device)

        end_frame_embedding = motion_model.get_quantized_frame_embedding(end_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

        video_embeddings_pred[:, -32:, :] = end_frame_embedding

        motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                        ['empty'], 
                                                        8, list(range(1, 7)), 
                                                        video_embeddings_pred, 
                                                        f'{save_dir}/sequence{seq_idx}_interpolated',
                                                        save_idx=list(range(frame_idx*8, (frame_idx+1)*8)))
    
    motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence{seq_idx}_interpolated')

In [16]:
intra_sequence_inter(1)
# intra_sequence_inter(2)
# intra_sequence_inter(3)
# intra_sequence_inter(4)
# intra_sequence_inter(5)

In [17]:
intra_sequence_inter('1_2')
# intra_sequence_inter('2_3')
# intra_sequence_inter('3_4')
# intra_sequence_inter('4_5')

In [18]:
video_file_name = f'{save_dir}/video.mp4'
images = []
for seq_idx in range(1, 7):    
    if os.path.exists(f'{save_dir}/sequence{seq_idx}_interpolated'):
        print(f'{save_dir}/sequence{seq_idx}_interpolated')
        for frame_idx in range(56):   
            images.append(f'{save_dir}/sequence{seq_idx}_interpolated/{frame_idx:03d}.png')
    elif os.path.exists(f'{save_dir}/sequence{seq_idx}'):
        print(f'{save_dir}/sequence{seq_idx}')
        for frame_idx in range(8):  
            images.append(f'{save_dir}/sequence{seq_idx}/{frame_idx:03d}.png')
    if os.path.exists(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}_interpolated'):
        print(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}_interpolated')
        for frame_idx in range(56):
            images.append(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}_interpolated/{frame_idx:03d}.png')
    elif os.path.exists(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}'):
        print(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}')
        for frame_idx in range(8):
            images.append(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}/{frame_idx:03d}.png')
    else:
        continue
        

./results/sequence1_interpolated
./results/sequence1_2_interpolated
./results/sequence2


In [19]:
import shutil

In [20]:
len(images)

120

In [21]:
all_frames_dir = f'{save_dir}/all_frames'
os.makedirs(all_frames_dir, exist_ok=True)

for idx, image in enumerate(images):
    shutil.copy(image, f'{all_frames_dir}/{idx:03d}.png')

In [22]:
target_dir = f'{save_dir}/all_frames_stabilized'
os.makedirs(target_dir, exist_ok=True)

motion_model.video_stabilization(x_identity, all_frames_dir, target_dir, fix_video_len=len(images))

i= 0
i= 1
i= 2
i= 3
i= 4
i= 5
i= 6
i= 7
i= 8
i= 9
i= 10
i= 11
i= 12
i= 13
i= 14
i= 15
i= 16
i= 17
i= 18
i= 19
i= 20
i= 21
i= 22
i= 23
i= 24
i= 25
i= 26
i= 27
i= 28
i= 29
i= 30
i= 31
i= 32
i= 33
i= 34
i= 35
i= 36
i= 37
i= 38
i= 39
i= 40
i= 41
i= 42
i= 43
i= 44
i= 45
i= 46
i= 47
i= 48
i= 49
i= 50
i= 51
i= 52
i= 53
i= 54
i= 55
i= 56
i= 57
i= 58
i= 59
i= 60
i= 61
i= 62
i= 63
i= 64
i= 65
i= 66
i= 67
i= 68
i= 69
i= 70
i= 71
i= 72
i= 73
i= 74
i= 75
i= 76
i= 77
i= 78
i= 79
i= 80
i= 81
i= 82
i= 83
i= 84
i= 85
i= 86
i= 87
i= 88
i= 89
i= 90
i= 91
i= 92
i= 93
i= 94
i= 95
i= 96
i= 97
i= 98
i= 99
i= 100
i= 101
i= 102
i= 103
i= 104
i= 105
i= 106
i= 107
i= 108
i= 109
i= 110
i= 111
i= 112
i= 113
i= 114
i= 115
i= 116
i= 117
i= 118
i= 119


In [23]:
video_file_name = f'{save_dir}/video.mp4'

images = []
for i in range(1000):
    images.append(f'{target_dir}/{i:03d}.png')

frame = cv2.imread(images[0])
height, width, layers = frame.shape
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
video = cv2.VideoWriter(video_file_name, fourcc, 48, (width, height))

for image in images:
    video.write(cv2.imread(image))

cv2.destroyAllWindows()
video.release()