In [11]:
import torch
idx_tensor = torch.arange(15)
tensor = torch.randn(15)
print(idx_tensor.dtype, idx_tensor.device)  
print(tensor.dtype, tensor.device)
print(idx_tensor * tensor)

torch.int64 cpu
torch.float32 cpu
tensor([ -0.0000,   0.5068,   2.9216,  -1.5335,   3.3608,  12.0714,   6.3862,
         -7.4236,   0.9255,   7.3898,  22.2197,  11.5649,   0.0366, -25.1163,
        -21.7017])


In [6]:
import torch 

tensor1 = torch.randn(15).type(torch.float32).to("cuda")
tensor2 = torch.randn(15).type(torch.float16).to("cuda")
print(tensor1.dtype, tensor1.device)
print(tensor2.dtype, tensor2.device)
print(tensor1 * tensor2)

torch.float32 cuda:0
torch.float16 cuda:0
tensor([-0.6114, -0.0056, -0.0560, -0.3981, -1.2259,  0.7283, -0.0042, -0.1142,
        -2.0781,  0.8457, -0.1566,  0.1703,  1.0486, -0.0755, -0.2323],
       device='cuda:0')


In [3]:
import torch
import sys
import os
from pathlib import Path
sys.path.append(str(Path(os.path.abspath("")).parents[2]))

from src.sadtalker.src.facerender.modules.make_animation import keypoint_transformation

batch_size = 50
kp_canonical = {
    "value": torch.randn(batch_size, 15, 3).type(torch.float16),
}
he_source = {
    "yaw": torch.randn(batch_size, 66).type(torch.float16),
    "pitch": torch.randn(batch_size, 66).type(torch.float16),
    "roll": torch.randn(batch_size, 66).type(torch.float16),
    "t": torch.randn(batch_size, 3).type(torch.float16),
    "exp": torch.randn(batch_size, 45).type(torch.float16),
}

kp_source = keypoint_transformation(kp_canonical, he_source)
print(kp_source["value"].shape)

torch.Size([50, 15, 3])


In [20]:
import time
import sys
import os
from pathlib import Path

sys.path.append(str(Path(os.path.abspath("")).parents[2]))
from src.sadtalker.src.facerender.modules.util import make_coordinate_grid, kp2gaussian

batch_size = 50
kp_driving = {
    "value": torch.randn(batch_size, 15, 3).type(torch.float32).to("cuda:0"),
}
start = time.time()
identity_grid = make_coordinate_grid((batch_size, 15, 3))
print(time.time() - start, identity_grid.shape)

spatial_size = torch.Size([16, 64, 64])
start = time.time()
gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01)
print(time.time() - start, gaussian_driving.shape)

0.00101470947265625 torch.Size([50, 15, 3, 3])
0.28836512565612793 torch.Size([50, 15, 16, 64, 64])


In [21]:
import torch
from torch.profiler import (
    profile,
    record_function,
    ProfilerActivity,
    tensorboard_trace_handler,
)


def make_coordinate_grid(spatial_size: tuple[int, int, int]) -> torch.Tensor:
    d, h, w = spatial_size
    x = torch.arange(w)
    y = torch.arange(h)
    z = torch.arange(d)

    x = 2 * (x / (w - 1)) - 1
    y = 2 * (y / (h - 1)) - 1
    z = 2 * (z / (d - 1)) - 1

    yy = y.view(1, -1, 1).repeat(d, 1, w)
    xx = x.view(1, 1, -1).repeat(d, h, 1)
    zz = z.view(-1, 1, 1).repeat(1, h, w)

    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
    return meshed


def kp2gaussian(kp, spatial_size, kp_variance):
    """
    Transform a keypoint into gaussian like representation
    """
    mean = kp["value"]
    # TEST: spatial_size torch.Size([16, 64, 64])
    # print("spatial_size", spatial_size)
    coordinate_grid = make_coordinate_grid(spatial_size)
    number_of_leading_dimensions = len(mean.shape) - 1
    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape
    coordinate_grid = coordinate_grid.view(*shape)
    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)
    coordinate_grid = coordinate_grid.repeat(*repeats)

    # Preprocess kp shape
    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)
    mean = mean.view(*shape)

    mean_sub = coordinate_grid.to(mean.device, non_blocking=True) - mean

    out = torch.exp(-0.5 * (mean_sub**2).sum(-1) / kp_variance)

    return out


batch_size = 30
kp_driving = {"value": torch.randn(batch_size, 15, 3).type(torch.float32).to("cuda:0")}
spatial_size = (16, 64, 64)
kp_variance = 0.01


with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    # on_trace_ready=tensorboard_trace_handler("./log/kp2gaussian"),
) as prof:
    with record_function("kp2gaussian"):
        kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=kp_variance)
prof.export_chrome_trace("./log/kp2gaussian.json")

In [1]:
import torch 
import time
# start = time.time()
# coordinate_grid = torch.randn((50, 15, 16, 64, 64, 3), device="cpu")
# print(time.time() - start)


def make_coordinate_grid(spatial_size):
    d, h, w = spatial_size
    device="cpu"
    x = torch.arange(w, device=device)
    y = torch.arange(h, device=device)
    z = torch.arange(d, device=device)
    print(x.device, y.device, z.device)
    x = 2 * (x / (w - 1)) - 1
    y = 2 * (y / (h - 1)) - 1
    z = 2 * (z / (d - 1)) - 1
    print(x.device, y.device, z.device)
    yy = y.view(1, -1, 1).repeat(d, 1, w)
    xx = x.view(1, 1, -1).repeat(d, h, 1)
    zz = z.view(-1, 1, 1).repeat(1, h, w)
    print(yy.device, xx.device, zz.device)
    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
    print(meshed.device)
    return meshed

# spatial_size = (16, 64, 64)
# start = time.time()
# make_coordinate_grid(spatial_size)
# print(time.time() - start)

def headpose_pred_to_degree(pred):  # slow
    idx_tensor = torch.arange(pred.shape[1], device=pred.device)
    # NOTE: doesn't require pred and idx_tensor to have the same dtype
    degree = torch.sum(pred.softmax(1) * idx_tensor, 1) * 3 - 99
    return degree

# pred = torch.randn(50, 66).to("cpu")
# start = time.time()
# headpose_pred_to_degree(pred)
# print(time.time() - start)

predictions = []
for _ in range(4):
    time.sleep(1)
    predictions.append(torch.randn((1 * 1024**3), device="cuda:3"))


In [20]:
import torch
import time
from torch.nn.parallel import DataParallel
import sys
import os
from pathlib import Path

sys.path.append(str(Path(os.path.abspath("")).parents[2]))

from src.sadtalker.src.utils.init_path import init_path
from src.sadtalker.src.facerender.animate import AnimateFromCoeff
from src.sadtalker.src.facerender.modules.generator import OcclusionAwareSPADEGenerator
from src.sadtalker.src.facerender.modules.make_animation import keypoint_transformation

image_size = 256
image_preprocess = "crop"
checkpoint_path = Path(os.path.abspath("")).parents[2] / "src/sadtalker/checkpoints"
gfpgan_path = Path(os.path.abspath("")).parents[2] / "src/sadtalker/gfpgan/weights"
config_path = Path(os.path.abspath("")).parents[2] / "src/sadtalker/src/config"
sadtalker_paths = init_path(
    str(checkpoint_path),
    str(gfpgan_path),
    str(config_path),
    image_size,
    False,
    image_preprocess,
)

batch_size = 30
def generate(
    source_image: torch.Tensor,
    source_semantics: torch.Tensor,
    target_semantics: torch.Tensor,
    generator: (
        OcclusionAwareSPADEGenerator | DataParallel[OcclusionAwareSPADEGenerator]
    ),
    device,
    dtype,
):
    with torch.no_grad():
        kp_canonical = {
            "value": torch.rand((batch_size, 15, 3), device=device, dtype=dtype)
        }
        he_source = {
            "yaw": torch.rand((batch_size, 66), device=device, dtype=dtype),
            "pitch": torch.rand((batch_size, 66), device=device, dtype=dtype),
            "roll": torch.rand((batch_size, 66), device=device, dtype=dtype),
            "t": torch.rand((batch_size, 3), device=device, dtype=dtype),
            "exp": torch.rand((batch_size, 45), device=device, dtype=dtype),
        }
        kp_source = keypoint_transformation(kp_canonical, he_source)

        he_driving = {
            "yaw": torch.rand((batch_size, 66), device=device, dtype=dtype),
            "pitch": torch.rand((batch_size, 66), device=device, dtype=dtype),
            "roll": torch.rand((batch_size, 66), device=device, dtype=dtype),
            "t": torch.rand((batch_size, 3), device=device, dtype=dtype),
            "exp": torch.rand((batch_size, 45), device=device, dtype=dtype),
        }
        kp_driving = keypoint_transformation(kp_canonical, he_driving)

        # start = time.time()
        out = generator(source_image, kp_source=kp_source, kp_driving=kp_driving)
        # print("generator time", time.time() - start)
        return out

model = AnimateFromCoeff(
    sadtalker_paths,
    device="cuda:1",
    dtype=torch.float32,
)
source_image = torch.rand(batch_size, 3, 256, 256)
source_semantics = torch.rand(batch_size, 70, 27)
target_semantics = torch.rand(batch_size, 8, 70, 27)
source_image = source_image.type(model.dtype).to(model.device)
source_semantics = source_semantics.type(model.dtype).to(model.device)
target_semantics = target_semantics.type(model.dtype).to(model.device)


using safetensor as default
Loading checkpoint from /home/80026129/PROJECTS/oncobot/src/sadtalker/checkpoints/SadTalker_V0.0.2_256.safetensors
OcclusionAwareSPADEGenerator model size: 385.550 MB
KPDetector model size: 160.013 MB
HEEstimator model size: 115.643 MB
MappingNet model size: 38.892 MB
dtype: torch.float32 device: cuda:1 dp_device_ids: None


In [25]:
for _ in range(2):  # Warm-up iterations
    torch.cuda.synchronize()
    start = time.time()
    print("start", flush=True)
    generate(
        source_image,
        source_semantics,
        target_semantics,
        model.generator,
        model.device,
        model.dtype,
    )
    print("end", flush=True)    
    torch.cuda.synchronize()
    print("warmup time", time.time() - start, flush=True)

for _ in range(0):
    torch.cuda.synchronize()
    start = time.time()
    generate(
        source_image,
        source_semantics,
        target_semantics,
        model.generator,
        model.device,
        model.dtype,
    )
    torch.cuda.synchronize()
    print("time", time.time() - start, flush=True)

start
end
warmup time 0.03161358833312988
start
end
warmup time 2.487888813018799


: 