In [None]:
# 1. Install necessary libraries
!pip install diffusers transformers accelerate huggingface_hub


In [None]:
# 2. Imports
import torch
from torch import autocast
from diffusers import DiffusionPipeline
from huggingface_hub import snapshot_download
import gc

In [None]:
import pynvml
import torch

# Initialize NVML once at the start of your notebook
pynvml.nvmlInit()

def get_gpu_metrics(device_idx=0):
    """
    Print GPU memory usage, SM utilization, and memory bandwidth utilization.
    """
    # PyTorch memory metrics
    used_mem_MB = torch.cuda.memory_allocated(device_idx) / 1024 / 1024
    reserved_mem_MB = torch.cuda.memory_reserved(device_idx) / 1024 / 1024
    total_mem_MB = torch.cuda.get_device_properties(device_idx).total_memory / 1024 / 1024

    # NVML metrics
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    util_info = pynvml.nvmlDeviceGetUtilizationRates(handle)

    metrics = {
        "pytorch_allocated_MB": used_mem_MB,
        "pytorch_reserved_MB": reserved_mem_MB,
        "nvml_used_MB": mem_info.used / 1024 / 1024,
        "total_memory_MB": total_mem_MB,
        "memory_utilization_percent": 100 * used_mem_MB / total_mem_MB,
        "sm_utilization_percent": util_info.gpu,
        "memory_bandwidth_utilization_percent": util_info.memory
    }

    return metrics

In [None]:
# 3. Download and Load Model
model_path = snapshot_download(repo_id="cerspense/zeroscope_v2_576w")

pipe = DiffusionPipeline.from_pretrained(
    model_path,
    torch_dtype=torch.float16
).to("cuda")

pipe.safety_checker = lambda images, **kwargs: (images, [False] * len(images))  # Disable safety checker
pipe.unet.eval()


In [None]:
batch_size = 1
num_frames = 8

In [None]:

prompt_list_full = [
    "A cinematic sunset over the mountains",
    "A bustling city skyline at night",
    "A peaceful forest in autumn",
    "A futuristic space station orbiting a planet",
    "A snowy village during Christmas",
    "A tropical beach at sunrise",
    "An ancient castle on a misty hill",
    "A colorful coral reef underwater",
    "A cute cat",
    "Dancing cat video",
    "Harry Potter",
    "A cute european village",
]

prompt_list = prompt_list_full[:batch_size]

# Get tokenizer and text_encoder from pipeline
tokenizer = pipe.tokenizer
text_encoder = pipe.text_encoder

# Tokenize prompt
text_inputs = tokenizer(
    prompt_list,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    return_tensors="pt"
)

input_ids = text_inputs.input_ids.to("cuda")


In [None]:
# Encode text
with torch.no_grad():
    encoder_hidden_states = text_encoder(input_ids)[0]  # (batch_size, seq_len, hidden_dim)


In [None]:
# 5. Prepare Static Latents and Timestep
latent_shape = (batch_size, 4, num_frames, 64, 64)

latents = torch.randn(latent_shape, device="cuda", dtype=torch.float16)
timestep = torch.tensor([50], device="cuda", dtype=torch.float16)  # Random timestep


In [None]:
# Torch Compile
# This one line decreased the inference time by 3x
pipe.unet = torch.compile(pipe.unet)

In [None]:
# 6. Warmup UNet Forward
with torch.no_grad(), autocast("cuda"):
    _ = pipe.unet(latents, timestep, encoder_hidden_states).sample

print("Warmup UNet Forward")


In [None]:
# 7. Capture UNet forward with CUDA Graph
# Graph capture increased SM utilization to almost 100%
print("Capturing UNet with CUDA Graph...")

graph = torch.cuda.CUDAGraph()

with torch.cuda.graph(graph):
    with torch.no_grad(), autocast("cuda"):
        unet_output = pipe.unet(latents, timestep, encoder_hidden_states).sample

torch.cuda.synchronize()
print("Capture complete!")


In [None]:
# 8. Replay and Profile
profile_logdir = "./graph_profile_log"

print("Profiling CUDA Graph replays...")

start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

total_time_ms = 0.0
num_repeats = 10

# Warmup replay
graph.replay()
torch.cuda.synchronize()

metrics_per_step = []

with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(profile_logdir),
    record_shapes=False,
    with_stack=False,
    profile_memory=False
) as prof:
    for step in range(num_repeats):
        start_event.record()

        graph.replay()
        torch.cuda.synchronize()
        # Collect and save metrics
        step_metrics = get_gpu_metrics()
        metrics_per_step.append(step_metrics)

        end_event.record()

        elapsed_time_ms = start_event.elapsed_time(end_event)
        total_time_ms += elapsed_time_ms

        torch.cuda.empty_cache()
        gc.collect()

        prof.step()

print("Profiling complete! 🚀")

avg_time_per_replay_ms = total_time_ms / num_repeats

print(f"\\n🚀 Benchmark Results:")
print(f"Batch size = {batch_size}, Num Frames = {num_frames}")
print(f"Total time for {num_repeats} replays: {total_time_ms:.2f} ms")
print(f"Average time per replay: {avg_time_per_replay_ms:.2f} ms")

In [None]:
import matplotlib.pyplot as plt

# Extract fields
sm_utilization = [m['sm_utilization_percent'] for m in metrics_per_step]
mem_utilization = [m['memory_utilization_percent'] for m in metrics_per_step]
bandwidth_utilization = [m['memory_bandwidth_utilization_percent'] for m in metrics_per_step]

# Plot
plt.figure(figsize=(12,6))
plt.plot(sm_utilization, label='SM Utilization (%)')
plt.plot(mem_utilization, label='Memory Utilization (%)')
plt.plot(bandwidth_utilization, label='Memory Bandwidth Usage (%)')
plt.xlabel('Replay Step')
plt.ylabel('Percentage (%)')
plt.title('GPU Utilization Metrics Over Steps')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
! ls -lart {profile_logdir}

In [None]:
#%load_ext tensorboard

In [None]:
#%tensorboard --logdir {profile_logdir}