In [1]:
prompt = """
A photorealistic close-up of a single, iridescent hummingbird hovering mid-air, its wings a blur of sapphire and emerald, drinking nectar from a luminous, bioluminescent flower that emits soft, swirling particles of golden light. The background is a hyper-detailed, otherworldly jungle at twilight, with colossal, crystalline trees reflecting a nebula-filled sky. In the foreground, a single dewdrop clings precariously to a spiderweb woven with threads of pure silver. The overall atmosphere should be one of serene magic and vibrant detail plus perfect clarity, sharp focus, intricate detail, expressive style, a rich deep aesthetic, overall epic composition.
"""
negative_prompt = "cartoon, anime, poor quality, poor clarity, ugly, jpeg artifacts, cropped, lowres, error, out of frame, watermark"

guidance_scale=4
num_inference_steps=20
width=1536
height=640

model_id =  "THUDM/CogView4-6B"

In [2]:
import diffusers
import torch, time, gc, os, subprocess

def flush():
    gc.collect()
    torch.cuda.empty_cache()
def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024
device, dtype, time_start = "cuda", torch.bfloat16, time.time()
emb_prompts = diffusers.DiffusionPipeline.from_pretrained(model_id, transformer=None, vae=None, torch_dtype=dtype).to(device)

with torch.no_grad():
    (prompt_embeds, negative_prompt_embeds) = emb_prompts.encode_prompt(prompt=prompt, negative_prompt=negative_prompt)

del emb_prompts
flush()
print(f"   ... Prompts embeded.. {time.time() - time_start:.2f} seconds, Max vram: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated()):.2f} GB\n   ... Prompt shape {prompt_embeds.shape} ... Negative shape {negative_prompt_embeds.shape}\n   ... Generating Image..")

time_gen = time.time()
pipeline = diffusers.DiffusionPipeline.from_pretrained(model_id, text_encoder=None, tokenizer=None, torch_dtype=dtype).to(device)

with torch.inference_mode():
    image = pipeline(prompt_embeds=prompt_embeds.to(device).to(dtype), negative_prompt_embeds=negative_prompt_embeds.to(device).to(dtype), guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, width=width, height=height).images[0]

del pipeline
flush()
filename = f"cog4_cfg_{guidance_scale}_steps_{num_inference_steps}_{str(int(time.time()))}.png"
result = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used,temperature.gpu,utilization.gpu', '--format=csv,noheader'], encoding='utf-8', timeout=1.0)
image.save(filename);os.startfile(filename);print(f"   ... Generated in {time.time() - time_gen:.2f} secs, mem/temp/use: {result}   ... Max mem allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated()):.2f} GB")

Loading pipeline components...:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

   ... Prompts embeded.. 18.45 seconds, Max vram: 16.47 GB
   ... Prompt shape torch.Size([1, 144, 4096]) ... Negative shape torch.Size([1, 32, 4096])
   ... Generating Image..


Loading pipeline components...:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

   ... Generated in 29.43 secs, mem/temp/use: 1791 MiB, 54, 100 %
   ... Max mem allocated: 17.09 GB
