In [1]:
from diffusers import FluxPipeline, FluxTransformer2DModel
from transformers import T5EncoderModel
import torch, time, os, subprocess, gc

def flush():
    gc.collect()
    torch.cuda.empty_cache()

prompt = """
Skeuomorphism,gobo lighting,long exposure, photorealistic close-up of an iridescent hummingbird hovering mid-air, drinking nectar from a bioluminescent flower, a dewdrop clings precariously to a spiderweb woven of pure silver. land art
"""

width = 1536
height = 512
guidance_scale = 2.5
num_inference_steps = 20

t1_start, dtype, model_id = time.time(), torch.bfloat16, "black-forest-labs/FLUX.1-dev"
float8_storage_dtype = torch.float8_e4m3fn #or float8_e5m2

t5_encoder = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2", torch_dtype=dtype).to("cuda")
text_encoder = FluxPipeline.from_pretrained(model_id, text_encoder_2=t5_encoder, transformer=None, vae=None, torch_dtype=dtype).to("cuda") 

with torch.no_grad():
    prompt_embeds, pooled_prompt_embeds, _ = text_encoder.encode_prompt(prompt=prompt, prompt_2=prompt, max_sequence_length=512)

embeddings_prompt_embeds = prompt_embeds.detach().cpu()
embeddings_pooled_prompt_embeds = pooled_prompt_embeds.detach().cpu()

del text_encoder
del t5_encoder
flush()
print(f"Encoded {time.time() - t1_start:.2f} secs")

t2_start = time.time()
transformer = FluxTransformer2DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=dtype).to("cuda")
transformer.enable_layerwise_casting(storage_dtype=float8_storage_dtype)
flush() 

pipeline = FluxPipeline.from_pretrained(model_id, torch_dtype=dtype, transformer=transformer, text_encoder_2=None, text_encoder=None, tokenizer_2=None, tokenizer=None).to("cuda")

prompt_embeds_gen = embeddings_prompt_embeds.to("cuda").to(dtype)
pooled_prompt_embeds_gen = embeddings_pooled_prompt_embeds.to("cuda").to(dtype)
flush()

gen_start_time = time.time()
with torch.no_grad():
    output = pipeline(prompt_embeds=prompt_embeds_gen, pooled_prompt_embeds=pooled_prompt_embeds_gen, width=width, height=height, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps)

image, timestamp = output.images[0], time.time()
filename = f"flux_cfg{guidance_scale}_steps{num_inference_steps}_layerwise_{float8_storage_dtype}_{timestamp}.png"
image.save(filename); os.startfile(filename) # Windows specific

result = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used,temperature.gpu,utilization.gpu', '--format=csv,noheader'], encoding='utf-8', timeout=1.0)
print(f"Max VRAM Peak: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GBMem used / temp C / utilized: {result}")
print(f"Inference time: {timestamp - gen_start_time:.2f} secs, total time: {time.time() - t2_start:.2f} secs")

del pipeline;del transformer;del prompt_embeds_gen;del pooled_prompt_embeds_gen
flush()



Multiple distributions found for package optimum. Picked distribution: optimum


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Encoded 3.71 secs


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Max VRAM Peak: 22.19 GBMem used / temp C / utilized: 20490 MiB, 54, 64 %

Inference time: 11.99 secs, total time: 20.13 secs
