In [8]:
from diffusers import StableDiffusionXLPipeline, AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection
import torch
from safetensors.torch import load_file, save_file
from safetensors import safe_open
from time import perf_counter
import psutil
import gc

def preload_safetensors_to_cpu(safetensors_path):
    # 使用 safetensors 库读取权重到 CPU 内存
    weights = load_file(safetensors_path, device="cpu")
    return weights

def get_device_memory_usage(device=0):
    # 获取 GPU 剩余显存信息
    free_memory, total_memory = torch.cuda.mem_get_info(device)
    free_memory_gb = free_memory / (1024 ** 3)  # 转换为 GB
    total_memory_gb = total_memory / (1024 ** 3)  # 转换为 GB
    used_memory_gb = total_memory_gb - free_memory_gb

    print(f"GPU {device} Memory Usage:")
    print(f"  Total Memory: {total_memory_gb:.2f} GB")
    print(f"  Free Memory: {free_memory_gb:.2f} GB")
    print(f"  Used Memory: {used_memory_gb:.2f} GB")

def get_cpu_memory_usage():
    # 获取 CPU 内存使用情况
    memory_info = psutil.virtual_memory()
    cpu_memory_used = memory_info.used / (1024 ** 3)  # 转换为 GB
    cpu_memory_total = memory_info.total / (1024 ** 3)  # 转换为 GB
    cpu_memory_free = memory_info.available / (1024 ** 3)  # 转换为 GB

    print("CPU Memory Usage:")
    print(f"  Total Memory: {cpu_memory_total:.2f} GB")
    print(f"  Used Memory: {cpu_memory_used:.2f} GB")
    print(f"  Free Memory: {cpu_memory_free:.2f} GB")

def get_gpu_memory_usage():
    # 获取 GPU 显存使用情况
    gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3)  # 转换为 GB
    gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3)  # 转换为 GB
    gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # 转换为 GB

    print("GPU Memory Usage:")
    print(f"  Total Memory: {gpu_memory_total:.2f} GB")
    print(f"  Allocated Memory: {gpu_memory_allocated:.2f} GB")
    print(f"  Reserved Memory: {gpu_memory_reserved:.2f} GB")

# Example usage
get_cpu_memory_usage()
get_gpu_memory_usage()
get_device_memory_usage()

CPU Memory Usage:
  Total Memory: 62.55 GB
  Used Memory: 9.94 GB
  Free Memory: 51.51 GB
GPU Memory Usage:
  Total Memory: 14.58 GB
  Allocated Memory: 6.57 GB
  Reserved Memory: 6.84 GB
GPU 0 Memory Usage:
  Total Memory: 14.58 GB
  Free Memory: 7.64 GB
  Used Memory: 6.94 GB


In [2]:
import gc
gc.collect()

# 清理 GPU 内存缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

.safetensors 方式

In [2]:
# 加载基座
start = perf_counter()
text_encoder = CLIPTextModel.from_pretrained(
  '/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0',
  subfolder='text_encoder',
  use_safetensors=True,
  torch_dtype=torch.float16,
  variant='fp16',
).to('cuda')
text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(
  '/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0',
  subfolder='text_encoder_2',
  use_safetensors=True,
  torch_dtype=torch.float16,
  variant='fp16',
).to('cuda')
print('unet')
unet = UNet2DConditionModel.from_pretrained(
    "/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0",
    subfolder='unet',
    torch_dtype=torch.float16, 
    use_safetensors=True, 
    variant="fp16"
).to("cuda")
print('vae')
vae = AutoencoderKL.from_pretrained(
    "madebyollin/sdxl-vae-fp16-fix", 
    torch_dtype=torch.float16,
    cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
).to("cuda")
print('pipe')
pipe = StableDiffusionXLPipeline.from_pretrained(
    "/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0", 
    unet=unet,
    vae=vae,
    text_encoder=text_encoder,
    text_encoder_2=text_encoder_2,
    use_safetensors=True, 
    torch_dtype=torch.float16, 
    variant="fp16"
).to("cuda")
endtime = perf_counter()-start
print("加载一个 SD-XL 基座模型 所需时间为: ", endtime, "s")
get_cpu_memory_usage()
get_gpu_memory_usage()

unet
vae
pipe


Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 45.49it/s]

加载一个 SD-XL 基座模型 所需时间为:  61.070760504342616 s
CPU Memory Usage:
  Total Memory: 62.55 GB
  Used Memory: 9.47 GB
  Free Memory: 51.98 GB
GPU Memory Usage:
  Total Memory: 14.58 GB
  Allocated Memory: 6.57 GB
  Reserved Memory: 6.84 GB





.bin 加载方式

In [4]:
torch.save(vae, './stable-diffusion-xl-base-1.0/vae.bin')
torch.save(unet, './stable-diffusion-xl-base-1.0/unet.bin')
torch.save(text_encoder, './stable-diffusion-xl-base-1.0/text_encoder.bin')
torch.save(text_encoder_2, './stable-diffusion-xl-base-1.0/text_encoder_2.bin')

In [3]:
start = perf_counter()
text_encoder = torch.load('./stable-diffusion-xl-base-1.0/text_encoder.bin').to("cuda")
text_encoder_2 = torch.load('./stable-diffusion-xl-base-1.0/text_encoder_2.bin').to("cuda")
unet = torch.load('./stable-diffusion-xl-base-1.0/unet.bin').to("cuda")
vae = torch.load('./stable-diffusion-xl-base-1.0/vae.bin').to("cuda")
pipe = StableDiffusionXLPipeline.from_pretrained(
    "/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0", 
    unet=unet,
    vae=vae,
    text_encoder=text_encoder,
    text_encoder_2=text_encoder_2,
    use_safetensors=True, 
    torch_dtype=torch.float16, 
    variant="fp16"
).to("cuda")
endtime = perf_counter()-start
print("加载一个 SD-XL 基座模型 所需时间为: ", endtime, "s")
get_cpu_memory_usage()
get_gpu_memory_usage()

Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 38.92it/s]


加载一个 SD-XL 基座模型 所需时间为:  2.7703366917558014 s
Max memory allocated: 6.5670599937438965 GB


Zero Copy 方法

In [3]:
from typing import *
import copy
def extract_tensors(m: torch.nn.Module) -> Tuple[torch.nn.Module, List[Dict]]:
    """
    Remove the tensors from a PyTorch model, convert them to NumPy
    arrays, and return the stripped model and tensors.
    """
    tensors = []
    for _, module in m.named_modules():
        # Store the tensors in Python dictionaries
        params = {
            name: torch.clone(param).detach()
            for name, param in module.named_parameters(recurse=False)
        }
        buffers = {
            name: torch.clone(buf).detach()
            for name, buf in module.named_buffers(recurse=False)
        }
        tensors.append({"params": params, "buffers": buffers})
    
    # Make a copy of the original model and strip all tensors and
    # buffers out of the copy.
    m_copy = copy.deepcopy(m)
    for _, module in m_copy.named_modules():
        for name in ([name for name, _ in module.named_parameters(recurse=False)]
                     + [name for name, _ in module.named_buffers(recurse=False)]):
            setattr(module, name, None)   

    # Make sure the copy is configured for inference.
    m_copy.eval()
    return m_copy, tensors

In [5]:
start = perf_counter()
text_encoder_copy, text_encoder_tensors = extract_tensors(text_encoder.cpu())
text_encoder_2_copy, text_encoder_2_tensors = extract_tensors(text_encoder_2.cpu())
unet_copy, unet_tensors = extract_tensors(unet.cpu())
vae_copy, vae_tensors = extract_tensors(vae.cpu())
endtime = perf_counter()-start
print("所需时间为: ", endtime, "s")

所需时间为:  1.1928686192259192 s


In [6]:
torch.save(text_encoder_tensors, 'text_encoder_tensors.bin')
torch.save(text_encoder_2_tensors, 'text_encoder_2_tensors.bin')
torch.save(unet_tensors, 'unet_tensors.bin')
torch.save(vae_tensors, 'vae_tensors.bin')

In [8]:
text_encoder_tensors = torch.load('./stable-diffusion-xl-base-1.0/text_encoder_tensors.bin')
text_encoder_2_tensors = torch.load('./stable-diffusion-xl-base-1.0/text_encoder_2_tensors.bin')
unet_tensors = torch.load('./stable-diffusion-xl-base-1.0/unet_tensors.bin')
vae_tensors = torch.load('./stable-diffusion-xl-base-1.0/vae_tensors.bin')

In [12]:
def replace_tensors(m: torch.nn.Module, tensors: List[Dict]):
    """
    Restore the tensors that extract_tensors() stripped out of a 
    PyTorch model.
    :param no_parameters_objects: Skip wrapping tensors in 
     ``torch.nn.Parameters`` objects (~20% speedup, may impact
     some models)
    """
    modules = [module for _, module in m.named_modules()] 
    for module, tensor_dict in zip(modules, tensors):
        # There are separate APIs to set parameters and buffers.
        for name, array in tensor_dict["params"].items():
            module.register_parameter(name, 
                torch.nn.Parameter(torch.as_tensor(array)))
        for name, array in tensor_dict["buffers"].items():
            module.register_buffer(name, torch.as_tensor(array))    

start = perf_counter()
replace_tensors(text_encoder_copy, text_encoder_tensors)
text_encoder_new = text_encoder_copy.to("cuda")
replace_tensors(text_encoder_2_copy, text_encoder_2_tensors)
text_encoder_2_new = text_encoder_2_copy.to("cuda")
replace_tensors(unet_copy, unet_tensors)
unet_new = unet_copy.to("cuda")
replace_tensors(vae_copy, vae_tensors)
vae_new = vae_copy.to("cuda")
endtime = perf_counter()-start
print("所需时间为: ", endtime, "s")

所需时间为:  0.8135069417767227 s


State_dict 方法

In [16]:
start_time = perf_counter()
text_encoder_path = "/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0/text_encoder/model.fp16.safetensors"
text_encoder_2_path = "/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0/text_encoder_2/model.fp16.safetensors"
unet_model_path = "/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors"
vae_model_path = "/data1/workspace/javeyqiu/models/huggingface/hub/models--madebyollin--sdxl-vae-fp16-fix/snapshots/207b116dae70ace3637169f1ddd2434b91b3a8cd/diffusion_pytorch_model.safetensors"
text_encoder_weights = preload_safetensors_to_cpu(text_encoder_path)
text_encoder_2_weights = preload_safetensors_to_cpu(text_encoder_2_path)
unet_weights = preload_safetensors_to_cpu(unet_model_path)
vae_weights = preload_safetensors_to_cpu(vae_model_path)
endtime = perf_counter()-start_time
print("将 SD-XL 权重加载到系统内存上用时为", endtime, "s")

将 SD-XL 权重加载到系统内存上用时为 0.0871807630173862 s


In [19]:
start = perf_counter()
text_encoder.load_state_dict(text_encoder_weights)
text_encoder_2.load_state_dict(text_encoder_2_weights)
unet.load_state_dict(unet_weights)
vae.load_state_dict(vae_weights)
endtime = perf_counter()-start
print("将state_dict从内存中加载到显存上的时间为", endtime, "s")


将state_dict从内存中加载到显存上的时间为 0.21457289392128587 s
