In [1]:
from diffusers import DiffusionPipeline, AutoencoderKL, ControlNetModel, StableDiffusionXLControlNetPipeline
import torch
from time import perf_counter
from safetensors.torch import load_file, save_file
from safetensors import safe_open
import psutil
import gc
import os

def preload_safetensors_to_cpu(safetensors_path):
    # 使用 safetensors 库读取权重到 CPU 内存
    weights = load_file(safetensors_path, device="cpu")
    return weights

def get_device_memory_usage(device=0):
    # 获取 GPU 剩余显存信息
    free_memory, total_memory = torch.cuda.mem_get_info(device)
    free_memory_gb = free_memory / (1024 ** 3)  # 转换为 GB
    total_memory_gb = total_memory / (1024 ** 3)  # 转换为 GB
    used_memory_gb = total_memory_gb - free_memory_gb

    print(f"GPU {device} Memory Usage:")
    print(f"  Total Memory: {total_memory_gb:.2f} GB")
    print(f"  Free Memory: {free_memory_gb:.2f} GB")
    print(f"  Used Memory: {used_memory_gb:.2f} GB")

def get_cpu_memory_usage():
    # 获取 CPU 内存使用情况
    memory_info = psutil.virtual_memory()
    cpu_memory_used = memory_info.used / (1024 ** 3)  # 转换为 GB
    cpu_memory_total = memory_info.total / (1024 ** 3)  # 转换为 GB
    cpu_memory_free = memory_info.available / (1024 ** 3)  # 转换为 GB

    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 ** 3)  # 转换为 GB

    print("CPU Memory Usage:")
    # print(f"  Total Memory: {cpu_memory_total:.2f} GB")
    print(f"  Used Memory: {cpu_memory_used:.2f} GB")
    # print(f"  Free Memory: {cpu_memory_free:.2f} GB")
    # print(f"  RSS Memory': {mem:.2f} GB")

def get_gpu_memory_usage():
    # 获取 GPU 显存使用情况
    gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3)  # 转换为 GB
    gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3)  # 转换为 GB
    gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # 转换为 GB

    print("GPU Memory Usage:")
    # print(f"  Total Memory: {gpu_memory_total:.2f} GB")
    # print(f"  Allocated Memory: {gpu_memory_allocated:.2f} GB")
    print(f"  Reserved Memory: {gpu_memory_reserved:.2f} GB")

# Example usage
get_cpu_memory_usage()
get_gpu_memory_usage()
get_device_memory_usage()

  from .autonotebook import tqdm as notebook_tqdm


CPU Memory Usage:
  Used Memory: 17.50 GB
GPU Memory Usage:
  Reserved Memory: 0.00 GB
GPU 0 Memory Usage:
  Total Memory: 14.58 GB
  Free Memory: 6.96 GB
  Used Memory: 7.62 GB


In [6]:
import gc
gc.collect()

# 清理 GPU 内存缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [3]:
start = perf_counter()
vae = AutoencoderKL.from_pretrained(
    "madebyollin/sdxl-vae-fp16-fix", 
    torch_dtype=torch.float16,
    cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
).to("cuda")
pipe = DiffusionPipeline.from_pretrained("/data1/workspace/javeyqiu/models/stable-diffusion-xl-base-1.0", 
                                         torch_dtype=torch.float16, 
                                         vae=vae,
                                         use_safetensors=True, 
                                         variant="fp16").to("cuda")
endtime = perf_counter()-start
print("加载一个 SD-XL 基座模型 所需时间为: ", endtime, "s") 
get_cpu_memory_usage()
get_gpu_memory_usage()

Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00,  8.49it/s]


加载一个 SD-XL 基座模型 所需时间为:  3.2940734419971704 s
memory allocated: (6.087009429931641, 6.575650691986084, 14.58062744140625)


In [None]:
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
controlnet_path = "/data1/workspace/javeyqiu/models/huggingface/hub/models--diffusers--controlnet-depth-sdxl-1.0/snapshots/17bb97973f29801224cd66f192c5ffacf82648b4/diffusion_pytorch_model.fp16.safetensors"

controlnet_depth =ControlNetModel.from_single_file(controlnet_path)

从硬盘读取

In [6]:
# for i in range(100):
start = perf_counter()
controlnet_depth = ControlNetModel.from_pretrained(
    "diffusers/controlnet-depth-sdxl-1.0",
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
    # low_cpu_mem_usage=True,
    # device_map=0,
    cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
).to("cuda")
get_cpu_memory_usage()
controlnet_depth = controlnet_depth.to('cpu')
# pipe_depth = StableDiffusionXLControlNetPipeline.from_pipe(
#     pipe,
#     controlnet=controlnet_depth,
# )
endtime = perf_counter()-start
print("增加一个 controlnet pipe所需时间为: ", endtime, "s")
get_cpu_memory_usage()

del controlnet_depth
gc.collect()
torch.cuda.empty_cache()

get_cpu_memory_usage()
    # get_gpu_memory_usage()

  Used Memory: 6.03 GB
  RSS Memory': 0.75 GB
增加一个 controlnet pipe所需时间为:  1.7664242559112608 s
  Used Memory: 8.28 GB
  RSS Memory': 3.00 GB
  Used Memory: 6.10 GB
  RSS Memory': 0.82 GB


In [2]:
from collections import OrderedDict
gpu_cache = OrderedDict()
cpu_cache = OrderedDict()
for i in range(3):
    controlnet_depth = ControlNetModel.from_pretrained(
        "diffusers/controlnet-depth-sdxl-1.0",
        variant="fp16",
        use_safetensors=True,
        torch_dtype=torch.float16,
        cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
    ).to("cuda")
    # controlnet_depth = torch.load('./stable-diffusion-xl-base-1.0/controlnet_depth.bin').to("cuda")
    gpu_cache[f'diffusers/controlnet-depth-sdxl-1.0-{i}'] = controlnet_depth
    
    get_cpu_memory_usage()
    get_gpu_memory_usage()

CPU Memory Usage:
  Used Memory: 6.92 GB
GPU Memory Usage:
  Reserved Memory: 2.47 GB
CPU Memory Usage:
  Used Memory: 7.79 GB
GPU Memory Usage:
  Reserved Memory: 4.92 GB
CPU Memory Usage:
  Used Memory: 8.66 GB
GPU Memory Usage:
  Reserved Memory: 7.37 GB


In [3]:
for i in range(4, 10):
    controlnet_depth = ControlNetModel.from_pretrained(
        "diffusers/controlnet-depth-sdxl-1.0",
        variant="fp16",
        use_safetensors=True,
        torch_dtype=torch.float16,
        cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
    ).to("cuda")
    # controlnet_depth = torch.load('./stable-diffusion-xl-base-1.0/controlnet_depth.bin').to("cuda")
    gpu_cache[f'diffusers/controlnet-depth-sdxl-1.0-{i}'] = controlnet_depth

    model_name, model = gpu_cache.popitem(last=False)
    model.to('cpu')
    cpu_cache[model_name] = model
    get_cpu_memory_usage()
    get_gpu_memory_usage()

CPU Memory Usage:
  Used Memory: 11.03 GB
GPU Memory Usage:
  Reserved Memory: 9.81 GB
CPU Memory Usage:
  Used Memory: 13.48 GB
GPU Memory Usage:
  Reserved Memory: 9.81 GB
CPU Memory Usage:
  Used Memory: 15.87 GB
GPU Memory Usage:
  Reserved Memory: 9.84 GB
CPU Memory Usage:
  Used Memory: 18.23 GB
GPU Memory Usage:
  Reserved Memory: 9.84 GB
CPU Memory Usage:
  Used Memory: 20.65 GB
GPU Memory Usage:
  Reserved Memory: 9.87 GB
CPU Memory Usage:
  Used Memory: 23.06 GB
GPU Memory Usage:
  Reserved Memory: 9.87 GB


In [5]:
for i in range(20, 30):
    start = perf_counter()
    # controlnet_depth = ControlNetModel.from_pretrained(
    #     "diffusers/controlnet-depth-sdxl-1.0",
    #     variant="fp16",
    #     use_safetensors=True,
    #     torch_dtype=torch.float16,
    #     cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
    # ).to("cuda")
    controlnet_depth = torch.load('./stable-diffusion-xl-base-1.0/controlnet_depth.bin').to("cuda")
    gpu_cache[f'diffusers/controlnet-depth-sdxl-1.0-{i}'] = controlnet_depth

    model_name, model = gpu_cache.popitem(last=False)
    model.to('cpu')
    cpu_cache[model_name] = model

    _, model_cpu = cpu_cache.popitem(last=False)
    del model_cpu
    gc.collect()
    
    endtime = perf_counter()-start
    print("增加一个 controlnet pipe所需时间为: ", endtime, "s")
    get_cpu_memory_usage()
    get_gpu_memory_usage()

增加一个 controlnet pipe所需时间为:  1.1605215673334897 s
CPU Memory Usage:
  Used Memory: 26.02 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.1453118841163814 s
CPU Memory Usage:
  Used Memory: 26.02 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.1303139389492571 s
CPU Memory Usage:
  Used Memory: 26.02 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.1489184750244021 s
CPU Memory Usage:
  Used Memory: 26.02 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.2040403131395578 s
CPU Memory Usage:
  Used Memory: 26.23 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.2347728172317147 s
CPU Memory Usage:
  Used Memory: 26.58 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.1448842212557793 s
CPU Memory Usage:
  Used Memory: 26.61 GB
GPU Memory Usage:
  Reserved Memory: 9.89 GB
增加一个 controlnet pipe所需时间为:  1.242498458828777 s


In [5]:
# torch.save(controlnet_depth, './stable-diffusion-xl-base-1.0/controlnet_depth.bin')
controlnet_depth = torch.load('./stable-diffusion-xl-base-1.0/controlnet_depth.bin').to("cuda")

In [9]:
get_cpu_memory_usage()
get_gpu_memory_usage()

CPU Memory Usage:
  Total Memory: 62.55 GB
  Used Memory: 10.63 GB
  Free Memory: 50.81 GB
GPU Memory Usage:
  Total Memory: 14.58 GB
  Allocated Memory: 2.36 GB
  Reserved Memory: 2.56 GB


In [None]:
for i in range(10):
    controlnet_depth = ControlNetModel.from_pretrained(
        "diffusers/controlnet-depth-sdxl-1.0",
        variant="fp16",
        use_safetensors=True,
        torch_dtype=torch.float16,
        cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
    ).cuda()
    del controlnet_depth
    gc.collect()
    get_cpu_memory_usage()

In [6]:
import gc
gc.collect()

# 清理 GPU 内存缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

从内存读取

In [3]:
controlnet_depth = ControlNetModel.from_pretrained(
    "diffusers/controlnet-depth-sdxl-1.0",
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
    cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub'
).to("cuda")

controlnet_depth_path = "/data1/workspace/javeyqiu/models/huggingface/hub/models--diffusers--controlnet-depth-sdxl-1.0/snapshots/17bb97973f29801224cd66f192c5ffacf82648b4/diffusion_pytorch_model.fp16.safetensors"
controlnet_depth_weights = preload_safetensors_to_cpu(controlnet_depth_path)

In [7]:
gpu_cache = {'depth': controlnet_depth}

In [8]:
start = perf_counter()
# controlnet_depth.load_state_dict(controlnet_depth_weights)
pipe_depth = StableDiffusionXLControlNetPipeline.from_pipe(
    pipe,
    controlnet=gpu_cache['depth'],
)
endtime = perf_counter()-start
print("增加一个 controlnet pipe所需时间为: ", endtime, "s")
print("memory allocated: {}".format(get_memory_usage()))

增加一个 controlnet pipe所需时间为:  0.2635579979978502 s
memory allocated: (8.458267211914062, 8.953990459442139, 14.58062744140625)


In [9]:
start = perf_counter()
controlnet_canny = ControlNetModel.from_pretrained(
    "diffusers/controlnet-canny-sdxl-1.0",
    torch_dtype=torch.float16,
    cache_dir='/data1/workspace/javeyqiu/models/huggingface/hub',
).to("cuda")
# controlnet_canny = torch.load('controlnet_canny.pt').to("cuda")
gpu_cache['canny'] = controlnet_canny
pipe = StableDiffusionXLControlNetPipeline.from_pipe(
    pipe_depth,
    controlnet=gpu_cache['canny'],
)

endtime = perf_counter()-start
print("更换一个 controlnet pipe所需时间为: ", endtime, "s")

更换一个 controlnet pipe所需时间为:  49.76634782506153 s


In [27]:
import gc
gc.collect()

# 清理 GPU 内存缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()