Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions src/diffusers/pipelines/pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,9 +371,7 @@ def module_is_sequentially_offloaded(module):
if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
return False

return hasattr(module, "_hf_hook") and not isinstance(
module._hf_hook, (accelerate.hooks.CpuOffload, accelerate.hooks.AlignDevicesHook)
)
return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enable_sequential_cpu_offload install AlignDevicesHook so I think it is a mistake here - it did not affect the results because when this type of hooks is installed, you would not be able to move the models to other devices anyway

i.e.,
with this change, this script will now throw the ValueError we intended to throw.

Before this change, you would get a NotImplementedError: Cannot copy out of meta tensor; no data! when it tried to move the models to gpu

from diffusers import StableDiffusionPipeline
import torch
from accelerate.hooks import CpuOffload, AlignDevicesHook, remove_hook_from_module

num_inference_steps = 2
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
pipe.enable_sequential_cpu_offload()
pipe.to("cuda")


def module_is_offloaded(module):
if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
Expand Down Expand Up @@ -939,6 +937,16 @@ def _execution_device(self):
return torch.device(module._hf_hook.execution_device)
return self.device

def remove_all_hooks(self):
r"""
Removes all hooks that were added when using `enable_sequential_cpu_offload` or `enable_model_cpu_offload`.
"""
for _, model in self.components.items():
if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook)
accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload)
self._all_hooks = []
Comment thread
yiyixuxu marked this conversation as resolved.

def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
Expand All @@ -963,6 +971,8 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")

self.remove_all_hooks()

torch_device = torch.device(device)
device_index = torch_device.index

Expand All @@ -979,15 +989,13 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
device = torch.device(f"{device_type}:{self._offload_gpu_id}")
self._offload_device = device

if self.device.type != "cpu":

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.device.type == "cpu" does not mean all the models are on cpu device, no?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This checks if the pipeline has been moved to GPU before enable_model_cpu_offload has been called. If this is the case, it moves the whole pipeline to CPU first. But you're right that it's not needed since all components are moved to CPU anyway.

self.to("cpu", silence_dtype_warnings=True)
device_mod = getattr(torch, self.device.type, None)
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
self.to("cpu", silence_dtype_warnings=True)
device_mod = getattr(torch, device.type, None)
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)

all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}

self._all_hooks = []
hook = None
for model_str in self.model_cpu_offload_seq.split("->"):
model = all_model_components.pop(model_str, None)
Expand Down Expand Up @@ -1021,11 +1029,6 @@ def maybe_free_model_hooks(self):
# `enable_model_cpu_offload` has not be called, so silently do nothing
return

for hook in self._all_hooks:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a question. Why is hook removal here not functionally equivalent to the one defined in remove_all_hooks?

What is hook.remove not accomplishing that leads to the device mismatch?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My question too. Maybe the recursive removal?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. these methods (e.g. self._all_hooks[0].remove()) only remove hooks installed by enable_model_cpu_offload, it won't remove the hooks installed by enable_sequential_cpu_offload method
  2. this code is inside maybe_free_model_hooks, which is only called at the end of each inference loop; and as soon as it removes the hooks, it immediately calls enable_model_cpu_offload() again, which install a new set of hooks; so if you explicitly call enable_model_cpu_offload() on the pipeline again, it will try to install hooks again and cause the device mismatch error.
pipe.enable_model_cpu_offload()  # -> this install hooks to the models
out = pipe(...)  # removed the hooks and installed new hooks (inside the maybe_free_model_hooks method)
pipe_img2img =different_pipeline_class(**pipe.components) 
pipe.enable_model_cpu_offload() # install hooks again (without removing the existing ones)

here I'm moving the work to remove hooks to the beginning of each cpu_offload method; and inside maybe_free_model_hooks, we only check to see if we need to call enable_model_cpu_offload again

# offload model and remove hook from model
hook.offload()
hook.remove()

# make sure the model is in the same state as before calling it
self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))

Expand All @@ -1048,6 +1051,7 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
from accelerate import cpu_offload
else:
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
self.remove_all_hooks()

torch_device = torch.device(device)
device_index = torch_device.index
Expand Down
92 changes: 92 additions & 0 deletions tests/pipelines/test_pipelines_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,98 @@ def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
)

@unittest.skipIf(
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
)
def test_cpu_offload_forward_pass_twice(self, expected_max_diff=2e-4):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we split this up into two tests? One for model and one for sequential.

import accelerate

generator_device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)

for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()

pipe.set_progress_bar_config(disable=None)

pipe.enable_model_cpu_offload()
inputs = self.get_dummy_inputs(generator_device)
output_with_offload = pipe(**inputs)[0]

pipe.enable_model_cpu_offload()
inputs = self.get_dummy_inputs(generator_device)
output_with_offload_twice = pipe(**inputs)[0]

max_diff = np.abs(to_np(output_with_offload) - to_np(output_with_offload_twice)).max()
self.assertLess(
max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results"
)
offloaded_modules = [
v
for k, v in pipe.components.items()
if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
]
(
self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)),
f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}",
)

offloaded_modules_with_hooks = [v for v in offloaded_modules if hasattr(v, "_hf_hook")]
(
self.assertTrue(all(isinstance(v, accelerate.hooks.CpuOffload) for v in offloaded_modules_with_hooks)),
f"Not installed correct hook: {[v for v in offloaded_modules_with_hooks if not isinstance(v, accelerate.hooks.CpuOffload)]}",
)

@unittest.skipIf(
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
)
def test_sequential_offload_forward_pass_twice(self, expected_max_diff=2e-4):
import accelerate

generator_device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)

for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()

pipe.set_progress_bar_config(disable=None)

pipe.enable_sequential_cpu_offload()
inputs = self.get_dummy_inputs(generator_device)
output_with_offload = pipe(**inputs)[0]

pipe.nable_sequential_cpu_offload()
inputs = self.get_dummy_inputs(generator_device)
output_with_offload_twice = pipe(**inputs)[0]

max_diff = np.abs(to_np(output_with_offload) - to_np(output_with_offload_twice)).max()
self.assertLess(
max_diff, expected_max_diff, "running sequential offloading second time should have the inference results"
)
offloaded_modules = [
v
for k, v in pipe.components.items()
if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload
]
(
self.assertTrue(all(v.device.type == "meta" for v in offloaded_modules)),
f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'meta']}",
)

offloaded_modules_with_hooks = [v for v in offloaded_modules if hasattr(v, "_hf_hook")]
(
self.assertTrue(
all(isinstance(v, accelerate.hooks.AlignDevicesHook) for v in offloaded_modules_with_hooks)
),
f"Not installed correct hook: {[v for v in offloaded_modules_with_hooks if not isinstance(v, accelerate.hooks.AlignDevicesHook)]}",
)

@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
Expand Down