-
Notifications
You must be signed in to change notification settings - Fork 7.1k
cpu_offload: remove all hooks before offload #7448
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
11fff3f
ffda7e5
7b3d1bf
783b8c9
5127c99
0a574b4
3a93fbe
225f2d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -371,9 +371,7 @@ def module_is_sequentially_offloaded(module): | |
| if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"): | ||
| return False | ||
|
|
||
| return hasattr(module, "_hf_hook") and not isinstance( | ||
| module._hf_hook, (accelerate.hooks.CpuOffload, accelerate.hooks.AlignDevicesHook) | ||
| ) | ||
| return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook) | ||
|
|
||
| def module_is_offloaded(module): | ||
| if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"): | ||
|
|
@@ -939,6 +937,16 @@ def _execution_device(self): | |
| return torch.device(module._hf_hook.execution_device) | ||
| return self.device | ||
|
|
||
| def remove_all_hooks(self): | ||
| r""" | ||
| Removes all hooks that were added when using `enable_sequential_cpu_offload` or `enable_model_cpu_offload`. | ||
| """ | ||
| for _, model in self.components.items(): | ||
| if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"): | ||
| is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook) | ||
| accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload) | ||
| self._all_hooks = [] | ||
|
yiyixuxu marked this conversation as resolved.
|
||
|
|
||
| def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"): | ||
| r""" | ||
| Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared | ||
|
|
@@ -963,6 +971,8 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t | |
| else: | ||
| raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") | ||
|
|
||
| self.remove_all_hooks() | ||
|
|
||
| torch_device = torch.device(device) | ||
| device_index = torch_device.index | ||
|
|
||
|
|
@@ -979,15 +989,13 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t | |
| device = torch.device(f"{device_type}:{self._offload_gpu_id}") | ||
| self._offload_device = device | ||
|
|
||
| if self.device.type != "cpu": | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. This checks if the pipeline has been moved to GPU before |
||
| self.to("cpu", silence_dtype_warnings=True) | ||
| device_mod = getattr(torch, self.device.type, None) | ||
| if hasattr(device_mod, "empty_cache") and device_mod.is_available(): | ||
| device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist) | ||
| self.to("cpu", silence_dtype_warnings=True) | ||
| device_mod = getattr(torch, device.type, None) | ||
| if hasattr(device_mod, "empty_cache") and device_mod.is_available(): | ||
| device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist) | ||
|
|
||
| all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)} | ||
|
|
||
| self._all_hooks = [] | ||
| hook = None | ||
| for model_str in self.model_cpu_offload_seq.split("->"): | ||
| model = all_model_components.pop(model_str, None) | ||
|
|
@@ -1021,11 +1029,6 @@ def maybe_free_model_hooks(self): | |
| # `enable_model_cpu_offload` has not be called, so silently do nothing | ||
| return | ||
|
|
||
| for hook in self._all_hooks: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a question. Why is hook removal here not functionally equivalent to the one defined in What is
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My question too. Maybe the recursive removal?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
pipe.enable_model_cpu_offload() # -> this install hooks to the models
out = pipe(...) # removed the hooks and installed new hooks (inside the maybe_free_model_hooks method)
pipe_img2img =different_pipeline_class(**pipe.components)
pipe.enable_model_cpu_offload() # install hooks again (without removing the existing ones)here I'm moving the work to remove hooks to the beginning of each cpu_offload method; and inside |
||
| # offload model and remove hook from model | ||
| hook.offload() | ||
| hook.remove() | ||
|
|
||
| # make sure the model is in the same state as before calling it | ||
| self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda")) | ||
|
|
||
|
|
@@ -1048,6 +1051,7 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un | |
| from accelerate import cpu_offload | ||
| else: | ||
| raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") | ||
| self.remove_all_hooks() | ||
|
|
||
| torch_device = torch.device(device) | ||
| device_index = torch_device.index | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1101,6 +1101,98 @@ def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4): | |
| f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}", | ||
| ) | ||
|
|
||
| @unittest.skipIf( | ||
| torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"), | ||
| reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher", | ||
| ) | ||
| def test_cpu_offload_forward_pass_twice(self, expected_max_diff=2e-4): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we split this up into two tests? One for model and one for sequential. |
||
| import accelerate | ||
|
|
||
| generator_device = "cpu" | ||
| components = self.get_dummy_components() | ||
| pipe = self.pipeline_class(**components) | ||
|
|
||
| for component in pipe.components.values(): | ||
| if hasattr(component, "set_default_attn_processor"): | ||
| component.set_default_attn_processor() | ||
|
|
||
| pipe.set_progress_bar_config(disable=None) | ||
|
|
||
| pipe.enable_model_cpu_offload() | ||
| inputs = self.get_dummy_inputs(generator_device) | ||
| output_with_offload = pipe(**inputs)[0] | ||
|
|
||
| pipe.enable_model_cpu_offload() | ||
| inputs = self.get_dummy_inputs(generator_device) | ||
| output_with_offload_twice = pipe(**inputs)[0] | ||
|
|
||
| max_diff = np.abs(to_np(output_with_offload) - to_np(output_with_offload_twice)).max() | ||
| self.assertLess( | ||
| max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results" | ||
| ) | ||
| offloaded_modules = [ | ||
| v | ||
| for k, v in pipe.components.items() | ||
| if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload | ||
| ] | ||
| ( | ||
| self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)), | ||
| f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}", | ||
| ) | ||
|
|
||
| offloaded_modules_with_hooks = [v for v in offloaded_modules if hasattr(v, "_hf_hook")] | ||
| ( | ||
| self.assertTrue(all(isinstance(v, accelerate.hooks.CpuOffload) for v in offloaded_modules_with_hooks)), | ||
| f"Not installed correct hook: {[v for v in offloaded_modules_with_hooks if not isinstance(v, accelerate.hooks.CpuOffload)]}", | ||
| ) | ||
|
|
||
| @unittest.skipIf( | ||
| torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), | ||
| reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", | ||
| ) | ||
| def test_sequential_offload_forward_pass_twice(self, expected_max_diff=2e-4): | ||
| import accelerate | ||
|
|
||
| generator_device = "cpu" | ||
| components = self.get_dummy_components() | ||
| pipe = self.pipeline_class(**components) | ||
|
|
||
| for component in pipe.components.values(): | ||
| if hasattr(component, "set_default_attn_processor"): | ||
| component.set_default_attn_processor() | ||
|
|
||
| pipe.set_progress_bar_config(disable=None) | ||
|
|
||
| pipe.enable_sequential_cpu_offload() | ||
| inputs = self.get_dummy_inputs(generator_device) | ||
| output_with_offload = pipe(**inputs)[0] | ||
|
|
||
| pipe.nable_sequential_cpu_offload() | ||
| inputs = self.get_dummy_inputs(generator_device) | ||
| output_with_offload_twice = pipe(**inputs)[0] | ||
|
|
||
| max_diff = np.abs(to_np(output_with_offload) - to_np(output_with_offload_twice)).max() | ||
| self.assertLess( | ||
| max_diff, expected_max_diff, "running sequential offloading second time should have the inference results" | ||
| ) | ||
| offloaded_modules = [ | ||
| v | ||
| for k, v in pipe.components.items() | ||
| if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload | ||
| ] | ||
| ( | ||
| self.assertTrue(all(v.device.type == "meta" for v in offloaded_modules)), | ||
| f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'meta']}", | ||
| ) | ||
|
|
||
| offloaded_modules_with_hooks = [v for v in offloaded_modules if hasattr(v, "_hf_hook")] | ||
| ( | ||
| self.assertTrue( | ||
| all(isinstance(v, accelerate.hooks.AlignDevicesHook) for v in offloaded_modules_with_hooks) | ||
| ), | ||
| f"Not installed correct hook: {[v for v in offloaded_modules_with_hooks if not isinstance(v, accelerate.hooks.AlignDevicesHook)]}", | ||
| ) | ||
|
|
||
| @unittest.skipIf( | ||
| torch_device != "cuda" or not is_xformers_available(), | ||
| reason="XFormers attention is only available with CUDA and `xformers` installed", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
enable_sequential_cpu_offloadinstallAlignDevicesHookso I think it is a mistake here - it did not affect the results because when this type of hooks is installed, you would not be able to move the models to other devices anywayi.e.,
with this change, this script will now throw the
ValueErrorwe intended to throw.Before this change, you would get a
NotImplementedError: Cannot copy out of meta tensor; no data!when it tried to move the models to gpu