From 9159ed7b29e5e662f5717a82cf540dcce7adf10d Mon Sep 17 00:00:00 2001 From: KimbingNg Date: Thu, 27 Nov 2025 15:40:12 +0800 Subject: [PATCH 1/3] Fixes #12673. Wrong default_stream is used. leading to wrong execution order when record_steram is enabled. --- src/diffusers/hooks/group_offloading.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py index 38f291f5203c..e3900f169721 100644 --- a/src/diffusers/hooks/group_offloading.py +++ b/src/diffusers/hooks/group_offloading.py @@ -153,27 +153,27 @@ def _pinned_memory_tensors(self): finally: pinned_dict = None - def _transfer_tensor_to_device(self, tensor, source_tensor): + def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream): tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking) if self.record_stream: - tensor.data.record_stream(self._torch_accelerator_module.current_stream()) + tensor.data.record_stream(default_stream) - def _process_tensors_from_modules(self, pinned_memory=None): + def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None): for group_module in self.modules: for param in group_module.parameters(): source = pinned_memory[param] if pinned_memory else param.data - self._transfer_tensor_to_device(param, source) + self._transfer_tensor_to_device(param, source, default_stream) for buffer in group_module.buffers(): source = pinned_memory[buffer] if pinned_memory else buffer.data - self._transfer_tensor_to_device(buffer, source) + self._transfer_tensor_to_device(buffer, source, default_stream) for param in self.parameters: source = pinned_memory[param] if pinned_memory else param.data - self._transfer_tensor_to_device(param, source) + self._transfer_tensor_to_device(param, source, default_stream) for buffer in self.buffers: source = pinned_memory[buffer] if pinned_memory else buffer.data - self._transfer_tensor_to_device(buffer, source) + self._transfer_tensor_to_device(buffer, source, default_stream) def _onload_from_disk(self): if self.stream is not None: @@ -208,12 +208,13 @@ def _onload_from_memory(self): self.stream.synchronize() context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream) + default_stream = self._torch_accelerator_module.current_stream() with context: if self.stream is not None: with self._pinned_memory_tensors() as pinned_memory: - self._process_tensors_from_modules(pinned_memory) + self._process_tensors_from_modules(pinned_memory, default_stream=default_stream) else: - self._process_tensors_from_modules(None) + self._process_tensors_from_modules(None, default_stream=default_stream) def _offload_to_disk(self): # TODO: we can potentially optimize this code path by checking if the _all_ the desired From 4d39e8ca6b79cc3705847f91e3f23198a862d6f7 Mon Sep 17 00:00:00 2001 From: KimbingNg Date: Thu, 27 Nov 2025 15:53:13 +0800 Subject: [PATCH 2/3] update --- src/diffusers/hooks/group_offloading.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py index e3900f169721..11b8dfd15222 100644 --- a/src/diffusers/hooks/group_offloading.py +++ b/src/diffusers/hooks/group_offloading.py @@ -208,13 +208,14 @@ def _onload_from_memory(self): self.stream.synchronize() context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream) - default_stream = self._torch_accelerator_module.current_stream() + default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None + with context: if self.stream is not None: with self._pinned_memory_tensors() as pinned_memory: self._process_tensors_from_modules(pinned_memory, default_stream=default_stream) else: - self._process_tensors_from_modules(None, default_stream=default_stream) + self._process_tensors_from_modules(None) def _offload_to_disk(self): # TODO: we can potentially optimize this code path by checking if the _all_ the desired From e56f51176c77fb353ac5565e11b3806943a90e1e Mon Sep 17 00:00:00 2001 From: KimbingNg Date: Thu, 27 Nov 2025 21:08:24 +0800 Subject: [PATCH 3/3] Update test --- tests/models/test_modeling_common.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 6f4c3d544b45..034a1add6359 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1814,9 +1814,6 @@ def _run_forward(model, inputs_dict): torch.manual_seed(0) return model(**inputs_dict)[0] - if self.__class__.__name__ == "AutoencoderKLCosmosTests" and offload_type == "leaf_level": - pytest.skip("With `leaf_type` as the offloading type, it fails. Needs investigation.") - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() torch.manual_seed(0) model = self.model_class(**init_dict)