From 9b599c82f5b3f0b2289b4de3807391e558dc3982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Somoza?= <somoza.alvaro@gmail.com>
Date: Mon, 24 Jun 2024 03:44:23 -0400
Subject: [PATCH 1/3] initial fix

---
 .../pipeline_stable_diffusion_3_img2img.py               | 2 +-
 .../schedulers/scheduling_flow_match_euler_discrete.py   | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
index 2555224b8c23..f888fb6c1de3 100644
--- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py
@@ -852,7 +852,7 @@ def __call__(
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
-        latent_timestep = timesteps[:1].repeat(batch_size * num_inference_steps)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
         # 5. Prepare latent variables
         if latents is None:
diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index 08e9d4f3a95d..02a16bc36f5a 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -129,7 +129,14 @@ def scale_noise(
         if self.step_index is None:
             self._init_step_index(timestep)
 
-        sigma = self.sigmas[self.step_index]
+        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
+        schedule_timesteps = self.timesteps.to(sample.device)
+        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        sigma = sigmas[step_indices].flatten()
+
+        while len(sigma.shape) < len(noise.shape):
+            sigma = sigma.unsqueeze(-1)
+
         sample = sigma * noise + (1.0 - sigma) * sample
 
         return sample

From aef08ed4d3f09f2947262d4f05e9516fe4c2b913 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Somoza?= <somoza.alvaro@gmail.com>
Date: Wed, 26 Jun 2024 18:32:37 -0400
Subject: [PATCH 2/3] apply suggestion

---
 .../scheduling_flow_match_euler_discrete.py   | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index 02a16bc36f5a..ba50e16d9a51 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -129,12 +129,29 @@ def scale_noise(
         if self.step_index is None:
             self._init_step_index(timestep)
 
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
-        schedule_timesteps = self.timesteps.to(sample.device)
-        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
-        sigma = sigmas[step_indices].flatten()
 
-        while len(sigma.shape) < len(noise.shape):
+        if sample.device.type == "mps" and torch.is_floating_point(timestep):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timestep = timestep.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timestep = timestep.to(sample.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timestep.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timestep.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(sample.shape):
             sigma = sigma.unsqueeze(-1)
 
         sample = sigma * noise + (1.0 - sigma) * sample

From 1cceb36c36d58b5661a435c458ffb739cbbd953b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Somoza?= <somoza.alvaro@gmail.com>
Date: Wed, 26 Jun 2024 22:38:35 -0400
Subject: [PATCH 3/3] delete step_index line

---
 .../schedulers/scheduling_flow_match_euler_discrete.py         | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
index ba50e16d9a51..83ce63981abd 100644
--- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py
@@ -126,9 +126,6 @@ def scale_noise(
             `torch.FloatTensor`:
                 A scaled input sample.
         """
-        if self.step_index is None:
-            self._init_step_index(timestep)
-
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)