From 42e6d51a17a9bf26cd19107a0eb81e14508415d6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 15 Aug 2022 16:45:04 +0000
Subject: [PATCH 1/7] [Half precision] Make sure half-precision is correct

---
 src/diffusers/models/embeddings.py            |  6 ++---
 src/diffusers/models/resnet.py                |  8 ++++--
 src/diffusers/models/unet_2d.py               |  5 ++++
 src/diffusers/models/unet_2d_condition.py     |  8 ++++--
 .../pipeline_stable_diffusion.py              | 27 ++++++++++++++-----
 src/diffusers/schedulers/scheduling_ddim.py   | 10 ++++---
 6 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 8b52859e972e..8d1052173e66 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -32,10 +32,10 @@ def get_timestep_embedding(
     assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
 
     half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32)
+    exponent = exponent / (half_dim - downscale_freq_shift)
 
-    emb_coeff = -math.log(max_period) / (half_dim - downscale_freq_shift)
-    emb = torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
-    emb = torch.exp(emb * emb_coeff)
+    emb = torch.exp(exponent).to(device=timesteps.device)
     emb = timesteps[:, None].float() * emb[None, :]
 
     # scale embeddings
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 98244261d1cc..15cf6e26a955 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -331,7 +331,9 @@ def __init__(
     def forward(self, x, temb, hey=False):
         h = x
 
-        h = self.norm1(h)
+        # make sure hidden states is in float32
+        # when running in half-precision
+        h = self.norm1(h.float()).type(h.dtype)
         h = self.nonlinearity(h)
 
         if self.upsample is not None:
@@ -347,7 +349,9 @@ def forward(self, x, temb, hey=False):
             temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
             h = h + temb
 
-        h = self.norm2(h)
+        # make sure hidden states is in float32
+        # when running in half-precision
+        h = self.norm2(h.float()).type(h.dtype)
         h = self.nonlinearity(h)
 
         h = self.dropout(h)
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 6203d76f2586..6b020ea932a5 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -132,6 +132,9 @@ def forward(
         elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
+        # broadcast to batch dimension
+        timesteps = timesteps.broadcast_to(sample.shape[0])
+
         t_emb = self.time_proj(timesteps)
         emb = self.time_embedding(t_emb)
 
@@ -166,6 +169,8 @@ def forward(
                 sample = upsample_block(sample, res_samples, emb)
 
         # 6. post-process
+        # make sure hidden states is in float32
+        # when running in half-precision
         sample = self.conv_norm_out(sample)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index a39223811a00..25c4e37d8a6d 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -133,6 +133,9 @@ def forward(
         elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
+        # broadcast to batch dimension
+        timesteps = timesteps.broadcast_to(sample.shape[0])
+
         t_emb = self.time_proj(timesteps)
         emb = self.time_embedding(t_emb)
 
@@ -172,8 +175,9 @@ def forward(
                 sample = upsample_block(hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples)
 
         # 6. post-process
-
-        sample = self.conv_norm_out(sample)
+        # make sure hidden states is in float32
+        # when running in half-precision
+        sample = self.conv_norm_out(sample.float()).type(sample.dtype)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 0f309625ae44..53a882265c75 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -50,7 +50,13 @@ def __call__(
         self.text_encoder.to(torch_device)
 
         # get prompt text embeddings
-        text_input = self.tokenizer(prompt, padding=True, truncation=True, return_tensors="pt")
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
         text_embeddings = self.text_encoder(text_input.input_ids.to(torch_device))[0]
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
@@ -74,25 +80,32 @@ def __call__(
         latents = torch.randn(
             (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
+            device=torch_device,
         )
-        latents = latents.to(torch_device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_kwargs = {}
+        extra_forward_kwargs = {}
         if accepts_eta:
-            extra_kwargs["eta"] = eta
-
-        self.scheduler.set_timesteps(num_inference_steps)
+            extra_forward_kwargs["eta"] = eta
 
         for t in tqdm(self.scheduler.timesteps):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
             # predict the noise residual
+            t = t + 1
             noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
 
             # perform guidance
@@ -101,7 +114,7 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs)["prev_sample"]
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_forward_kwargs)["prev_sample"]
 
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index ed76873f8a96..d23d4fff304c 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -59,6 +59,7 @@ def __init__(
         trained_betas=None,
         timestep_values=None,
         clip_sample=True,
+        do_neg_alpha_one=True,
         tensor_format="pt",
     ):
 
@@ -75,7 +76,7 @@ def __init__(
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
-        self.one = np.array(1.0)
+        self.negative_alpha_cumprod = np.array(1.0) if do_neg_alpha_one else self.alphas_cumprod[0]
 
         # setable values
         self.num_inference_steps = None
@@ -86,7 +87,7 @@ def __init__(
 
     def _get_variance(self, timestep, prev_timestep):
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.negative_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
@@ -94,11 +95,12 @@ def _get_variance(self, timestep, prev_timestep):
 
         return variance
 
-    def set_timesteps(self, num_inference_steps):
+    def set_timesteps(self, num_inference_steps, offset=0):
         self.num_inference_steps = num_inference_steps
         self.timesteps = np.arange(
             0, self.config.num_train_timesteps, self.config.num_train_timesteps // self.num_inference_steps
         )[::-1].copy()
+        self.timesteps += offset
         self.set_format(tensor_format=self.tensor_format)
 
     def step(
@@ -126,7 +128,7 @@ def step(
 
         # 2. compute alphas, betas
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.negative_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
 
         # 3. compute predicted original sample from predicted noise also called

From 4667928023eb3b8bdaf31781faf6317d8bbc8e2a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 15 Aug 2022 18:45:53 +0200
Subject: [PATCH 2/7] Update src/diffusers/models/unet_2d.py

---
 src/diffusers/models/unet_2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 6b020ea932a5..db4c33690c9d 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -171,7 +171,7 @@ def forward(
         # 6. post-process
         # make sure hidden states is in float32
         # when running in half-precision
-        sample = self.conv_norm_out(sample)
+        sample = self.conv_norm_out(sample.float()).type(sample.dtype)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 

From 760a0710cb03dfa38240c70b782392fd782af310 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 15 Aug 2022 18:47:28 +0200
Subject: [PATCH 3/7] Update
 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py

---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 53a882265c75..e08d876c18a9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -105,7 +105,6 @@ def __call__(
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
             # predict the noise residual
-            t = t + 1
             noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
 
             # perform guidance

From f3d19e1cf03940e1e2c7cbb9402e40225c943de5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 15 Aug 2022 16:57:04 +0000
Subject: [PATCH 4/7] correct some tests

---
 tests/test_modeling_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 7c8e9be63560..8b72779f5a99 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -37,6 +37,7 @@
     PNDMScheduler,
     ScoreSdeVePipeline,
     ScoreSdeVeScheduler,
+    StableDiffusionPipeline,
     UNet2DModel,
     VQModel,
 )
@@ -45,8 +46,6 @@
 from diffusers.testing_utils import floats_tensor, slow, torch_device
 from diffusers.training_utils import EMAModel
 
-from ..src.diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
-
 
 torch.backends.cuda.matmul.allow_tf32 = False
 

From c7743d526929a441d27d67c9ac3330ad63105e05 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 16 Aug 2022 09:58:53 +0200
Subject: [PATCH 5/7] Apply suggestions from code review

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index e08d876c18a9..a5d09abd88bf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -96,9 +96,9 @@ def __call__(
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_forward_kwargs = {}
+        extra_step_kwargs = {}
         if accepts_eta:
-            extra_forward_kwargs["eta"] = eta
+            extra_step_kwargs["eta"] = eta
 
         for t in tqdm(self.scheduler.timesteps):
             # expand the latents if we are doing classifier free guidance
@@ -113,7 +113,7 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_forward_kwargs)["prev_sample"]
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"]
 
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents

From b30c8c7c5ee36dbbbff8bd74a823ce60037a2902 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 16 Aug 2022 08:29:08 +0000
Subject: [PATCH 6/7] finalize

---
 src/diffusers/schedulers/scheduling_ddim.py | 13 ++++---
 tests/test_modeling_utils.py                | 38 ++++++++++++++-------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index d23d4fff304c..d513fa9d5c7f 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -59,7 +59,7 @@ def __init__(
         trained_betas=None,
         timestep_values=None,
         clip_sample=True,
-        do_neg_alpha_one=True,
+        clip_alpha_at_one=True,
         tensor_format="pt",
     ):
 
@@ -76,7 +76,12 @@ def __init__(
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
-        self.negative_alpha_cumprod = np.array(1.0) if do_neg_alpha_one else self.alphas_cumprod[0]
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `clip_alpha_at_one` decides whether we set this paratemer simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = np.array(1.0) if clip_alpha_at_one else self.alphas_cumprod[0]
 
         # setable values
         self.num_inference_steps = None
@@ -87,7 +92,7 @@ def __init__(
 
     def _get_variance(self, timestep, prev_timestep):
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.negative_alpha_cumprod
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
@@ -128,7 +133,7 @@ def step(
 
         # 2. compute alphas, betas
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.negative_alpha_cumprod
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
         beta_prod_t = 1 - alpha_prod_t
 
         # 3. compute predicted original sample from predicted noise also called
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 8b72779f5a99..ea75bf7de8de 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -666,7 +666,7 @@ def test_output_pretrained(self):
 
         output_slice = output[0, -1, -3:, -3:].flatten()
         # fmt: off
-        expected_output_slice = torch.tensor([-4.0078e-01, -3.8304e-04, -1.2681e-01, -1.1462e-01,  2.0095e-01, 1.0893e-01, -8.8248e-02, -3.0361e-01, -9.8646e-03])
+        expected_output_slice = torch.tensor([-4.0078e-01, -3.8304e-04, -1.2681e-01, -1.1462e-01, 2.0095e-01, 1.0893e-01, -8.8248e-02, -3.0361e-01, -9.8646e-03])
         # fmt: on
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
 
@@ -841,35 +841,46 @@ def test_ldm_text2img_fast(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     @slow
+    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is suppused to run on GPU")
     def test_stable_diffusion(self):
-        ldm = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1-diffusers")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1-diffusers")
 
         prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = ldm([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="numpy")[
-            "sample"
-        ]
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
+
+        image = output["sample"]
 
         image_slice = image[0, -3:, -3:, -1]
 
         # TODO: update the expected_slice
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.9256, 0.9340, 0.8933, 0.9361, 0.9113, 0.8727, 0.9122, 0.8745, 0.8099])
+        expected_slice = np.array([0.8983, 0.9198, 0.9107, 0.8959, 0.9157, 0.9199, 0.9236, 0.9311, 0.8891])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     @slow
-    def test_stable_diffusion_fast(self):
-        ldm = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1-diffusers")
+    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is suppused to run on GPU")
+    def test_stable_diffusion_fast_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1-diffusers")
+
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            clip_alpha_at_one=False,
+        )
+        sd_pipe.scheduler = scheduler
 
         prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = ldm([prompt], generator=generator, num_inference_steps=1, output_type="numpy")["sample"]
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")["sample"]
 
         image_slice = image[0, -3:, -3:, -1]
 
         # TODO: update the expected_slice
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
+        expected_slice = np.array([0.8364, 0.8308, 0.8678, 0.8391, 0.8325, 0.8678, 0.8373, 0.8596, 0.8697])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     @slow
@@ -887,6 +898,9 @@ def test_score_sde_ve_pipeline(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 256, 256, 3)
+        import ipdb
+
+        ipdb.set_trace()
         expected_slice = np.array([0.64363, 0.5868, 0.3031, 0.2284, 0.7409, 0.3216, 0.25643, 0.6557, 0.2633])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 

From 387a6b0b6f0d88c5d3840662083d4439afd5ca90 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 16 Aug 2022 08:41:09 +0000
Subject: [PATCH 7/7] finish

---
 tests/test_modeling_utils.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index f45277a8d27c..894a4294d664 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -847,14 +847,17 @@ def test_stable_diffusion(self):
 
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
+        with torch.autocast("cuda"):
+            output = sd_pipe(
+                [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np"
+            )
 
         image = output["sample"]
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.8983, 0.9198, 0.9107, 0.8959, 0.9157, 0.9199, 0.9236, 0.9311, 0.8891])
+        expected_slice = np.array([0.898, 0.9194, 0.91, 0.8955, 0.915, 0.919, 0.9233, 0.9307, 0.8887])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     @slow
@@ -873,14 +876,15 @@ def test_stable_diffusion_fast_ddim(self):
 
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
-        
+
+        with torch.autocast("cuda"):
+            output = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
         image = output["sample"]
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.8364, 0.8308, 0.8678, 0.8391, 0.8325, 0.8678, 0.8373, 0.8596, 0.8697])
+        expected_slice = np.array([0.8354, 0.83, 0.866, 0.838, 0.8315, 0.867, 0.836, 0.8584, 0.869])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     @slow
@@ -898,9 +902,7 @@ def test_score_sde_ve_pipeline(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 256, 256, 3)
-        import ipdb
 
-        ipdb.set_trace()
         expected_slice = np.array([0.64363, 0.5868, 0.3031, 0.2284, 0.7409, 0.3216, 0.25643, 0.6557, 0.2633])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2