huggingface
diff --git a/‎src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py‎
Lines changed: 18 additions & 11 deletions b/‎src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/schedulers/scheduling_ddim_inverse.py‎
Lines changed: 86 additions & 20 deletions b/‎src/diffusers/schedulers/scheduling_ddim_inverse.py‎
Lines changed: 86 additions & 20 deletions
@@ -992,7 +992,7 @@ def generate_mask(
         )
 
         # 4. Preprocess image
-        image = preprocess(image).repeat_interleave(num_maps_per_mask, dim=0)
+        image = self.image_processor.preprocess(image).repeat_interleave(num_maps_per_mask, dim=0)
 
         # 5. Set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -1176,7 +1176,7 @@ def invert(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Preprocess image
-        image = preprocess(image)
+        image = self.image_processor.preprocess(image)
 
         # 4. Prepare latent variables
         num_images_per_prompt = 1
@@ -1201,9 +1201,9 @@ def invert(
 
         # 7. Noising loop where we obtain the intermediate noised latent image for each timestep.
         num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
-        inverted_latents = [latents.detach().clone()]
-        with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
-            for i, t in enumerate(timesteps[:-1]):
+        inverted_latents = []
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
@@ -1270,7 +1270,7 @@ def invert(
         # 8. Post-processing
         image = None
         if decode_latents:
-            image = self.decode_latents(latents.flatten(0, 1).detach())
+            image = self.decode_latents(latents.flatten(0, 1))
 
         # 9. Convert to PIL.
         if decode_latents and output_type == "pil":
@@ -1291,7 +1291,7 @@ def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
         mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
-        image_latents: torch.FloatTensor = None,
+        image_latents: Union[torch.FloatTensor, PIL.Image.Image] = None,
         inpaint_strength: Optional[float] = 0.8,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
@@ -1447,7 +1447,13 @@ def __call__(
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, inpaint_strength, device)
 
         # 6. Preprocess image latents
-        image_latents = preprocess(image_latents)
+        if isinstance(image_latents, list) and any(isinstance(l, torch.Tensor) and l.ndim == 5 for l in image_latents):
+            image_latents = torch.cat(image_latents).detach()
+        elif isinstance(image_latents, torch.Tensor) and image_latents.ndim == 5:
+            image_latents = image_latents.detach()
+        else:
+            image_latents = self.image_processor.preprocess(image_latents).detach()
+
         latent_shape = (self.vae.config.latent_channels, latent_height, latent_width)
         if image_latents.shape[-3:] != latent_shape:
             raise ValueError(
@@ -1458,8 +1464,9 @@ def __call__(
             image_latents = image_latents.reshape(batch_size, len(timesteps), *latent_shape)
         if image_latents.shape[:2] != (batch_size, len(timesteps)):
             raise ValueError(
-                f"`image_latents` must have batch size {batch_size} with latent images from {len(timesteps)} timesteps, "
-                f"but has batch size {image_latents.shape[0]} with latent images from {image_latents.shape[1]} timesteps."
+                f"`image_latents` must have batch size {batch_size} with latent images from {len(timesteps)}"
+                f" timesteps, but has batch size {image_latents.shape[0]} with latent images from"
+                f" {image_latents.shape[1]} timesteps."
             )
         image_latents = image_latents.transpose(0, 1).repeat_interleave(num_images_per_prompt, dim=1)
         image_latents = image_latents.to(device=device, dtype=prompt_embeds.dtype)
@@ -1468,7 +1475,7 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        latents = image_latents[0].detach().clone()
+        latents = image_latents[0].clone()
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
 
@@ -1183,8 +1183,8 @@ def invert(
 
         # 7. Denoising loop where we obtain the cross-attention maps.
         num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
-        with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
-            for i, t in enumerate(timesteps[:-1]):
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
 
@@ -90,6 +90,43 @@ def alpha_bar_fn(t):
     return torch.tensor(betas, dtype=torch.float32)
 
 
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
 class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
     """
     DDIMInverseScheduler is the reverse scheduler of [`DDIMScheduler`].
@@ -126,9 +163,19 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
             https://imagen.research.google/video/paper.pdf)
+        timestep_spacing (`str`, default `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
+            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, default `False`):
+            whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf).
+            This can enable the model to generate very bright and dark samples instead of limiting it to samples with
+            medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
     """
 
     order = 1
+    ignore_for_config = ["kwargs"]
+    _deprecated_kwargs = ["set_alpha_to_zero"]
 
     @register_to_config
     def __init__(
@@ -139,18 +186,20 @@ def __init__(
         beta_schedule: str = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         clip_sample: bool = True,
-        set_alpha_to_zero: bool = True,
+        set_alpha_to_one: bool = True,
         steps_offset: int = 0,
         prediction_type: str = "epsilon",
         clip_sample_range: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
         **kwargs,
     ):
-        if kwargs.get("set_alpha_to_one", None) is not None:
+        if kwargs.get("set_alpha_to_zero", None) is not None:
             deprecation_message = (
-                "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
+                "The `set_alpha_to_zero` argument is deprecated. Please use `set_alpha_to_one` instead."
             )
-            deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False)
-            set_alpha_to_zero = kwargs["set_alpha_to_one"]
+            deprecate("set_alpha_to_zero", "1.0.0", deprecation_message, standard_warn=False)
+            set_alpha_to_one = kwargs["set_alpha_to_zero"]
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
         elif beta_schedule == "linear":
@@ -166,15 +215,19 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
         # At every step in inverted ddim, we are looking into the next alphas_cumprod
-        # For the final step, there is no next alphas_cumprod, and the index is out of bounds
-        # `set_alpha_to_zero` decides whether we set this parameter simply to zero
+        # For the initial step, there is no current alphas_cumprod, and the index is out of bounds
+        # `set_alpha_to_one` decides whether we set this parameter simply to one
         # in this case, self.step() just output the predicted noise
-        # or whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = torch.tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1]
+        # or whether we use the initial alpha used in training the diffusion model.
+        self.initial_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
@@ -215,12 +268,29 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             )
 
         self.num_inference_steps = num_inference_steps
-        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-        # creates integer timesteps by multiplying by ratio
-        # casting to int to avoid issues when num_inference_step is power of 3
-        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
+
+        # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)[::-1]).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        # Roll timesteps array by one to reflect reversed origin and destination semantics for each step
+        timesteps = np.roll(timesteps, 1)
+        timesteps[0] = int(timesteps[1] - step_ratio)
         self.timesteps = torch.from_numpy(timesteps).to(device)
-        self.timesteps += self.config.steps_offset
 
     def step(
         self,
@@ -237,12 +307,8 @@ def step(
 
         # 2. compute alphas, betas
         # change original implementation to exactly match noise levels for analogous forward process
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = (
-            self.alphas_cumprod[prev_timestep]
-            if prev_timestep < self.config.num_train_timesteps
-            else self.final_alpha_cumprod
-        )
+        alpha_prod_t = self.alphas_cumprod[timestep] if timestep >= 0 else self.initial_alpha_cumprod
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
 
         beta_prod_t = 1 - alpha_prod_t