From fcb578dd5cc6a2b5f0b941b715b40006fc74b280 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 8 Mar 2024 05:45:10 +0000
Subject: [PATCH 1/8] updates

---
 .../pipeline_stable_cascade_combined.py           | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
index cd3592b49ac0..6b317777e408 100644
--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
@@ -19,7 +19,7 @@
 
 from ...models import StableCascadeUNet
 from ...schedulers import DDPMWuerstchenScheduler
-from ...utils import replace_example_docstring
+from ...utils import is_torch_version, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
 from .pipeline_stable_cascade import StableCascadeDecoderPipeline
@@ -29,9 +29,9 @@
 TEXT2IMAGE_EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> from diffusions import StableCascadeCombinedPipeline
-
-        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade-combined", torch_dtype=torch.bfloat16).to(
+        >>> import torch
+        >>> from diffusers import StableCascadeCombinedPipeline
+        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=torch.bfloat16).to(
         ...     "cuda"
         ... )
         >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
@@ -155,10 +155,8 @@ def __call__(
         height: int = 512,
         width: int = 512,
         prior_num_inference_steps: int = 60,
-        prior_timesteps: Optional[List[float]] = None,
         prior_guidance_scale: float = 4.0,
         num_inference_steps: int = 12,
-        decoder_timesteps: Optional[List[float]] = None,
         decoder_guidance_scale: float = 0.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -253,6 +251,11 @@ def __call__(
             [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
             otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
+        dtype = self.decoder_pipe.decoder.dtype
+        if is_torch_version("<", "2.2.0") and dtype == torch.bfloat16:
+            raise ValueError(
+                "`StableCascadeCombinedPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype."
+            )
 
         prior_outputs = self.prior_pipe(
             prompt=prompt if prompt_embeds is None else None,

From 7bc2f92fe63462ae7187204dcf6a743d58be3db6 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 8 Mar 2024 06:09:51 +0000
Subject: [PATCH 2/8] update

---
 .../source/en/api/pipelines/stable_cascade.md | 36 ++++++++++++-------
 .../pipeline_stable_cascade_combined.py       |  5 ++-
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index 37df68bb03fd..e71d6e5e2d4c 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -12,13 +12,13 @@ specific language governing permissions and limitations under the License.
 
 # Stable Cascade
 
-This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main 
-difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this 
-important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes. 
-How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being 
-encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a 
-1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the 
-highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable 
+This model is built upon the [Würstchen](https://openreview.net/forum?id=gU58d5QeGv) architecture and its main
+difference to other models like Stable Diffusion is that it is working at a much smaller latent space. Why is this
+important? The smaller the latent space, the **faster** you can run inference and the **cheaper** the training becomes.
+How small is the latent space? Stable Diffusion uses a compression factor of 8, resulting in a 1024x1024 image being
+encoded to 128x128. Stable Cascade achieves a compression factor of 42, meaning that it is possible to encode a
+1024x1024 image to 24x24, while maintaining crisp reconstructions. The text-conditional model is then trained in the
+highly compressed latent space. Previous versions of this architecture, achieved a 16x cost reduction over Stable
 Diffusion 1.5.
 
 Therefore, this kind of model is well suited for usages where efficiency is important. Furthermore, all known extensions
@@ -30,11 +30,11 @@ The original codebase can be found at [Stability-AI/StableCascade](https://githu
 Stable Cascade consists of three models: Stage A, Stage B and Stage C, representing a cascade to generate images,
 hence the name "Stable Cascade".
 
-Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion. 
-However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a 
-spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves 
-a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the 
-image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible 
+Stage A & B are used to compress images, similar to what the job of the VAE is in Stable Diffusion.
+However, with this setup, a much higher compression of images can be achieved. While the Stable Diffusion models use a
+spatial compression factor of 8, encoding an image with resolution of 1024 x 1024 to 128 x 128, Stable Cascade achieves
+a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while being able to accurately decode the
+image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible
 for generating the small 24 x 24 latents given a text prompt.
 
 ## Uses
@@ -53,7 +53,7 @@ Excluded uses are described below.
 
 ### Out-of-Scope Use
 
-The model was not trained to be factual or true representations of people or events, 
+The model was not trained to be factual or true representations of people or events,
 and therefore using the model to generate such content is out-of-scope for the abilities of this model.
 The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).
 
@@ -63,6 +63,16 @@ The model should not be used in any way that violates Stability AI's [Acceptable
 - Faces and people in general may not be generated properly.
 - The autoencoding part of the model is lossy.
 
+<Tip warning={true}>
+
+There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
+
+In order to use the `torch.bfloat16` datatype with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
+
+If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
+
+</Tip>
+
 
 ## StableCascadeCombinedPipeline
 
diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
index 6b317777e408..7e9f05f60ff1 100644
--- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
+++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
@@ -31,9 +31,8 @@
         ```py
         >>> import torch
         >>> from diffusers import StableCascadeCombinedPipeline
-        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=torch.bfloat16).to(
-        ...     "cuda"
-        ... )
+        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
+        >>> pipe.enable_model_cpu_offload()
         >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
         >>> images = pipe(prompt=prompt)
         ```

From cc61f8248c524aa5d6eface60f49eb69b7589f08 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 8 Mar 2024 06:26:38 +0000
Subject: [PATCH 3/8] update

---
 .../source/en/api/pipelines/stable_cascade.md | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index e71d6e5e2d4c..60bf4e49c455 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -37,6 +37,44 @@ a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while b
 image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible
 for generating the small 24 x 24 latents given a text prompt.
 
+## Usage example
+
+```python
+import torch
+from diffusers import (
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+)
+
+prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+negative_prompt = ""
+
+prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16)
+decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.float16)
+
+prior.enable_model_cpu_offload()
+prior_output = prior(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    negative_prompt=negative_prompt,
+    guidance_scale=4.0,
+    num_images_per_prompt=1,
+    num_inference_steps=20
+)
+
+decoder.enable_model_cpu_offload()
+decoder_output = decoder(
+    image_embeddings=prior_output.image_embeddings.to(torch.float16),
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=0.0,
+    output_type="pil",
+    num_inference_steps=10
+).images[0]
+decoder_output.save("cascade.png")
+```
+
 ## Uses
 
 ### Direct Use
@@ -67,7 +105,7 @@ The model should not be used in any way that violates Stability AI's [Acceptable
 
 There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
 
-In order to use the `torch.bfloat16` datatype with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
+In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
 
 If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
 

From 1d90d753dcbbacabe3444cf0f8fe7177d82da95f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 8 Mar 2024 12:41:26 +0100
Subject: [PATCH 4/8] Update docs/source/en/api/pipelines/stable_cascade.md

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/stable_cascade.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index 60bf4e49c455..3b404884e0e1 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -41,10 +41,7 @@ for generating the small 24 x 24 latents given a text prompt.
 
 ```python
 import torch
-from diffusers import (
-    StableCascadeDecoderPipeline,
-    StableCascadePriorPipeline,
-)
+from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline
 
 prompt = "an image of a shiba inu, donning a spacesuit and helmet"
 negative_prompt = ""

From 2e1cfd34529c99df89711ac80d7d761542408bc0 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sat, 9 Mar 2024 05:44:07 +0000
Subject: [PATCH 5/8] update

---
 .../source/en/api/pipelines/stable_cascade.md | 66 ++++++++++++++++---
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index 60bf4e49c455..0582a2a46541 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -37,6 +37,20 @@ a compression factor of 42. This encodes a 1024 x 1024 image to 24 x 24, while b
 image. This comes with the great benefit of cheaper training and inference. Furthermore, Stage C is responsible
 for generating the small 24 x 24 latents given a text prompt.
 
+The Stage C model operates on the small 24 x 24 latents and denoises the latents conditioned on text prompts. The model is also the largest component in the Cascade pipeline and is meant to be used with the `StableCascadePriorPipeline`
+
+The Stage B and Stage A models are used with the `StableCascadeDecoderPipeline` and are responsible for generating the final image given the small 24 x 24 latents.
+
+<Tip warning={true}>
+
+There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
+
+In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
+
+If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
+
+</Tip>
+
 ## Usage example
 
 ```python
@@ -75,6 +89,48 @@ decoder_output = decoder(
 decoder_output.save("cascade.png")
 ```
 
+## Using the Lite Versions of the Stage B and Stage C models
+
+```python
+import torch
+from diffusers import (
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+    StableCascadeUNet,
+)
+
+prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+negative_prompt = ""
+
+prior_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior_lite")
+decoder_unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder_lite")
+
+prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet)
+decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet)
+
+prior.enable_model_cpu_offload()
+prior_output = prior(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    negative_prompt=negative_prompt,
+    guidance_scale=4.0,
+    num_images_per_prompt=1,
+    num_inference_steps=20
+)
+
+decoder.enable_model_cpu_offload()
+decoder_output = decoder(
+    image_embeddings=prior_output.image_embeddings,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=0.0,
+    output_type="pil",
+    num_inference_steps=20
+).images[0]
+decoder_output.save("cascade.png")
+```
+
 ## Uses
 
 ### Direct Use
@@ -101,16 +157,6 @@ The model should not be used in any way that violates Stability AI's [Acceptable
 - Faces and people in general may not be generated properly.
 - The autoencoding part of the model is lossy.
 
-<Tip warning={true}>
-
-There are some restrictions on data types that can be used with the Stable Cascade models. The official checkpoints for the  `StableCascadePriorPipeline` do not support the `torch.float16` data type. Please use `torch.bfloat16` instead.
-
-In order to use the `torch.bfloat16` data type with the `StableCascadeDecoderPipeline` you need to have PyTorch 2.2.0 or higher installed. This also means that using the `StableCascadeCombinedPipeline` with `torch.bfloat16` requires PyTorch 2.2.0 or higher, since it calls the `StableCascadeDecoderPipeline` internally.
-
-If it is not possible to install PyTorch 2.2.0 or higher in your environment, the `StableCascadeDecoderPipeline` can be used on its own with the `torch.float16` data type. You can download the full precision or `bf16` variant weights for the pipeline and cast the weights to `torch.float16`.
-
-</Tip>
-
 
 ## StableCascadeCombinedPipeline
 

From 7b37f654078820f4086385a41e6abfdcaae3d734 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sun, 10 Mar 2024 15:58:09 +0000
Subject: [PATCH 6/8] update

---
 docs/source/en/api/pipelines/stable_cascade.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index 0582a2a46541..1e7798768b41 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -126,7 +126,7 @@ decoder_output = decoder(
     negative_prompt=negative_prompt,
     guidance_scale=0.0,
     output_type="pil",
-    num_inference_steps=20
+    num_inference_steps=10
 ).images[0]
 decoder_output.save("cascade.png")
 ```

From 0ebdefed9705dcc2ec830a784497205816d03619 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Sun, 10 Mar 2024 16:15:10 +0000
Subject: [PATCH 7/8] update

---
 .../source/en/api/pipelines/stable_cascade.md | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index 1e7798768b41..63aef2f1d4b9 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -131,6 +131,56 @@ decoder_output = decoder(
 decoder_output.save("cascade.png")
 ```
 
+## Loading original checkpoints with `from_single_file`
+
+Loading the original format checkpoints is supported via `from_single_file` method in the StableCascadeUNet.
+
+```python
+import torch
+from diffusers import (
+    StableCascadeDecoderPipeline,
+    StableCascadePriorPipeline,
+    StableCascadeUNet,
+)
+
+prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+negative_prompt = ""
+
+prior_unet = StableCascadeUNet.from_single_file(
+    "https://huggingface.co/stabilityai/stable-cascade/resolve/main/stage_c_bf16.safetensors",
+    torch_dtype=torch.bfloat16
+)
+decoder_unet = StableCascadeUNet.from_single_file(
+    "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors",
+    torch_dtype=torch.bfloat16
+)
+
+prior = StableCascadePriorPipeline.from_pretrained("../checkpoints/stable-cascade-prior", prior=prior_unet, torch_dtype=torch.bfloat16)
+decoder = StableCascadeDecoderPipeline.from_pretrained("../checkpoints/stable-cascade", decoder=decoder_unet, torch_dtype=torch.bfloat16)
+
+prior.enable_model_cpu_offload()
+prior_output = prior(
+    prompt=prompt,
+    height=1024,
+    width=1024,
+    negative_prompt=negative_prompt,
+    guidance_scale=4.0,
+    num_images_per_prompt=1,
+    num_inference_steps=20
+)
+
+decoder.enable_model_cpu_offload()
+decoder_output = decoder(
+    image_embeddings=prior_output.image_embeddings,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=0.0,
+    output_type="pil",
+    num_inference_steps=10
+).images[0]
+decoder_output.save("cascade-single-file.png")
+```
+
 ## Uses
 
 ### Direct Use

From ca9ea112b4fb986a43cc3ced93d8cb3b2076820e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Mon, 11 Mar 2024 03:40:58 +0000
Subject: [PATCH 8/8] update

---
 docs/source/en/api/pipelines/stable_cascade.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_cascade.md b/docs/source/en/api/pipelines/stable_cascade.md
index 63aef2f1d4b9..1ab554ca82a8 100644
--- a/docs/source/en/api/pipelines/stable_cascade.md
+++ b/docs/source/en/api/pipelines/stable_cascade.md
@@ -155,8 +155,8 @@ decoder_unet = StableCascadeUNet.from_single_file(
     torch_dtype=torch.bfloat16
 )
 
-prior = StableCascadePriorPipeline.from_pretrained("../checkpoints/stable-cascade-prior", prior=prior_unet, torch_dtype=torch.bfloat16)
-decoder = StableCascadeDecoderPipeline.from_pretrained("../checkpoints/stable-cascade", decoder=decoder_unet, torch_dtype=torch.bfloat16)
+prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", prior=prior_unet, torch_dtype=torch.bfloat16)
+decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", decoder=decoder_unet, torch_dtype=torch.bfloat16)
 
 prior.enable_model_cpu_offload()
 prior_output = prior(