From eb45fc1bf3ff116d16f0f06147323b0c7acecf11 Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Fri, 10 Oct 2025 09:08:45 -0700
Subject: [PATCH 1/4] fix syntax

---
 docs/source/en/api/pipelines/marigold.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/marigold.md b/docs/source/en/api/pipelines/marigold.md
index 81e103afeb64..bb6e94de33d7 100644
--- a/docs/source/en/api/pipelines/marigold.md
+++ b/docs/source/en/api/pipelines/marigold.md
@@ -75,7 +75,7 @@ The following is a summary of the recommended checkpoints, all of which produce
 | [prs-eth/marigold-depth-v1-1](https://huggingface.co/prs-eth/marigold-depth-v1-1)                   | Depth        | Affine-invariant depth prediction assigns each pixel a value between 0 (near plane) and 1 (far plane), with both planes determined by the model during inference.                    |
 | [prs-eth/marigold-normals-v0-1](https://huggingface.co/prs-eth/marigold-normals-v0-1)               | Normals      | The surface normals predictions are unit-length 3D vectors in the screen space camera, with values in the range from -1 to 1.                                                        |
 | [prs-eth/marigold-iid-appearance-v1-1](https://huggingface.co/prs-eth/marigold-iid-appearance-v1-1) | Intrinsics   | InteriorVerse decomposition is comprised of Albedo and two BRDF material properties: Roughness and Metallicity.                                                                      | 
-| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image &nbsp\\(I\\)&nbsp is comprised of Albedo &nbsp\\(A\\), Diffuse shading &nbsp\\(S\\), and Non-diffuse residual &nbsp\\(R\\): &nbsp\\(I = A*S+R\\). |
+| [prs-eth/marigold-iid-lighting-v1-1](https://huggingface.co/prs-eth/marigold-iid-lighting-v1-1)     | Intrinsics   | HyperSim decomposition of an image $I$ is comprised of Albedo $A$, Diffuse shading $S$, and Non-diffuse residual $R$: $I = A*S+R$. |
 
 > [!TIP]
 > Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff 

From 74e7313da5e797f8501b8a358e2525f940dce36a Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Fri, 10 Oct 2025 09:15:07 -0700
Subject: [PATCH 2/4] fix

---
 .../pipelines/marigold/pipeline_marigold_depth.py        | 7 ++-----
 .../pipelines/marigold/pipeline_marigold_intrinsics.py   | 9 ++-------
 .../pipelines/marigold/pipeline_marigold_normals.py      | 7 ++-----
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
index da991aefbd4a..1b9f8526e95f 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
@@ -86,12 +86,9 @@ class MarigoldDepthOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
index c809de18f469..516bab3b003b 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
@@ -99,14 +99,9 @@ class MarigoldIntrinsicsOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
-            \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
-            \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
-            the intrinsic image decomposition.
+            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of the intrinsic image decomposition.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
-            numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
-            height \times width \times 3$ for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width \times 3$ for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
index 192ed590a489..e9171ed8c273 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
@@ -81,12 +81,9 @@ class MarigoldNormalsOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
+            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.

From b92b98b3c252e1a28627c79652631d0b28382bd8 Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Fri, 10 Oct 2025 09:23:33 -0700
Subject: [PATCH 3/4] style

---
 .../pipelines/marigold/pipeline_marigold_depth.py        | 7 +++++--
 .../pipelines/marigold/pipeline_marigold_intrinsics.py   | 9 +++++++--
 .../pipelines/marigold/pipeline_marigold_normals.py      | 7 +++++--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
index 1b9f8526e95f..da991aefbd4a 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
@@ -86,9 +86,12 @@ class MarigoldDepthOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
+            width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
+            for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
index 516bab3b003b..c809de18f469 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
@@ -99,9 +99,14 @@ class MarigoldIntrinsicsOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of the intrinsic image decomposition.
+            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
+            \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
+            \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
+            the intrinsic image decomposition.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width \times 3$ for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
+            numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
+            height \times width \times 3$ for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
index e9171ed8c273..192ed590a489 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
@@ -81,9 +81,12 @@ class MarigoldNormalsOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
+            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
+            width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
+            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
+            for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.

From 31162ff52052788fdcf0280bae83c757a96d49df Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Fri, 10 Oct 2025 09:29:18 -0700
Subject: [PATCH 4/4] fix

---
 .../marigold/pipeline_marigold_depth.py          | 11 +++++------
 .../marigold/pipeline_marigold_intrinsics.py     | 16 ++++++++--------
 .../marigold/pipeline_marigold_normals.py        | 11 +++++------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
index da991aefbd4a..92ec16fd455b 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_depth.py
@@ -86,15 +86,14 @@ class MarigoldDepthOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
+            Predicted depth maps with values in the range [0, 1]. The shape is `numimages × 1 × height × width` for
+            `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `numimages × 1 ×
+            height × width` for `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+            The shape is `numimages * numensemble × 4 × latentheight × latentwidth`.
     """
 
     prediction: Union[np.ndarray, torch.Tensor]
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
index c809de18f469..bef9ca77c708 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_intrinsics.py
@@ -99,17 +99,17 @@ class MarigoldIntrinsicsOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted image intrinsics with values in the range [0, 1]. The shape is $(numimages * numtargets) \times 3
-            \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times height \times width
-            \times 3$ for `np.ndarray`, where `numtargets` corresponds to the number of predicted target modalities of
-            the intrinsic image decomposition.
+            Predicted image intrinsics with values in the range [0, 1]. The shape is `(numimages * numtargets) × 3 ×
+            height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for `np.ndarray`,
+            where `numtargets` corresponds to the number of predicted target modalities of the intrinsic image
+            decomposition.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $(numimages *
-            numtargets) \times 3 \times height \times width$ for `torch.Tensor` or $(numimages * numtargets) \times
-            height \times width \times 3$ for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `(numimages *
+            numtargets) × 3 × height × width` for `torch.Tensor` or `(numimages * numtargets) × height × width × 3` for
+            `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $(numimages * numensemble) \times (numtargets * 4) \times latentheight \times latentwidth$.
+            The shape is `(numimages * numensemble) × (numtargets * 4) × latentheight × latentwidth`.
     """
 
     prediction: Union[np.ndarray, torch.Tensor]
diff --git a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
index 192ed590a489..485a39c995ec 100644
--- a/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
+++ b/src/diffusers/pipelines/marigold/pipeline_marigold_normals.py
@@ -81,15 +81,14 @@ class MarigoldNormalsOutput(BaseOutput):
 
     Args:
         prediction (`np.ndarray`, `torch.Tensor`):
-            Predicted normals with values in the range [-1, 1]. The shape is $numimages \times 3 \times height \times
-            width$ for `torch.Tensor` or $numimages \times height \times width \times 3$ for `np.ndarray`.
+            Predicted normals with values in the range [-1, 1]. The shape is `numimages × 3 × height × width` for
+            `torch.Tensor` or `numimages × height × width × 3` for `np.ndarray`.
         uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
-            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
-            \times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
-            for `np.ndarray`.
+            Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is `numimages × 1 ×
+            height × width` for `torch.Tensor` or `numimages × height × width × 1` for `np.ndarray`.
         latent (`None`, `torch.Tensor`):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
-            The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
+            The shape is `numimages * numensemble × 4 × latentheight × latentwidth`.
     """
 
     prediction: Union[np.ndarray, torch.Tensor]