From d65fbc2af36fccc309d8e0ea456e8490b9075ddd Mon Sep 17 00:00:00 2001
From: Mohammad Sadegh Salehi <mss226@bath.ac.uk>
Date: Sun, 26 Oct 2025 19:48:59 +0000
Subject: [PATCH 1/3] Fix overflow in rgblike_to_depthmap by safe dtype casting
 (torch & NumPy)

---
 src/diffusers/image_processor.py | 39 ++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 0e3082eada8a..db138ff8e01c 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -1045,16 +1045,37 @@ def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) ->
     def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
         r"""
         Convert an RGB-like depth image to a depth map.
-
-        Args:
-            image (`Union[np.ndarray, torch.Tensor]`):
-                The RGB-like depth image to convert.
-
-        Returns:
-            `Union[np.ndarray, torch.Tensor]`:
-                The corresponding depth map.
         """
-        return image[:, :, 1] * 2**8 + image[:, :, 2]
+        # 1. Cast the tensor to a larger integer type (e.g., int32)
+        #    to safely perform the multiplication by 256.
+        # 2. Perform the 16-bit combination: High-byte * 256 + Low-byte.
+        # 3. Cast the final result to the desired depth map type (uint16) if needed 
+        #    before returning, though leaving it as int32/int64 is often safer 
+        #    for return value from a library function.
+        
+        if isinstance(image, torch.Tensor):
+            # Cast to a safe dtype (e.g., int32 or int64) for the calculation
+            image_safe = image.to(torch.int32) 
+            
+            # Calculate the depth map
+            depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
+            
+            # You may want to cast the final result to uint16, but casting to a 
+            # larger int type (like int32) is sufficient to fix the overflow.
+            # depth_map = depth_map.to(torch.uint16) # Uncomment if uint16 is strictly required
+            return depth_map
+            
+        elif isinstance(image, np.ndarray):
+            # NumPy equivalent: Cast to a safe dtype (e.g., np.int32)
+            image_safe = image.astype(np.int32)
+            
+            # Calculate the depth map
+            depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
+            
+            # depth_map = depth_map.astype(np.uint16) # Uncomment if uint16 is strictly required
+            return depth_map
+        else:
+            raise TypeError("Input image must be a torch.Tensor or np.ndarray")
 
     def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
         r"""

From c3b6e6e95d3d8825a65d37f75e72d4a14a75777a Mon Sep 17 00:00:00 2001
From: Mohammad Sadegh Salehi
 <34940948+MohammadSadeghSalehi@users.noreply.github.com>
Date: Thu, 6 Nov 2025 09:48:28 +0000
Subject: [PATCH 2/3] Fix: store original dtype and cast back after safe
 computation

---
 src/diffusers/image_processor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index db138ff8e01c..4c3138b8f29a 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -1055,6 +1055,7 @@ def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndar
         
         if isinstance(image, torch.Tensor):
             # Cast to a safe dtype (e.g., int32 or int64) for the calculation
+            original_dtype = image.dtype
             image_safe = image.to(torch.int32) 
             
             # Calculate the depth map
@@ -1063,17 +1064,18 @@ def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndar
             # You may want to cast the final result to uint16, but casting to a 
             # larger int type (like int32) is sufficient to fix the overflow.
             # depth_map = depth_map.to(torch.uint16) # Uncomment if uint16 is strictly required
-            return depth_map
+            return depth_map.to(original_dtype)
             
         elif isinstance(image, np.ndarray):
             # NumPy equivalent: Cast to a safe dtype (e.g., np.int32)
+            original_dtype = image.dtype
             image_safe = image.astype(np.int32)
             
             # Calculate the depth map
             depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
             
             # depth_map = depth_map.astype(np.uint16) # Uncomment if uint16 is strictly required
-            return depth_map
+            return depth_map.astype(original_dtype)
         else:
             raise TypeError("Input image must be a torch.Tensor or np.ndarray")
 

From 1c8b5dc36cbd727e3e7628e620fbc027494a92af Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Nov 2025 16:16:21 +0000
Subject: [PATCH 3/3] Apply style fixes

---
 src/diffusers/image_processor.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 4c3138b8f29a..067d876ffcd8 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -1049,31 +1049,31 @@ def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndar
         # 1. Cast the tensor to a larger integer type (e.g., int32)
         #    to safely perform the multiplication by 256.
         # 2. Perform the 16-bit combination: High-byte * 256 + Low-byte.
-        # 3. Cast the final result to the desired depth map type (uint16) if needed 
-        #    before returning, though leaving it as int32/int64 is often safer 
+        # 3. Cast the final result to the desired depth map type (uint16) if needed
+        #    before returning, though leaving it as int32/int64 is often safer
         #    for return value from a library function.
-        
+
         if isinstance(image, torch.Tensor):
             # Cast to a safe dtype (e.g., int32 or int64) for the calculation
             original_dtype = image.dtype
-            image_safe = image.to(torch.int32) 
-            
+            image_safe = image.to(torch.int32)
+
             # Calculate the depth map
             depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
-            
-            # You may want to cast the final result to uint16, but casting to a 
+
+            # You may want to cast the final result to uint16, but casting to a
             # larger int type (like int32) is sufficient to fix the overflow.
             # depth_map = depth_map.to(torch.uint16) # Uncomment if uint16 is strictly required
             return depth_map.to(original_dtype)
-            
+
         elif isinstance(image, np.ndarray):
             # NumPy equivalent: Cast to a safe dtype (e.g., np.int32)
             original_dtype = image.dtype
             image_safe = image.astype(np.int32)
-            
+
             # Calculate the depth map
             depth_map = image_safe[:, :, 1] * 256 + image_safe[:, :, 2]
-            
+
             # depth_map = depth_map.astype(np.uint16) # Uncomment if uint16 is strictly required
             return depth_map.astype(original_dtype)
         else: