From 6fbf1e472d498d2fad04419c074f45329ca1863e Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 22 Oct 2024 08:24:14 +0200
Subject: [PATCH 1/6] update

---
 src/diffusers/loaders/single_file_utils.py | 77 +++++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 236fbd0c2295..60017c9d4bb0 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -75,6 +75,7 @@
     "stable_cascade_stage_b": "down_blocks.1.0.channelwise.0.weight",
     "stable_cascade_stage_c": "clip_txt_mapper.weight",
     "sd3": "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias",
+    "sd35_large": "model.diffusion_model.joint_blocks.37.x_block.mlp.fc1.weight",
     "animatediff": "down_blocks.0.motion_modules.0.temporal_transformer.transformer_blocks.0.attention_blocks.0.pos_encoder.pe",
     "animatediff_v2": "mid_block.motion_modules.0.temporal_transformer.norm.bias",
     "animatediff_sdxl_beta": "up_blocks.2.motion_modules.0.temporal_transformer.norm.weight",
@@ -113,6 +114,9 @@
     "sd3": {
         "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
     },
+    "sd35_large": {
+        "pretrained_model_name_or_path": "diffusers-internal-dev/test-dummy-3",
+    },
     "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
     "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
     "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},
@@ -504,9 +508,12 @@ def infer_diffusers_model_type(checkpoint):
     ):
         model_type = "stable_cascade_stage_b"
 
-    elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint:
+    elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216:
         model_type = "sd3"
 
+    elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint:
+        model_type = "sd35_large"
+
     elif CHECKPOINT_KEY_NAMES["animatediff"] in checkpoint:
         if CHECKPOINT_KEY_NAMES["animatediff_scribble"] in checkpoint:
             model_type = "animatediff_scribble"
@@ -1670,6 +1677,22 @@ def swap_scale_shift(weight, dim):
     return new_weight
 
 
+def get_attn2_layers(state_dict):
+    attn2_layers = []
+    for key in state_dict.keys():
+        if "attn2." in key:
+            # Extract the layer number from the key
+            layer_num = int(key.split(".")[1])
+            attn2_layers.append(layer_num)
+
+    return tuple(sorted(set(attn2_layers)))
+
+
+def get_caption_projection_dim(state_dict):
+    caption_projection_dim = state_dict["context_embedder.weight"].shape[0]
+    return caption_projection_dim
+
+
 def convert_sd3_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
     converted_state_dict = {}
     keys = list(checkpoint.keys())
@@ -1678,7 +1701,10 @@ def convert_sd3_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
             checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
 
     num_layers = list(set(int(k.split(".", 2)[1]) for k in checkpoint if "joint_blocks" in k))[-1] + 1  # noqa: C401
-    caption_projection_dim = 1536
+    dual_attention_layers = get_attn2_layers(checkpoint)
+
+    caption_projection_dim = get_caption_projection_dim(checkpoint)
+    has_qk_norm = any("ln_q" in key for key in checkpoint.keys())
 
     # Positional and patch embeddings.
     converted_state_dict["pos_embed.pos_embed"] = checkpoint.pop("pos_embed")
@@ -1735,6 +1761,21 @@ def convert_sd3_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
         converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.weight"] = torch.cat([context_v])
         converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.bias"] = torch.cat([context_v_bias])
 
+        # qk norm
+        if has_qk_norm:
+            converted_state_dict[f"transformer_blocks.{i}.attn.norm_q.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.x_block.attn.ln_q.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn.norm_k.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.x_block.attn.ln_k.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn.norm_added_q.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.attn.ln_q.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn.norm_added_k.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.context_block.attn.ln_k.weight"
+            )
+
         # output projections.
         converted_state_dict[f"transformer_blocks.{i}.attn.to_out.0.weight"] = checkpoint.pop(
             f"joint_blocks.{i}.x_block.attn.proj.weight"
@@ -1750,6 +1791,38 @@ def convert_sd3_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
                 f"joint_blocks.{i}.context_block.attn.proj.bias"
             )
 
+        if i in dual_attention_layers:
+            # Q, K, V
+            sample_q2, sample_k2, sample_v2 = torch.chunk(
+                checkpoint.pop(f"joint_blocks.{i}.x_block.attn2.qkv.weight"), 3, dim=0
+            )
+            sample_q2_bias, sample_k2_bias, sample_v2_bias = torch.chunk(
+                checkpoint.pop(f"joint_blocks.{i}.x_block.attn2.qkv.bias"), 3, dim=0
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_q.weight"] = torch.cat([sample_q2])
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_q.bias"] = torch.cat([sample_q2_bias])
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_k.weight"] = torch.cat([sample_k2])
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_k.bias"] = torch.cat([sample_k2_bias])
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_v.weight"] = torch.cat([sample_v2])
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_v.bias"] = torch.cat([sample_v2_bias])
+
+            # qk norm
+            if has_qk_norm:
+                converted_state_dict[f"transformer_blocks.{i}.attn2.norm_q.weight"] = checkpoint.pop(
+                    f"joint_blocks.{i}.x_block.attn2.ln_q.weight"
+                )
+                converted_state_dict[f"transformer_blocks.{i}.attn2.norm_k.weight"] = checkpoint.pop(
+                    f"joint_blocks.{i}.x_block.attn2.ln_k.weight"
+                )
+
+            # output projections.
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_out.0.weight"] = checkpoint.pop(
+                f"joint_blocks.{i}.x_block.attn2.proj.weight"
+            )
+            converted_state_dict[f"transformer_blocks.{i}.attn2.to_out.0.bias"] = checkpoint.pop(
+                f"joint_blocks.{i}.x_block.attn2.proj.bias"
+            )
+
         # norms.
         converted_state_dict[f"transformer_blocks.{i}.norm1.linear.weight"] = checkpoint.pop(
             f"joint_blocks.{i}.x_block.adaLN_modulation.1.weight"

From cd9f01b947ada8162368b15b9189320f085a2ddc Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 22 Oct 2024 08:31:24 +0200
Subject: [PATCH 2/6] update

---
 src/diffusers/loaders/single_file_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 60017c9d4bb0..fa06d07a64c5 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -115,7 +115,7 @@
         "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
     },
     "sd35_large": {
-        "pretrained_model_name_or_path": "diffusers-internal-dev/test-dummy-3",
+        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-5-large",
     },
     "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
     "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},

From 89933791dae90b3b30d706fe6cd70b781e0fdb77 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 22 Oct 2024 09:04:38 +0200
Subject: [PATCH 3/6] update

---
 src/diffusers/loaders/single_file_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index fa06d07a64c5..997166c33c4e 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -117,6 +117,9 @@
     "sd35_large": {
         "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-5-large",
     },
+    "sd35_large_turbo": {
+        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-5-large-turbo",
+    },
     "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
     "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
     "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},

From 9fc676ca1d38d97d6fcbab9aaad161469481a673 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 22 Oct 2024 09:11:15 +0200
Subject: [PATCH 4/6] update

---
 .../stable_diffusion/stable_diffusion_3.md    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
index fd026f07c923..714ee1a8eb67 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -313,6 +313,26 @@ image = pipe("a picture of a cat holding a sign that says hello world").images[0
 image.save('sd3-single-file-t5-fp8.png')
 ```
 
+### Loading the single file checkpoint for the Stable Diffusion 3.5 Transformer Model
+
+```python
+import torch
+from diffusers import SD3Transformer2DModel, StableDiffusion3Pipeline
+
+transformer = SD3Transformer2DModel.from_single_file(
+    "https://huggingface.co/stabilityai/stable-diffusion-3-5-large/blob/main/sd3.5_large.safetensors",
+    torch_dtype=torch.bfloat16,
+)
+pipe = StableDiffusion3Pipeline.from_pretrained(
+    "stabilityai/stable-diffusion-3-5-large",
+    transformer=transformer,
+    torch_dtype=torch.bfloat16,
+)
+pipe.enable_model_cpu_offload()
+image = pipe("a cat holding a sign that says hello world").images[0]
+image.save("sd35.png")
+```
+
 ## StableDiffusion3Pipeline
 
 [[autodoc]] StableDiffusion3Pipeline

From eae7c6041bb1fb03124d69f1f97218e2d5e20318 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 22 Oct 2024 11:05:38 +0200
Subject: [PATCH 5/6] update

---
 .../en/api/pipelines/stable_diffusion/stable_diffusion_3.md   | 4 ++--
 src/diffusers/loaders/single_file_utils.py                    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
index 714ee1a8eb67..8170c5280d38 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
+++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -320,11 +320,11 @@ import torch
 from diffusers import SD3Transformer2DModel, StableDiffusion3Pipeline
 
 transformer = SD3Transformer2DModel.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-3-5-large/blob/main/sd3.5_large.safetensors",
+    "https://huggingface.co/stabilityai/stable-diffusion-3.5-large-turbo/blob/main/sd3.5_large.safetensors",
     torch_dtype=torch.bfloat16,
 )
 pipe = StableDiffusion3Pipeline.from_pretrained(
-    "stabilityai/stable-diffusion-3-5-large",
+    "stabilityai/stable-diffusion-3.5-large",
     transformer=transformer,
     torch_dtype=torch.bfloat16,
 )
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 997166c33c4e..d8f8f13bf082 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -115,10 +115,10 @@
         "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-medium-diffusers",
     },
     "sd35_large": {
-        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-5-large",
+        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large",
     },
     "sd35_large_turbo": {
-        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3-5-large-turbo",
+        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large-turbo",
     },
     "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
     "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},

From 86e8d8af81b84697f15430c4adb2f1b38d6a7b37 Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Tue, 22 Oct 2024 12:25:12 +0200
Subject: [PATCH 6/6] update

---
 src/diffusers/loaders/single_file_utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index d8f8f13bf082..d1bad8b5a7cd 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -117,9 +117,6 @@
     "sd35_large": {
         "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large",
     },
-    "sd35_large_turbo": {
-        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large-turbo",
-    },
     "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"},
     "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"},
     "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"},