huggingface · sayakpaul · Feb 11, 2024 · Feb 13, 2024 · sayakpaul · Feb 13, 2024
diff --git a/scripts/convert_diffusers_to_original_stable_diffusion.py b/scripts/convert_diffusers_to_original_stable_diffusion.py
@@ -177,6 +177,8 @@ def reshape_weight_for_sd(w):
 
 
 def convert_vae_state_dict(vae_state_dict):
+    # print(vae_state_dict["encoder.mid_block.attentions.0.to_k.bias"].shape)
+
     mapping = {k: k for k in vae_state_dict.keys()}
     for k, v in mapping.items():
         for sd_part, hf_part in vae_conversion_map:
@@ -188,6 +190,7 @@ def convert_vae_state_dict(vae_state_dict):
                 v = v.replace(hf_part, sd_part)
             mapping[k] = v
     new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+
     weights_to_convert = ["q", "k", "v", "proj_out"]
     keys_to_rename = {}
     for k, v in new_state_dict.items():
@@ -198,9 +201,13 @@ def convert_vae_state_dict(vae_state_dict):
         for weight_name, real_weight_name in vae_extra_conversion_map:
             if f"mid.attn_1.{weight_name}.weight" in k or f"mid.attn_1.{weight_name}.bias" in k:
                 keys_to_rename[k] = k.replace(weight_name, real_weight_name)
+
+    # print(keys_to_rename)
     for k, v in keys_to_rename.items():
         if k in new_state_dict:
             print(f"Renaming {k} to {v}")
+            if "encoder.mid.attn_1.k.bias" in v:
+                print(new_state_dict[k].shape, reshape_weight_for_sd(new_state_dict[k]).shape)
             new_state_dict[v] = reshape_weight_for_sd(new_state_dict[k])
             del new_state_dict[k]
     return new_state_dict

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -59,6 +59,7 @@
     "xl_refiner": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml",
     "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml",
     "controlnet": "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml",
+    "test_single_file_sd": "https://huggingface.co/datasets/sayakpaul/sample-datasets/raw/main/tiny-sd-single-file-config.yaml",
 }
 
 CHECKPOINT_KEY_NAMES = {
@@ -278,6 +279,12 @@ def infer_original_config_file(class_name, checkpoint):
     elif class_name == "ControlNetModel":
         config_url = CONFIG_URLS["controlnet"]
 
+    elif len(checkpoint) == 512:
+        config_url = CONFIG_URLS["test_single_file_sd"]
+
+    elif len(checkpoint) == 701:
+        config_url = CONFIG_URLS["test_single_file_sdxl"]
+
     else:
         config_url = CONFIG_URLS["v1"]
 
@@ -1010,9 +1017,12 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
-def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False):
+def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False, subfolder=None):
     try:
-        config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+        if subfolder is None:
+            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+        else:
+            config = CLIPTextConfig.from_pretrained(config_name, subfolder=subfolder)
     except Exception:
         raise ValueError(
             f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
@@ -1189,6 +1199,7 @@ def create_diffusers_vae_model_from_ldm(
 
     if is_accelerate_available():
         for param_name, param in diffusers_format_vae_checkpoint.items():
+            # print(param_name, param.shape)
             set_module_tensor_to_device(vae, param_name, "cpu", value=param)
     else:
         vae.load_state_dict(diffusers_format_vae_checkpoint)
@@ -1231,7 +1242,17 @@ def create_text_encoders_and_tokenizers_from_ldm(
             tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
 
         except Exception:
-            raise ValueError(
+            try:
+                if not local_files_only:
+                    config_name = "hf-internal-testing/tiny-sd-pipe"
+                    text_encoder = create_text_encoder_from_ldm_clip_checkpoint(
+                        config_name, checkpoint, local_files_only=False, subfolder="text_encoder"
+                    )
+                    tokenizer = CLIPTokenizer.from_pretrained(config_name, subfolder="tokenizer", local_files_only=False)
+                else:
+                    raise ValueError("This option needs `local_files_only` set to False.")
+            except Exception:
+                raise ValueError(
                 f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
             )
         else:

diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -124,9 +124,7 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
                     ) from e
         except (UnicodeDecodeError, ValueError):
             raise OSError(
-                f"Unable to load weights from checkpoint file for '{checkpoint_file}' "
-                f"at '{checkpoint_file}'. "
-                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
+                f"Unable to load weights from checkpoint file for '{checkpoint_file} at '{checkpoint_file}'."
             )