From d68635f9504163ac110325b47ce70e87122b6984 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 12 Feb 2024 14:24:24 +0000 Subject: [PATCH 1/4] update --- src/diffusers/loaders/autoencoder.py | 5 ++++- src/diffusers/loaders/single_file_utils.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py index 960552f9790b..b4e31ab00042 100644 --- a/src/diffusers/loaders/autoencoder.py +++ b/src/diffusers/loaders/autoencoder.py @@ -118,7 +118,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): ) image_size = kwargs.pop("image_size", None) - component = create_diffusers_vae_model_from_ldm(class_name, original_config, checkpoint, image_size=image_size) + scaling_factor = kwargs.pop("scaling_factor", None) + component = create_diffusers_vae_model_from_ldm( + class_name, original_config, checkpoint, image_size=image_size, scaling_factor=scaling_factor + ) vae = component["vae"] if torch_dtype is not None: vae = vae.to(torch_dtype) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 3a9f6e88238a..11103ba84d29 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -1173,7 +1173,7 @@ def create_diffusers_unet_model_from_ldm( def create_diffusers_vae_model_from_ldm( - pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=0.18125 + pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=None ): # import here to avoid circular imports from ..models import AutoencoderKL From 2d5e9c2e39d63cf6505fe9b117a173a2938a6327 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 12 Feb 2024 16:10:28 +0000 Subject: [PATCH 2/4] update --- src/diffusers/loaders/autoencoder.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py index b4e31ab00042..38577e79a16c 100644 --- a/src/diffusers/loaders/autoencoder.py +++ b/src/diffusers/loaders/autoencoder.py @@ -92,6 +92,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): """ original_config_file = kwargs.pop("original_config_file", None) + config_file = kwargs.pop("config_file", None) resume_download = kwargs.pop("resume_download", False) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) @@ -103,6 +104,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): use_safetensors = kwargs.pop("use_safetensors", True) class_name = cls.__name__ + + if (config_file is not None) and (original_config_file is not None): + raise ValueError( + "You cannot pass both `config_file` and `original_config_file` to `from_single_file`. Please use only one of these arguments." + ) + + original_config_file = original_config_file or config_file original_config, checkpoint = fetch_ldm_config_and_checkpoint( pretrained_model_link_or_path=pretrained_model_link_or_path, class_name=class_name, From e7b2032082cf09a325cb2a0896f27f708860bb9c Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 12 Feb 2024 18:22:36 +0000 Subject: [PATCH 3/4] update --- src/diffusers/loaders/single_file_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 11103ba84d29..90776fe90147 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -175,6 +175,7 @@ } LDM_VAE_KEY = "first_stage_model." +LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215 LDM_UNET_KEY = "model.diffusion_model." LDM_CONTROLNET_KEY = "control_model." LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."] @@ -518,7 +519,9 @@ def create_vae_diffusers_config(original_config, image_size, scaling_factor=None Creates a config for the diffusers based on the config of the LDM model. """ vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - scaling_factor = scaling_factor or original_config["model"]["params"]["scale_factor"] + scaling_factor = ( + scaling_factor or original_config["model"]["params"]["scale_factor"] or LDM_VAE_DEFAULT_SCALING_FACTOR + ) block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) From 57afaff2708dd229020701ad9c9c108d853a58b2 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 13 Feb 2024 03:47:00 +0000 Subject: [PATCH 4/4] update --- src/diffusers/loaders/autoencoder.py | 10 ++++++++++ src/diffusers/loaders/single_file_utils.py | 7 ++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py index 38577e79a16c..4bcdda9bf6ef 100644 --- a/src/diffusers/loaders/autoencoder.py +++ b/src/diffusers/loaders/autoencoder.py @@ -38,6 +38,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): - A link to the `.ckpt` file (for example `"https://huggingface.co//blob/main/.ckpt"`) on the Hub. - A path to a *file* containing all pipeline weights. + config_file (`str`, *optional*): + Filepath to the configuration YAML file associated with the model. If not provided it will default to: + https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml torch_dtype (`str` or `torch.dtype`, *optional*): Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the dtype is automatically derived from the model's weights. @@ -65,6 +68,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): image_size (`int`, *optional*, defaults to 512): The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable Diffusion v2 base model. Use 768 for Stable Diffusion v2. + scaling_factor (`float`, *optional*, defaults to 0.18215): + The component-wise standard deviation of the trained latent space computed using the first batch of the + training set. This is used to scale the latent space to have unit variance when training the diffusion + model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the + diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z + = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution + Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. use_safetensors (`bool`, *optional*, defaults to `None`): If set to `None`, the safetensors weights are downloaded if they're available **and** if the safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 90776fe90147..8368c37ea658 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -519,9 +519,10 @@ def create_vae_diffusers_config(original_config, image_size, scaling_factor=None Creates a config for the diffusers based on the config of the LDM model. """ vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"] - scaling_factor = ( - scaling_factor or original_config["model"]["params"]["scale_factor"] or LDM_VAE_DEFAULT_SCALING_FACTOR - ) + if scaling_factor is None and "scale_factor" in original_config["model"]["params"]: + scaling_factor = original_config["model"]["params"]["scale_factor"] + elif scaling_factor is None: + scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]] down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)