huggingface · patrickvonplaten · Dec 1, 2022 · Dec 1, 2022 · Dec 1, 2022 · patrickvonplaten
diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -207,12 +207,12 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-def create_unet_diffusers_config(original_config):
+def create_unet_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    model_params = original_config.model.params
     unet_params = original_config.model.params.unet_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
 
@@ -230,8 +230,10 @@ def create_unet_diffusers_config(original_config):
         up_block_types.append(block_type)
         resolution //= 2
 
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
     config = dict(
-        sample_size=model_params.image_size,
+        sample_size=image_size // vae_scale_factor,
         in_channels=unet_params.in_channels,
         out_channels=unet_params.out_channels,
         down_block_types=tuple(down_block_types),
@@ -245,7 +247,7 @@ def create_unet_diffusers_config(original_config):
     return config
 
 
-def create_vae_diffusers_config(original_config):
+def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
@@ -257,7 +259,7 @@ def create_vae_diffusers_config(original_config):
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
     config = dict(
-        sample_size=vae_params.resolution,
+        sample_size=image_size,
         in_channels=vae_params.in_channels,
         out_channels=vae_params.out_ch,
         down_block_types=tuple(down_block_types),
@@ -653,6 +655,15 @@ def convert_ldm_clip_checkpoint(checkpoint):
         type=str,
         help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
     )
+    parser.add_argument(
+        "--image_size",
+        default=512,
+        type=int,
+        help=(
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
+            " Base. Use 768 for Stable Diffusion v2."
+        ),
+    )
     parser.add_argument(
         "--extract_ema",
         action="store_true",
@@ -712,7 +723,7 @@ def convert_ldm_clip_checkpoint(checkpoint):
         raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
 
     # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config)
+    unet_config = create_unet_diffusers_config(original_config, image_size=args.image_size)
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
     )
@@ -721,7 +732,7 @@ def convert_ldm_clip_checkpoint(checkpoint):
     unet.load_state_dict(converted_unet_checkpoint)
 
     # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config)
+    vae_config = create_vae_diffusers_config(original_config, image_size=args.image_size)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
     vae = AutoencoderKL(**vae_config)