From 1d13041aa231bc29faa0577927bb784bffc0c017 Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 14 Oct 2025 23:12:39 +0800 Subject: [PATCH 1/3] fix: resolve precision issues in SDXL VAE under fp16 --- README.md | 1 - conditioner.hpp | 2 +- ggml_extend.hpp | 70 +++++++++++++++++++++++--------------------- qwen_image.hpp | 2 +- stable-diffusion.cpp | 12 ++++---- vae.hpp | 12 ++++++++ 6 files changed, 55 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 0a27bc1c0..516b71950 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ API and command-line option may change frequently.*** - Image Models - SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) - SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) - - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors). - [SD3/SD3.5](./docs/sd3.md) - [Flux-dev/Flux-schnell](./docs/flux.md) - [Chroma](./docs/chroma.md) diff --git a/conditioner.hpp b/conditioner.hpp index abd6dbc3f..4f9efb8cf 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1457,7 +1457,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { const ConditionerParams& conditioner_params) { std::string prompt; std::vector> image_embeds; - size_t system_prompt_length = 0; + size_t system_prompt_length = 0; int prompt_template_encode_start_idx = 34; if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) { LOG_INFO("QwenImageEditPlusPipeline"); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index ca91121dc..d8df0d8f6 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -975,38 +975,28 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* w, struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1) { - x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); - if (b != NULL) { - b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); - // b = ggml_repeat(ctx, b, x); - x = ggml_add_inplace(ctx, x, b); + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1, + bool direct = false, + float scale = 1.f) { + if (scale != 1.f) { + x = ggml_scale(ctx, x, scale); + } + if (direct) { + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + } else { + x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); + } + if (scale != 1.f) { + x = ggml_scale(ctx, x, 1.f / scale); } - return x; -} - -// w: [OC*IC, KD, KH, KW] -// x: [N*IC, ID, IH, IW] -__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1) { - x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); if (b != NULL) { b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); - // b = ggml_repeat(ctx, b, x); - x = ggml_add(ctx, x, b); + x = ggml_add_inplace(ctx, x, b); } return x; } @@ -2067,6 +2057,7 @@ class Conv2d : public UnaryBlock { std::pair dilation; bool bias; bool direct = false; + float scale = 1.f; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { enum ggml_type wtype = GGML_TYPE_F16; @@ -2097,6 +2088,10 @@ class Conv2d : public UnaryBlock { direct = true; } + void set_scale(float scale_value) { + scale = scale_value; + } + std::string get_desc() { return "Conv2d"; } @@ -2107,11 +2102,18 @@ class Conv2d : public UnaryBlock { if (bias) { b = params["bias"]; } - if (direct) { - return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - } else { - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - } + return ggml_nn_conv_2d(ctx, + x, + w, + b, + stride.second, + stride.first, + padding.second, + padding.first, + dilation.second, + dilation.first, + direct, + scale); } }; diff --git a/qwen_image.hpp b/qwen_image.hpp index 630e5536e..ce4e62dce 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -535,7 +535,7 @@ namespace Qwen { } } LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); - qwen_image = QwenImageModel(qwen_image_params); + qwen_image = QwenImageModel(qwen_image_params); qwen_image.init(params_ctx, tensor_types, prefix); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 42912805c..3de931437 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -330,13 +330,6 @@ class StableDiffusionGGML { if (sd_version_is_sdxl(version)) { scale_factor = 0.13025f; - if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) { - LOG_WARN( - "!!!It looks like you are using SDXL model. " - "If you find that the generated images are completely black, " - "try specifying SDXL VAE FP16 Fix with the --vae parameter. " - "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors"); - } } else if (sd_version_is_sd3(version)) { scale_factor = 1.5305f; } else if (sd_version_is_flux(version)) { @@ -517,6 +510,11 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the vae model"); first_stage_model->enable_conv2d_direct(); } + if (version == VERSION_SDXL && strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0) { + float vae_conv_2d_scale = 1.f / 32.f; + LOG_WARN("No VAE specified with --vae, using Conv2D scale %.3f", vae_conv_2d_scale); + first_stage_model->set_conv2d_scale(vae_conv_2d_scale); + } first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { diff --git a/vae.hpp b/vae.hpp index 622b8bb93..20d97a2ad 100644 --- a/vae.hpp +++ b/vae.hpp @@ -530,6 +530,7 @@ struct VAE : public GGMLRunner { struct ggml_context* output_ctx) = 0; virtual void get_param_tensors(std::map& tensors, const std::string prefix) = 0; virtual void enable_conv2d_direct(){}; + virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); }; }; struct AutoEncoderKL : public VAE { @@ -558,6 +559,17 @@ struct AutoEncoderKL : public VAE { } } + void set_conv2d_scale(float scale) { + std::vector blocks; + ae.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->set_scale(scale); + } + } + } + std::string get_desc() { return "vae"; } From 80ecc326950bd44e43e2fcd60876a57836117d7d Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 15 Oct 2025 21:36:58 +0800 Subject: [PATCH 2/3] add --force-sdxl-vae-conv-scale option --- examples/cli/main.cpp | 5 +++++ stable-diffusion.cpp | 8 ++++++-- stable-diffusion.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index b1d83a06d..474d433af 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -128,6 +128,7 @@ struct SDParams { float flow_shift = INFINITY; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; + bool force_sdxl_vae_conv_scale = false; SDParams() { sd_sample_params_init(&sample_params); @@ -194,6 +195,7 @@ void print_params(SDParams params) { printf(" seed: %zd\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false"); + printf(" force_sdxl_vae_conv_scale: %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); @@ -287,6 +289,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n"); printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n"); printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n"); + printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n"); printf(" --vae-on-cpu keep vae in cpu (for low vram)\n"); printf(" --clip-on-cpu keep clip in cpu (for low vram)\n"); printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); @@ -557,6 +560,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { options.bool_options = { {"", "--vae-tiling", "", true, ¶ms.vae_tiling_params.enabled}, + {"", "--force-sdxl-vae-conv-scale", "", true, ¶ms.force_sdxl_vae_conv_scale}, {"", "--offload-to-cpu", "", true, ¶ms.offload_params_to_cpu}, {"", "--control-net-cpu", "", true, ¶ms.control_net_cpu}, {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, @@ -1361,6 +1365,7 @@ int main(int argc, const char* argv[]) { params.diffusion_flash_attn, params.diffusion_conv_direct, params.vae_conv_direct, + params.force_sdxl_vae_conv_scale, params.chroma_use_dit_mask, params.chroma_use_t5_mask, params.chroma_t5_mask_pad, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3de931437..8071f6f09 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -510,9 +510,13 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the vae model"); first_stage_model->enable_conv2d_direct(); } - if (version == VERSION_SDXL && strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0) { + if (version == VERSION_SDXL && + (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) { float vae_conv_2d_scale = 1.f / 32.f; - LOG_WARN("No VAE specified with --vae, using Conv2D scale %.3f", vae_conv_2d_scale); + LOG_WARN( + "No VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, " + "using Conv2D scale %.3f", + vae_conv_2d_scale); first_stage_model->set_conv2d_scale(vae_conv_2d_scale); } first_stage_model->alloc_params_buffer(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 1d3ed85c1..4d6af69be 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -153,6 +153,7 @@ typedef struct { bool diffusion_flash_attn; bool diffusion_conv_direct; bool vae_conv_direct; + bool force_sdxl_vae_conv_scale; bool chroma_use_dit_mask; bool chroma_use_t5_mask; int chroma_t5_mask_pad; From 34f006024de43a50574d2433c885193acf5ff75c Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 15 Oct 2025 21:42:39 +0800 Subject: [PATCH 3/3] update docs --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 516b71950..18721eb27 100644 --- a/README.md +++ b/README.md @@ -363,6 +363,7 @@ arguments: --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32) --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5) + --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --vae-on-cpu keep vae in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram) --diffusion-fa use flash attention in the diffusion model (for low vram)