diff --git a/examples/cli/README.md b/examples/cli/README.md index cf90b1e5f..4dcfb89b0 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -129,8 +129,8 @@ Generation Options: --hires-upscale-tile-size highres fix upscaler tile size, reserved for model-backed upscalers (default: 128) --cfg-scale unconditional guidance scale: (default: 7.0) - --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same - as --cfg-scale) + --img-cfg-scale image guidance scale for inpaint or image edit models: (default: same as + --cfg-scale) --guidance distilled guidance scale for models with guidance input (default: 3.5) --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium @@ -140,8 +140,8 @@ Generation Options: res_2s; 1 for euler_a, er_sde and dpm++2s_a) --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) - --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models - (default: same as --cfg-scale) + --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or image edit models (default: + same as --cfg-scale) --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 138b1e698..673699fde 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -940,7 +940,7 @@ ArgOptions SDGenerationParams::get_options() { &sample_params.guidance.txt_cfg}, {"", "--img-cfg-scale", - "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", + "image guidance scale for inpaint or image edit models: (default: same as --cfg-scale)", &sample_params.guidance.img_cfg}, {"", "--guidance", @@ -972,7 +972,7 @@ ArgOptions SDGenerationParams::get_options() { &high_noise_sample_params.guidance.txt_cfg}, {"", "--high-noise-img-cfg-scale", - "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", + "(high noise) image guidance scale for inpaint or image edit models (default: same as --cfg-scale)", &high_noise_sample_params.guidance.img_cfg}, {"", "--high-noise-guidance", diff --git a/examples/server/README.md b/examples/server/README.md index b9266ba5c..d971f5fe4 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -231,8 +231,8 @@ Default Generation Options: --hires-upscale-tile-size highres fix upscaler tile size, reserved for model-backed upscalers (default: 128) --cfg-scale unconditional guidance scale: (default: 7.0) - --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same - as --cfg-scale) + --img-cfg-scale image guidance scale for inpaint or image edit models: (default: same as + --cfg-scale) --guidance distilled guidance scale for models with guidance input (default: 3.5) --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium @@ -242,8 +242,8 @@ Default Generation Options: res_2s; 1 for euler_a, er_sde and dpm++2s_a) --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) - --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models - (default: same as --cfg-scale) + --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or image edit models (default: + same as --cfg-scale) --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 28598eb67..8cfc82733 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -109,6 +109,19 @@ const char* sampling_methods_str[] = { /*================================================== Helper Functions ================================================*/ +static bool sd_version_supports_ref_latent_img_cfg(SDVersion version) { + return version == VERSION_FLUX || + sd_version_is_flux2(version) || + sd_version_is_qwen_image(version) || + sd_version_is_longcat(version) || + sd_version_is_z_image(version); +} + +static bool sd_version_supports_img_cfg(SDVersion version, bool has_ref_images) { + return sd_version_is_inpaint_or_unet_edit(version) || + (has_ref_images && sd_version_supports_ref_latent_img_cfg(version)); +} + void calculate_alphas_cumprod(float* alphas_cumprod, float linear_start = 0.00085f, float linear_end = 0.0120f, @@ -2059,13 +2072,19 @@ class StableDiffusionGGML { cond, &controls); + static const std::vector> empty_ref_latents; + bool uncond_without_ref_latents = !img_cond.empty() && + !ref_latents.empty() && + sd_version_supports_ref_latent_img_cfg(version); + auto run_condition = [&](const SDCondition& condition, - const sd::Tensor* c_concat_override = nullptr, - const std::vector* local_skip_layers = nullptr) -> sd::Tensor { + const sd::Tensor* c_concat_override = nullptr, + const std::vector* local_skip_layers = nullptr, + const std::vector>* ref_latents_override = nullptr) -> sd::Tensor { diffusion_params.context = condition.c_crossattn.empty() ? nullptr : &condition.c_crossattn; diffusion_params.c_concat = c_concat_override != nullptr ? c_concat_override : (condition.c_concat.empty() ? nullptr : &condition.c_concat); diffusion_params.y = condition.c_vector.empty() ? nullptr : &condition.c_vector; - diffusion_params.ref_latents = condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images; + diffusion_params.ref_latents = ref_latents_override != nullptr ? ref_latents_override : (condition.c_ref_images.empty() ? &ref_latents : &condition.c_ref_images); if (sd_version_is_unet(version)) { diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength}; @@ -2140,7 +2159,10 @@ class StableDiffusionGGML { LOG_DEBUG("Skipping layers at uncond step %d\n", step); uncond_skip_layers = &skip_layer_guidance.layers(); } - uncond_out = run_condition(uncond, nullptr, uncond_skip_layers); + uncond_out = run_condition(uncond, + nullptr, + uncond_skip_layers, + uncond_without_ref_latents ? &empty_ref_latents : nullptr); if (uncond_out.empty()) { return {}; } @@ -3149,6 +3171,7 @@ struct GenerationRequest { bool use_img_cond = false; bool use_high_noise_uncond = false; bool use_high_noise_img_cond = false; + bool has_ref_images = false; const sd_cache_params_t* cache_params = nullptr; int batch_count = 1; int shifted_timestep = 0; @@ -3182,6 +3205,7 @@ struct GenerationRequest { eta = sd_img_gen_params->sample_params.eta; increase_ref_index = sd_img_gen_params->increase_ref_index; auto_resize_ref_image = sd_img_gen_params->auto_resize_ref_image; + has_ref_images = sd_img_gen_params->ref_images_count > 0; guidance = sd_img_gen_params->sample_params.guidance; pm_params = sd_img_gen_params->pm_params; hires = sd_img_gen_params->hires; @@ -3305,17 +3329,22 @@ struct GenerationRequest { sd_guidance_params_t* guidance, bool* use_uncond, bool* use_img_cond, + bool has_ref_images, const char* stage_name = nullptr) { GGML_ASSERT(guidance != nullptr); GGML_ASSERT(use_uncond != nullptr); GGML_ASSERT(use_img_cond != nullptr); // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) // img_cfg == txt_cfg means that img_cfg is not used + bool img_cfg_was_unset = !std::isfinite(guidance->img_cfg); if (!std::isfinite(guidance->img_cfg)) { guidance->img_cfg = guidance->txt_cfg; } - if (!sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version)) { + if (!sd_version_supports_img_cfg(sd_ctx->sd->version, has_ref_images)) { + if (!img_cfg_was_unset && guidance->img_cfg != guidance->txt_cfg) { + LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance"); + } guidance->img_cfg = guidance->txt_cfg; } @@ -3344,12 +3373,13 @@ struct GenerationRequest { resolve_hires(); seed = resolve_seed(seed); - resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond); + resolve_guidance(sd_ctx, &guidance, &use_uncond, &use_img_cond, has_ref_images); if (sd_ctx->sd->high_noise_diffusion_model) { resolve_guidance(sd_ctx, &high_noise_guidance, &use_high_noise_uncond, &use_high_noise_img_cond, + has_ref_images, "high noise: "); } @@ -3949,6 +3979,7 @@ static std::optional prepare_image_generation_latents(sd LOG_WARN("This model needs at least one reference image; using an empty reference"); ref_images.push_back(sd::zeros({request->width, request->height, 3, 1})); request->guidance.img_cfg = request->guidance.txt_cfg; + request->use_img_cond = false; } if (!ref_images.empty()) { @@ -4104,6 +4135,10 @@ static std::optional prepare_image_generation_embeds(sd_c cond.c_concat = latents->concat_latent; // TODO: optimize } + bool use_ref_latent_img_cfg = request->use_img_cond && + !latents->ref_images.empty() && + sd_version_supports_ref_latent_img_cfg(sd_ctx->sd->version); + SDCondition uncond; if (request->use_uncond || request->use_high_noise_uncond) { bool zero_out_masked = false; @@ -4121,6 +4156,23 @@ static std::optional prepare_image_generation_embeds(sd_c } } + SDCondition img_cond; + if (request->use_img_cond) { + if (use_ref_latent_img_cfg) { + img_cond = uncond; + + std::vector> empty_ref_images; + condition_params.ref_images = &empty_ref_images; + uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(sd_ctx->sd->n_threads, + condition_params); + if (uncond.c_concat.empty()) { + uncond.c_concat = latents->uncond_concat_latent; // TODO: optimize + } + } else { + img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); + } + } + int64_t t1 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - prepare_start_ms) * 1.0f / 1000); @@ -4129,9 +4181,7 @@ static std::optional prepare_image_generation_embeds(sd_c } ImageGenerationEmbeds embeds; - if (request->use_img_cond) { - embeds.img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); - } + embeds.img_cond = std::move(img_cond); embeds.cond = std::move(cond); embeds.uncond = std::move(uncond); embeds.id_cond = std::move(id_cond);