From d8aa85274e653908f43caaf857047cbe6c3487be Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 00:14:39 -0300 Subject: [PATCH 1/7] refactor: move DDIM/TCD initial noise scaling out of the samplers We don't have the noise component isolated during img2img at the sampler's code, so move the initial scaling to the latent initialization code. Note that, for normal txt2img, this scale factor is very close to 1 due to the large initial sigma. But it gets larger for small sigmas, e.g. with low-strength i2i. --- src/denoiser.hpp | 8 ++------ src/stable-diffusion.cpp | 9 ++++++++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/denoiser.hpp b/src/denoiser.hpp index c9c9d881d..da6fcc42b 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1311,9 +1311,7 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; int prev_timestep = timestep - TIMESTEPS / steps; float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - x *= std::sqrt(sigma * sigma + 1) / sigma; - } else { + if (i > 0) { x *= std::sqrt(sigma * sigma + 1); } @@ -1376,9 +1374,7 @@ static sd::Tensor sample_tcd(denoise_cb_t model, int timestep_s = (int)floor((1 - eta) * prev_timestep); float sigma = static_cast(compvis_sigmas[timestep]); - if (i == 0) { - x *= std::sqrt(sigma * sigma + 1) / sigma; - } else { + if (i > 0) { x *= std::sqrt(sigma * sigma + 1); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 683a07d53..b753b475e 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1619,8 +1619,15 @@ class StableDiffusionGGML { } int64_t t0 = ggml_time_us(); + + // scales the initial noise for DDIM and TCD + // NOTE: may not be strictly needed, since with the initial + // ~14.6 sigma, its value is very close to 1 (~1.002) + float ddim_noise_scale = ((method == DDIM_TRAILING_SAMPLE_METHOD || method == TCD_SAMPLE_METHOD) + ? std::sqrt(sigmas[0] * sigmas[0] + 1) / sigmas[0] + : 1.0); sd::Tensor x_t = !noise.empty() - ? denoiser->noise_scaling(sigmas[0], noise, init_latent) + ? denoiser->noise_scaling(sigmas[0] * ddim_noise_scale, noise, init_latent) : init_latent; sd::Tensor denoised = x_t; SamplePreviewContext preview = prepare_sample_preview_context(); From 6f7bfa3e493a0ae1f4b0b60b76fcdbbcae3e38bc Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 08:23:38 -0300 Subject: [PATCH 2/7] refactor: move DDIM/TCD scaling to the end of the loop Also tweaks the criteria to check for the last iteration, to make a follow-up change easier to read. --- src/denoiser.hpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/denoiser.hpp b/src/denoiser.hpp index da6fcc42b..c7f9c4df4 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1311,9 +1311,6 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; int prev_timestep = timestep - TIMESTEPS / steps; float sigma = static_cast(compvis_sigmas[timestep]); - if (i > 0) { - x *= std::sqrt(sigma * sigma + 1); - } auto model_output_opt = model(x, sigma, i + 1); if (model_output_opt.empty()) { @@ -1341,6 +1338,11 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, if (eta > 0) { x += std_dev_t * sd::Tensor::randn_like(x, rng); } + + if (prev_timestep >= 0) { + // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1 + x *= std::sqrt(1 / alpha_prod_t_prev); + } } return x; } @@ -1374,10 +1376,6 @@ static sd::Tensor sample_tcd(denoise_cb_t model, int timestep_s = (int)floor((1 - eta) * prev_timestep); float sigma = static_cast(compvis_sigmas[timestep]); - if (i > 0) { - x *= std::sqrt(sigma * sigma + 1); - } - auto model_output_opt = model(x, sigma, i + 1); if (model_output_opt.empty()) { return {}; @@ -1402,6 +1400,11 @@ static sd::Tensor sample_tcd(denoise_cb_t model, x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x + std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); } + + if (prev_timestep >= 0) { + // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1 + x *= std::sqrt(1 / alpha_prod_t_prev); + } } return x; } From 626efc1669325a2039b5c1d505e54803585440ea Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 08:38:01 -0300 Subject: [PATCH 3/7] fix: replace DDIM/TCD handling of out-of-bounds alpha_cumprod Instead of the arbitrarily-chosen first alpha_cumprod (which is very close to 1), use a constant 1, which is the value actually used at the start of the cumulative product calculations. This also avoids the need to check for the last iteration when scaling x for the next loop. --- src/denoiser.hpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/denoiser.hpp b/src/denoiser.hpp index c7f9c4df4..14ea7b5d5 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1320,7 +1320,7 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, model_output = (x - model_output) * (1.0f / sigma); float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast(alphas_cumprod[prev_timestep]) : 1.0f; float beta_prod_t = 1.0f - alpha_prod_t; sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - @@ -1339,10 +1339,7 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, x += std_dev_t * sd::Tensor::randn_like(x, rng); } - if (prev_timestep >= 0) { - // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1 - x *= std::sqrt(1 / alpha_prod_t_prev); - } + x *= std::sqrt(1 / alpha_prod_t_prev); } return x; } @@ -1385,7 +1382,7 @@ static sd::Tensor sample_tcd(denoise_cb_t model, float alpha_prod_t = static_cast(alphas_cumprod[timestep]); float beta_prod_t = 1.0f - alpha_prod_t; - float alpha_prod_t_prev = static_cast(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]); + float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast(alphas_cumprod[prev_timestep]) : 1.0f; float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); float beta_prod_s = 1.0f - alpha_prod_s; @@ -1401,10 +1398,7 @@ static sd::Tensor sample_tcd(denoise_cb_t model, std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); } - if (prev_timestep >= 0) { - // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1 - x *= std::sqrt(1 / alpha_prod_t_prev); - } + x *= std::sqrt(1 / alpha_prod_t_prev); } return x; } From fb1b3fddce358841aceb13c380c9e0903f6f8ed3 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 10:16:30 -0300 Subject: [PATCH 4/7] fix: use the simple scheduler with DDIM Apart from the rounding criteria, Simple is equivalent to the hardcoded DDIM scheduler. So, drop the local compvis_sigmas and alphas_cumprod tables, and recover the alpha_cumprod values from the provided sigmas vector. This partially fixes DDIM behavior with image to image, since we rely on the input sigmas vector to provide the appropriate noise levels. It also allows combining DDIM with different schedulers. --- src/denoiser.hpp | 25 +++++-------------------- src/stable-diffusion.cpp | 2 ++ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/denoiser.hpp b/src/denoiser.hpp index 14ea7b5d5..e45e3535c 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1290,27 +1290,12 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, const std::vector& sigmas, std::shared_ptr rng, float eta) { - float beta_start = 0.00085f; - float beta_end = 0.0120f; - std::vector alphas_cumprod(TIMESTEPS); - std::vector compvis_sigmas(TIMESTEPS); - for (int i = 0; i < TIMESTEPS; i++) { - alphas_cumprod[i] = - (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * - (1.0f - - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), - 2)); - compvis_sigmas[i] = - std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); - } int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - int timestep = static_cast(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1; - int prev_timestep = timestep - TIMESTEPS / steps; - float sigma = static_cast(compvis_sigmas[timestep]); + + float sigma = sigmas[i]; + float sigma_to = sigmas[i + 1]; auto model_output_opt = model(x, sigma, i + 1); if (model_output_opt.empty()) { @@ -1319,8 +1304,8 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, sd::Tensor model_output = std::move(model_output_opt); model_output = (x - model_output) * (1.0f / sigma); - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); - float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast(alphas_cumprod[prev_timestep]) : 1.0f; + float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f); + float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f); float beta_prod_t = 1.0f - alpha_prod_t; sd::Tensor pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) - diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index b753b475e..d01ff52a5 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2424,6 +2424,8 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me } if (sample_method == LCM_SAMPLE_METHOD) { return LCM_SCHEDULER; + } else if (sample_method == DDIM_TRAILING_SAMPLE_METHOD) { + return SIMPLE_SCHEDULER; } return DISCRETE_SCHEDULER; } From b660006212387df09f29bfe2762cbf1052167a6d Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 11:42:39 -0300 Subject: [PATCH 5/7] fix: use the LCM scheduler with TCD LCM and TCD timesteps are identical, so adapt the TCD code to use the provided sigmas vector, keeping the internal tables to obtain the timesteps from the sigmas. As with DDIM, this partially fixes TCD for image to image operations. An alternative could be using the input sigmas to obtain the timestep and prev_timestep values, then obtain new sigma and alpha_cumprod values from the tables, keeping most of the code as-is. It'd be more complex, though: input sigmas that happened to be too close to one another could end up rounded to the same timestep value, and it's not clear how to best guard against that. --- src/denoiser.hpp | 28 ++++++++++++++++++++-------- src/stable-diffusion.cpp | 2 +- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/denoiser.hpp b/src/denoiser.hpp index e45e3535c..fd30fb573 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1350,13 +1350,25 @@ static sd::Tensor sample_tcd(denoise_cb_t model, std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); } - int original_steps = 50; - int steps = static_cast(sigmas.size()) - 1; + auto get_timestep_from_sigma = [&](float s) -> int { + auto it = std::lower_bound(compvis_sigmas.begin(), compvis_sigmas.end(), s); + if (it == compvis_sigmas.begin()) return 0; + if (it == compvis_sigmas.end()) return TIMESTEPS - 1; + int idx_high = static_cast(std::distance(compvis_sigmas.begin(), it)); + int idx_low = idx_high - 1; + if (std::abs(compvis_sigmas[idx_high] - s) < std::abs(compvis_sigmas[idx_low] - s)) { + return idx_high; + } + return idx_low; + }; + + int steps = static_cast(sigmas.size()) - 1; for (int i = 0; i < steps; i++) { - int timestep = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps)); - int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); + + float sigma_to = sigmas[i + 1]; + int prev_timestep = get_timestep_from_sigma(sigma_to); int timestep_s = (int)floor((1 - eta) * prev_timestep); - float sigma = static_cast(compvis_sigmas[timestep]); + float sigma = sigmas[i]; auto model_output_opt = model(x, sigma, i + 1); if (model_output_opt.empty()) { @@ -1365,9 +1377,9 @@ static sd::Tensor sample_tcd(denoise_cb_t model, sd::Tensor model_output = std::move(model_output_opt); model_output = (x - model_output) * (1.0f / sigma); - float alpha_prod_t = static_cast(alphas_cumprod[timestep]); + float alpha_prod_t = 1.0f / (sigma * sigma + 1.0f); float beta_prod_t = 1.0f - alpha_prod_t; - float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast(alphas_cumprod[prev_timestep]) : 1.0f; + float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f); float alpha_prod_s = static_cast(alphas_cumprod[timestep_s]); float beta_prod_s = 1.0f - alpha_prod_s; @@ -1378,7 +1390,7 @@ static sd::Tensor sample_tcd(denoise_cb_t model, x = std::sqrt(alpha_prod_s) * pred_original_sample + std::sqrt(beta_prod_s) * model_output; - if (eta > 0 && i != steps - 1) { + if (eta > 0 && sigma_to > 0.0f) { x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x + std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index d01ff52a5..ea1a6eef9 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2422,7 +2422,7 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me return EXPONENTIAL_SCHEDULER; } } - if (sample_method == LCM_SAMPLE_METHOD) { + if (sample_method == LCM_SAMPLE_METHOD || sample_method == TCD_SAMPLE_METHOD) { return LCM_SCHEDULER; } else if (sample_method == DDIM_TRAILING_SAMPLE_METHOD) { return SIMPLE_SCHEDULER; From 3827d1b2a383c08c5ca1fe42c5c900f42c6d4efe Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 09:59:13 -0300 Subject: [PATCH 6/7] refactor: fold DDIM/TCD scaling into the x update calculations --- src/denoiser.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/denoiser.hpp b/src/denoiser.hpp index fd30fb573..e3fc82464 100644 --- a/src/denoiser.hpp +++ b/src/denoiser.hpp @@ -1317,14 +1317,13 @@ static sd::Tensor sample_ddim_trailing(denoise_cb_t model, (1.0f - alpha_prod_t / alpha_prod_t_prev); float std_dev_t = eta * std::sqrt(variance); - x = std::sqrt(alpha_prod_t_prev) * pred_original_sample + - std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output; + x = pred_original_sample + + std::sqrt((1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2))/ alpha_prod_t_prev) * model_output; if (eta > 0) { - x += std_dev_t * sd::Tensor::randn_like(x, rng); + x+= std_dev_t / std::sqrt(alpha_prod_t_prev) * sd::Tensor::randn_like(x, rng); } - x *= std::sqrt(1 / alpha_prod_t_prev); } return x; } @@ -1387,15 +1386,14 @@ static sd::Tensor sample_tcd(denoise_cb_t model, std::sqrt(beta_prod_t) * model_output) * (1.0f / std::sqrt(alpha_prod_t)); - x = std::sqrt(alpha_prod_s) * pred_original_sample + - std::sqrt(beta_prod_s) * model_output; + x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * pred_original_sample + + std::sqrt(beta_prod_s / alpha_prod_t_prev) * model_output; if (eta > 0 && sigma_to > 0.0f) { x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x + - std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor::randn_like(x, rng); + std::sqrt(1.0f / alpha_prod_t_prev - 1.0f / alpha_prod_s) * sd::Tensor::randn_like(x, rng); } - x *= std::sqrt(1 / alpha_prod_t_prev); } return x; } From 64d01533fa9d94696b4ffafbb7327d5201e4804a Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 11 Apr 2026 15:55:29 -0300 Subject: [PATCH 7/7] fix: correct DDIM/TCD for low-strength image to image As explained in a previous commit, the initial noise scaling has very little effect for text to image; but for image to image, the lower the denoising strength, the stronger it gets, until the model isn't able to compensate for it. I've placed this change at the end of the series to make it easier to test the results. --- src/stable-diffusion.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index ea1a6eef9..c07531e82 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1619,15 +1619,8 @@ class StableDiffusionGGML { } int64_t t0 = ggml_time_us(); - - // scales the initial noise for DDIM and TCD - // NOTE: may not be strictly needed, since with the initial - // ~14.6 sigma, its value is very close to 1 (~1.002) - float ddim_noise_scale = ((method == DDIM_TRAILING_SAMPLE_METHOD || method == TCD_SAMPLE_METHOD) - ? std::sqrt(sigmas[0] * sigmas[0] + 1) / sigmas[0] - : 1.0); sd::Tensor x_t = !noise.empty() - ? denoiser->noise_scaling(sigmas[0] * ddim_noise_scale, noise, init_latent) + ? denoiser->noise_scaling(sigmas[0], noise, init_latent) : init_latent; sd::Tensor denoised = x_t; SamplePreviewContext preview = prepare_sample_preview_context();