From d8aa85274e653908f43caaf857047cbe6c3487be Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 00:14:39 -0300
Subject: [PATCH 1/7] refactor: move DDIM/TCD initial noise scaling out of the
 samplers

We don't have the noise component isolated during img2img at the
sampler's code, so move the initial scaling to the latent
initialization code.

Note that, for normal txt2img, this scale factor is very close
to 1 due to the large initial sigma. But it gets larger for
small sigmas, e.g. with low-strength i2i.
---
 src/denoiser.hpp         | 8 ++------
 src/stable-diffusion.cpp | 9 ++++++++-
 2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index c9c9d881d..da6fcc42b 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -1311,9 +1311,7 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
         int timestep      = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
         int prev_timestep = timestep - TIMESTEPS / steps;
         float sigma       = static_cast<float>(compvis_sigmas[timestep]);
-        if (i == 0) {
-            x *= std::sqrt(sigma * sigma + 1) / sigma;
-        } else {
+        if (i > 0) {
             x *= std::sqrt(sigma * sigma + 1);
         }
 
@@ -1376,9 +1374,7 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
         int timestep_s    = (int)floor((1 - eta) * prev_timestep);
         float sigma       = static_cast<float>(compvis_sigmas[timestep]);
 
-        if (i == 0) {
-            x *= std::sqrt(sigma * sigma + 1) / sigma;
-        } else {
+        if (i > 0) {
             x *= std::sqrt(sigma * sigma + 1);
         }
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 683a07d53..b753b475e 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1619,8 +1619,15 @@ class StableDiffusionGGML {
         }
 
         int64_t t0                   = ggml_time_us();
+
+        // scales the initial noise for DDIM and TCD
+        // NOTE: may not be strictly needed, since with the initial
+        // ~14.6 sigma, its value is very close to 1 (~1.002)
+        float ddim_noise_scale       = ((method == DDIM_TRAILING_SAMPLE_METHOD || method == TCD_SAMPLE_METHOD)
+                                           ? std::sqrt(sigmas[0] * sigmas[0] + 1) / sigmas[0]
+                                           : 1.0);
         sd::Tensor<float> x_t        = !noise.empty()
-                                           ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
+                                           ? denoiser->noise_scaling(sigmas[0] * ddim_noise_scale, noise, init_latent)
                                            : init_latent;
         sd::Tensor<float> denoised   = x_t;
         SamplePreviewContext preview = prepare_sample_preview_context();

From 6f7bfa3e493a0ae1f4b0b60b76fcdbbcae3e38bc Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 08:23:38 -0300
Subject: [PATCH 2/7] refactor: move DDIM/TCD scaling to the end of the loop

Also tweaks the criteria to check for the last iteration, to
make a follow-up change easier to read.
---
 src/denoiser.hpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index da6fcc42b..c7f9c4df4 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -1311,9 +1311,6 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
         int timestep      = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
         int prev_timestep = timestep - TIMESTEPS / steps;
         float sigma       = static_cast<float>(compvis_sigmas[timestep]);
-        if (i > 0) {
-            x *= std::sqrt(sigma * sigma + 1);
-        }
 
         auto model_output_opt = model(x, sigma, i + 1);
         if (model_output_opt.empty()) {
@@ -1341,6 +1338,11 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
         if (eta > 0) {
             x += std_dev_t * sd::Tensor<float>::randn_like(x, rng);
         }
+
+        if (prev_timestep >= 0) {
+            // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1
+            x *= std::sqrt(1 / alpha_prod_t_prev);
+        }
     }
     return x;
 }
@@ -1374,10 +1376,6 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
         int timestep_s    = (int)floor((1 - eta) * prev_timestep);
         float sigma       = static_cast<float>(compvis_sigmas[timestep]);
 
-        if (i > 0) {
-            x *= std::sqrt(sigma * sigma + 1);
-        }
-
         auto model_output_opt = model(x, sigma, i + 1);
         if (model_output_opt.empty()) {
             return {};
@@ -1402,6 +1400,11 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
             x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x +
                 std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
         }
+
+        if (prev_timestep >= 0) {
+            // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1
+            x *= std::sqrt(1 / alpha_prod_t_prev);
+        }
     }
     return x;
 }

From 626efc1669325a2039b5c1d505e54803585440ea Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 08:38:01 -0300
Subject: [PATCH 3/7] fix: replace DDIM/TCD handling of out-of-bounds
 alpha_cumprod

Instead of the arbitrarily-chosen first alpha_cumprod (which is
very close to 1), use a constant 1, which is the value actually
used at the start of the cumulative product calculations.

This also avoids the need to check for the last iteration when
scaling x for the next loop.
---
 src/denoiser.hpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index c7f9c4df4..14ea7b5d5 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -1320,7 +1320,7 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
         model_output                   = (x - model_output) * (1.0f / sigma);
 
         float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
-        float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
+        float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast<float>(alphas_cumprod[prev_timestep]) : 1.0f;
         float beta_prod_t       = 1.0f - alpha_prod_t;
 
         sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
@@ -1339,10 +1339,7 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
             x += std_dev_t * sd::Tensor<float>::randn_like(x, rng);
         }
 
-        if (prev_timestep >= 0) {
-            // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1
-            x *= std::sqrt(1 / alpha_prod_t_prev);
-        }
+        x *= std::sqrt(1 / alpha_prod_t_prev);
     }
     return x;
 }
@@ -1385,7 +1382,7 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
 
         float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
         float beta_prod_t       = 1.0f - alpha_prod_t;
-        float alpha_prod_t_prev = static_cast<float>(prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]);
+        float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast<float>(alphas_cumprod[prev_timestep]) : 1.0f;
         float alpha_prod_s      = static_cast<float>(alphas_cumprod[timestep_s]);
         float beta_prod_s       = 1.0f - alpha_prod_s;
 
@@ -1401,10 +1398,7 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
                 std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
         }
 
-        if (prev_timestep >= 0) {
-            // sigma_prev * sigma_prev + 1 = (1 - alpha_prod_t_prev) / alpha_prod_t_prev + 1
-            x *= std::sqrt(1 / alpha_prod_t_prev);
-        }
+        x *= std::sqrt(1 / alpha_prod_t_prev);
     }
     return x;
 }

From fb1b3fddce358841aceb13c380c9e0903f6f8ed3 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 10:16:30 -0300
Subject: [PATCH 4/7] fix: use the simple scheduler with DDIM

Apart from the rounding criteria, Simple is equivalent to the
hardcoded DDIM scheduler. So, drop the local compvis_sigmas and
alphas_cumprod tables, and recover the alpha_cumprod values from
the provided sigmas vector.

This partially fixes DDIM behavior with image to image, since
we rely on the input sigmas vector to provide the appropriate
noise levels. It also allows combining DDIM with different
schedulers.
---
 src/denoiser.hpp         | 25 +++++--------------------
 src/stable-diffusion.cpp |  2 ++
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index 14ea7b5d5..e45e3535c 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -1290,27 +1290,12 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
                                               const std::vector<float>& sigmas,
                                               std::shared_ptr<RNG> rng,
                                               float eta) {
-    float beta_start = 0.00085f;
-    float beta_end   = 0.0120f;
-    std::vector<double> alphas_cumprod(TIMESTEPS);
-    std::vector<double> compvis_sigmas(TIMESTEPS);
-    for (int i = 0; i < TIMESTEPS; i++) {
-        alphas_cumprod[i] =
-            (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
-            (1.0f -
-             std::pow(sqrtf(beta_start) +
-                          (sqrtf(beta_end) - sqrtf(beta_start)) *
-                              ((float)i / (TIMESTEPS - 1)),
-                      2));
-        compvis_sigmas[i] =
-            std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
-    }
 
     int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        int timestep      = static_cast<int>(roundf(TIMESTEPS - i * ((float)TIMESTEPS / steps))) - 1;
-        int prev_timestep = timestep - TIMESTEPS / steps;
-        float sigma       = static_cast<float>(compvis_sigmas[timestep]);
+
+        float sigma       = sigmas[i];
+        float sigma_to    = sigmas[i + 1];
 
         auto model_output_opt = model(x, sigma, i + 1);
         if (model_output_opt.empty()) {
@@ -1319,8 +1304,8 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
         sd::Tensor<float> model_output = std::move(model_output_opt);
         model_output                   = (x - model_output) * (1.0f / sigma);
 
-        float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
-        float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast<float>(alphas_cumprod[prev_timestep]) : 1.0f;
+        float alpha_prod_t      = 1.0f / (sigma * sigma + 1.0f);
+        float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f);
         float beta_prod_t       = 1.0f - alpha_prod_t;
 
         sd::Tensor<float> pred_original_sample = ((x / std::sqrt(sigma * sigma + 1)) -
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index b753b475e..d01ff52a5 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -2424,6 +2424,8 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me
     }
     if (sample_method == LCM_SAMPLE_METHOD) {
         return LCM_SCHEDULER;
+    } else if (sample_method == DDIM_TRAILING_SAMPLE_METHOD) {
+        return SIMPLE_SCHEDULER;
     }
     return DISCRETE_SCHEDULER;
 }

From b660006212387df09f29bfe2762cbf1052167a6d Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 11:42:39 -0300
Subject: [PATCH 5/7] fix: use the LCM scheduler with TCD

LCM and TCD timesteps are identical, so adapt the TCD code to use
the provided sigmas vector, keeping the internal tables to obtain
the timesteps from the sigmas.

As with DDIM, this partially fixes TCD for image to image operations.

An alternative could be using the input sigmas to obtain the timestep
and prev_timestep values, then obtain new sigma and alpha_cumprod
values from the tables, keeping most of the code as-is. It'd be more
complex, though: input sigmas that happened to be too close to one
another could end up rounded to the same timestep value, and it's
not clear how to best guard against that.
---
 src/denoiser.hpp         | 28 ++++++++++++++++++++--------
 src/stable-diffusion.cpp |  2 +-
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index e45e3535c..fd30fb573 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -1350,13 +1350,25 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
             std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]);
     }
 
-    int original_steps = 50;
-    int steps          = static_cast<int>(sigmas.size()) - 1;
+    auto get_timestep_from_sigma = [&](float s) -> int {
+        auto it = std::lower_bound(compvis_sigmas.begin(), compvis_sigmas.end(), s);
+        if (it == compvis_sigmas.begin()) return 0;
+        if (it == compvis_sigmas.end()) return TIMESTEPS - 1;
+        int idx_high = static_cast<int>(std::distance(compvis_sigmas.begin(), it));
+        int idx_low  = idx_high - 1;
+        if (std::abs(compvis_sigmas[idx_high] - s) < std::abs(compvis_sigmas[idx_low] - s)) {
+            return idx_high;
+        }
+        return idx_low;
+    };
+
+    int steps = static_cast<int>(sigmas.size()) - 1;
     for (int i = 0; i < steps; i++) {
-        int timestep      = TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor(i * ((float)original_steps / steps));
-        int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
+
+        float sigma_to    = sigmas[i + 1];
+        int prev_timestep = get_timestep_from_sigma(sigma_to);
         int timestep_s    = (int)floor((1 - eta) * prev_timestep);
-        float sigma       = static_cast<float>(compvis_sigmas[timestep]);
+        float sigma       = sigmas[i];
 
         auto model_output_opt = model(x, sigma, i + 1);
         if (model_output_opt.empty()) {
@@ -1365,9 +1377,9 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
         sd::Tensor<float> model_output = std::move(model_output_opt);
         model_output                   = (x - model_output) * (1.0f / sigma);
 
-        float alpha_prod_t      = static_cast<float>(alphas_cumprod[timestep]);
+        float alpha_prod_t      = 1.0f / (sigma * sigma + 1.0f);
         float beta_prod_t       = 1.0f - alpha_prod_t;
-        float alpha_prod_t_prev = prev_timestep >= 0 ? static_cast<float>(alphas_cumprod[prev_timestep]) : 1.0f;
+        float alpha_prod_t_prev = 1.0f / (sigma_to * sigma_to + 1.0f);
         float alpha_prod_s      = static_cast<float>(alphas_cumprod[timestep_s]);
         float beta_prod_s       = 1.0f - alpha_prod_s;
 
@@ -1378,7 +1390,7 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
         x = std::sqrt(alpha_prod_s) * pred_original_sample +
             std::sqrt(beta_prod_s) * model_output;
 
-        if (eta > 0 && i != steps - 1) {
+        if (eta > 0 && sigma_to > 0.0f) {
             x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x +
                 std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
         }
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index d01ff52a5..ea1a6eef9 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -2422,7 +2422,7 @@ enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_me
             return EXPONENTIAL_SCHEDULER;
         }
     }
-    if (sample_method == LCM_SAMPLE_METHOD) {
+    if (sample_method == LCM_SAMPLE_METHOD || sample_method == TCD_SAMPLE_METHOD) {
         return LCM_SCHEDULER;
     } else if (sample_method == DDIM_TRAILING_SAMPLE_METHOD) {
         return SIMPLE_SCHEDULER;

From 3827d1b2a383c08c5ca1fe42c5c900f42c6d4efe Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 09:59:13 -0300
Subject: [PATCH 6/7] refactor: fold DDIM/TCD scaling into the x update
 calculations

---
 src/denoiser.hpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index fd30fb573..e3fc82464 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -1317,14 +1317,13 @@ static sd::Tensor<float> sample_ddim_trailing(denoise_cb_t model,
                          (1.0f - alpha_prod_t / alpha_prod_t_prev);
         float std_dev_t = eta * std::sqrt(variance);
 
-        x = std::sqrt(alpha_prod_t_prev) * pred_original_sample +
-            std::sqrt(1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2)) * model_output;
+        x = pred_original_sample +
+            std::sqrt((1.0f - alpha_prod_t_prev - std::pow(std_dev_t, 2))/ alpha_prod_t_prev) * model_output;
 
         if (eta > 0) {
-            x += std_dev_t * sd::Tensor<float>::randn_like(x, rng);
+            x+= std_dev_t / std::sqrt(alpha_prod_t_prev) * sd::Tensor<float>::randn_like(x, rng);
         }
 
-        x *= std::sqrt(1 / alpha_prod_t_prev);
     }
     return x;
 }
@@ -1387,15 +1386,14 @@ static sd::Tensor<float> sample_tcd(denoise_cb_t model,
                                                   std::sqrt(beta_prod_t) * model_output) *
                                                  (1.0f / std::sqrt(alpha_prod_t));
 
-        x = std::sqrt(alpha_prod_s) * pred_original_sample +
-            std::sqrt(beta_prod_s) * model_output;
+        x = std::sqrt(alpha_prod_s / alpha_prod_t_prev) * pred_original_sample +
+            std::sqrt(beta_prod_s / alpha_prod_t_prev) * model_output;
 
         if (eta > 0 && sigma_to > 0.0f) {
             x = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * x +
-                std::sqrt(1.0f - alpha_prod_t_prev / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
+                std::sqrt(1.0f / alpha_prod_t_prev - 1.0f / alpha_prod_s) * sd::Tensor<float>::randn_like(x, rng);
         }
 
-        x *= std::sqrt(1 / alpha_prod_t_prev);
     }
     return x;
 }

From 64d01533fa9d94696b4ffafbb7327d5201e4804a Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 11 Apr 2026 15:55:29 -0300
Subject: [PATCH 7/7] fix: correct DDIM/TCD for low-strength image to image

As explained in a previous commit, the initial noise scaling
has very little effect for text to image; but for image to
image, the lower the denoising strength, the stronger it gets,
until the model isn't able to compensate for it.

I've placed this change at the end of the series to make it
easier to test the results.
---
 src/stable-diffusion.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index ea1a6eef9..c07531e82 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1619,15 +1619,8 @@ class StableDiffusionGGML {
         }
 
         int64_t t0                   = ggml_time_us();
-
-        // scales the initial noise for DDIM and TCD
-        // NOTE: may not be strictly needed, since with the initial
-        // ~14.6 sigma, its value is very close to 1 (~1.002)
-        float ddim_noise_scale       = ((method == DDIM_TRAILING_SAMPLE_METHOD || method == TCD_SAMPLE_METHOD)
-                                           ? std::sqrt(sigmas[0] * sigmas[0] + 1) / sigmas[0]
-                                           : 1.0);
         sd::Tensor<float> x_t        = !noise.empty()
-                                           ? denoiser->noise_scaling(sigmas[0] * ddim_noise_scale, noise, init_latent)
+                                           ? denoiser->noise_scaling(sigmas[0], noise, init_latent)
                                            : init_latent;
         sd::Tensor<float> denoised   = x_t;
         SamplePreviewContext preview = prepare_sample_preview_context();