From 8db26cd0fa0ee4e6ed4995228ef4eab8b03ab054 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 16 Nov 2025 14:30:50 +0800
Subject: [PATCH] feat: add cpu rng

---
 README.md              |   4 +-
 examples/cli/README.md |   2 +-
 examples/cli/main.cpp  |   2 +-
 rng_mt19937.hpp        | 147 +++++++++++++++++++++++++++++++++++++++++
 stable-diffusion.cpp   |   4 ++
 stable-diffusion.h     |   1 +
 6 files changed, 157 insertions(+), 3 deletions(-)
 create mode 100644 rng_mt19937.hpp
diff --git a/README.md b/README.md
index 5cc6e4458..058cd5818 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,9 @@ API and command-line option may change frequently.***
     - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
     - `DPM++ 2S a`
     - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
-- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Cross-platform reproducibility
+    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
+    - `--rng cpu`, consistent with the `comfyui RNG`
 - Embedds generation parameters into png output as webui-compatible text string
 
 ## Quick Start
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 84df1a1e8..30f3489a4 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -94,7 +94,7 @@ Options:
   -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file
-  --rng                                    RNG, one of [std_default, cuda], default: cuda
+  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
   -s, --seed                               RNG seed (default: 42, use random seed for < 0)
   --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                            tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index a2df09497..333e7ac71 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1124,7 +1124,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          on_type_arg},
         {"",
          "--rng",
-         "RNG, one of [std_default, cuda], default: cuda",
+         "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
          on_rng_arg},
         {"-s",
          "--seed",
diff --git a/rng_mt19937.hpp b/rng_mt19937.hpp
new file mode 100644
index 000000000..7e6199886
--- /dev/null
+++ b/rng_mt19937.hpp
@@ -0,0 +1,147 @@
+#ifndef __RNG_MT19937_HPP__
+#define __RNG_MT19937_HPP__
+
+#include <cmath>
+#include <vector>
+
+#include "rng.hpp"
+
+// RNG imitiating torch cpu randn on CPU.
+// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
+class MT19937RNG : public RNG {
+    static const int N             = 624;
+    static const int M             = 397;
+    static const uint32_t MATRIX_A = 0x9908b0dfU;
+    static const uint32_t UMASK    = 0x80000000U;
+    static const uint32_t LMASK    = 0x7fffffffU;
+
+    struct State {
+        uint64_t seed_;
+        int left_;
+        bool seeded_;
+        uint32_t next_;
+        std::array<uint32_t, N> state_;
+        bool has_next_gauss = false;
+        double next_gauss   = 0.0f;
+    };
+
+    State s;
+
+    uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
+    uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
+    void next_state() {
+        uint32_t* p = s.state_.data();
+        s.left_     = N;
+        s.next_     = 0;
+        for (int j = N - M + 1; --j; p++)
+            p[0] = p[M] ^ twist(p[0], p[1]);
+        for (int j = M; --j; p++)
+            p[0] = p[M - N] ^ twist(p[0], p[1]);
+        p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
+    }
+
+    uint32_t rand_uint32() {
+        if (--s.left_ == 0)
+            next_state();
+        uint32_t y = s.state_[s.next_++];
+        y ^= (y >> 11);
+        y ^= (y << 7) & 0x9d2c5680U;
+        y ^= (y << 15) & 0xefc60000U;
+        y ^= (y >> 18);
+        return y;
+    }
+
+    uint64_t rand_uint64() {
+        uint64_t high = (uint64_t)rand_uint32();
+        uint64_t low  = (uint64_t)rand_uint32();
+        return (high << 32) | low;
+    }
+
+    template <typename T, typename V>
+    T uniform_real(V val, T from, T to) {
+        constexpr auto MASK    = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
+        constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
+        T x                    = (val & MASK) * DIVISOR;
+        return (x * (to - from) + from);
+    }
+
+    double normal_double_value(double mean, double std) {
+        if (s.has_next_gauss) {
+            s.has_next_gauss = false;
+            return s.next_gauss;
+        }
+        double u1 = uniform_real(rand_uint64(), 0., 1.);  // double
+        double u2 = uniform_real(rand_uint64(), 0., 1.);  // double
+
+        double r         = std::sqrt(-2.0 * std::log1p(-u2));
+        double theta     = 2.0 * 3.14159265358979323846 * u1;
+        double value     = r * std::cos(theta) * std + mean;
+        s.next_gauss     = r * std::sin(theta) * std + mean;
+        s.has_next_gauss = true;
+        return value;
+    }
+
+    void normal_fill_16(float* data, float mean, float std) {
+        for (int j = 0; j < 8; ++j) {
+            float u1    = 1.0f - data[j];
+            float u2    = data[j + 8];
+            float r     = std::sqrt(-2.0f * std::log(u1));
+            float theta = 2.0f * 3.14159265358979323846 * u2;
+            data[j]     = r * std::cos(theta) * std + mean;
+            data[j + 8] = r * std::sin(theta) * std + mean;
+        }
+    }
+
+    void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
+        if (size >= 16) {
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+            }
+            for (int64_t i = 0; i < size - 15; i += 16) {
+                normal_fill_16(data + i, mean, std);
+            }
+            if (size % 16 != 0) {
+                // Recompute the last 16 values.
+                data = data + size - 16;
+                for (int64_t i = 0; i < 16; i++) {
+                    data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+                }
+                normal_fill_16(data, mean, std);
+            }
+        } else {
+            // Strange handling, hard to understand, but keeping it consistent with PyTorch.
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = (float)normal_double_value(mean, std);
+            }
+        }
+    }
+
+public:
+    MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
+
+    void manual_seed(uint64_t seed) override {
+        s.seed_     = seed;
+        s.seeded_   = true;
+        s.state_[0] = (uint32_t)(seed & 0xffffffffU);
+        for (int j = 1; j < N; j++) {
+            uint32_t prev = s.state_[j - 1];
+            s.state_[j]   = 1812433253U * (prev ^ (prev >> 30)) + j;
+        }
+        s.left_          = 1;
+        s.next_          = 0;
+        s.has_next_gauss = false;
+    }
+
+    std::vector<float> randn(uint32_t n) override {
+        std::vector<float> out;
+        out.resize(n);
+        randn((float*)out.data(), out.size());
+        return out;
+    }
+};
+
+#endif  // __RNG_MT19937_HPP__
\ No newline at end of file
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 3e71ec0b1..58068f961 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -2,6 +2,7 @@
 
 #include "model.h"
 #include "rng.hpp"
+#include "rng_mt19937.hpp"
 #include "rng_philox.hpp"
 #include "stable-diffusion.h"
 #include "util.h"
@@ -200,6 +201,8 @@ class StableDiffusionGGML {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (sd_ctx_params->rng_type == CUDA_RNG) {
             rng = std::make_shared<PhiloxRNG>();
+        } else if (sd_ctx_params->rng_type == CPU_RNG) {
+            rng = std::make_shared<MT19937RNG>();
         }
 
         ggml_log_set(ggml_log_callback_default, nullptr);
@@ -2127,6 +2130,7 @@ enum sd_type_t str_to_sd_type(const char* str) {
 const char* rng_type_to_str[] = {
     "std_default",
     "cuda",
+    "cpu",
 };
 
 const char* sd_rng_type_name(enum rng_type_t rng_type) {
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 5cb239489..58226ec42 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -31,6 +31,7 @@ extern "C" {
 enum rng_type_t {
     STD_DEFAULT_RNG,
     CUDA_RNG,
+    CPU_RNG,
     RNG_TYPE_COUNT
 };