From 8db26cd0fa0ee4e6ed4995228ef4eab8b03ab054 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 16 Nov 2025 14:30:50 +0800 Subject: [PATCH] feat: add cpu rng --- README.md | 4 +- examples/cli/README.md | 2 +- examples/cli/main.cpp | 2 +- rng_mt19937.hpp | 147 +++++++++++++++++++++++++++++++++++++++++ stable-diffusion.cpp | 4 ++ stable-diffusion.h | 1 + 6 files changed, 157 insertions(+), 3 deletions(-) create mode 100644 rng_mt19937.hpp diff --git a/README.md b/README.md index 5cc6e4458..058cd5818 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,9 @@ API and command-line option may change frequently.*** - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - `DPM++ 2S a` - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) -- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) +- Cross-platform reproducibility + - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG` + - `--rng cpu`, consistent with the `comfyui RNG` - Embedds generation parameters into png output as webui-compatible text string ## Quick Start diff --git a/examples/cli/README.md b/examples/cli/README.md index 84df1a1e8..30f3489a4 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -94,7 +94,7 @@ Options: -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the type of the weight file - --rng RNG, one of [std_default, cuda], default: cuda + --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) -s, --seed RNG seed (default: 42, use random seed for < 0) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index a2df09497..333e7ac71 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1124,7 +1124,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { on_type_arg}, {"", "--rng", - "RNG, one of [std_default, cuda], default: cuda", + "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", on_rng_arg}, {"-s", "--seed", diff --git a/rng_mt19937.hpp b/rng_mt19937.hpp new file mode 100644 index 000000000..7e6199886 --- /dev/null +++ b/rng_mt19937.hpp @@ -0,0 +1,147 @@ +#ifndef __RNG_MT19937_HPP__ +#define __RNG_MT19937_HPP__ + +#include +#include + +#include "rng.hpp" + +// RNG imitiating torch cpu randn on CPU. +// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16 +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution +class MT19937RNG : public RNG { + static const int N = 624; + static const int M = 397; + static const uint32_t MATRIX_A = 0x9908b0dfU; + static const uint32_t UMASK = 0x80000000U; + static const uint32_t LMASK = 0x7fffffffU; + + struct State { + uint64_t seed_; + int left_; + bool seeded_; + uint32_t next_; + std::array state_; + bool has_next_gauss = false; + double next_gauss = 0.0f; + }; + + State s; + + uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); } + uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); } + void next_state() { + uint32_t* p = s.state_.data(); + s.left_ = N; + s.next_ = 0; + for (int j = N - M + 1; --j; p++) + p[0] = p[M] ^ twist(p[0], p[1]); + for (int j = M; --j; p++) + p[0] = p[M - N] ^ twist(p[0], p[1]); + p[0] = p[M - N] ^ twist(p[0], s.state_[0]); + } + + uint32_t rand_uint32() { + if (--s.left_ == 0) + next_state(); + uint32_t y = s.state_[s.next_++]; + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680U; + y ^= (y << 15) & 0xefc60000U; + y ^= (y >> 18); + return y; + } + + uint64_t rand_uint64() { + uint64_t high = (uint64_t)rand_uint32(); + uint64_t low = (uint64_t)rand_uint32(); + return (high << 32) | low; + } + + template + T uniform_real(V val, T from, T to) { + constexpr auto MASK = static_cast((static_cast(1) << std::numeric_limits::digits) - 1); + constexpr auto DIVISOR = static_cast(1) / (static_cast(1) << std::numeric_limits::digits); + T x = (val & MASK) * DIVISOR; + return (x * (to - from) + from); + } + + double normal_double_value(double mean, double std) { + if (s.has_next_gauss) { + s.has_next_gauss = false; + return s.next_gauss; + } + double u1 = uniform_real(rand_uint64(), 0., 1.); // double + double u2 = uniform_real(rand_uint64(), 0., 1.); // double + + double r = std::sqrt(-2.0 * std::log1p(-u2)); + double theta = 2.0 * 3.14159265358979323846 * u1; + double value = r * std::cos(theta) * std + mean; + s.next_gauss = r * std::sin(theta) * std + mean; + s.has_next_gauss = true; + return value; + } + + void normal_fill_16(float* data, float mean, float std) { + for (int j = 0; j < 8; ++j) { + float u1 = 1.0f - data[j]; + float u2 = data[j + 8]; + float r = std::sqrt(-2.0f * std::log(u1)); + float theta = 2.0f * 3.14159265358979323846 * u2; + data[j] = r * std::cos(theta) * std + mean; + data[j + 8] = r * std::sin(theta) * std + mean; + } + } + + void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) { + if (size >= 16) { + for (int64_t i = 0; i < size; i++) { + data[i] = uniform_real(rand_uint32(), 0.f, 1.f); + } + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16(data + i, mean, std); + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (int64_t i = 0; i < 16; i++) { + data[i] = uniform_real(rand_uint32(), 0.f, 1.f); + } + normal_fill_16(data, mean, std); + } + } else { + // Strange handling, hard to understand, but keeping it consistent with PyTorch. + for (int64_t i = 0; i < size; i++) { + data[i] = (float)normal_double_value(mean, std); + } + } + } + +public: + MT19937RNG(uint64_t seed = 0) { manual_seed(seed); } + + void manual_seed(uint64_t seed) override { + s.seed_ = seed; + s.seeded_ = true; + s.state_[0] = (uint32_t)(seed & 0xffffffffU); + for (int j = 1; j < N; j++) { + uint32_t prev = s.state_[j - 1]; + s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j; + } + s.left_ = 1; + s.next_ = 0; + s.has_next_gauss = false; + } + + std::vector randn(uint32_t n) override { + std::vector out; + out.resize(n); + randn((float*)out.data(), out.size()); + return out; + } +}; + +#endif // __RNG_MT19937_HPP__ \ No newline at end of file diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3e71ec0b1..58068f961 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2,6 +2,7 @@ #include "model.h" #include "rng.hpp" +#include "rng_mt19937.hpp" #include "rng_philox.hpp" #include "stable-diffusion.h" #include "util.h" @@ -200,6 +201,8 @@ class StableDiffusionGGML { rng = std::make_shared(); } else if (sd_ctx_params->rng_type == CUDA_RNG) { rng = std::make_shared(); + } else if (sd_ctx_params->rng_type == CPU_RNG) { + rng = std::make_shared(); } ggml_log_set(ggml_log_callback_default, nullptr); @@ -2127,6 +2130,7 @@ enum sd_type_t str_to_sd_type(const char* str) { const char* rng_type_to_str[] = { "std_default", "cuda", + "cpu", }; const char* sd_rng_type_name(enum rng_type_t rng_type) { diff --git a/stable-diffusion.h b/stable-diffusion.h index 5cb239489..58226ec42 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -31,6 +31,7 @@ extern "C" { enum rng_type_t { STD_DEFAULT_RNG, CUDA_RNG, + CPU_RNG, RNG_TYPE_COUNT };