From 70afad4c81a9712b8181fbea63eefdb441d5ada0 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 16 Nov 2025 15:14:42 +0800 Subject: [PATCH] feat: support independent sampler rng --- examples/cli/README.md | 1 + examples/cli/main.cpp | 54 ++++++++++++++++++++++++++++++------------ stable-diffusion.cpp | 38 ++++++++++++++++++++--------- stable-diffusion.h | 1 + 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/examples/cli/README.md b/examples/cli/README.md index ffa582eed..c9cb46b20 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -95,6 +95,7 @@ Options: --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the type of the weight file --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) + --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng -s, --seed RNG seed (default: 42, use random seed for < 0) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 45dff7f16..a8d81c2ef 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -110,21 +110,22 @@ struct SDParams { int fps = 16; float vace_strength = 1.f; - float strength = 0.75f; - float control_strength = 0.9f; - rng_type_t rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; - bool offload_params_to_cpu = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; - bool canny_preprocess = false; - bool color = false; - int upscale_repeats = 1; + float strength = 0.75f; + float control_strength = 0.9f; + rng_type_t rng_type = CUDA_RNG; + rng_type_t sampler_rng_type = RNG_TYPE_COUNT; + int64_t seed = 42; + bool verbose = false; + bool offload_params_to_cpu = false; + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; + bool canny_preprocess = false; + bool color = false; + int upscale_repeats = 1; // Photo Maker std::string photo_maker_path; @@ -214,6 +215,7 @@ void print_params(SDParams params) { printf(" flow_shift: %.2f\n", params.flow_shift); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); + printf(" sampler rng: %s\n", sd_rng_type_name(params.sampler_rng_type)); printf(" seed: %zd\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false"); @@ -886,6 +888,20 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.sampler_rng_type = str_to_rng_type(arg); + if (params.sampler_rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid sampler rng type %s\n", + arg); + return -1; + } + return 1; + }; + auto on_schedule_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -1126,6 +1142,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--rng", "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", on_rng_arg}, + {"", + "--sampler-rng", + "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", + on_sampler_rng_arg}, {"-s", "--seed", "RNG seed (default: 42, use random seed for < 0)", @@ -1323,6 +1343,9 @@ std::string get_image_params(SDParams params, int64_t seed) { parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; parameter_string += "Model: " + sd_basename(params.model_path) + ", "; parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; + if (params.sampler_rng_type != RNG_TYPE_COUNT) { + parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(params.sampler_rng_type)) + ", "; + } parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_params.sample_method)); if (params.sample_params.scheduler != DEFAULT) { parameter_string += " " + std::string(sd_schedule_name(params.sample_params.scheduler)); @@ -1761,6 +1784,7 @@ int main(int argc, const char* argv[]) { params.n_threads, params.wtype, params.rng_type, + params.sampler_rng_type, params.prediction, params.lora_apply_mode, params.offload_params_to_cpu, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 7ca8ffaa6..fb01b4053 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -99,10 +99,11 @@ class StableDiffusionGGML { bool vae_decode_only = false; bool free_params_immediately = false; - std::shared_ptr rng = std::make_shared(); - int n_threads = -1; - float scale_factor = 0.18215f; - float shift_factor = 0.f; + std::shared_ptr rng = std::make_shared(); + std::shared_ptr sampler_rng = nullptr; + int n_threads = -1; + float scale_factor = 0.18215f; + float shift_factor = 0.f; std::shared_ptr cond_stage_model; std::shared_ptr clip_vision; // for svd or wan2.1 i2v @@ -188,6 +189,16 @@ class StableDiffusionGGML { } } + std::shared_ptr get_rng(rng_type_t rng_type) { + if (rng_type == STD_DEFAULT_RNG) { + return std::make_shared(); + } else if (rng_type == CPU_RNG) { + return std::make_shared(); + } else { // default: CUDA_RNG + return std::make_shared(); + } + } + bool init(const sd_ctx_params_t* sd_ctx_params) { n_threads = sd_ctx_params->n_threads; vae_decode_only = sd_ctx_params->vae_decode_only; @@ -197,12 +208,11 @@ class StableDiffusionGGML { use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; - if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { - rng = std::make_shared(); - } else if (sd_ctx_params->rng_type == CUDA_RNG) { - rng = std::make_shared(); - } else if (sd_ctx_params->rng_type == CPU_RNG) { - rng = std::make_shared(); + rng = get_rng(sd_ctx_params->rng_type); + if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT) { + sampler_rng = get_rng(sd_ctx_params->sampler_rng_type); + } else { + sampler_rng = rng; } ggml_log_set(ggml_log_callback_default, nullptr); @@ -1735,7 +1745,7 @@ class StableDiffusionGGML { return denoised; }; - sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); + sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta); if (inverse_noise_scaling) { x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); @@ -2290,6 +2300,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->n_threads = get_num_physical_cores(); sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->rng_type = CUDA_RNG; + sd_ctx_params->sampler_rng_type = RNG_TYPE_COUNT; sd_ctx_params->prediction = DEFAULT_PRED; sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; @@ -2330,6 +2341,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "n_threads: %d\n" "wtype: %s\n" "rng_type: %s\n" + "sampler_rng_type: %s\n" "prediction: %s\n" "offload_params_to_cpu: %s\n" "keep_clip_on_cpu: %s\n" @@ -2359,6 +2371,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->n_threads, sd_type_name(sd_ctx_params->wtype), sd_rng_type_name(sd_ctx_params->rng_type), + sd_rng_type_name(sd_ctx_params->sampler_rng_type), sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), @@ -2814,6 +2827,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); + sd_ctx->sd->sampler_rng->manual_seed(cur_seed); struct ggml_tensor* x_t = init_latent; struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); @@ -2940,6 +2954,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g seed = rand(); } sd_ctx->sd->rng->manual_seed(seed); + sd_ctx->sd->sampler_rng->manual_seed(seed); int sample_steps = sd_img_gen_params->sample_params.sample_steps; @@ -3231,6 +3246,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } sd_ctx->sd->rng->manual_seed(seed); + sd_ctx->sd->sampler_rng->manual_seed(seed); int64_t t0 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 58226ec42..5c1e956d5 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -172,6 +172,7 @@ typedef struct { int n_threads; enum sd_type_t wtype; enum rng_type_t rng_type; + enum rng_type_t sampler_rng_type; enum prediction_t prediction; enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu;