From 4af94ee0cf28630a9df90c6f59e9c117c37f12b4 Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 3 Dec 2025 00:50:13 +0800 Subject: [PATCH 1/2] split SDParams to SDCliParams/SDContextParams/SDGenerationParams --- examples/cli/main.cpp | 2822 +++++++++++++++++++++-------------------- stable-diffusion.cpp | 10 +- util.cpp | 26 +- 3 files changed, 1499 insertions(+), 1359 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index bc5444281..ab9e97118 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -62,188 +62,6 @@ enum SDMode { MODE_COUNT }; -struct SDParams { - int n_threads = -1; - SDMode mode = IMG_GEN; - std::string model_path; - std::string clip_l_path; - std::string clip_g_path; - std::string clip_vision_path; - std::string t5xxl_path; - std::string llm_path; - std::string llm_vision_path; - std::string diffusion_model_path; - std::string high_noise_diffusion_model_path; - std::string vae_path; - std::string taesd_path; - std::string esrgan_path; - std::string control_net_path; - std::string embedding_dir; - sd_type_t wtype = SD_TYPE_COUNT; - std::string tensor_type_rules; - std::string lora_model_dir; - std::string output_path = "output.png"; - std::string init_image_path; - std::string end_image_path; - std::string mask_image_path; - std::string control_image_path; - std::vector ref_image_paths; - std::string control_video_path; - bool auto_resize_ref_image = true; - bool increase_ref_index = false; - - std::string prompt; - std::string negative_prompt; - - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; - int batch_count = 1; - - std::vector skip_layers = {7, 8, 9}; - sd_sample_params_t sample_params; - - std::vector high_noise_skip_layers = {7, 8, 9}; - sd_sample_params_t high_noise_sample_params; - - std::string easycache_option; - sd_easycache_params_t easycache_params; - - float moe_boundary = 0.875f; - int video_frames = 1; - int fps = 16; - float vace_strength = 1.f; - - float strength = 0.75f; - float control_strength = 0.9f; - rng_type_t rng_type = CUDA_RNG; - rng_type_t sampler_rng_type = RNG_TYPE_COUNT; - int64_t seed = 42; - bool verbose = false; - bool offload_params_to_cpu = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; - bool canny_preprocess = false; - bool color = false; - int upscale_repeats = 1; - - // Photo Maker - std::string photo_maker_path; - std::string pm_id_images_dir; - std::string pm_id_embed_path; - float pm_style_strength = 20.f; - - bool chroma_use_dit_mask = true; - bool chroma_use_t5_mask = false; - int chroma_t5_mask_pad = 1; - float flow_shift = INFINITY; - - prediction_t prediction = DEFAULT_PRED; - lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; - - sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; - bool force_sdxl_vae_conv_scale = false; - - preview_t preview_method = PREVIEW_NONE; - int preview_interval = 1; - std::string preview_path = "preview.png"; - float preview_fps = 16; - bool taesd_preview = false; - bool preview_noisy = false; - - SDParams() { - sd_sample_params_init(&sample_params); - sd_sample_params_init(&high_noise_sample_params); - high_noise_sample_params.sample_steps = -1; - sd_easycache_params_init(&easycache_params); - } -}; - -void print_params(SDParams params) { - char* sample_params_str = sd_sample_params_to_str(¶ms.sample_params); - char* high_noise_sample_params_str = sd_sample_params_to_str(¶ms.high_noise_sample_params); - printf("Option: \n"); - printf(" n_threads: %d\n", params.n_threads); - printf(" mode: %s\n", modes_str[params.mode]); - printf(" model_path: %s\n", params.model_path.c_str()); - printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified"); - printf(" clip_l_path: %s\n", params.clip_l_path.c_str()); - printf(" clip_g_path: %s\n", params.clip_g_path.c_str()); - printf(" clip_vision_path: %s\n", params.clip_vision_path.c_str()); - printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str()); - printf(" llm_path: %s\n", params.llm_path.c_str()); - printf(" llm_vision_path: %s\n", params.llm_vision_path.c_str()); - printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str()); - printf(" high_noise_diffusion_model_path: %s\n", params.high_noise_diffusion_model_path.c_str()); - printf(" vae_path: %s\n", params.vae_path.c_str()); - printf(" taesd_path: %s\n", params.taesd_path.c_str()); - printf(" esrgan_path: %s\n", params.esrgan_path.c_str()); - printf(" control_net_path: %s\n", params.control_net_path.c_str()); - printf(" embedding_dir: %s\n", params.embedding_dir.c_str()); - printf(" photo_maker_path: %s\n", params.photo_maker_path.c_str()); - printf(" pm_id_images_dir: %s\n", params.pm_id_images_dir.c_str()); - printf(" pm_id_embed_path: %s\n", params.pm_id_embed_path.c_str()); - printf(" pm_style_strength: %.2f\n", params.pm_style_strength); - printf(" output_path: %s\n", params.output_path.c_str()); - printf(" init_image_path: %s\n", params.init_image_path.c_str()); - printf(" end_image_path: %s\n", params.end_image_path.c_str()); - printf(" mask_image_path: %s\n", params.mask_image_path.c_str()); - printf(" control_image_path: %s\n", params.control_image_path.c_str()); - printf(" ref_images_paths:\n"); - for (auto& path : params.ref_image_paths) { - printf(" %s\n", path.c_str()); - }; - printf(" control_video_path: %s\n", params.control_video_path.c_str()); - printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false"); - printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); - printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); - printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); - printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false"); - printf(" vae_on_cpu: %s\n", params.vae_on_cpu ? "true" : "false"); - printf(" diffusion flash attention: %s\n", params.diffusion_flash_attn ? "true" : "false"); - printf(" diffusion Conv2d direct: %s\n", params.diffusion_conv_direct ? "true" : "false"); - printf(" vae_conv_direct: %s\n", params.vae_conv_direct ? "true" : "false"); - printf(" control_strength: %.2f\n", params.control_strength); - printf(" prompt: %s\n", params.prompt.c_str()); - printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); - printf(" clip_skip: %d\n", params.clip_skip); - printf(" width: %d\n", params.width); - printf(" height: %d\n", params.height); - printf(" sample_params: %s\n", SAFE_STR(sample_params_str)); - printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str)); - printf(" moe_boundary: %.3f\n", params.moe_boundary); - printf(" prediction: %s\n", sd_prediction_name(params.prediction)); - printf(" lora_apply_mode: %s\n", sd_lora_apply_mode_name(params.lora_apply_mode)); - printf(" flow_shift: %.2f\n", params.flow_shift); - printf(" strength(img2img): %.2f\n", params.strength); - printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); - printf(" sampler rng: %s\n", sd_rng_type_name(params.sampler_rng_type)); - printf(" seed: %zd\n", params.seed); - printf(" batch_count: %d\n", params.batch_count); - printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false"); - printf(" force_sdxl_vae_conv_scale: %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false"); - printf(" upscale_repeats: %d\n", params.upscale_repeats); - printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); - printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); - printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); - printf(" video_frames: %d\n", params.video_frames); - printf(" easycache: %s (threshold=%.3f, start=%.2f, end=%.2f)\n", - params.easycache_params.enabled ? "enabled" : "disabled", - params.easycache_params.reuse_threshold, - params.easycache_params.start_percent, - params.easycache_params.end_percent); - printf(" vace_strength: %.2f\n", params.vace_strength); - printf(" fps: %d\n", params.fps); - printf(" preview_mode: %s (%s)\n", previews_str[params.preview_method], params.preview_noisy ? "noisy" : "denoised"); - printf(" preview_interval: %d\n", params.preview_interval); - free(sample_params_str); - free(high_noise_sample_params_str); -} - #if defined(_WIN32) static std::string utf16_to_utf8(const std::wstring& wstr) { if (wstr.empty()) @@ -322,1076 +140,1427 @@ struct ArgOptions { std::vector float_options; std::vector bool_options; std::vector manual_options; + + static std::string wrap_text(const std::string& text, size_t width, size_t indent) { + std::ostringstream oss; + size_t line_len = 0; + size_t pos = 0; + + while (pos < text.size()) { + // Preserve manual newlines + if (text[pos] == '\n') { + oss << '\n' + << std::string(indent, ' '); + line_len = indent; + ++pos; + continue; + } + + // Add the character + oss << text[pos]; + ++line_len; + ++pos; + + // If the current line exceeds width, try to break at the last space + if (line_len >= width) { + std::string current = oss.str(); + size_t back = current.size(); + + // Find the last space (for a clean break) + while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') + --back; + + // If found a space to break on + if (back > 0 && current[back - 1] != '\n') { + std::string before = current.substr(0, back - 1); + std::string after = current.substr(back); + oss.str(""); + oss.clear(); + oss << before << "\n" + << std::string(indent, ' ') << after; + } else { + // If no space found, just break at width + oss << "\n" + << std::string(indent, ' '); + } + line_len = indent; + } + } + + return oss.str(); + } + + void print() const { + constexpr size_t max_line_width = 120; + + struct Entry { + std::string names; + std::string desc; + }; + std::vector entries; + + auto add_entry = [&](const std::string& s, const std::string& l, + const std::string& desc, const std::string& hint = "") { + std::ostringstream ss; + if (!s.empty()) + ss << s; + if (!s.empty() && !l.empty()) + ss << ", "; + if (!l.empty()) + ss << l; + if (!hint.empty()) + ss << " " << hint; + entries.push_back({ss.str(), desc}); + }; + + for (auto& o : string_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : int_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : float_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : bool_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : manual_options) + add_entry(o.short_name, o.long_name, o.desc); + + size_t max_name_width = 0; + for (auto& e : entries) + max_name_width = std::max(max_name_width, e.names.size()); + + for (auto& e : entries) { + size_t indent = 2 + max_name_width + 4; + size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40); + std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent); + std::cout << " " << std::left << std::setw(static_cast(max_name_width) + 4) + << e.names << wrapped_desc << "\n"; + } + } }; -bool parse_options(int argc, const char** argv, ArgOptions& options) { +bool parse_options(int argc, const char** argv, const std::vector& options_list) { bool invalid_arg = false; std::string arg; + + auto match_and_apply = [&](auto& opts, auto&& apply_fn) -> bool { + for (auto& option : opts) { + if ((option.short_name.size() > 0 && arg == option.short_name) || + (option.long_name.size() > 0 && arg == option.long_name)) { + apply_fn(option); + return true; + } + } + return false; + }; + for (int i = 1; i < argc; i++) { - bool found_arg = false; arg = argv[i]; + bool found_arg = false; - for (auto& option : options.string_options) { - if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { - found_arg = true; - if (++i >= argc) { - invalid_arg = true; - break; - } - *option.target = argv_to_utf8(i, argv); - } + for (auto& options : options_list) { + if (match_and_apply(options.string_options, [&](auto& option) { + if (++i >= argc) { + invalid_arg = true; + return; + } + *option.target = argv_to_utf8(i, argv); + found_arg = true; + })) + break; + + if (match_and_apply(options.int_options, [&](auto& option) { + if (++i >= argc) { + invalid_arg = true; + return; + } + *option.target = std::stoi(argv[i]); + found_arg = true; + })) + break; + + if (match_and_apply(options.float_options, [&](auto& option) { + if (++i >= argc) { + invalid_arg = true; + return; + } + *option.target = std::stof(argv[i]); + found_arg = true; + })) + break; + + if (match_and_apply(options.bool_options, [&](auto& option) { + *option.target = option.keep_true ? true : false; + found_arg = true; + })) + break; + + if (match_and_apply(options.manual_options, [&](auto& option) { + int ret = option.cb(argc, argv, i); + if (ret < 0) { + invalid_arg = true; + return; + } + i += ret; + found_arg = true; + })) + break; } + if (invalid_arg) { - break; + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + return false; + } + if (!found_arg) { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + return false; } + } - for (auto& option : options.int_options) { - if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { - found_arg = true; - if (++i >= argc) { - invalid_arg = true; - break; + return true; +} + +struct SDCliParams { + SDMode mode = IMG_GEN; + std::string output_path = "output.png"; + + bool verbose = false; + bool canny_preprocess = false; + + preview_t preview_method = PREVIEW_NONE; + int preview_interval = 1; + std::string preview_path = "preview.png"; + int preview_fps = 16; + bool taesd_preview = false; + bool preview_noisy = false; + bool color = false; + + bool normal_exit = false; + + ArgOptions get_options() { + ArgOptions options; + + options.string_options = { + {"-o", + "--output", + "path to write result image to (default: ./output.png)", + &output_path}, + {"", + "--preview-path", + "path to write preview image to (default: ./preview.png)", + &preview_path}, + }; + + options.int_options = { + {"", + "--preview-interval", + "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)", + &preview_interval}, + }; + + options.bool_options = { + {"", + "--canny", + "apply canny preprocessor (edge detection)", + true, &canny_preprocess}, + {"-v", + "--verbose", + "print extra info", + true, &verbose}, + {"", + "--color", + "colors the logging tags according to level", + true, &color}, + {"", + "--taesd-preview-only", + std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", + true, &taesd_preview}, + {"", + "--preview-noisy", + "enables previewing noisy inputs of the models rather than the denoised outputs", + true, &preview_noisy}, + + }; + + auto on_mode_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* mode_c_str = argv[index]; + if (mode_c_str != nullptr) { + int mode_found = -1; + for (int i = 0; i < MODE_COUNT; i++) { + if (!strcmp(mode_c_str, modes_str[i])) { + mode_found = i; + } + } + if (mode_found == -1) { + fprintf(stderr, + "error: invalid mode %s, must be one of [%s]\n", + mode_c_str, SD_ALL_MODES_STR); + exit(1); } - *option.target = std::stoi(argv[i]); + mode = (SDMode)mode_found; } - } - if (invalid_arg) { - break; - } + return 1; + }; - for (auto& option : options.float_options) { - if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { - found_arg = true; - if (++i >= argc) { - invalid_arg = true; - break; + auto on_preview_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* preview = argv[index]; + int preview_method = -1; + for (int m = 0; m < PREVIEW_COUNT; m++) { + if (!strcmp(preview, previews_str[m])) { + preview_method = m; } - *option.target = std::stof(argv[i]); } + if (preview_method == -1) { + fprintf(stderr, "error: preview method %s\n", + preview); + return -1; + } + preview_method = (preview_t)preview_method; + return 1; + }; + + auto on_help_arg = [&](int argc, const char** argv, int index) { + normal_exit = true; + return -1; + }; + + options.manual_options = { + {"-M", + "--mode", + "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen", + on_mode_arg}, + {"", + "--preview", + std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")", + on_preview_arg}, + {"-h", + "--help", + "show this help message and exit", + on_help_arg}, + }; + + return options; + }; + + bool process_and_check() { + if (output_path.length() == 0) { + fprintf(stderr, "error: the following arguments are required: output_path\n"); + return false; } - if (invalid_arg) { - break; + + if (mode == CONVERT) { + if (output_path == "output.png") { + output_path = "output.gguf"; + } } + return true; + } + + std::string to_string() const { + std::ostringstream oss; + oss << "SDCliParams {\n" + << " mode: " << modes_str[mode] << ",\n" + << " output_path: \"" << output_path << "\",\n" + << " verbose: " << (verbose ? "true" : "false") << ",\n" + << " color: " << (color ? "true" : "false") << ",\n" + << " canny_preprocess: " << (canny_preprocess ? "true" : "false") << ",\n" + << " preview_method: " << previews_str[preview_method] << ",\n" + << " preview_interval: " << preview_interval << ",\n" + << " preview_path: \"" << preview_path << "\",\n" + << " preview_fps: " << preview_fps << ",\n" + << " taesd_preview: " << (taesd_preview ? "true" : "false") << ",\n" + << " preview_noisy: " << (preview_noisy ? "true" : "false") << "\n" + << "}"; + return oss.str(); + } +}; + +struct SDContextParams { + int n_threads = -1; + std::string model_path; + std::string clip_l_path; + std::string clip_g_path; + std::string clip_vision_path; + std::string t5xxl_path; + std::string llm_path; + std::string llm_vision_path; + std::string diffusion_model_path; + std::string high_noise_diffusion_model_path; + std::string vae_path; + std::string taesd_path; + std::string esrgan_path; + std::string control_net_path; + std::string embedding_dir; + std::string photo_maker_path; + sd_type_t wtype = SD_TYPE_COUNT; + std::string tensor_type_rules; + std::string lora_model_dir; + + rng_type_t rng_type = CUDA_RNG; + rng_type_t sampler_rng_type = RNG_TYPE_COUNT; + bool offload_params_to_cpu = false; + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; + + bool chroma_use_dit_mask = true; + bool chroma_use_t5_mask = false; + int chroma_t5_mask_pad = 1; + + prediction_t prediction = DEFAULT_PRED; + lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; + + sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; + bool force_sdxl_vae_conv_scale = false; + + float flow_shift = INFINITY; + + ArgOptions get_options() { + ArgOptions options; + options.string_options = { + {"-m", + "--model", + "path to full model", + &model_path}, + {"", + "--clip_l", + "path to the clip-l text encoder", &clip_l_path}, + {"", "--clip_g", + "path to the clip-g text encoder", + &clip_g_path}, + {"", + "--clip_vision", + "path to the clip-vision encoder", + &clip_vision_path}, + {"", + "--t5xxl", + "path to the t5xxl text encoder", + &t5xxl_path}, + {"", + "--llm", + "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", + &llm_path}, + {"", + "--llm_vision", + "path to the llm vit", + &llm_vision_path}, + {"", + "--qwen2vl", + "alias of --llm. Deprecated.", + &llm_path}, + {"", + "--qwen2vl_vision", + "alias of --llm_vision. Deprecated.", + &llm_vision_path}, + {"", + "--diffusion-model", + "path to the standalone diffusion model", + &diffusion_model_path}, + {"", + "--high-noise-diffusion-model", + "path to the standalone high noise diffusion model", + &high_noise_diffusion_model_path}, + {"", + "--vae", + "path to standalone vae model", + &vae_path}, + {"", + "--taesd", + "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", + &taesd_path}, + {"", + "--control-net", + "path to control net model", + &control_net_path}, + {"", + "--embd-dir", + "embeddings directory", + &embedding_dir}, + {"", + "--lora-model-dir", + "lora model directory", + &lora_model_dir}, + + {"", + "--tensor-type-rules", + "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", + &tensor_type_rules}, + {"", + "--photo-maker", + "path to PHOTOMAKER model", + &photo_maker_path}, + {"", + "--upscale-model", + "path to esrgan model.", + &esrgan_path}, + }; + + options.int_options = { + {"-t", + "--threads", + "number of threads to use during computation (default: -1). " + "If threads <= 0, then threads will be set to the number of CPU physical cores", + &n_threads}, + {"", + "--chroma-t5-mask-pad", + "t5 mask pad size of chroma", + &chroma_t5_mask_pad}, + }; + + options.float_options = { + {"", + "--vae-tile-overlap", + "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", + &vae_tiling_params.target_overlap}, + {"", + "--flow-shift", + "shift value for Flow models like SD3.x or WAN (default: auto)", + &flow_shift}, + }; + + options.bool_options = { + {"", + "--vae-tiling", + "process vae in tiles to reduce memory usage", + true, &vae_tiling_params.enabled}, + {"", + "--force-sdxl-vae-conv-scale", + "force use of conv scale on sdxl vae", + true, &force_sdxl_vae_conv_scale}, + {"", + "--offload-to-cpu", + "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", + true, &offload_params_to_cpu}, + {"", + "--control-net-cpu", + "keep controlnet in cpu (for low vram)", + true, &control_net_cpu}, + {"", + "--clip-on-cpu", + "keep clip in cpu (for low vram)", + true, &clip_on_cpu}, + {"", + "--vae-on-cpu", + "keep vae in cpu (for low vram)", + true, &vae_on_cpu}, + {"", + "--diffusion-fa", + "use flash attention in the diffusion model", + true, &diffusion_flash_attn}, + {"", + "--diffusion-conv-direct", + "use ggml_conv2d_direct in the diffusion model", + true, &diffusion_conv_direct}, + {"", + "--vae-conv-direct", + "use ggml_conv2d_direct in the vae model", + true, &vae_conv_direct}, + {"", + "--chroma-disable-dit-mask", + "disable dit mask for chroma", + false, &chroma_use_dit_mask}, + {"", + "--chroma-enable-t5-mask", + "enable t5 mask for chroma", + true, &chroma_use_t5_mask}, + }; + + auto on_type_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + wtype = str_to_sd_type(arg); + if (wtype == SD_TYPE_COUNT) { + fprintf(stderr, "error: invalid weight format %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + rng_type = str_to_rng_type(arg); + if (rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid rng type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + sampler_rng_type = str_to_rng_type(arg); + if (sampler_rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid sampler rng type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_prediction_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + prediction = str_to_prediction(arg); + if (prediction == PREDICTION_COUNT) { + fprintf(stderr, "error: invalid prediction type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + lora_apply_mode = str_to_lora_apply_mode(arg); + if (lora_apply_mode == LORA_APPLY_MODE_COUNT) { + fprintf(stderr, "error: invalid lora apply model %s\n", + arg); + return -1; + } + return 1; + }; - for (auto& option : options.bool_options) { - if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { - found_arg = true; - if (option.keep_true) { - *option.target = true; + auto on_tile_size_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string tile_size_str = argv[index]; + size_t x_pos = tile_size_str.find('x'); + try { + if (x_pos != std::string::npos) { + std::string tile_x_str = tile_size_str.substr(0, x_pos); + std::string tile_y_str = tile_size_str.substr(x_pos + 1); + vae_tiling_params.tile_size_x = std::stoi(tile_x_str); + vae_tiling_params.tile_size_y = std::stoi(tile_y_str); } else { - *option.target = false; + vae_tiling_params.tile_size_x = vae_tiling_params.tile_size_y = std::stoi(tile_size_str); } + } catch (const std::invalid_argument&) { + return -1; + } catch (const std::out_of_range&) { + return -1; } - } - if (invalid_arg) { - break; - } + return 1; + }; - for (auto& option : options.manual_options) { - if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { - found_arg = true; - int ret = option.cb(argc, argv, i); - if (ret < 0) { - invalid_arg = true; - break; + auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string rel_size_str = argv[index]; + size_t x_pos = rel_size_str.find('x'); + try { + if (x_pos != std::string::npos) { + std::string rel_x_str = rel_size_str.substr(0, x_pos); + std::string rel_y_str = rel_size_str.substr(x_pos + 1); + vae_tiling_params.rel_size_x = std::stof(rel_x_str); + vae_tiling_params.rel_size_y = std::stof(rel_y_str); + } else { + vae_tiling_params.rel_size_x = vae_tiling_params.rel_size_y = std::stof(rel_size_str); } - i += ret; + } catch (const std::invalid_argument&) { + return -1; + } catch (const std::out_of_range&) { + return -1; } - } - if (invalid_arg) { - break; - } - if (!found_arg) { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - return false; - } - } - if (invalid_arg) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - return false; + return 1; + }; + + options.manual_options = { + {"", + "--type", + "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " + "If not specified, the default is the type of the weight file", + on_type_arg}, + {"", + "--rng", + "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", + on_rng_arg}, + {"", + "--sampler-rng", + "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", + on_sampler_rng_arg}, + {"", + "--prediction", + "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]", + on_prediction_arg}, + {"", + "--lora-apply-mode", + "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " + "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." + "The immediately mode may have precision and compatibility issues with quantized parameters, " + "but it usually offers faster inference speed and, in some cases, lower memory usage. " + "The at_runtime mode, on the other hand, is exactly the opposite.", + on_lora_apply_mode_arg}, + {"", + "--vae-tile-size", + "tile size for vae tiling, format [X]x[Y] (default: 32x32)", + on_tile_size_arg}, + {"", + "--vae-relative-tile-size", + "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", + on_relative_tile_size_arg}, + }; + + return options; } - return true; -} -static std::string wrap_text(const std::string& text, size_t width, size_t indent) { - std::ostringstream oss; - size_t line_len = 0; - size_t pos = 0; - - while (pos < text.size()) { - // Preserve manual newlines - if (text[pos] == '\n') { - oss << '\n' - << std::string(indent, ' '); - line_len = indent; - ++pos; - continue; + bool process_and_check(SDMode mode) { + if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) { + fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); + return false; } - // Add the character - oss << text[pos]; - ++line_len; - ++pos; - - // If the current line exceeds width, try to break at the last space - if (line_len >= width) { - std::string current = oss.str(); - size_t back = current.size(); - - // Find the last space (for a clean break) - while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') - --back; - - // If found a space to break on - if (back > 0 && current[back - 1] != '\n') { - std::string before = current.substr(0, back - 1); - std::string after = current.substr(back); - oss.str(""); - oss.clear(); - oss << before << "\n" - << std::string(indent, ' ') << after; - } else { - // If no space found, just break at width - oss << "\n" - << std::string(indent, ' '); + if (mode == UPSCALE) { + if (esrgan_path.length() == 0) { + fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n"); + return false; } - line_len = indent; } - } - return oss.str(); -} + if (n_threads <= 0) { + n_threads = sd_get_num_physical_cores(); + } -void print_usage(int argc, const char* argv[], const ArgOptions& options) { - constexpr size_t max_line_width = 120; + return true; + } - std::cout << "Usage: " << argv[0] << " [options]\n\n"; - std::cout << "Options:\n"; + std::string to_string() const { + std::ostringstream oss; + oss << "SDContextParams {\n" + << " n_threads: " << n_threads << ",\n" + << " model_path: \"" << model_path << "\",\n" + << " clip_l_path: \"" << clip_l_path << "\",\n" + << " clip_g_path: \"" << clip_g_path << "\",\n" + << " clip_vision_path: \"" << clip_vision_path << "\",\n" + << " t5xxl_path: \"" << t5xxl_path << "\",\n" + << " llm_path: \"" << llm_path << "\",\n" + << " llm_vision_path: \"" << llm_vision_path << "\",\n" + << " diffusion_model_path: \"" << diffusion_model_path << "\",\n" + << " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n" + << " vae_path: \"" << vae_path << "\",\n" + << " taesd_path: \"" << taesd_path << "\",\n" + << " esrgan_path: \"" << esrgan_path << "\",\n" + << " control_net_path: \"" << control_net_path << "\",\n" + << " embedding_dir: \"" << embedding_dir << "\",\n" + << " wtype: " << sd_type_name(wtype) << ",\n" + << " tensor_type_rules: \"" << tensor_type_rules << "\",\n" + << " lora_model_dir: \"" << lora_model_dir << "\",\n" + << " photo_maker_path: \"" << photo_maker_path << "\",\n" + << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" + << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" + << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n" + << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" + << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" + << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" + << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" + << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" + << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" + << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" + << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" + << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" + << " prediction: " << sd_prediction_name(prediction) << ",\n" + << " lora_apply_mode: " << sd_lora_apply_mode_name(lora_apply_mode) << ",\n" + << " vae_tiling_params: { " + << vae_tiling_params.enabled << ", " + << vae_tiling_params.tile_size_x << ", " + << vae_tiling_params.tile_size_y << ", " + << vae_tiling_params.target_overlap << ", " + << vae_tiling_params.rel_size_x << ", " + << vae_tiling_params.rel_size_y << " },\n" + << " force_sdxl_vae_conv_scale: " << (force_sdxl_vae_conv_scale ? "true" : "false") << "\n" + << "}"; + return oss.str(); + } - struct Entry { - std::string names; - std::string desc; - }; - std::vector entries; - - auto add_entry = [&](const std::string& s, const std::string& l, - const std::string& desc, const std::string& hint = "") { - std::ostringstream ss; - if (!s.empty()) - ss << s; - if (!s.empty() && !l.empty()) - ss << ", "; - if (!l.empty()) - ss << l; - if (!hint.empty()) - ss << " " << hint; - entries.push_back({ss.str(), desc}); - }; + sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) { + sd_ctx_params_t sd_ctx_params = { + model_path.c_str(), + clip_l_path.c_str(), + clip_g_path.c_str(), + clip_vision_path.c_str(), + t5xxl_path.c_str(), + llm_path.c_str(), + llm_vision_path.c_str(), + diffusion_model_path.c_str(), + high_noise_diffusion_model_path.c_str(), + vae_path.c_str(), + taesd_path.c_str(), + control_net_path.c_str(), + lora_model_dir.c_str(), + embedding_dir.c_str(), + photo_maker_path.c_str(), + tensor_type_rules.c_str(), + vae_decode_only, + free_params_immediately, + n_threads, + wtype, + rng_type, + sampler_rng_type, + prediction, + lora_apply_mode, + offload_params_to_cpu, + clip_on_cpu, + control_net_cpu, + vae_on_cpu, + diffusion_flash_attn, + taesd_preview, + diffusion_conv_direct, + vae_conv_direct, + force_sdxl_vae_conv_scale, + chroma_use_dit_mask, + chroma_use_t5_mask, + chroma_t5_mask_pad, + flow_shift, + }; + return sd_ctx_params; + } +}; + +template +static std::string vec_to_string(const std::vector& v) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < v.size(); i++) { + oss << v[i]; + if (i + 1 < v.size()) + oss << ", "; + } + oss << "]"; + return oss.str(); +} - for (auto& o : options.string_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : options.int_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : options.float_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : options.bool_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : options.manual_options) - add_entry(o.short_name, o.long_name, o.desc); - - size_t max_name_width = 0; - for (auto& e : entries) - max_name_width = std::max(max_name_width, e.names.size()); - - for (auto& e : entries) { - size_t indent = 2 + max_name_width + 4; - size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40); - std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent); - std::cout << " " << std::left << std::setw(static_cast(max_name_width) + 4) - << e.names << wrapped_desc << "\n"; +static std::string vec_str_to_string(const std::vector& v) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < v.size(); i++) { + oss << "\"" << v[i] << "\""; + if (i + 1 < v.size()) + oss << ", "; } + oss << "]"; + return oss.str(); } -void parse_args(int argc, const char** argv, SDParams& params) { - ArgOptions options; - options.string_options = { - {"-m", - "--model", - "path to full model", - ¶ms.model_path}, - {"", - "--clip_l", - "path to the clip-l text encoder", ¶ms.clip_l_path}, - {"", "--clip_g", - "path to the clip-g text encoder", - ¶ms.clip_g_path}, - {"", - "--clip_vision", - "path to the clip-vision encoder", - ¶ms.clip_vision_path}, - {"", - "--t5xxl", - "path to the t5xxl text encoder", - ¶ms.t5xxl_path}, - {"", - "--llm", - "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", - ¶ms.llm_path}, - {"", - "--llm_vision", - "path to the llm vit", - ¶ms.llm_vision_path}, - {"", - "--qwen2vl", - "alias of --llm. Deprecated.", - ¶ms.llm_path}, - {"", - "--qwen2vl_vision", - "alias of --llm_vision. Deprecated.", - ¶ms.llm_vision_path}, - {"", - "--diffusion-model", - "path to the standalone diffusion model", - ¶ms.diffusion_model_path}, - {"", - "--high-noise-diffusion-model", - "path to the standalone high noise diffusion model", - ¶ms.high_noise_diffusion_model_path}, - {"", - "--vae", - "path to standalone vae model", - ¶ms.vae_path}, - {"", - "--taesd", - "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", - ¶ms.taesd_path}, - {"", - "--control-net", - "path to control net model", - ¶ms.control_net_path}, - {"", - "--embd-dir", - "embeddings directory", - ¶ms.embedding_dir}, - {"", - "--lora-model-dir", - "lora model directory", - ¶ms.lora_model_dir}, - {"-i", - "--init-img", - "path to the init image", - ¶ms.init_image_path}, - {"", - "--end-img", - "path to the end image, required by flf2v", - ¶ms.end_image_path}, - {"", - "--tensor-type-rules", - "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", - ¶ms.tensor_type_rules}, - {"", - "--photo-maker", - "path to PHOTOMAKER model", - ¶ms.photo_maker_path}, - {"", - "--pm-id-images-dir", - "path to PHOTOMAKER input id images dir", - ¶ms.pm_id_images_dir}, - {"", - "--pm-id-embed-path", - "path to PHOTOMAKER v2 id embed", - ¶ms.pm_id_embed_path}, - {"", - "--mask", - "path to the mask image", - ¶ms.mask_image_path}, - {"", - "--control-image", - "path to control image, control net", - ¶ms.control_image_path}, - {"", - "--control-video", - "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " - "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " - "such as 00.png, 01.png, ... etc.", - ¶ms.control_video_path}, - {"-o", - "--output", - "path to write result image to (default: ./output.png)", - ¶ms.output_path}, - {"-p", - "--prompt", - "the prompt to render", - ¶ms.prompt}, - {"-n", - "--negative-prompt", - "the negative prompt (default: \"\")", - ¶ms.negative_prompt}, - {"", - "--preview-path", - "path to write preview image to (default: ./preview.png)", - ¶ms.preview_path}, - {"", - "--upscale-model", - "path to esrgan model.", - ¶ms.esrgan_path}, - }; - - options.int_options = { - {"-t", - "--threads", - "number of threads to use during computation (default: -1). " - "If threads <= 0, then threads will be set to the number of CPU physical cores", - ¶ms.n_threads}, - {"", - "--upscale-repeats", - "Run the ESRGAN upscaler this many times (default: 1)", - ¶ms.upscale_repeats}, - {"-H", - "--height", - "image height, in pixel space (default: 512)", - ¶ms.height}, - {"-W", - "--width", - "image width, in pixel space (default: 512)", - ¶ms.width}, - {"", - "--steps", - "number of sample steps (default: 20)", - ¶ms.sample_params.sample_steps}, - {"", - "--high-noise-steps", - "(high noise) number of sample steps (default: -1 = auto)", - ¶ms.high_noise_sample_params.sample_steps}, - {"", - "--clip-skip", - "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). " - "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x", - ¶ms.clip_skip}, - {"-b", - "--batch-count", - "batch count", - ¶ms.batch_count}, - {"", - "--chroma-t5-mask-pad", - "t5 mask pad size of chroma", - ¶ms.chroma_t5_mask_pad}, - {"", - "--video-frames", - "video frames (default: 1)", - ¶ms.video_frames}, - {"", - "--fps", - "fps (default: 24)", - ¶ms.fps}, - {"", - "--timestep-shift", - "shift timestep for NitroFusion models (default: 0). " - "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", - ¶ms.sample_params.shifted_timestep}, - {"", - "--preview-interval", - "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)", - ¶ms.preview_interval}, - }; - - options.float_options = { - {"", - "--cfg-scale", - "unconditional guidance scale: (default: 7.0)", - ¶ms.sample_params.guidance.txt_cfg}, - {"", - "--img-cfg-scale", - "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", - ¶ms.sample_params.guidance.img_cfg}, - {"", - "--guidance", - "distilled guidance scale for models with guidance input (default: 3.5)", - ¶ms.sample_params.guidance.distilled_guidance}, - {"", - "--slg-scale", - "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", - ¶ms.sample_params.guidance.slg.scale}, - {"", - "--skip-layer-start", - "SLG enabling point (default: 0.01)", - ¶ms.sample_params.guidance.slg.layer_start}, - {"", - "--skip-layer-end", - "SLG disabling point (default: 0.2)", - ¶ms.sample_params.guidance.slg.layer_end}, - {"", - "--eta", - "eta in DDIM, only for DDIM and TCD (default: 0)", - ¶ms.sample_params.eta}, - {"", - "--high-noise-cfg-scale", - "(high noise) unconditional guidance scale: (default: 7.0)", - ¶ms.high_noise_sample_params.guidance.txt_cfg}, - {"", - "--high-noise-img-cfg-scale", - "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", - ¶ms.high_noise_sample_params.guidance.img_cfg}, - {"", - "--high-noise-guidance", - "(high noise) distilled guidance scale for models with guidance input (default: 3.5)", - ¶ms.high_noise_sample_params.guidance.distilled_guidance}, - {"", - "--high-noise-slg-scale", - "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)", - ¶ms.high_noise_sample_params.guidance.slg.scale}, - {"", - "--high-noise-skip-layer-start", - "(high noise) SLG enabling point (default: 0.01)", - ¶ms.high_noise_sample_params.guidance.slg.layer_start}, - {"", - "--high-noise-skip-layer-end", - "(high noise) SLG disabling point (default: 0.2)", - ¶ms.high_noise_sample_params.guidance.slg.layer_end}, - {"", - "--high-noise-eta", - "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)", - ¶ms.high_noise_sample_params.eta}, - {"", - "--strength", - "strength for noising/unnoising (default: 0.75)", - ¶ms.strength}, - {"", - "--pm-style-strength", - "", - ¶ms.pm_style_strength}, - {"", - "--control-strength", - "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image", - ¶ms.control_strength}, - {"", - "--moe-boundary", - "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1", - ¶ms.moe_boundary}, - {"", - "--flow-shift", - "shift value for Flow models like SD3.x or WAN (default: auto)", - ¶ms.flow_shift}, - {"", - "--vace-strength", - "wan vace strength", - ¶ms.vace_strength}, - {"", - "--vae-tile-overlap", - "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", - ¶ms.vae_tiling_params.target_overlap}, - }; +struct SDGenerationParams { + std::string prompt; + std::string negative_prompt; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; + int batch_count = 1; + std::string init_image_path; + std::string end_image_path; + std::string mask_image_path; + std::string control_image_path; + std::vector ref_image_paths; + std::string control_video_path; + bool auto_resize_ref_image = true; + bool increase_ref_index = false; - options.bool_options = { - {"", - "--vae-tiling", - "process vae in tiles to reduce memory usage", - true, ¶ms.vae_tiling_params.enabled}, - {"", - "--force-sdxl-vae-conv-scale", - "force use of conv scale on sdxl vae", - true, ¶ms.force_sdxl_vae_conv_scale}, - {"", - "--offload-to-cpu", - "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", - true, ¶ms.offload_params_to_cpu}, - {"", - "--control-net-cpu", - "keep controlnet in cpu (for low vram)", - true, ¶ms.control_net_cpu}, - {"", - "--clip-on-cpu", - "keep clip in cpu (for low vram)", - true, ¶ms.clip_on_cpu}, - {"", - "--vae-on-cpu", - "keep vae in cpu (for low vram)", - true, ¶ms.vae_on_cpu}, - {"", - "--diffusion-fa", - "use flash attention in the diffusion model", - true, ¶ms.diffusion_flash_attn}, - {"", - "--diffusion-conv-direct", - "use ggml_conv2d_direct in the diffusion model", - true, ¶ms.diffusion_conv_direct}, - {"", - "--vae-conv-direct", - "use ggml_conv2d_direct in the vae model", - true, ¶ms.vae_conv_direct}, - {"", - "--canny", - "apply canny preprocessor (edge detection)", - true, ¶ms.canny_preprocess}, - {"-v", - "--verbose", - "print extra info", - true, ¶ms.verbose}, - {"", - "--color", - "colors the logging tags according to level", - true, ¶ms.color}, - {"", - "--chroma-disable-dit-mask", - "disable dit mask for chroma", - false, ¶ms.chroma_use_dit_mask}, - {"", - "--chroma-enable-t5-mask", - "enable t5 mask for chroma", - true, ¶ms.chroma_use_t5_mask}, - {"", - "--increase-ref-index", - "automatically increase the indices of references images based on the order they are listed (starting with 1).", - true, ¶ms.increase_ref_index}, - {"", - "--disable-auto-resize-ref-image", - "disable auto resize of ref images", - false, ¶ms.auto_resize_ref_image}, - {"", - "--taesd-preview-only", - std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", - true, ¶ms.taesd_preview}, - {"", - "--preview-noisy", - "enables previewing noisy inputs of the models rather than the denoised outputs", - true, ¶ms.preview_noisy}}; - - auto on_mode_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* mode = argv[index]; - if (mode != nullptr) { - int mode_found = -1; - for (int i = 0; i < MODE_COUNT; i++) { - if (!strcmp(mode, modes_str[i])) { - mode_found = i; - } - } - if (mode_found == -1) { - fprintf(stderr, - "error: invalid mode %s, must be one of [%s]\n", - mode, SD_ALL_MODES_STR); - exit(1); - } - params.mode = (SDMode)mode_found; - } - return 1; - }; + std::vector skip_layers = {7, 8, 9}; + sd_sample_params_t sample_params; - auto on_type_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.wtype = str_to_sd_type(arg); - if (params.wtype == SD_TYPE_COUNT) { - fprintf(stderr, "error: invalid weight format %s\n", - arg); - return -1; - } - return 1; - }; + std::vector high_noise_skip_layers = {7, 8, 9}; + sd_sample_params_t high_noise_sample_params; - auto on_rng_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.rng_type = str_to_rng_type(arg); - if (params.rng_type == RNG_TYPE_COUNT) { - fprintf(stderr, "error: invalid rng type %s\n", - arg); - return -1; - } - return 1; - }; + std::string easycache_option; + sd_easycache_params_t easycache_params; - auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.sampler_rng_type = str_to_rng_type(arg); - if (params.sampler_rng_type == RNG_TYPE_COUNT) { - fprintf(stderr, "error: invalid sampler rng type %s\n", - arg); - return -1; - } - return 1; - }; + float moe_boundary = 0.875f; + int video_frames = 1; + int fps = 16; + float vace_strength = 1.f; - auto on_scheduler_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.sample_params.scheduler = str_to_scheduler(arg); - if (params.sample_params.scheduler == SCHEDULER_COUNT) { - fprintf(stderr, "error: invalid scheduler %s\n", - arg); - return -1; - } - return 1; - }; + float strength = 0.75f; + float control_strength = 0.9f; - auto on_prediction_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.prediction = str_to_prediction(arg); - if (params.prediction == PREDICTION_COUNT) { - fprintf(stderr, "error: invalid prediction type %s\n", - arg); - return -1; - } - return 1; - }; + int64_t seed = 42; - auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.lora_apply_mode = str_to_lora_apply_mode(arg); - if (params.lora_apply_mode == LORA_APPLY_MODE_COUNT) { - fprintf(stderr, "error: invalid lora apply model %s\n", - arg); - return -1; - } - return 1; - }; + // Photo Maker + std::string pm_id_images_dir; + std::string pm_id_embed_path; + float pm_style_strength = 20.f; - auto on_sample_method_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.sample_params.sample_method = str_to_sample_method(arg); - if (params.sample_params.sample_method == SAMPLE_METHOD_COUNT) { - fprintf(stderr, "error: invalid sample method %s\n", - arg); - return -1; - } - return 1; - }; + int upscale_repeats = 1; - auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - params.high_noise_sample_params.sample_method = str_to_sample_method(arg); - if (params.high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { - fprintf(stderr, "error: invalid high noise sample method %s\n", - arg); - return -1; - } - return 1; - }; + SDGenerationParams() { + sd_sample_params_init(&sample_params); + sd_sample_params_init(&high_noise_sample_params); + } - auto on_seed_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - params.seed = std::stoll(argv[index]); - return 1; - }; + ArgOptions get_options() { + ArgOptions options; + options.string_options = { + {"-p", + "--prompt", + "the prompt to render", + &prompt}, + {"-n", + "--negative-prompt", + "the negative prompt (default: \"\")", + &negative_prompt}, + {"-i", + "--init-img", + "path to the init image", + &init_image_path}, + {"", + "--end-img", + "path to the end image, required by flf2v", + &end_image_path}, + {"", + "--mask", + "path to the mask image", + &mask_image_path}, + {"", + "--control-image", + "path to control image, control net", + &control_image_path}, + {"", + "--control-video", + "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " + "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " + "such as 00.png, 01.png, ... etc.", + &control_video_path}, + {"", + "--pm-id-images-dir", + "path to PHOTOMAKER input id images dir", + &pm_id_images_dir}, + {"", + "--pm-id-embed-path", + "path to PHOTOMAKER v2 id embed", + &pm_id_embed_path}, + }; - auto on_help_arg = [&](int argc, const char** argv, int index) { - print_usage(argc, argv, options); - exit(0); - return 0; - }; + options.int_options = { + {"-H", + "--height", + "image height, in pixel space (default: 512)", + &height}, + {"-W", + "--width", + "image width, in pixel space (default: 512)", + &width}, + {"", + "--steps", + "number of sample steps (default: 20)", + &sample_params.sample_steps}, + {"", + "--high-noise-steps", + "(high noise) number of sample steps (default: -1 = auto)", + &high_noise_sample_params.sample_steps}, + {"", + "--clip-skip", + "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). " + "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x", + &clip_skip}, + {"-b", + "--batch-count", + "batch count", + &batch_count}, + {"", + "--video-frames", + "video frames (default: 1)", + &video_frames}, + {"", + "--fps", + "fps (default: 24)", + &fps}, + {"", + "--timestep-shift", + "shift timestep for NitroFusion models (default: 0). " + "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", + &sample_params.shifted_timestep}, + {"", + "--upscale-repeats", + "Run the ESRGAN upscaler this many times (default: 1)", + &upscale_repeats}, + }; - auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string layers_str = argv[index]; - if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { - return -1; - } + options.float_options = { + {"", + "--cfg-scale", + "unconditional guidance scale: (default: 7.0)", + &sample_params.guidance.txt_cfg}, + {"", + "--img-cfg-scale", + "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", + &sample_params.guidance.img_cfg}, + {"", + "--guidance", + "distilled guidance scale for models with guidance input (default: 3.5)", + &sample_params.guidance.distilled_guidance}, + {"", + "--slg-scale", + "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", + &sample_params.guidance.slg.scale}, + {"", + "--skip-layer-start", + "SLG enabling point (default: 0.01)", + &sample_params.guidance.slg.layer_start}, + {"", + "--skip-layer-end", + "SLG disabling point (default: 0.2)", + &sample_params.guidance.slg.layer_end}, + {"", + "--eta", + "eta in DDIM, only for DDIM and TCD (default: 0)", + &sample_params.eta}, + {"", + "--high-noise-cfg-scale", + "(high noise) unconditional guidance scale: (default: 7.0)", + &high_noise_sample_params.guidance.txt_cfg}, + {"", + "--high-noise-img-cfg-scale", + "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", + &high_noise_sample_params.guidance.img_cfg}, + {"", + "--high-noise-guidance", + "(high noise) distilled guidance scale for models with guidance input (default: 3.5)", + &high_noise_sample_params.guidance.distilled_guidance}, + {"", + "--high-noise-slg-scale", + "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)", + &high_noise_sample_params.guidance.slg.scale}, + {"", + "--high-noise-skip-layer-start", + "(high noise) SLG enabling point (default: 0.01)", + &high_noise_sample_params.guidance.slg.layer_start}, + {"", + "--high-noise-skip-layer-end", + "(high noise) SLG disabling point (default: 0.2)", + &high_noise_sample_params.guidance.slg.layer_end}, + {"", + "--high-noise-eta", + "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)", + &high_noise_sample_params.eta}, + {"", + "--strength", + "strength for noising/unnoising (default: 0.75)", + &strength}, + {"", + "--pm-style-strength", + "", + &pm_style_strength}, + {"", + "--control-strength", + "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image", + &control_strength}, + {"", + "--moe-boundary", + "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1", + &moe_boundary}, + {"", + "--vace-strength", + "wan vace strength", + &vace_strength}, + }; - layers_str = layers_str.substr(1, layers_str.size() - 2); + options.bool_options = { + {"", + "--increase-ref-index", + "automatically increase the indices of references images based on the order they are listed (starting with 1).", + true, + &increase_ref_index}, + {"", + "--disable-auto-resize-ref-image", + "disable auto resize of ref images", + false, + &auto_resize_ref_image}, + }; - std::regex regex("[, ]+"); - std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); - std::sregex_token_iterator end; - std::vector tokens(iter, end); - std::vector layers; - for (const auto& token : tokens) { - try { - layers.push_back(std::stoi(token)); - } catch (const std::invalid_argument& e) { + auto on_seed_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { return -1; } - } - params.skip_layers = layers; - return 1; - }; + seed = std::stoll(argv[index]); + return 1; + }; - auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string layers_str = argv[index]; - if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { - return -1; - } + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + sample_params.sample_method = str_to_sample_method(arg); + if (sample_params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid sample method %s\n", + arg); + return -1; + } + return 1; + }; - layers_str = layers_str.substr(1, layers_str.size() - 2); + auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + high_noise_sample_params.sample_method = str_to_sample_method(arg); + if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid high noise sample method %s\n", + arg); + return -1; + } + return 1; + }; - std::regex regex("[, ]+"); - std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); - std::sregex_token_iterator end; - std::vector tokens(iter, end); - std::vector layers; - for (const auto& token : tokens) { - try { - layers.push_back(std::stoi(token)); - } catch (const std::invalid_argument& e) { + auto on_scheduler_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { return -1; } - } - params.high_noise_skip_layers = layers; - return 1; - }; + const char* arg = argv[index]; + sample_params.scheduler = str_to_scheduler(arg); + if (sample_params.scheduler == SCHEDULER_COUNT) { + fprintf(stderr, "error: invalid scheduler %s\n", + arg); + return -1; + } + return 1; + }; - auto on_ref_image_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - params.ref_image_paths.push_back(argv[index]); - return 1; - }; + auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; + } - auto on_tile_size_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string tile_size_str = argv[index]; - size_t x_pos = tile_size_str.find('x'); - try { - if (x_pos != std::string::npos) { - std::string tile_x_str = tile_size_str.substr(0, x_pos); - std::string tile_y_str = tile_size_str.substr(x_pos + 1); - params.vae_tiling_params.tile_size_x = std::stoi(tile_x_str); - params.vae_tiling_params.tile_size_y = std::stoi(tile_y_str); - } else { - params.vae_tiling_params.tile_size_x = params.vae_tiling_params.tile_size_y = std::stoi(tile_size_str); + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument&) { + return -1; + } } - } catch (const std::invalid_argument& e) { - return -1; - } catch (const std::out_of_range& e) { - return -1; - } - return 1; - }; + skip_layers = layers; + return 1; + }; - auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string rel_size_str = argv[index]; - size_t x_pos = rel_size_str.find('x'); - try { - if (x_pos != std::string::npos) { - std::string rel_x_str = rel_size_str.substr(0, x_pos); - std::string rel_y_str = rel_size_str.substr(x_pos + 1); - params.vae_tiling_params.rel_size_x = std::stof(rel_x_str); - params.vae_tiling_params.rel_size_y = std::stof(rel_y_str); - } else { - params.vae_tiling_params.rel_size_x = params.vae_tiling_params.rel_size_y = std::stof(rel_size_str); + auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; } - } catch (const std::invalid_argument& e) { - return -1; - } catch (const std::out_of_range& e) { - return -1; - } - return 1; - }; - auto on_preview_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* preview = argv[index]; - int preview_method = -1; - for (int m = 0; m < PREVIEW_COUNT; m++) { - if (!strcmp(preview, previews_str[m])) { - preview_method = m; + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument&) { + return -1; + } } - } - if (preview_method == -1) { - fprintf(stderr, "error: preview method %s\n", - preview); - return -1; - } - params.preview_method = (preview_t)preview_method; - return 1; - }; + high_noise_skip_layers = layers; + return 1; + }; - auto on_easycache_arg = [&](int argc, const char** argv, int index) { - const std::string default_values = "0.2,0.15,0.95"; - auto looks_like_value = [](const std::string& token) { - if (token.empty()) { - return false; + auto on_ref_image_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; } - if (token[0] != '-') { - return true; + ref_image_paths.push_back(argv[index]); + return 1; + }; + + auto on_easycache_arg = [&](int argc, const char** argv, int index) { + const std::string default_values = "0.2,0.15,0.95"; + auto looks_like_value = [](const std::string& token) { + if (token.empty()) { + return false; + } + if (token[0] != '-') { + return true; + } + if (token.size() == 1) { + return false; + } + unsigned char next = static_cast(token[1]); + return std::isdigit(next) || token[1] == '.'; + }; + + std::string option_value; + int consumed = 0; + if (index + 1 < argc) { + std::string next_arg = argv[index + 1]; + if (looks_like_value(next_arg)) { + option_value = argv_to_utf8(index + 1, argv); + consumed = 1; + } } - if (token.size() == 1) { - return false; + if (option_value.empty()) { + option_value = default_values; } - unsigned char next = static_cast(token[1]); - return std::isdigit(next) || token[1] == '.'; + easycache_option = option_value; + return consumed; }; - std::string option_value; - int consumed = 0; - if (index + 1 < argc) { - std::string next_arg = argv[index + 1]; - if (looks_like_value(next_arg)) { - option_value = argv_to_utf8(index + 1, argv); - consumed = 1; - } + options.manual_options = { + {"-s", + "--seed", + "RNG seed (default: 42, use random seed for < 0)", + on_seed_arg}, + {"", + "--sampling-method", + "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] " + "(default: euler for Flux/SD3/Wan, euler_a otherwise)", + on_sample_method_arg}, + {"", + "--high-noise-sampling-method", + "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]" + " default: euler for Flux/SD3/Wan, euler_a otherwise", + on_high_noise_sample_method_arg}, + {"", + "--scheduler", + "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete", + on_scheduler_arg}, + {"", + "--skip-layers", + "layers to skip for SLG steps (default: [7,8,9])", + on_skip_layers_arg}, + {"", + "--high-noise-skip-layers", + "(high noise) layers to skip for SLG steps (default: [7,8,9])", + on_high_noise_skip_layers_arg}, + {"-r", + "--ref-image", + "reference image for Flux Kontext models (can be used multiple times)", + on_ref_image_arg}, + {"", + "--easycache", + "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)", + on_easycache_arg}, + + }; + + return options; + } + + bool process_and_check(SDMode mode) { + if (width <= 0) { + fprintf(stderr, "error: the width must be greater than 0\n"); + return false; + } + + if (height <= 0) { + fprintf(stderr, "error: the height must be greater than 0\n"); + return false; } - if (option_value.empty()) { - option_value = default_values; + + if (sample_params.sample_steps <= 0) { + fprintf(stderr, "error: the sample_steps must be greater than 0\n"); + return false; } - params.easycache_option = option_value; - return consumed; - }; - options.manual_options = { - {"-M", - "--mode", - "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen", - on_mode_arg}, - {"", - "--type", - "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " - "If not specified, the default is the type of the weight file", - on_type_arg}, - {"", - "--rng", - "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", - on_rng_arg}, - {"", - "--sampler-rng", - "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", - on_sampler_rng_arg}, - {"-s", - "--seed", - "RNG seed (default: 42, use random seed for < 0)", - on_seed_arg}, - {"", - "--sampling-method", - "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] " - "(default: euler for Flux/SD3/Wan, euler_a otherwise)", - on_sample_method_arg}, - {"", - "--prediction", - "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]", - on_prediction_arg}, - {"", - "--lora-apply-mode", - "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " - "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." - "The immediately mode may have precision and compatibility issues with quantized parameters, " - "but it usually offers faster inference speed and, in some cases, lower memory usage. " - "The at_runtime mode, on the other hand, is exactly the opposite.", - on_lora_apply_mode_arg}, - {"", - "--scheduler", - "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete", - on_scheduler_arg}, - {"", - "--skip-layers", - "layers to skip for SLG steps (default: [7,8,9])", - on_skip_layers_arg}, - {"", - "--high-noise-sampling-method", - "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]" - " default: euler for Flux/SD3/Wan, euler_a otherwise", - on_high_noise_sample_method_arg}, - {"", - "--high-noise-skip-layers", - "(high noise) layers to skip for SLG steps (default: [7,8,9])", - on_high_noise_skip_layers_arg}, - {"-r", - "--ref-image", - "reference image for Flux Kontext models (can be used multiple times)", - on_ref_image_arg}, - {"-h", - "--help", - "show this help message and exit", - on_help_arg}, - {"", - "--vae-tile-size", - "tile size for vae tiling, format [X]x[Y] (default: 32x32)", - on_tile_size_arg}, - {"", - "--vae-relative-tile-size", - "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", - on_relative_tile_size_arg}, - {"", - "--preview", - std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")", - on_preview_arg}, - {"", - "--easycache", - "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)", - on_easycache_arg}, - }; + if (high_noise_sample_params.sample_steps <= 0) { + high_noise_sample_params.sample_steps = -1; + } - if (!parse_options(argc, argv, options)) { - print_usage(argc, argv, options); - exit(1); - } + if (strength < 0.f || strength > 1.f) { + fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); + return false; + } - if (!params.easycache_option.empty()) { - float values[3] = {0.0f, 0.0f, 0.0f}; - std::stringstream ss(params.easycache_option); - std::string token; - int idx = 0; - while (std::getline(ss, token, ',')) { - auto trim = [](std::string& s) { - const char* whitespace = " \t\r\n"; - auto start = s.find_first_not_of(whitespace); - if (start == std::string::npos) { - s.clear(); - return; + if (!easycache_option.empty()) { + float values[3] = {0.0f, 0.0f, 0.0f}; + std::stringstream ss(easycache_option); + std::string token; + int idx = 0; + while (std::getline(ss, token, ',')) { + auto trim = [](std::string& s) { + const char* whitespace = " \t\r\n"; + auto start = s.find_first_not_of(whitespace); + if (start == std::string::npos) { + s.clear(); + return; + } + auto end = s.find_last_not_of(whitespace); + s = s.substr(start, end - start + 1); + }; + trim(token); + if (token.empty()) { + fprintf(stderr, "error: invalid easycache option '%s'\n", easycache_option.c_str()); + return false; } - auto end = s.find_last_not_of(whitespace); - s = s.substr(start, end - start + 1); - }; - trim(token); - if (token.empty()) { - fprintf(stderr, "error: invalid easycache option '%s'\n", params.easycache_option.c_str()); - exit(1); + if (idx >= 3) { + fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); + return false; + } + try { + values[idx] = std::stof(token); + } catch (const std::exception&) { + fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str()); + return false; + } + idx++; } - if (idx >= 3) { + if (idx != 3) { fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); - exit(1); + return false; } - try { - values[idx] = std::stof(token); - } catch (const std::exception&) { - fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str()); - exit(1); + if (values[0] < 0.0f) { + fprintf(stderr, "error: easycache threshold must be non-negative\n"); + return false; } - idx++; - } - if (idx != 3) { - fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); - exit(1); - } - if (values[0] < 0.0f) { - fprintf(stderr, "error: easycache threshold must be non-negative\n"); - exit(1); - } - if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) { - fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n"); - exit(1); + if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) { + fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n"); + return false; + } + easycache_params.enabled = true; + easycache_params.reuse_threshold = values[0]; + easycache_params.start_percent = values[1]; + easycache_params.end_percent = values[2]; + } else { + easycache_params.enabled = false; } - params.easycache_params.enabled = true; - params.easycache_params.reuse_threshold = values[0]; - params.easycache_params.start_percent = values[1]; - params.easycache_params.end_percent = values[2]; - } else { - params.easycache_params.enabled = false; - } - if (params.n_threads <= 0) { - params.n_threads = sd_get_num_physical_cores(); - } + sample_params.guidance.slg.layers = skip_layers.data(); + sample_params.guidance.slg.layer_count = skip_layers.size(); + high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.data(); + high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); - if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) { - fprintf(stderr, "error: the following arguments are required: prompt\n"); - print_usage(argc, argv, options); - exit(1); - } + if (mode == VID_GEN && video_frames <= 0) { + return false; + } - if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); - print_usage(argc, argv, options); - exit(1); - } + if (mode == VID_GEN && fps <= 0) { + return false; + } - if (params.output_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: output_path\n"); - print_usage(argc, argv, options); - exit(1); - } + if (sample_params.shifted_timestep < 0 || sample_params.shifted_timestep > 1000) { + return false; + } - if (params.width <= 0) { - fprintf(stderr, "error: the width must be greater than 0\n"); - exit(1); - } + if (upscale_repeats < 1) { + return false; + } - if (params.height <= 0) { - fprintf(stderr, "error: the height must be greater than 0\n"); - exit(1); - } + if (mode == UPSCALE) { + if (init_image_path.length() == 0) { + fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n"); + return false; + } + } - if (params.sample_params.sample_steps <= 0) { - fprintf(stderr, "error: the sample_steps must be greater than 0\n"); - exit(1); - } + if (seed < 0) { + srand((int)time(nullptr)); + seed = rand(); + } - if (params.high_noise_sample_params.sample_steps <= 0) { - params.high_noise_sample_params.sample_steps = -1; + return true; } - if (params.strength < 0.f || params.strength > 1.f) { - fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); - exit(1); + std::string to_string() const { + char* sample_params_str = sd_sample_params_to_str(&sample_params); + char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params); + std::ostringstream oss; + oss << "SDGenerationParams {\n" + << " prompt: \"" << prompt << "\",\n" + << " negative_prompt: \"" << negative_prompt << "\",\n" + << " clip_skip: " << clip_skip << ",\n" + << " width: " << width << ",\n" + << " height: " << height << ",\n" + << " batch_count: " << batch_count << ",\n" + << " init_image_path: \"" << init_image_path << "\",\n" + << " end_image_path: \"" << end_image_path << "\",\n" + << " mask_image_path: \"" << mask_image_path << "\",\n" + << " control_image_path: \"" << control_image_path << "\",\n" + << " ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n" + << " control_video_path: \"" << control_video_path << "\",\n" + << " auto_resize_ref_image: " << (auto_resize_ref_image ? "true" : "false") << ",\n" + << " increase_ref_index: " << (increase_ref_index ? "true" : "false") << ",\n" + << " pm_id_images_dir: \"" << pm_id_images_dir << "\",\n" + << " pm_id_embed_path: \"" << pm_id_embed_path << "\",\n" + << " pm_style_strength: " << pm_style_strength << ",\n" + << " skip_layers: " << vec_to_string(skip_layers) << ",\n" + << " sample_params: " << sample_params_str << ",\n" + << " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n" + << " high_noise_sample_params: " << high_noise_sample_params_str << ",\n" + << " easycache_option: \"" << easycache_option << "\",\n" + << " easycache: " + << (easycache_params.enabled ? "enabled" : "disabled") + << " (threshold=" << easycache_params.reuse_threshold + << ", start=" << easycache_params.start_percent + << ", end=" << easycache_params.end_percent << "),\n" + << " moe_boundary: " << moe_boundary << ",\n" + << " video_frames: " << video_frames << ",\n" + << " fps: " << fps << ",\n" + << " vace_strength: " << vace_strength << ",\n" + << " strength: " << strength << ",\n" + << " control_strength: " << control_strength << ",\n" + << " seed: " << seed << ",\n" + << " upscale_repeats: " << upscale_repeats << ",\n" + << "}"; + free(sample_params_str); + free(high_noise_sample_params_str); + return oss.str(); } +}; - if (params.mode == VID_GEN && params.video_frames <= 0) { - fprintf(stderr, "warning: --video-frames must be at least 1\n"); - exit(1); - } +void print_usage(int argc, const char* argv[], const std::vector& options_list) { + std::cout << "Usage: " << argv[0] << " [options]\n\n"; + std::cout << "CLI Options:\n"; + options_list[0].print(); + std::cout << "\nContext Options:\n"; + options_list[1].print(); + std::cout << "\nGeneration Options:\n"; + options_list[2].print(); +} - if (params.mode == VID_GEN && params.fps <= 0) { - fprintf(stderr, "warning: --fps must be at least 1\n"); - exit(1); - } +void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextParams& ctx_params, SDGenerationParams& gen_params) { + std::vector options_vec = {cli_params.get_options(), ctx_params.get_options(), gen_params.get_options()}; - if (params.sample_params.shifted_timestep < 0 || params.sample_params.shifted_timestep > 1000) { - fprintf(stderr, "error: timestep-shift must be between 0 and 1000\n"); - exit(1); + if (!parse_options(argc, argv, options_vec)) { + print_usage(argc, argv, options_vec); + exit(cli_params.normal_exit ? 0 : 1); } - if (params.upscale_repeats < 1) { - fprintf(stderr, "error: upscale multiplier must be at least 1\n"); + if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) { + print_usage(argc, argv, options_vec); exit(1); } - - if (params.mode == UPSCALE) { - if (params.esrgan_path.length() == 0) { - fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n"); - exit(1); - } - if (params.init_image_path.length() == 0) { - fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n"); - exit(1); - } - } - - if (params.seed < 0) { - srand((int)time(nullptr)); - params.seed = rand(); - } - - if (params.mode == CONVERT) { - if (params.output_path == "output.png") { - params.output_path = "output.gguf"; - } - } } static std::string sd_basename(const std::string& path) { @@ -1406,50 +1575,50 @@ static std::string sd_basename(const std::string& path) { return path; } -std::string get_image_params(SDParams params, int64_t seed) { - std::string parameter_string = params.prompt + "\n"; - if (params.negative_prompt.size() != 0) { - parameter_string += "Negative prompt: " + params.negative_prompt + "\n"; +std::string get_image_params(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) { + std::string parameter_string = gen_params.prompt + "\n"; + if (gen_params.negative_prompt.size() != 0) { + parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n"; } - parameter_string += "Steps: " + std::to_string(params.sample_params.sample_steps) + ", "; - parameter_string += "CFG scale: " + std::to_string(params.sample_params.guidance.txt_cfg) + ", "; - if (params.sample_params.guidance.slg.scale != 0 && params.skip_layers.size() != 0) { - parameter_string += "SLG scale: " + std::to_string(params.sample_params.guidance.txt_cfg) + ", "; + parameter_string += "Steps: " + std::to_string(gen_params.sample_params.sample_steps) + ", "; + parameter_string += "CFG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", "; + if (gen_params.sample_params.guidance.slg.scale != 0 && gen_params.skip_layers.size() != 0) { + parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", "; parameter_string += "Skip layers: ["; - for (const auto& layer : params.skip_layers) { + for (const auto& layer : gen_params.skip_layers) { parameter_string += std::to_string(layer) + ", "; } parameter_string += "], "; - parameter_string += "Skip layer start: " + std::to_string(params.sample_params.guidance.slg.layer_start) + ", "; - parameter_string += "Skip layer end: " + std::to_string(params.sample_params.guidance.slg.layer_end) + ", "; + parameter_string += "Skip layer start: " + std::to_string(gen_params.sample_params.guidance.slg.layer_start) + ", "; + parameter_string += "Skip layer end: " + std::to_string(gen_params.sample_params.guidance.slg.layer_end) + ", "; } - parameter_string += "Guidance: " + std::to_string(params.sample_params.guidance.distilled_guidance) + ", "; - parameter_string += "Eta: " + std::to_string(params.sample_params.eta) + ", "; + parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", "; + parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", "; parameter_string += "Seed: " + std::to_string(seed) + ", "; - parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; - parameter_string += "Model: " + sd_basename(params.model_path) + ", "; - parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; - if (params.sampler_rng_type != RNG_TYPE_COUNT) { - parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(params.sampler_rng_type)) + ", "; + parameter_string += "Size: " + std::to_string(gen_params.width) + "x" + std::to_string(gen_params.height) + ", "; + parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", "; + parameter_string += "RNG: " + std::string(sd_rng_type_name(ctx_params.rng_type)) + ", "; + if (ctx_params.sampler_rng_type != RNG_TYPE_COUNT) { + parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(ctx_params.sampler_rng_type)) + ", "; } - parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_params.sample_method)); - if (params.sample_params.scheduler != SCHEDULER_COUNT) { - parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler)); + parameter_string += "Sampler: " + std::string(sd_sample_method_name(gen_params.sample_params.sample_method)); + if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) { + parameter_string += " " + std::string(sd_scheduler_name(gen_params.sample_params.scheduler)); } parameter_string += ", "; - for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) { + for (const auto& te : {ctx_params.clip_l_path, ctx_params.clip_g_path, ctx_params.t5xxl_path, ctx_params.llm_path, ctx_params.llm_vision_path}) { if (!te.empty()) { parameter_string += "TE: " + sd_basename(te) + ", "; } } - if (!params.diffusion_model_path.empty()) { - parameter_string += "Unet: " + sd_basename(params.diffusion_model_path) + ", "; + if (!ctx_params.diffusion_model_path.empty()) { + parameter_string += "Unet: " + sd_basename(ctx_params.diffusion_model_path) + ", "; } - if (!params.vae_path.empty()) { - parameter_string += "VAE: " + sd_basename(params.vae_path) + ", "; + if (!ctx_params.vae_path.empty()) { + parameter_string += "VAE: " + sd_basename(ctx_params.vae_path) + ", "; } - if (params.clip_skip != -1) { - parameter_string += "Clip skip: " + std::to_string(params.clip_skip) + ", "; + if (gen_params.clip_skip != -1) { + parameter_string += "Clip skip: " + std::to_string(gen_params.clip_skip) + ", "; } parameter_string += "Version: stable-diffusion.cpp"; return parameter_string; @@ -1457,12 +1626,12 @@ std::string get_image_params(SDParams params, int64_t seed) { /* Enables Printing the log level tag in color using ANSI escape codes */ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { - SDParams* params = (SDParams*)data; + SDCliParams* cli_params = (SDCliParams*)data; int tag_color; const char* level_str; FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout; - if (!log || (!params->verbose && level <= SD_LOG_DEBUG)) { + if (!log || (!cli_params->verbose && level <= SD_LOG_DEBUG)) { return; } @@ -1489,7 +1658,7 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { break; } - if (params->color == true) { + if (cli_params->color == true) { fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str); } else { fprintf(out_stream, "[%-5s] ", level_str); @@ -1642,72 +1811,81 @@ bool load_images_from_dir(const std::string dir, void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy, void* data) { (void)step; (void)is_noisy; - SDParams* params = (SDParams*)data; + SDCliParams* cli_params = (SDCliParams*)data; // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents // unused in this app, it will either be always noisy or always denoised here if (frame_count == 1) { - stbi_write_png(params->preview_path.c_str(), image->width, image->height, image->channel, image->data, 0); + stbi_write_png(cli_params->preview_path.c_str(), image->width, image->height, image->channel, image->data, 0); } else { - create_mjpg_avi_from_sd_images(params->preview_path.c_str(), image, frame_count, params->preview_fps); + create_mjpg_avi_from_sd_images(cli_params->preview_path.c_str(), image, frame_count, cli_params->preview_fps); } } int main(int argc, const char* argv[]) { - SDParams params; - parse_args(argc, argv, params); - if (params.video_frames > 4) { - size_t last_dot_pos = params.preview_path.find_last_of("."); - std::string base_path = params.preview_path; + SDCliParams cli_params; + SDContextParams ctx_params; + SDGenerationParams gen_params; + + parse_args(argc, argv, cli_params, ctx_params, gen_params); + if (gen_params.video_frames > 4) { + size_t last_dot_pos = cli_params.preview_path.find_last_of("."); + std::string base_path = cli_params.preview_path; std::string file_ext = ""; if (last_dot_pos != std::string::npos) { // filename has extension - base_path = params.preview_path.substr(0, last_dot_pos); - file_ext = params.preview_path.substr(last_dot_pos); + base_path = cli_params.preview_path.substr(0, last_dot_pos); + file_ext = cli_params.preview_path.substr(last_dot_pos); std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower); } if (file_ext == ".png") { - params.preview_path = base_path + ".avi"; + cli_params.preview_path = base_path + ".avi"; } } - params.preview_fps = params.fps; - if (params.preview_method == PREVIEW_PROJ) - params.preview_fps /= 4.0f; - - params.sample_params.guidance.slg.layers = params.skip_layers.data(); - params.sample_params.guidance.slg.layer_count = params.skip_layers.size(); - params.high_noise_sample_params.guidance.slg.layers = params.high_noise_skip_layers.data(); - params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size(); - - sd_set_log_callback(sd_log_cb, (void*)¶ms); - sd_set_preview_callback(step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy, (void*)¶ms); - - if (params.verbose) { - print_params(params); + cli_params.preview_fps = gen_params.fps; + if (cli_params.preview_method == PREVIEW_PROJ) + cli_params.preview_fps /= 4; + + sd_set_log_callback(sd_log_cb, (void*)&cli_params); + sd_set_preview_callback(step_callback, + cli_params.preview_method, + cli_params.preview_interval, + !cli_params.preview_noisy, + cli_params.preview_noisy, + (void*)&cli_params); + + if (cli_params.verbose) { printf("%s", sd_get_system_info()); + printf("%s\n", cli_params.to_string().c_str()); + printf("%s\n", ctx_params.to_string().c_str()); + printf("%s\n", gen_params.to_string().c_str()); } - if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_type_rules.c_str()); + if (cli_params.mode == CONVERT) { + bool success = convert(ctx_params.model_path.c_str(), + ctx_params.vae_path.c_str(), + cli_params.output_path.c_str(), + ctx_params.wtype, + ctx_params.tensor_type_rules.c_str()); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", - params.model_path.c_str(), - params.vae_path.c_str(), - params.output_path.c_str()); + ctx_params.model_path.c_str(), + ctx_params.vae_path.c_str(), + cli_params.output_path.c_str()); return 1; } else { printf("convert '%s'/'%s' to '%s' success\n", - params.model_path.c_str(), - params.vae_path.c_str(), - params.output_path.c_str()); + ctx_params.model_path.c_str(), + ctx_params.vae_path.c_str(), + cli_params.output_path.c_str()); return 0; } } bool vae_decode_only = true; - sd_image_t init_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr}; - sd_image_t end_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr}; - sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr}; - sd_image_t mask_image = {(uint32_t)params.width, (uint32_t)params.height, 1, nullptr}; + sd_image_t init_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; + sd_image_t end_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; + sd_image_t control_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 3, nullptr}; + sd_image_t mask_image = {(uint32_t)gen_params.width, (uint32_t)gen_params.height, 1, nullptr}; std::vector ref_images; std::vector pmid_images; std::vector control_frames; @@ -1734,45 +1912,45 @@ int main(int argc, const char* argv[]) { control_frames.clear(); }; - if (params.init_image_path.size() > 0) { + if (gen_params.init_image_path.size() > 0) { vae_decode_only = false; int width = 0; int height = 0; - init_image.data = load_image(params.init_image_path.c_str(), width, height, params.width, params.height); + init_image.data = load_image(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height); if (init_image.data == nullptr) { - fprintf(stderr, "load image from '%s' failed\n", params.init_image_path.c_str()); + fprintf(stderr, "load image from '%s' failed\n", gen_params.init_image_path.c_str()); release_all_resources(); return 1; } } - if (params.end_image_path.size() > 0) { + if (gen_params.end_image_path.size() > 0) { vae_decode_only = false; int width = 0; int height = 0; - end_image.data = load_image(params.end_image_path.c_str(), width, height, params.width, params.height); + end_image.data = load_image(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height); if (end_image.data == nullptr) { - fprintf(stderr, "load image from '%s' failed\n", params.end_image_path.c_str()); + fprintf(stderr, "load image from '%s' failed\n", gen_params.end_image_path.c_str()); release_all_resources(); return 1; } } - if (params.mask_image_path.size() > 0) { + if (gen_params.mask_image_path.size() > 0) { int c = 0; int width = 0; int height = 0; - mask_image.data = load_image(params.mask_image_path.c_str(), width, height, params.width, params.height, 1); + mask_image.data = load_image(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1); if (mask_image.data == nullptr) { - fprintf(stderr, "load image from '%s' failed\n", params.mask_image_path.c_str()); + fprintf(stderr, "load image from '%s' failed\n", gen_params.mask_image_path.c_str()); release_all_resources(); return 1; } } else { - mask_image.data = (uint8_t*)malloc(params.width * params.height); - memset(mask_image.data, 255, params.width * params.height); + mask_image.data = (uint8_t*)malloc(gen_params.width * gen_params.height); + memset(mask_image.data, 255, gen_params.width * gen_params.height); if (mask_image.data == nullptr) { fprintf(stderr, "malloc mask image failed\n"); release_all_resources(); @@ -1780,16 +1958,16 @@ int main(int argc, const char* argv[]) { } } - if (params.control_image_path.size() > 0) { + if (gen_params.control_image_path.size() > 0) { int width = 0; int height = 0; - control_image.data = load_image(params.control_image_path.c_str(), width, height, params.width, params.height); + control_image.data = load_image(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height); if (control_image.data == nullptr) { - fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str()); + fprintf(stderr, "load image from '%s' failed\n", gen_params.control_image_path.c_str()); release_all_resources(); return 1; } - if (params.canny_preprocess) { // apply preprocessor + if (cli_params.canny_preprocess) { // apply preprocessor preprocess_canny(control_image, 0.08f, 0.08f, @@ -1799,9 +1977,9 @@ int main(int argc, const char* argv[]) { } } - if (params.ref_image_paths.size() > 0) { + if (gen_params.ref_image_paths.size() > 0) { vae_decode_only = false; - for (auto& path : params.ref_image_paths) { + for (auto& path : gen_params.ref_image_paths) { int width = 0; int height = 0; uint8_t* image_buffer = load_image(path.c_str(), width, height); @@ -1817,78 +1995,40 @@ int main(int argc, const char* argv[]) { } } - if (!params.control_video_path.empty()) { - if (!load_images_from_dir(params.control_video_path, + if (!gen_params.control_video_path.empty()) { + if (!load_images_from_dir(gen_params.control_video_path, control_frames, - params.width, - params.height, - params.video_frames, - params.verbose)) { + gen_params.width, + gen_params.height, + gen_params.video_frames, + cli_params.verbose)) { release_all_resources(); return 1; } } - if (!params.pm_id_images_dir.empty()) { - if (!load_images_from_dir(params.pm_id_images_dir, + if (!gen_params.pm_id_images_dir.empty()) { + if (!load_images_from_dir(gen_params.pm_id_images_dir, pmid_images, 0, 0, 0, - params.verbose)) { + cli_params.verbose)) { release_all_resources(); return 1; } } - if (params.mode == VID_GEN) { + if (cli_params.mode == VID_GEN) { vae_decode_only = false; } - sd_ctx_params_t sd_ctx_params = { - params.model_path.c_str(), - params.clip_l_path.c_str(), - params.clip_g_path.c_str(), - params.clip_vision_path.c_str(), - params.t5xxl_path.c_str(), - params.llm_path.c_str(), - params.llm_vision_path.c_str(), - params.diffusion_model_path.c_str(), - params.high_noise_diffusion_model_path.c_str(), - params.vae_path.c_str(), - params.taesd_path.c_str(), - params.control_net_path.c_str(), - params.lora_model_dir.c_str(), - params.embedding_dir.c_str(), - params.photo_maker_path.c_str(), - params.tensor_type_rules.c_str(), - vae_decode_only, - true, - params.n_threads, - params.wtype, - params.rng_type, - params.sampler_rng_type, - params.prediction, - params.lora_apply_mode, - params.offload_params_to_cpu, - params.clip_on_cpu, - params.control_net_cpu, - params.vae_on_cpu, - params.diffusion_flash_attn, - params.taesd_preview, - params.diffusion_conv_direct, - params.vae_conv_direct, - params.force_sdxl_vae_conv_scale, - params.chroma_use_dit_mask, - params.chroma_use_t5_mask, - params.chroma_t5_mask_pad, - params.flow_shift, - }; + sd_ctx_params_t sd_ctx_params = ctx_params.to_sd_ctx_params_t(vae_decode_only, true, cli_params.taesd_preview); sd_image_t* results = nullptr; int num_results = 0; - if (params.mode == UPSCALE) { + if (cli_params.mode == UPSCALE) { num_results = 1; results = (sd_image_t*)calloc(num_results, sizeof(sd_image_t)); if (results == nullptr) { @@ -1908,68 +2048,68 @@ int main(int argc, const char* argv[]) { return 1; } - if (params.sample_params.sample_method == SAMPLE_METHOD_COUNT) { - params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx); + if (gen_params.sample_params.sample_method == SAMPLE_METHOD_COUNT) { + gen_params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx); } - if (params.high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { - params.high_noise_sample_params.sample_method = sd_get_default_sample_method(sd_ctx); + if (gen_params.high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { + gen_params.high_noise_sample_params.sample_method = sd_get_default_sample_method(sd_ctx); } - if (params.sample_params.scheduler == SCHEDULER_COUNT) { - params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx); + if (gen_params.sample_params.scheduler == SCHEDULER_COUNT) { + gen_params.sample_params.scheduler = sd_get_default_scheduler(sd_ctx); } - if (params.mode == IMG_GEN) { + if (cli_params.mode == IMG_GEN) { sd_img_gen_params_t img_gen_params = { - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, + gen_params.prompt.c_str(), + gen_params.negative_prompt.c_str(), + gen_params.clip_skip, init_image, ref_images.data(), (int)ref_images.size(), - params.auto_resize_ref_image, - params.increase_ref_index, + gen_params.auto_resize_ref_image, + gen_params.increase_ref_index, mask_image, - params.width, - params.height, - params.sample_params, - params.strength, - params.seed, - params.batch_count, + gen_params.width, + gen_params.height, + gen_params.sample_params, + gen_params.strength, + gen_params.seed, + gen_params.batch_count, control_image, - params.control_strength, + gen_params.control_strength, { pmid_images.data(), (int)pmid_images.size(), - params.pm_id_embed_path.c_str(), - params.pm_style_strength, + gen_params.pm_id_embed_path.c_str(), + gen_params.pm_style_strength, }, // pm_params - params.vae_tiling_params, - params.easycache_params, + ctx_params.vae_tiling_params, + gen_params.easycache_params, }; results = generate_image(sd_ctx, &img_gen_params); - num_results = params.batch_count; - } else if (params.mode == VID_GEN) { + num_results = gen_params.batch_count; + } else if (cli_params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, + gen_params.prompt.c_str(), + gen_params.negative_prompt.c_str(), + gen_params.clip_skip, init_image, end_image, control_frames.data(), (int)control_frames.size(), - params.width, - params.height, - params.sample_params, - params.high_noise_sample_params, - params.moe_boundary, - params.strength, - params.seed, - params.video_frames, - params.vace_strength, - params.easycache_params, + gen_params.width, + gen_params.height, + gen_params.sample_params, + gen_params.high_noise_sample_params, + gen_params.moe_boundary, + gen_params.strength, + gen_params.seed, + gen_params.video_frames, + gen_params.vace_strength, + gen_params.easycache_params, }; results = generate_video(sd_ctx, &vid_gen_params, &num_results); @@ -1985,11 +2125,11 @@ int main(int argc, const char* argv[]) { } int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth - if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { - upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), - params.offload_params_to_cpu, - params.diffusion_conv_direct, - params.n_threads); + if (ctx_params.esrgan_path.size() > 0 && gen_params.upscale_repeats > 0) { + upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(), + ctx_params.offload_params_to_cpu, + ctx_params.diffusion_conv_direct, + ctx_params.n_threads); if (upscaler_ctx == nullptr) { printf("new_upscaler_ctx failed\n"); @@ -1999,7 +2139,7 @@ int main(int argc, const char* argv[]) { continue; } sd_image_t current_image = results[i]; - for (int u = 0; u < params.upscale_repeats; ++u) { + for (int u = 0; u < gen_params.upscale_repeats; ++u) { sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor); if (upscaled_image.data == nullptr) { printf("upscale failed\n"); @@ -2015,7 +2155,7 @@ int main(int argc, const char* argv[]) { // create directory if not exists { - const fs::path out_path = params.output_path; + const fs::path out_path = cli_params.output_path; if (const fs::path out_dir = out_path.parent_path(); !out_dir.empty()) { std::error_code ec; fs::create_directories(out_dir, ec); // OK if already exists @@ -2031,26 +2171,26 @@ int main(int argc, const char* argv[]) { std::string file_ext; std::string file_ext_lower; bool is_jpg; - size_t last_dot_pos = params.output_path.find_last_of("."); - size_t last_slash_pos = std::min(params.output_path.find_last_of("/"), - params.output_path.find_last_of("\\")); + size_t last_dot_pos = cli_params.output_path.find_last_of("."); + size_t last_slash_pos = std::min(cli_params.output_path.find_last_of("/"), + cli_params.output_path.find_last_of("\\")); if (last_dot_pos != std::string::npos && (last_slash_pos == std::string::npos || last_dot_pos > last_slash_pos)) { // filename has extension - base_path = params.output_path.substr(0, last_dot_pos); - file_ext = file_ext_lower = params.output_path.substr(last_dot_pos); + base_path = cli_params.output_path.substr(0, last_dot_pos); + file_ext = file_ext_lower = cli_params.output_path.substr(last_dot_pos); std::transform(file_ext.begin(), file_ext.end(), file_ext_lower.begin(), ::tolower); is_jpg = (file_ext_lower == ".jpg" || file_ext_lower == ".jpeg" || file_ext_lower == ".jpe"); } else { - base_path = params.output_path; + base_path = cli_params.output_path; file_ext = file_ext_lower = ""; is_jpg = false; } - if (params.mode == VID_GEN && num_results > 1) { - std::string vid_output_path = params.output_path; + if (cli_params.mode == VID_GEN && num_results > 1) { + std::string vid_output_path = cli_params.output_path; if (file_ext_lower == ".png") { vid_output_path = base_path + ".avi"; } - create_mjpg_avi_from_sd_images(vid_output_path.c_str(), results, num_results, params.fps); + create_mjpg_avi_from_sd_images(vid_output_path.c_str(), results, num_results, gen_params.fps); printf("save result MJPG AVI video to '%s'\n", vid_output_path.c_str()); } else { // appending ".png" to absent or unknown extension @@ -2066,11 +2206,11 @@ int main(int argc, const char* argv[]) { std::string final_image_path = i > 0 ? base_path + "_" + std::to_string(i + 1) + file_ext : base_path + file_ext; if (is_jpg) { write_ok = stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, - results[i].data, 90, get_image_params(params, params.seed + i).c_str()); + results[i].data, 90, get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + i).c_str()); printf("save result JPEG image to '%s' (%s)\n", final_image_path.c_str(), write_ok == 0 ? "failure" : "success"); } else { write_ok = stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, - results[i].data, 0, get_image_params(params, params.seed + i).c_str()); + results[i].data, 0, get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + i).c_str()); printf("save result PNG image to '%s' (%s)\n", final_image_path.c_str(), write_ok == 0 ? "failure" : "success"); } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 73065610d..11e2bbcbd 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2094,12 +2094,12 @@ class StableDiffusionGGML { } ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { - int64_t t0 = ggml_time_ms(); - ggml_tensor* result = nullptr; + int64_t t0 = ggml_time_ms(); + ggml_tensor* result = nullptr; const int vae_scale_factor = get_vae_scale_factor(); int W = x->ne[0] / vae_scale_factor; int H = x->ne[1] / vae_scale_factor; - int C = get_latent_channel(); + int C = get_latent_channel(); if (vae_tiling_params.enabled && !encode_video) { // TODO wan2.2 vae support? int ne2; @@ -2224,8 +2224,8 @@ class StableDiffusionGGML { const int vae_scale_factor = get_vae_scale_factor(); int64_t W = x->ne[0] * vae_scale_factor; int64_t H = x->ne[1] * vae_scale_factor; - int64_t C = 3; - ggml_tensor* result = nullptr; + int64_t C = 3; + ggml_tensor* result = nullptr; if (decode_video) { int T = x->ne[2]; if (sd_version_is_wan(version)) { diff --git a/util.cpp b/util.cpp index 68250a910..4a59852e2 100644 --- a/util.cpp +++ b/util.cpp @@ -378,19 +378,19 @@ const char* sd_get_system_info() { static char buffer[1024]; std::stringstream ss; ss << "System Info: \n"; - ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl; - ss << " AVX = " << ggml_cpu_has_avx() << std::endl; - ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl; - ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl; - ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl; - ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl; - ss << " FMA = " << ggml_cpu_has_fma() << std::endl; - ss << " NEON = " << ggml_cpu_has_neon() << std::endl; - ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl; - ss << " F16C = " << ggml_cpu_has_f16c() << std::endl; - ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl; - ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl; - ss << " VSX = " << ggml_cpu_has_vsx() << std::endl; + ss << " SSE3 = " << ggml_cpu_has_sse3() << " | "; + ss << " AVX = " << ggml_cpu_has_avx() << " | "; + ss << " AVX2 = " << ggml_cpu_has_avx2() << " | "; + ss << " AVX512 = " << ggml_cpu_has_avx512() << " | "; + ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | "; + ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | "; + ss << " FMA = " << ggml_cpu_has_fma() << " | "; + ss << " NEON = " << ggml_cpu_has_neon() << " | "; + ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | "; + ss << " F16C = " << ggml_cpu_has_f16c() << " | "; + ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | "; + ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | "; + ss << " VSX = " << ggml_cpu_has_vsx() << " | "; snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); return buffer; } From 0c443ca5f042b8cfd89f40ee2f714a43d9b96f3c Mon Sep 17 00:00:00 2001 From: leejet Date: Wed, 3 Dec 2025 22:30:45 +0800 Subject: [PATCH 2/2] update docs --- examples/cli/README.md | 110 +++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 53 deletions(-) diff --git a/examples/cli/README.md b/examples/cli/README.md index f6490ea54..add5e3eb7 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -3,7 +3,21 @@ ``` usage: ./bin/sd [options] -Options: +CLI Options: + -o, --output path to write result image to (default: ./output.png) + --preview-path path to write preview image to (default: ./preview.png) + --preview-interval interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at + every step) + --canny apply canny preprocessor (edge detection) + -v, --verbose print extra info + --color colors the logging tags according to level + --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) + --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs + -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen + --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) + -h, --help show this help message and exit + +Context Options: -m, --model path to full model --clip_l path to the clip-l text encoder --clip_g path to the clip-g text encoder @@ -20,25 +34,52 @@ Options: --control-net path to control net model --embd-dir embeddings directory --lora-model-dir lora model directory - -i, --init-img path to the init image - --end-img path to the end image, required by flf2v --tensor-type-rules weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --photo-maker path to PHOTOMAKER model - --pm-id-images-dir path to PHOTOMAKER input id images dir - --pm-id-embed-path path to PHOTOMAKER v2 id embed + --upscale-model path to esrgan model. + -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of + CPU physical cores + --chroma-t5-mask-pad t5 mask pad size of chroma + --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) + --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) + --vae-tiling process vae in tiles to reduce memory usage + --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae + --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed + --control-net-cpu keep controlnet in cpu (for low vram) + --clip-on-cpu keep clip in cpu (for low vram) + --vae-on-cpu keep vae in cpu (for low vram) + --diffusion-fa use flash attention in the diffusion model + --diffusion-conv-direct use ggml_conv2d_direct in the diffusion model + --vae-conv-direct use ggml_conv2d_direct in the vae model + --chroma-disable-dit-mask disable dit mask for chroma + --chroma-enable-t5-mask enable t5 mask for chroma + --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the + type of the weight file + --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) + --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng + --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] + --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights + contain any quantized parameters, the at_runtime mode will be used; otherwise, + immediately will be used.The immediately mode may have precision and + compatibility issues with quantized parameters, but it usually offers faster inference + speed and, in some cases, lower memory usage. The at_runtime mode, on the + other hand, is exactly the opposite. + --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) + --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 + (overrides --vae-tile-size) + +Generation Options: + -p, --prompt the prompt to render + -n, --negative-prompt the negative prompt (default: "") + -i, --init-img path to the init image + --end-img path to the end image, required by flf2v --mask path to the mask image --control-image path to control image, control net --control-video path to control video frames, It must be a directory path. The video frames inside should be stored as images in lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, ... etc. - -o, --output path to write result image to (default: ./output.png) - -p, --prompt the prompt to render - -n, --negative-prompt the negative prompt (default: "") - --preview-path path to write preview image to (default: ./preview.png) - --upscale-model path to esrgan model. - -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of - CPU physical cores - --upscale-repeats Run the ESRGAN upscaler this many times (default: 1) + --pm-id-images-dir path to PHOTOMAKER input id images dir + --pm-id-embed-path path to PHOTOMAKER v2 id embed -H, --height image height, in pixel space (default: 512) -W, --width image width, in pixel space (default: 512) --steps number of sample steps (default: 20) @@ -46,13 +87,11 @@ Options: --clip-skip ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x -b, --batch-count batch count - --chroma-t5-mask-pad t5 mask pad size of chroma --video-frames video frames (default: 1) --fps fps (default: 24) --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant - --preview-interval interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at - every step) + --upscale-repeats Run the ESRGAN upscaler this many times (default: 1) --cfg-scale unconditional guidance scale: (default: 7.0) --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --guidance distilled guidance scale for models with guidance input (default: 3.5) @@ -72,53 +111,18 @@ Options: --pm-style-strength --control-strength strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image --moe-boundary timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 - --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) --vace-strength wan vace strength - --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) - --vae-tiling process vae in tiles to reduce memory usage - --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae - --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed - --control-net-cpu keep controlnet in cpu (for low vram) - --clip-on-cpu keep clip in cpu (for low vram) - --vae-on-cpu keep vae in cpu (for low vram) - --diffusion-fa use flash attention in the diffusion model - --diffusion-conv-direct use ggml_conv2d_direct in the diffusion model - --vae-conv-direct use ggml_conv2d_direct in the vae model - --canny apply canny preprocessor (edge detection) - -v, --verbose print extra info - --color colors the logging tags according to level - --chroma-disable-dit-mask disable dit mask for chroma - --chroma-enable-t5-mask enable t5 mask for chroma --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --disable-auto-resize-ref-image disable auto resize of ref images - --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) - --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs - -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen - --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the - type of the weight file - --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) - --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng -s, --seed RNG seed (default: 42, use random seed for < 0) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) - --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow] - --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights - contain any quantized parameters, the at_runtime mode will be used; otherwise, - immediately will be used.The immediately mode may have precision and - compatibility issues with quantized parameters, but it usually offers faster inference - speed and, in some cases, lower memory usage. The at_runtime mode, on the - other hand, is exactly the opposite. + --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, + ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete --skip-layers layers to skip for SLG steps (default: [7,8,9]) - --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, - ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) -r, --ref-image reference image for Flux Kontext models (can be used multiple times) - -h, --help show this help message and exit - --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) - --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 - (overrides --vae-tile-size) - --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) --easycache enable EasyCache for DiT models with optional "threshold,start_percent,end_percent" (default: 0.2,0.15,0.95) ```