diff --git a/README.md b/README.md index 4720dc29..8ce98137 100644 --- a/README.md +++ b/README.md @@ -282,14 +282,14 @@ usage: ./bin/sd [arguments] arguments: -h, --help show this help message and exit - -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img) + -M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen -t, --threads N number of threads to use during computation (default: -1) If threads <= 0, then threads will be set to the number of CPU physical cores -m, --model [MODEL] path to full model --diffusion-model path to the standalone diffusion model --clip_l path to the clip-l text encoder --clip_g path to the clip-g text encoder - --t5xxl path to the the t5xxl text encoder + --t5xxl path to the t5xxl text encoder --vae [VAE] path to vae --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) --control-net [CONTROL_PATH] path to control net model @@ -301,16 +301,18 @@ arguments: --upscale-repeats Run the ESRGAN upscaler this many times (default 1) --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K) If not specified, the default is the type of the weight file + --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") --lora-model-dir [DIR] lora model directory -i, --init-img [IMAGE] path to the input image, required by img2img --mask [MASK] path to the mask image, required by img2img with mask --control-image [IMAGE] path to image condition, control net - -r, --ref_image [PATH] reference image for Flux Kontext models (can be used multiple times) + -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) -o, --output OUTPUT path to write result image to (default: ./output.png) -p, --prompt [PROMPT] the prompt to render -n, --negative-prompt PROMPT the negative prompt (default: "") --cfg-scale SCALE unconditional guidance scale: (default: 7.0) - --guidance SCALE guidance scale for img2img (default: 3.5) + --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) + --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5) --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0) 0 means disabled, a value of 2.5 is nice for sd3.5 medium --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0) @@ -319,7 +321,7 @@ arguments: --skip-layer-end END SLG disabling point: (default: 0.2) SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END]) --strength STRENGTH strength for noising/unnoising (default: 0.75) - --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%) + --style-ratio STYLE-RATIO strength for keeping input identity (default: 20) --control-strength STRENGTH strength to apply Control Net (default: 0.9) 1.0 corresponds to full destruction of information in init image -H, --height H image height, in pixel space (default: 512) @@ -371,7 +373,7 @@ Using formats of different precisions will yield results of varying quality. ``` -./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4 +./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4 ```

diff --git a/docs/kontext.md b/docs/kontext.md index 51975255..69873503 100644 --- a/docs/kontext.md +++ b/docs/kontext.md @@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht For example: ``` - .\bin\Release\sd.exe -M edit -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v + .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v ``` diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 234dad3a..5879967f 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1,13 +1,15 @@ #include #include #include +#include #include +#include #include +#include #include #include // #include "preprocessing.hpp" -#include "flux.hpp" #include "stable-diffusion.h" #define STB_IMAGE_IMPLEMENTATION @@ -22,58 +24,26 @@ #define STB_IMAGE_RESIZE_STATIC #include "stb_image_resize.h" -const char* rng_type_to_str[] = { - "std_default", - "cuda", -}; - -// Names of the sampler method, same order as enum sample_method in stable-diffusion.h -const char* sample_method_str[] = { - "euler_a", - "euler", - "heun", - "dpm2", - "dpm++2s_a", - "dpm++2m", - "dpm++2mv2", - "ipndm", - "ipndm_v", - "lcm", - "ddim_trailing", - "tcd", -}; - -// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h -const char* schedule_str[] = { - "default", - "discrete", - "karras", - "exponential", - "ays", - "gits", -}; +#define SAFE_STR(s) ((s) ? (s) : "") +#define BOOL_STR(b) ((b) ? "true" : "false") const char* modes_str[] = { - "txt2img", - "img2img", - "img2vid", - "edit", + "img_gen", + "vid_gen", "convert", }; -#define SD_ALL_MODES_STR "txt2img, img2img, edit, convert" +#define SD_ALL_MODES_STR "img_gen, vid_gen, convert" enum SDMode { - TXT2IMG, - IMG2IMG, - IMG2VID, - EDIT, + IMG_GEN, + VID_GEN, CONVERT, MODE_COUNT }; struct SDParams { int n_threads = -1; - SDMode mode = TXT2IMG; + SDMode mode = IMG_GEN; std::string model_path; std::string clip_l_path; std::string clip_g_path; @@ -82,9 +52,9 @@ struct SDParams { std::string vae_path; std::string taesd_path; std::string esrgan_path; - std::string controlnet_path; - std::string embeddings_path; - std::string stacked_id_embeddings_path; + std::string control_net_path; + std::string embedding_dir; + std::string stacked_id_embed_dir; std::string input_id_images_path; sd_type_t wtype = SD_TYPE_COUNT; std::string tensor_type_rules; @@ -154,9 +124,9 @@ void print_params(SDParams params) { printf(" vae_path: %s\n", params.vae_path.c_str()); printf(" taesd_path: %s\n", params.taesd_path.c_str()); printf(" esrgan_path: %s\n", params.esrgan_path.c_str()); - printf(" controlnet_path: %s\n", params.controlnet_path.c_str()); - printf(" embeddings_path: %s\n", params.embeddings_path.c_str()); - printf(" stacked_id_embeddings_path: %s\n", params.stacked_id_embeddings_path.c_str()); + printf(" control_net_path: %s\n", params.control_net_path.c_str()); + printf(" embedding_dir: %s\n", params.embedding_dir.c_str()); + printf(" stacked_id_embed_dir: %s\n", params.stacked_id_embed_dir.c_str()); printf(" input_id_images_path: %s\n", params.input_id_images_path.c_str()); printf(" style ratio: %.2f\n", params.style_ratio); printf(" normalize input image : %s\n", params.normalize_input ? "true" : "false"); @@ -184,11 +154,11 @@ void print_params(SDParams params) { printf(" clip_skip: %d\n", params.clip_skip); printf(" width: %d\n", params.width); printf(" height: %d\n", params.height); - printf(" sample_method: %s\n", sample_method_str[params.sample_method]); - printf(" schedule: %s\n", schedule_str[params.schedule]); + printf(" sample_method: %s\n", sd_sample_method_name(params.sample_method)); + printf(" schedule: %s\n", sd_schedule_name(params.schedule)); printf(" sample_steps: %d\n", params.sample_steps); printf(" strength(img2img): %.2f\n", params.strength); - printf(" rng: %s\n", rng_type_to_str[params.rng_type]); + printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); printf(" seed: %ld\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); @@ -203,11 +173,7 @@ void print_usage(int argc, const char* argv[]) { printf("\n"); printf("arguments:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" -M, --mode [MODE] run mode, one of:\n"); - printf(" txt2img: generate an image from a text prompt (default)\n"); - printf(" img2img: generate an image from a text prompt and an initial image (--init-img)\n"); - printf(" edit: modify an image (--ref-image) based on text instructions\n"); - printf(" convert: convert a model file to gguf format, optionally with quantization\n"); + printf(" -M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen\n"); printf(" -t, --threads N number of threads to use during computation (default: -1)\n"); printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); printf(" -m, --model [MODEL] path to full model\n"); @@ -246,7 +212,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); - printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n"); + printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n"); printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); @@ -275,432 +241,344 @@ void print_usage(int argc, const char* argv[]) { printf(" -v, --verbose print extra info\n"); } -void parse_args(int argc, const char** argv, SDParams& params) { +struct StringOption { + std::string short_name; + std::string long_name; + std::string desc; + std::string* target; +}; + +struct IntOption { + std::string short_name; + std::string long_name; + std::string desc; + int* target; +}; + +struct FloatOption { + std::string short_name; + std::string long_name; + std::string desc; + float* target; +}; + +struct BoolOption { + std::string short_name; + std::string long_name; + std::string desc; + bool keep_true; + bool* target; +}; + +struct ManualOption { + std::string short_name; + std::string long_name; + std::string desc; + std::function cb; +}; + +struct ArgOptions { + std::vector string_options; + std::vector int_options; + std::vector float_options; + std::vector bool_options; + std::vector manual_options; +}; + +bool parse_options(int argc, const char** argv, ArgOptions& options) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { arg = argv[i]; - if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "-M" || arg == "--mode") { - if (++i >= argc) { - invalid_arg = true; - break; - } - const char* mode_selected = argv[i]; - int mode_found = -1; - for (int d = 0; d < MODE_COUNT; d++) { - if (!strcmp(mode_selected, modes_str[d])) { - mode_found = d; - } - } - if (mode_found == -1) { - fprintf(stderr, - "error: invalid mode %s, must be one of [%s]\n", - mode_selected, SD_ALL_MODES_STR); - exit(1); - } - params.mode = (SDMode)mode_found; - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.model_path = argv[i]; - } else if (arg == "--clip_l") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.clip_l_path = argv[i]; - } else if (arg == "--clip_g") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.clip_g_path = argv[i]; - } else if (arg == "--t5xxl") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.t5xxl_path = argv[i]; - } else if (arg == "--diffusion-model") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.diffusion_model_path = argv[i]; - } else if (arg == "--vae") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.vae_path = argv[i]; - } else if (arg == "--taesd") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.taesd_path = argv[i]; - } else if (arg == "--control-net") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.controlnet_path = argv[i]; - } else if (arg == "--upscale-model") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.esrgan_path = argv[i]; - } else if (arg == "--embd-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.embeddings_path = argv[i]; - } else if (arg == "--stacked-id-embd-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.stacked_id_embeddings_path = argv[i]; - } else if (arg == "--input-id-images-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.input_id_images_path = argv[i]; - } else if (arg == "--type") { - if (++i >= argc) { - invalid_arg = true; - break; - } - std::string type = argv[i]; - bool found = false; - std::string valid_types = ""; - for (size_t i = 0; i < SD_TYPE_COUNT; i++) { - auto trait = ggml_get_type_traits((ggml_type)i); - std::string name(trait->type_name); - if (name == "f32" || trait->to_float && trait->type_size) { - if (i) - valid_types += ", "; - valid_types += name; - if (type == name) { - if (ggml_quantize_requires_imatrix((ggml_type)i)) { - printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name); - } - params.wtype = (enum sd_type_t)i; - found = true; - break; - } - } - } - if (!found) { - fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n", - type.c_str(), - valid_types.c_str()); - exit(1); - } - } else if (arg == "--tensor-type-rules") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.tensor_type_rules = argv[i]; - } else if (arg == "--lora-model-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.lora_model_dir = argv[i]; - } else if (arg == "-i" || arg == "--init-img") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.input_path = argv[i]; - } else if (arg == "--mask") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.mask_path = argv[i]; - } else if (arg == "--control-image") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.control_image_path = argv[i]; - } else if (arg == "-o" || arg == "--output") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.output_path = argv[i]; - } else if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.prompt = argv[i]; - } else if (arg == "--upscale-repeats") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.upscale_repeats = std::stoi(argv[i]); - if (params.upscale_repeats < 1) { - fprintf(stderr, "error: upscale multiplier must be at least 1\n"); - exit(1); - } - } else if (arg == "-n" || arg == "--negative-prompt") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.negative_prompt = argv[i]; - } else if (arg == "--cfg-scale") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.cfg_scale = std::stof(argv[i]); - } else if (arg == "--img-cfg-scale") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.img_cfg_scale = std::stof(argv[i]); - } else if (arg == "--guidance") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.guidance = std::stof(argv[i]); - } else if (arg == "--eta") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.eta = std::stof(argv[i]); - } else if (arg == "--strength") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.strength = std::stof(argv[i]); - } else if (arg == "--style-ratio") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.style_ratio = std::stof(argv[i]); - } else if (arg == "--control-strength") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.control_strength = std::stof(argv[i]); - } else if (arg == "-H" || arg == "--height") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.height = std::stoi(argv[i]); - } else if (arg == "-W" || arg == "--width") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.width = std::stoi(argv[i]); - } else if (arg == "--steps") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.sample_steps = std::stoi(argv[i]); - } else if (arg == "--clip-skip") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.clip_skip = std::stoi(argv[i]); - } else if (arg == "--vae-tiling") { - params.vae_tiling = true; - } else if (arg == "--control-net-cpu") { - params.control_net_cpu = true; - } else if (arg == "--normalize-input") { - params.normalize_input = true; - } else if (arg == "--clip-on-cpu") { - params.clip_on_cpu = true; // will slow down get_learned_condiotion but necessary for low MEM GPUs - } else if (arg == "--vae-on-cpu") { - params.vae_on_cpu = true; // will slow down latent decoding but necessary for low MEM GPUs - } else if (arg == "--diffusion-fa") { - params.diffusion_flash_attn = true; // can reduce MEM significantly - } else if (arg == "--canny") { - params.canny_preprocess = true; - } else if (arg == "-b" || arg == "--batch-count") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.batch_count = std::stoi(argv[i]); - } else if (arg == "--rng") { - if (++i >= argc) { - invalid_arg = true; - break; - } - std::string rng_type_str = argv[i]; - if (rng_type_str == "std_default") { - params.rng_type = STD_DEFAULT_RNG; - } else if (rng_type_str == "cuda") { - params.rng_type = CUDA_RNG; - } else { - invalid_arg = true; - break; - } - } else if (arg == "--schedule") { - if (++i >= argc) { - invalid_arg = true; - break; - } - const char* schedule_selected = argv[i]; - int schedule_found = -1; - for (int d = 0; d < N_SCHEDULES; d++) { - if (!strcmp(schedule_selected, schedule_str[d])) { - schedule_found = d; - } - } - if (schedule_found == -1) { - invalid_arg = true; - break; - } - params.schedule = (schedule_t)schedule_found; - } else if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.seed = std::stoll(argv[i]); - } else if (arg == "--sampling-method") { - if (++i >= argc) { - invalid_arg = true; - break; - } - const char* sample_method_selected = argv[i]; - int sample_method_found = -1; - for (int m = 0; m < N_SAMPLE_METHODS; m++) { - if (!strcmp(sample_method_selected, sample_method_str[m])) { - sample_method_found = m; + for (auto& option : options.string_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + if (++i >= argc) { + invalid_arg = true; + break; } + *option.target = std::string(argv[i]); } - if (sample_method_found == -1) { - invalid_arg = true; - break; - } - params.sample_method = (sample_method_t)sample_method_found; - } else if (arg == "-h" || arg == "--help") { - print_usage(argc, argv); - exit(0); - } else if (arg == "-v" || arg == "--verbose") { - params.verbose = true; - } else if (arg == "--color") { - params.color = true; - } else if (arg == "--slg-scale") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.slg_scale = std::stof(argv[i]); - } else if (arg == "--skip-layers") { - if (++i >= argc) { - invalid_arg = true; - break; - } - if (argv[i][0] != '[') { - invalid_arg = true; - break; - } - std::string layers_str = argv[i]; - while (layers_str.back() != ']') { + } + if (invalid_arg) { + break; + } + + for (auto& option : options.int_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { if (++i >= argc) { invalid_arg = true; break; } - layers_str += " " + std::string(argv[i]); + *option.target = std::stoi(argv[i]); } - layers_str = layers_str.substr(1, layers_str.size() - 2); - - std::regex regex("[, ]+"); - std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); - std::sregex_token_iterator end; - std::vector tokens(iter, end); - std::vector layers; - for (const auto& token : tokens) { - try { - layers.push_back(std::stoi(token)); - } catch (const std::invalid_argument& e) { + } + if (invalid_arg) { + break; + } + + for (auto& option : options.float_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + if (++i >= argc) { invalid_arg = true; break; } + *option.target = std::stof(argv[i]); } - params.skip_layers = layers; + } + if (invalid_arg) { + break; + } - if (invalid_arg) { - break; - } - } else if (arg == "--skip-layer-start") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.skip_layer_start = std::stof(argv[i]); - } else if (arg == "--skip-layer-end") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.skip_layer_end = std::stof(argv[i]); - } else if (arg == "-r" || arg == "--ref-image") { - if (++i >= argc) { - invalid_arg = true; - break; + for (auto& option : options.bool_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + if (option.keep_true) { + *option.target = true; + } else { + *option.target = false; + } } - params.ref_image_paths.push_back(argv[i]); - } else if (arg == "--chroma-disable-dit-mask") { - params.chroma_use_dit_mask = false; - } else if (arg == "--chroma-enable-t5-mask") { - params.chroma_use_t5_mask = true; - } else if (arg == "--chroma-t5-mask-pad") { - if (++i >= argc) { - invalid_arg = true; - break; + } + if (invalid_arg) { + break; + } + + for (auto& option : options.manual_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + int ret = option.cb(argc, argv, i); + if (ret < 0) { + invalid_arg = true; + break; + } + i += ret; } - params.chroma_t5_mask_pad = std::stoi(argv[i]); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - print_usage(argc, argv); - exit(1); + } + if (invalid_arg) { + break; } } if (invalid_arg) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + return false; + } + return true; +} + +void parse_args(int argc, const char** argv, SDParams& params) { + ArgOptions options; + options.string_options = { + {"-m", "--model", "", ¶ms.model_path}, + {"", "--clip_l", "", ¶ms.clip_l_path}, + {"", "--clip_g", "", ¶ms.clip_g_path}, + {"", "--t5xxl", "", ¶ms.t5xxl_path}, + {"", "--diffusion-model", "", ¶ms.diffusion_model_path}, + {"", "--vae", "", ¶ms.vae_path}, + {"", "--taesd", "", ¶ms.taesd_path}, + {"", "--control-net", "", ¶ms.control_net_path}, + {"", "--embd-dir", "", ¶ms.embedding_dir}, + {"", "--stacked-id-embd-dir", "", ¶ms.stacked_id_embed_dir}, + {"", "--lora-model-dir", "", ¶ms.lora_model_dir}, + {"-i", "--init-img", "", ¶ms.input_path}, + {"", "--tensor-type-rules", "", ¶ms.tensor_type_rules}, + {"", "--input-id-images-dir", "", ¶ms.input_id_images_path}, + {"", "--mask", "", ¶ms.mask_path}, + {"", "--control-image", "", ¶ms.control_image_path}, + {"-o", "--output", "", ¶ms.output_path}, + {"-p", "--prompt", "", ¶ms.prompt}, + {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, + + {"", "--upscale-model", "", ¶ms.esrgan_path}, + }; + + options.int_options = { + {"-t", "--threads", "", ¶ms.n_threads}, + {"", "--upscale-repeats", "", ¶ms.upscale_repeats}, + {"-H", "--height", "", ¶ms.height}, + {"-W", "--width", "", ¶ms.width}, + {"", "--steps", "", ¶ms.sample_steps}, + {"", "--clip-skip", "", ¶ms.clip_skip}, + {"-b", "--batch-count", "", ¶ms.batch_count}, + {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, + }; + + options.float_options = { + {"", "--cfg-scale", "", ¶ms.cfg_scale}, + {"", "--img-cfg-scale", "", ¶ms.img_cfg_scale}, + {"", "--guidance", "", ¶ms.guidance}, + {"", "--eta", "", ¶ms.eta}, + {"", "--strength", "", ¶ms.strength}, + {"", "--style-ratio", "", ¶ms.style_ratio}, + {"", "--control-strength", "", ¶ms.control_strength}, + {"", "--slg-scale", "", ¶ms.slg_scale}, + {"", "--skip-layer-start", "", ¶ms.skip_layer_start}, + {"", "--skip-layer-end", "", ¶ms.skip_layer_end}, + + }; + + options.bool_options = { + {"", "--vae-tiling", "", true, ¶ms.vae_tiling}, + {"", "--control-net-cpu", "", true, ¶ms.control_net_cpu}, + {"", "--normalize-input", "", true, ¶ms.normalize_input}, + {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, + {"", "--vae-on-cpu", "", true, ¶ms.vae_on_cpu}, + {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, + {"", "--canny", "", true, ¶ms.canny_preprocess}, + {"-v", "--verbos", "", true, ¶ms.verbose}, + {"", "--color", "", true, ¶ms.color}, + {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, + {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, + }; + + auto on_mode_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* mode = argv[index]; + if (mode != NULL) { + int mode_found = -1; + for (int i = 0; i < MODE_COUNT; i++) { + if (!strcmp(mode, modes_str[i])) { + mode_found = i; + } + } + if (mode_found == -1) { + fprintf(stderr, + "error: invalid mode %s, must be one of [%s]\n", + mode, SD_ALL_MODES_STR); + exit(1); + } + params.mode = (SDMode)mode_found; + } + return 1; + }; + + auto on_type_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.wtype = str_to_sd_type(arg); + if (params.wtype == SD_TYPE_COUNT) { + fprintf(stderr, "error: invalid weight format %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.rng_type = str_to_rng_type(arg); + if (params.rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid rng type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_schedule_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.schedule = str_to_schedule(arg); + if (params.schedule == SCHEDULE_COUNT) { + fprintf(stderr, "error: invalid schedule %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.sample_method = str_to_sample_method(arg); + if (params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid sample method %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_seed_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + params.seed = std::stoll(argv[index]); + return 1; + }; + + auto on_help_arg = [&](int argc, const char** argv, int index) { + print_usage(argc, argv); + exit(0); + return 0; + }; + + auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; + } + + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument& e) { + return -1; + } + } + params.skip_layers = layers; + return 1; + }; + + auto on_ref_image_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + params.ref_image_paths.push_back(argv[index]); + return 1; + }; + + options.manual_options = { + {"-M", "--mode", "", on_mode_arg}, + {"", "--type", "", on_type_arg}, + {"", "--rng", "", on_rng_arg}, + {"-s", "--seed", "", on_seed_arg}, + {"", "--sampling-method", "", on_sample_method_arg}, + {"", "--schedule", "", on_schedule_arg}, + {"", "--skip-layers", "", on_skip_layers_arg}, + {"-r", "--ref-image", "", on_ref_image_arg}, + {"-h", "--help", "", on_help_arg}, + }; + + if (!parse_options(argc, argv, options)) { print_usage(argc, argv); exit(1); } + if (params.n_threads <= 0) { params.n_threads = get_num_physical_cores(); } - if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) { + if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) { fprintf(stderr, "error: the following arguments are required: prompt\n"); print_usage(argc, argv); exit(1); @@ -712,18 +590,6 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } - if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) { - fprintf(stderr, "error: when using the img2img/img2vid mode, the following arguments are required: init-img\n"); - print_usage(argc, argv); - exit(1); - } - - if (params.mode == EDIT && params.ref_image_paths.size() == 0) { - fprintf(stderr, "error: when using the edit mode, the following arguments are required: ref-image\n"); - print_usage(argc, argv); - exit(1); - } - if (params.output_path.length() == 0) { fprintf(stderr, "error: the following arguments are required: output_path\n"); print_usage(argc, argv); @@ -754,6 +620,11 @@ void parse_args(int argc, const char** argv, SDParams& params) { fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n"); } + if (params.upscale_repeats < 1) { + fprintf(stderr, "error: upscale multiplier must be at least 1\n"); + exit(1); + } + if (params.seed < 0) { srand((int)time(NULL)); params.seed = rand(); @@ -804,8 +675,8 @@ std::string get_image_params(SDParams params, int64_t seed) { parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; parameter_string += "Model: " + sd_basename(params.model_path) + ", "; - parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", "; - parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]); + parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; + parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_method)); if (params.schedule == KARRAS) { parameter_string += " karras"; } @@ -899,7 +770,7 @@ int main(int argc, const char* argv[]) { } } - if (params.mode == IMG2VID) { + if (params.mode == VID_GEN) { fprintf(stderr, "SVD support is broken, do not use it!!!\n"); return 1; } @@ -910,7 +781,7 @@ int main(int argc, const char* argv[]) { uint8_t* mask_image_buffer = NULL; std::vector ref_images; - if (params.mode == IMG2IMG || params.mode == IMG2VID) { + if (params.input_path.size() > 0) { vae_decode_only = false; int c = 0; @@ -960,7 +831,7 @@ int main(int argc, const char* argv[]) { free(input_image_buffer); input_image_buffer = resized_image_buffer; } - } else if (params.mode == EDIT) { + } else if (params.ref_image_paths.size() > 0) { vae_decode_only = false; for (auto& path : params.ref_image_paths) { int c = 0; @@ -993,39 +864,48 @@ int main(int argc, const char* argv[]) { } } - sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(), - params.clip_l_path.c_str(), - params.clip_g_path.c_str(), - params.t5xxl_path.c_str(), - params.diffusion_model_path.c_str(), - params.vae_path.c_str(), - params.taesd_path.c_str(), - params.controlnet_path.c_str(), - params.lora_model_dir.c_str(), - params.embeddings_path.c_str(), - params.stacked_id_embeddings_path.c_str(), - vae_decode_only, - params.vae_tiling, - true, - params.n_threads, - params.wtype, - params.rng_type, - params.schedule, - params.clip_on_cpu, - params.control_net_cpu, - params.vae_on_cpu, - params.diffusion_flash_attn, - params.chroma_use_dit_mask, - params.chroma_use_t5_mask, - params.chroma_t5_mask_pad); + sd_ctx_params_t sd_ctx_params = { + params.model_path.c_str(), + params.clip_l_path.c_str(), + params.clip_g_path.c_str(), + params.t5xxl_path.c_str(), + params.diffusion_model_path.c_str(), + params.vae_path.c_str(), + params.taesd_path.c_str(), + params.control_net_path.c_str(), + params.lora_model_dir.c_str(), + params.embedding_dir.c_str(), + params.stacked_id_embed_dir.c_str(), + vae_decode_only, + params.vae_tiling, + true, + params.n_threads, + params.wtype, + params.rng_type, + params.schedule, + params.clip_on_cpu, + params.control_net_cpu, + params.vae_on_cpu, + params.diffusion_flash_attn, + params.chroma_use_dit_mask, + params.chroma_use_t5_mask, + params.chroma_t5_mask_pad, + }; + + sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); return 1; } + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, + 3, + input_image_buffer}; + sd_image_t* control_image = NULL; - if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) { + if (params.control_net_path.size() > 0 && params.control_image_path.size() > 0) { int c = 0; control_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (control_image_buffer == NULL) { @@ -1061,107 +941,52 @@ int main(int argc, const char* argv[]) { mask_image_buffer}; sd_image_t* results; - if (params.mode == TXT2IMG) { - results = txt2img(sd_ctx, - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - guidance_params, - params.eta, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.seed, - params.batch_count, - control_image, - params.control_strength, - params.style_ratio, - params.normalize_input, - params.input_id_images_path.c_str()); - } else if (params.mode == IMG2IMG || params.mode == IMG2VID) { - sd_image_t input_image = {(uint32_t)params.width, - (uint32_t)params.height, - 3, - input_image_buffer}; - - if (params.mode == IMG2VID) { - results = img2vid(sd_ctx, - input_image, - params.width, - params.height, - params.video_frames, - params.motion_bucket_id, - params.fps, - params.augmentation_level, - guidance_params, - params.sample_method, - params.sample_steps, - params.strength, - params.seed); - if (results == NULL) { - printf("generate failed\n"); - free_sd_ctx(sd_ctx); - return 1; - } - size_t last = params.output_path.find_last_of("."); - std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; - for (int i = 0; i < params.video_frames; i++) { - if (results[i].data == NULL) { - continue; - } - std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png"; - stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, - results[i].data, 0, get_image_params(params, params.seed + i).c_str()); - printf("save result image to '%s'\n", final_image_path.c_str()); - free(results[i].data); - results[i].data = NULL; - } - free(results); - free_sd_ctx(sd_ctx); - return 0; - } else { - results = img2img(sd_ctx, - input_image, - mask_image, - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - guidance_params, - params.eta, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.strength, - params.seed, - params.batch_count, - control_image, - params.control_strength, - params.style_ratio, - params.normalize_input, - params.input_id_images_path.c_str()); - } - } else { // EDIT - results = edit(sd_ctx, - ref_images.data(), - ref_images.size(), - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - guidance_params, - params.eta, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.seed, - params.batch_count, - control_image, - params.control_strength, - params.style_ratio, - params.normalize_input, - params.input_id_images_path.c_str()); + int expected_num_results = 1; + if (params.mode == IMG_GEN) { + sd_img_gen_params_t img_gen_params = { + params.prompt.c_str(), + params.negative_prompt.c_str(), + params.clip_skip, + guidance_params, + input_image, + ref_images.data(), + ref_images.size(), + mask_image, + params.width, + params.height, + params.sample_method, + params.sample_steps, + params.eta, + params.strength, + params.seed, + params.batch_count, + control_image, + params.control_strength, + params.style_ratio, + params.normalize_input, + params.input_id_images_path.c_str(), + }; + + results = generate_image(sd_ctx, &img_gen_params); + expected_num_results = params.batch_count; + } else if (params.mode == VID_GEN) { + sd_vid_gen_params_t vid_gen_params = { + input_image, + params.width, + params.height, + guidance_params, + params.sample_method, + params.sample_steps, + params.strength, + params.seed, + params.video_frames, + params.motion_bucket_id, + params.fps, + params.augmentation_level, + }; + + results = generate_video(sd_ctx, &vid_gen_params); + expected_num_results = params.video_frames; } if (results == NULL) { @@ -1218,7 +1043,7 @@ int main(int argc, const char* argv[]) { dummy_name += ext; ext = ".png"; } - for (int i = 0; i < params.batch_count; i++) { + for (int i = 0; i < expected_num_results; i++) { if (results[i].data == NULL) { continue; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index c6b873fa..43b027ee 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -118,22 +118,6 @@ class StableDiffusionGGML { StableDiffusionGGML() = default; - StableDiffusionGGML(int n_threads, - bool vae_decode_only, - bool free_params_immediately, - std::string lora_model_dir, - rng_type_t rng_type) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir) { - if (rng_type == STD_DEFAULT_RNG) { - rng = std::make_shared(); - } else if (rng_type == CUDA_RNG) { - rng = std::make_shared(); - } - } - ~StableDiffusionGGML() { if (clip_backend != backend) { ggml_backend_free(clip_backend); @@ -147,27 +131,7 @@ class StableDiffusionGGML { ggml_backend_free(backend); } - bool load_from_file(const std::string& model_path, - const std::string& clip_l_path, - const std::string& clip_g_path, - const std::string& t5xxl_path, - const std::string& diffusion_model_path, - const std::string& vae_path, - const std::string control_net_path, - const std::string embeddings_path, - const std::string id_embeddings_path, - const std::string& taesd_path, - bool vae_tiling_, - ggml_type wtype, - schedule_t schedule, - bool clip_on_cpu, - bool control_net_cpu, - bool vae_on_cpu, - bool diffusion_flash_attn, - bool chroma_use_dit_mask, - bool chroma_use_t5_mask, - int chroma_t5_mask_pad) { - use_tiny_autoencoder = taesd_path.size() > 0; + void init_backend() { #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); backend = ggml_backend_cuda_init(0); @@ -203,62 +167,80 @@ class StableDiffusionGGML { LOG_DEBUG("Using CPU backend"); backend = ggml_backend_cpu_init(); } + } - ModelLoader model_loader; + bool init(const sd_ctx_params_t* sd_ctx_params) { + n_threads = sd_ctx_params->n_threads; + vae_decode_only = sd_ctx_params->vae_decode_only; + free_params_immediately = sd_ctx_params->free_params_immediately; + lora_model_dir = SAFE_STR(sd_ctx_params->lora_model_dir); + use_tiny_autoencoder = taesd_path.size() > 0; + vae_tiling = sd_ctx_params->vae_tiling; - vae_tiling = vae_tiling_; + if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { + rng = std::make_shared(); + } else if (sd_ctx_params->rng_type == CUDA_RNG) { + rng = std::make_shared(); + } + + init_backend(); + + ModelLoader model_loader; - if (model_path.size() > 0) { - LOG_INFO("loading model from '%s'", model_path.c_str()); - if (!model_loader.init_from_file(model_path)) { - LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { + LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); + if (!model_loader.init_from_file(sd_ctx_params->model_path)) { + LOG_ERROR("init model loader from file failed: '%s'", sd_ctx_params->model_path); } } - if (diffusion_model_path.size() > 0) { - LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str()); - if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) { - LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) > 0) { + LOG_INFO("loading diffusion model from '%s'", sd_ctx_params->diffusion_model_path); + if (!model_loader.init_from_file(sd_ctx_params->diffusion_model_path, "model.diffusion_model.")) { + LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->diffusion_model_path); } } bool is_unet = model_loader.model_is_unet(); - if (clip_l_path.size() > 0) { - LOG_INFO("loading clip_l from '%s'", clip_l_path.c_str()); - if (!model_loader.init_from_file(clip_l_path, is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.")) { - LOG_WARN("loading clip_l from '%s' failed", clip_l_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) { + LOG_INFO("loading clip_l from '%s'", sd_ctx_params->clip_l_path); + std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer."; + if (!model_loader.init_from_file(sd_ctx_params->clip_l_path, prefix)) { + LOG_WARN("loading clip_l from '%s' failed", sd_ctx_params->clip_l_path); } } - if (clip_g_path.size() > 0) { - LOG_INFO("loading clip_g from '%s'", clip_g_path.c_str()); - if (!model_loader.init_from_file(clip_g_path, is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.")) { - LOG_WARN("loading clip_g from '%s' failed", clip_g_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->clip_g_path)) > 0) { + LOG_INFO("loading clip_g from '%s'", sd_ctx_params->clip_g_path); + std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer."; + if (!model_loader.init_from_file(sd_ctx_params->clip_g_path, prefix)) { + LOG_WARN("loading clip_g from '%s' failed", sd_ctx_params->clip_g_path); } } - if (t5xxl_path.size() > 0) { - LOG_INFO("loading t5xxl from '%s'", t5xxl_path.c_str()); - if (!model_loader.init_from_file(t5xxl_path, "text_encoders.t5xxl.transformer.")) { - LOG_WARN("loading t5xxl from '%s' failed", t5xxl_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->t5xxl_path)) > 0) { + LOG_INFO("loading t5xxl from '%s'", sd_ctx_params->t5xxl_path); + if (!model_loader.init_from_file(sd_ctx_params->t5xxl_path, "text_encoders.t5xxl.transformer.")) { + LOG_WARN("loading t5xxl from '%s' failed", sd_ctx_params->t5xxl_path); } } - if (vae_path.size() > 0) { - LOG_INFO("loading vae from '%s'", vae_path.c_str()); - if (!model_loader.init_from_file(vae_path, "vae.")) { - LOG_WARN("loading vae from '%s' failed", vae_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) { + LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path); + if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) { + LOG_WARN("loading vae from '%s' failed", sd_ctx_params->vae_path); } } version = model_loader.get_sd_version(); if (version == VERSION_COUNT) { - LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str()); + LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path)); return false; } LOG_INFO("Version: %s ", model_version_to_str[version]); + ggml_type wtype = (ggml_type)sd_ctx_params->wtype; if (wtype == GGML_TYPE_COUNT) { model_wtype = model_loader.get_sd_wtype(); if (model_wtype == GGML_TYPE_COUNT) { @@ -300,7 +282,7 @@ class StableDiffusionGGML { if (sd_version_is_sdxl(version)) { scale_factor = 0.13025f; - if (vae_path.size() == 0 && taesd_path.size() == 0) { + if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) { LOG_WARN( "!!!It looks like you are using SDXL model. " "If you find that the generated images are completely black, " @@ -314,6 +296,8 @@ class StableDiffusionGGML { // TODO: shift_factor } + bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; + if (version == VERSION_SVD) { clip_vision = std::make_shared(backend, model_loader.tensor_storages_types); clip_vision->alloc_params_buffer(); @@ -341,11 +325,11 @@ class StableDiffusionGGML { LOG_INFO("CLIP: Using CPU backend"); clip_backend = ggml_backend_cpu_init(); } - if (diffusion_flash_attn) { + if (sd_ctx_params->diffusion_flash_attn) { LOG_INFO("Using flash attention in the diffusion model"); } if (sd_version_is_sd3(version)) { - if (diffusion_flash_attn) { + if (sd_ctx_params->diffusion_flash_attn) { LOG_WARN("flash attention in this diffusion model is currently unsupported!"); } cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); @@ -359,18 +343,36 @@ class StableDiffusionGGML { } } if (is_chroma) { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad); + cond_stage_model = std::make_shared(clip_backend, + model_loader.tensor_storages_types, + -1, + sd_ctx_params->chroma_use_t5_mask, + sd_ctx_params->chroma_t5_mask_pad); } else { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); } - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask); + diffusion_model = std::make_shared(backend, + model_loader.tensor_storages_types, + version, + sd_ctx_params->diffusion_flash_attn, + sd_ctx_params->chroma_use_dit_mask); } else { - if (id_embeddings_path.find("v2") != std::string::npos) { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2); + if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { + cond_stage_model = std::make_shared(clip_backend, + model_loader.tensor_storages_types, + SAFE_STR(sd_ctx_params->embedding_dir), + version, + PM_VERSION_2); } else { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, embeddings_path, version); + cond_stage_model = std::make_shared(clip_backend, + model_loader.tensor_storages_types, + SAFE_STR(sd_ctx_params->embedding_dir), + version); } - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn); + diffusion_model = std::make_shared(backend, + model_loader.tensor_storages_types, + version, + sd_ctx_params->diffusion_flash_attn); } cond_stage_model->alloc_params_buffer(); @@ -380,23 +382,32 @@ class StableDiffusionGGML { diffusion_model->get_param_tensors(tensors); if (!use_tiny_autoencoder) { - if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { + if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); } else { vae_backend = backend; } - first_stage_model = std::make_shared(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version); + first_stage_model = std::make_shared(vae_backend, + model_loader.tensor_storages_types, + "first_stage_model", + vae_decode_only, + false, + version); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { - tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); + tae_first_stage = std::make_shared(backend, + model_loader.tensor_storages_types, + "decoder.layers", + vae_decode_only, + version); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); - if (control_net_path.size() > 0) { + if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { ggml_backend_t controlnet_backend = NULL; - if (control_net_cpu && !ggml_backend_is_cpu(backend)) { + if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_DEBUG("ControlNet: Using CPU backend"); controlnet_backend = ggml_backend_cpu_init(); } else { @@ -405,21 +416,21 @@ class StableDiffusionGGML { control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version); } - if (id_embeddings_path.find("v2") != std::string::npos) { + if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { pmid_model = std::make_shared(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2); LOG_INFO("using PhotoMaker Version 2"); } else { pmid_model = std::make_shared(backend, model_loader.tensor_storages_types, "pmid", version); } - if (id_embeddings_path.size() > 0) { - pmid_lora = std::make_shared(backend, id_embeddings_path, ""); + if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) { + pmid_lora = std::make_shared(backend, sd_ctx_params->stacked_id_embed_dir, ""); if (!pmid_lora->load_from_file(true)) { - LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str()); + LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->stacked_id_embed_dir); return false; } - LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", id_embeddings_path.c_str()); - if (!model_loader.init_from_file(id_embeddings_path, "pmid.")) { - LOG_WARN("loading stacked ID embedding from '%s' failed", id_embeddings_path.c_str()); + LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->stacked_id_embed_dir); + if (!model_loader.init_from_file(sd_ctx_params->stacked_id_embed_dir, "pmid.")) { + LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->stacked_id_embed_dir); } else { stacked_id = true; } @@ -491,7 +502,7 @@ class StableDiffusionGGML { } size_t control_net_params_mem_size = 0; if (control_net) { - if (!control_net->load_from_file(control_net_path)) { + if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) { return false; } control_net_params_mem_size = control_net->get_params_buffer_size(); @@ -547,7 +558,7 @@ class StableDiffusionGGML { } int64_t t1 = ggml_time_ms(); - LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); + LOG_INFO("loading model from '%s' completed, taking %.2fs", SAFE_STR(sd_ctx_params->model_path), (t1 - t0) * 1.0f / 1000); // check is_using_v_parameterization_for_sd2 @@ -592,8 +603,8 @@ class StableDiffusionGGML { LOG_INFO("running in eps-prediction mode"); } - if (schedule != DEFAULT) { - switch (schedule) { + if (sd_ctx_params->schedule != DEFAULT) { + switch (sd_ctx_params->schedule) { case DISCRETE: LOG_INFO("running with discrete schedule"); denoiser->schedule = std::make_shared(); @@ -620,7 +631,7 @@ class StableDiffusionGGML { // Don't touch anything. break; default: - LOG_ERROR("Unknown schedule %i", schedule); + LOG_ERROR("Unknown schedule %i", sd_ctx_params->schedule); abort(); } } @@ -1185,80 +1196,301 @@ class StableDiffusionGGML { /*================================================= SD API ==================================================*/ +#define NONE_STR "NONE" + +const char* sd_type_name(enum sd_type_t type) { + return ggml_type_name((ggml_type)type); +} + +enum sd_type_t str_to_sd_type(const char* str) { + for (int i = 0; i < SD_TYPE_COUNT; i++) { + auto trait = ggml_get_type_traits((ggml_type)i); + if (!strcmp(str, trait->type_name)) { + return (enum sd_type_t)i; + } + } + return SD_TYPE_COUNT; +} + +const char* rng_type_to_str[] = { + "std_default", + "cuda", +}; + +const char* sd_rng_type_name(enum rng_type_t rng_type) { + if (rng_type < RNG_TYPE_COUNT) { + return rng_type_to_str[rng_type]; + } + return NONE_STR; +} + +enum rng_type_t str_to_rng_type(const char* str) { + for (int i = 0; i < RNG_TYPE_COUNT; i++) { + if (!strcmp(str, rng_type_to_str[i])) { + return (enum rng_type_t)i; + } + } + return RNG_TYPE_COUNT; +} + +const char* sample_method_to_str[] = { + "euler_a", + "euler", + "heun", + "dpm2", + "dpm++2s_a", + "dpm++2m", + "dpm++2mv2", + "ipndm", + "ipndm_v", + "lcm", + "ddim_trailing", + "tcd", +}; + +const char* sd_sample_method_name(enum sample_method_t sample_method) { + if (sample_method < SAMPLE_METHOD_COUNT) { + return sample_method_to_str[sample_method]; + } + return NONE_STR; +} + +enum sample_method_t str_to_sample_method(const char* str) { + for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) { + if (!strcmp(str, sample_method_to_str[i])) { + return (enum sample_method_t)i; + } + } + return SAMPLE_METHOD_COUNT; +} + +const char* schedule_to_str[] = { + "default", + "discrete", + "karras", + "exponential", + "ays", + "gits", +}; + +const char* sd_schedule_name(enum schedule_t schedule) { + if (schedule < SCHEDULE_COUNT) { + return schedule_to_str[schedule]; + } + return NONE_STR; +} + +enum schedule_t str_to_schedule(const char* str) { + for (int i = 0; i < SCHEDULE_COUNT; i++) { + if (!strcmp(str, schedule_to_str[i])) { + return (enum schedule_t)i; + } + } + return SCHEDULE_COUNT; +} + +void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { + memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t)); + sd_ctx_params->vae_decode_only = true; + sd_ctx_params->vae_tiling = false; + sd_ctx_params->free_params_immediately = true; + sd_ctx_params->n_threads = get_num_physical_cores(); + sd_ctx_params->wtype = SD_TYPE_COUNT; + sd_ctx_params->rng_type = CUDA_RNG; + sd_ctx_params->schedule = DEFAULT; + sd_ctx_params->keep_clip_on_cpu = false; + sd_ctx_params->keep_control_net_on_cpu = false; + sd_ctx_params->keep_vae_on_cpu = false; + sd_ctx_params->diffusion_flash_attn = false; + sd_ctx_params->chroma_use_dit_mask = true; + sd_ctx_params->chroma_use_t5_mask = false; + sd_ctx_params->chroma_t5_mask_pad = 1; +} + +char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { + char* buf = (char*)malloc(4096); + if (!buf) + return NULL; + buf[0] = '\0'; + + snprintf(buf + strlen(buf), 4096 - strlen(buf), + "model_path: %s\n" + "clip_l_path: %s\n" + "clip_g_path: %s\n" + "t5xxl_path: %s\n" + "diffusion_model_path: %s\n" + "vae_path: %s\n" + "taesd_path: %s\n" + "control_net_path: %s\n" + "lora_model_dir: %s\n" + "embedding_dir: %s\n" + "stacked_id_embed_dir: %s\n" + "vae_decode_only: %s\n" + "vae_tiling: %s\n" + "free_params_immediately: %s\n" + "n_threads: %d\n" + "wtype: %s\n" + "rng_type: %s\n" + "schedule: %s\n" + "keep_clip_on_cpu: %s\n" + "keep_control_net_on_cpu: %s\n" + "keep_vae_on_cpu: %s\n" + "diffusion_flash_attn: %s\n" + "chroma_use_dit_mask: %s\n" + "chroma_use_t5_mask: %s\n" + "chroma_t5_mask_pad: %d\n", + SAFE_STR(sd_ctx_params->model_path), + SAFE_STR(sd_ctx_params->clip_l_path), + SAFE_STR(sd_ctx_params->clip_g_path), + SAFE_STR(sd_ctx_params->t5xxl_path), + SAFE_STR(sd_ctx_params->diffusion_model_path), + SAFE_STR(sd_ctx_params->vae_path), + SAFE_STR(sd_ctx_params->taesd_path), + SAFE_STR(sd_ctx_params->control_net_path), + SAFE_STR(sd_ctx_params->lora_model_dir), + SAFE_STR(sd_ctx_params->embedding_dir), + SAFE_STR(sd_ctx_params->stacked_id_embed_dir), + BOOL_STR(sd_ctx_params->vae_decode_only), + BOOL_STR(sd_ctx_params->vae_tiling), + BOOL_STR(sd_ctx_params->free_params_immediately), + sd_ctx_params->n_threads, + sd_type_name(sd_ctx_params->wtype), + sd_rng_type_name(sd_ctx_params->rng_type), + sd_schedule_name(sd_ctx_params->schedule), + BOOL_STR(sd_ctx_params->keep_clip_on_cpu), + BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), + BOOL_STR(sd_ctx_params->keep_vae_on_cpu), + BOOL_STR(sd_ctx_params->diffusion_flash_attn), + BOOL_STR(sd_ctx_params->chroma_use_dit_mask), + BOOL_STR(sd_ctx_params->chroma_use_t5_mask), + sd_ctx_params->chroma_t5_mask_pad); + + return buf; +} + +void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { + memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t)); + sd_img_gen_params->clip_skip = -1; + sd_img_gen_params->guidance.txt_cfg = 7.0f; + sd_img_gen_params->guidance.min_cfg = 1.0f; + sd_img_gen_params->guidance.img_cfg = INFINITY; + sd_img_gen_params->guidance.distilled_guidance = 3.5f; + sd_img_gen_params->guidance.slg.layer_count = 0; + sd_img_gen_params->guidance.slg.layer_start = 0.01f; + sd_img_gen_params->guidance.slg.layer_end = 0.2f; + sd_img_gen_params->guidance.slg.scale = 0.f; + sd_img_gen_params->ref_images_count = 0; + sd_img_gen_params->width = 512; + sd_img_gen_params->height = 512; + sd_img_gen_params->sample_method = EULER_A; + sd_img_gen_params->sample_steps = 20; + sd_img_gen_params->eta = 0.f; + sd_img_gen_params->strength = 0.75f; + sd_img_gen_params->seed = -1; + sd_img_gen_params->batch_count = 1; + sd_img_gen_params->control_strength = 0.9f; + sd_img_gen_params->style_strength = 20.f; + sd_img_gen_params->normalize_input = false; +} + +char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { + char* buf = (char*)malloc(4096); + if (!buf) + return NULL; + buf[0] = '\0'; + + snprintf(buf + strlen(buf), 4096 - strlen(buf), + "prompt: %s\n" + "negative_prompt: %s\n" + "clip_skip: %d\n" + "txt_cfg: %.2f\n" + "img_cfg: %.2f\n" + "min_cfg: %.2f\n" + "distilled_guidance: %.2f\n" + "slg.layer_count: %zu\n" + "slg.layer_start: %.2f\n" + "slg.layer_end: %.2f\n" + "slg.scale: %.2f\n" + "width: %d\n" + "height: %d\n" + "sample_method: %s\n" + "sample_steps: %d\n" + "eta: %.2f\n" + "strength: %.2f\n" + "seed: %" PRId64 + "\n" + "batch_count: %d\n" + "ref_images_count: %d\n" + "control_strength: %.2f\n" + "style_strength: %.2f\n" + "normalize_input: %s\n" + "input_id_images_path: %s\n", + SAFE_STR(sd_img_gen_params->prompt), + SAFE_STR(sd_img_gen_params->negative_prompt), + sd_img_gen_params->clip_skip, + sd_img_gen_params->guidance.txt_cfg, + sd_img_gen_params->guidance.img_cfg, + sd_img_gen_params->guidance.min_cfg, + sd_img_gen_params->guidance.distilled_guidance, + sd_img_gen_params->guidance.slg.layer_count, + sd_img_gen_params->guidance.slg.layer_start, + sd_img_gen_params->guidance.slg.layer_end, + sd_img_gen_params->guidance.slg.scale, + sd_img_gen_params->width, + sd_img_gen_params->height, + sd_sample_method_name(sd_img_gen_params->sample_method), + sd_img_gen_params->sample_steps, + sd_img_gen_params->eta, + sd_img_gen_params->strength, + sd_img_gen_params->seed, + sd_img_gen_params->batch_count, + sd_img_gen_params->ref_images_count, + sd_img_gen_params->control_strength, + sd_img_gen_params->style_strength, + BOOL_STR(sd_img_gen_params->normalize_input), + SAFE_STR(sd_img_gen_params->input_id_images_path)); + + return buf; +} + +void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { + memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t)); + sd_vid_gen_params->guidance.txt_cfg = 7.0f; + sd_vid_gen_params->guidance.min_cfg = 1.0f; + sd_vid_gen_params->guidance.img_cfg = INFINITY; + sd_vid_gen_params->guidance.distilled_guidance = 3.5f; + sd_vid_gen_params->guidance.slg.layer_count = 0; + sd_vid_gen_params->guidance.slg.layer_start = 0.01f; + sd_vid_gen_params->guidance.slg.layer_end = 0.2f; + sd_vid_gen_params->guidance.slg.scale = 0.f; + sd_vid_gen_params->width = 512; + sd_vid_gen_params->height = 512; + sd_vid_gen_params->sample_method = EULER_A; + sd_vid_gen_params->sample_steps = 20; + sd_vid_gen_params->strength = 0.75f; + sd_vid_gen_params->seed = -1; + sd_vid_gen_params->video_frames = 6; + sd_vid_gen_params->motion_bucket_id = 127; + sd_vid_gen_params->fps = 6; + sd_vid_gen_params->augmentation_level = 0.f; +} + struct sd_ctx_t { StableDiffusionGGML* sd = NULL; }; -sd_ctx_t* new_sd_ctx(const char* model_path_c_str, - const char* clip_l_path_c_str, - const char* clip_g_path_c_str, - const char* t5xxl_path_c_str, - const char* diffusion_model_path_c_str, - const char* vae_path_c_str, - const char* taesd_path_c_str, - const char* control_net_path_c_str, - const char* lora_model_dir_c_str, - const char* embed_dir_c_str, - const char* id_embed_dir_c_str, - bool vae_decode_only, - bool vae_tiling, - bool free_params_immediately, - int n_threads, - enum sd_type_t wtype, - enum rng_type_t rng_type, - enum schedule_t s, - bool keep_clip_on_cpu, - bool keep_control_net_cpu, - bool keep_vae_on_cpu, - bool diffusion_flash_attn, - bool chroma_use_dit_mask, - bool chroma_use_t5_mask, - int chroma_t5_mask_pad) { +sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; } - std::string model_path(model_path_c_str); - std::string clip_l_path(clip_l_path_c_str); - std::string clip_g_path(clip_g_path_c_str); - std::string t5xxl_path(t5xxl_path_c_str); - std::string diffusion_model_path(diffusion_model_path_c_str); - std::string vae_path(vae_path_c_str); - std::string taesd_path(taesd_path_c_str); - std::string control_net_path(control_net_path_c_str); - std::string embd_path(embed_dir_c_str); - std::string id_embd_path(id_embed_dir_c_str); - std::string lora_model_dir(lora_model_dir_c_str); - - sd_ctx->sd = new StableDiffusionGGML(n_threads, - vae_decode_only, - free_params_immediately, - lora_model_dir, - rng_type); + + sd_ctx->sd = new StableDiffusionGGML(); if (sd_ctx->sd == NULL) { return NULL; } - if (!sd_ctx->sd->load_from_file(model_path, - clip_l_path, - clip_g_path, - t5xxl_path_c_str, - diffusion_model_path, - vae_path, - control_net_path, - embd_path, - id_embd_path, - taesd_path, - vae_tiling, - (ggml_type)wtype, - s, - keep_clip_on_cpu, - keep_control_net_cpu, - keep_vae_on_cpu, - diffusion_flash_attn, - chroma_use_dit_mask, - chroma_use_t5_mask, - chroma_t5_mask_pad)) { + if (!sd_ctx->sd->init(sd_ctx_params)) { delete sd_ctx->sd; sd_ctx->sd = NULL; free(sd_ctx); @@ -1275,28 +1507,28 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, - struct ggml_context* work_ctx, - ggml_tensor* init_latent, - std::string prompt, - std::string negative_prompt, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - const std::vector& sigmas, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - std::string input_id_images_path, - std::vector ref_latents, - ggml_tensor* concat_latent = NULL, - ggml_tensor* denoise_mask = NULL) { +sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, + struct ggml_context* work_ctx, + ggml_tensor* init_latent, + std::string prompt, + std::string negative_prompt, + int clip_skip, + sd_guidance_params_t guidance, + float eta, + int width, + int height, + enum sample_method_t sample_method, + const std::vector& sigmas, + int64_t seed, + int batch_count, + const sd_image_t* control_cond, + float control_strength, + float style_ratio, + bool normalize_input, + std::string input_id_images_path, + std::vector ref_latents, + ggml_tensor* concat_latent = NULL, + ggml_tensor* denoise_mask = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1639,25 +1871,11 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, return init_latent; } -sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - const char* input_id_images_path_c_str) { - LOG_DEBUG("txt2img %dx%d", width, height); - if (sd_ctx == NULL) { +sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { + int width = sd_img_gen_params->width; + int height = sd_img_gen_params->height; + LOG_DEBUG("generate_image %dx%d", width, height); + if (sd_ctx == NULL || sd_img_gen_params == NULL) { return NULL; } @@ -1672,94 +1890,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - params.mem_size += width * height * 3 * sizeof(float); - params.mem_size *= batch_count; - params.mem_buffer = NULL; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - struct ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return NULL; - } - - size_t t0 = ggml_time_ms(); - - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); - } - - ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); - - sd_image_t* result_images = generate_image(sd_ctx, - work_ctx, - init_latent, - prompt_c_str, - negative_prompt_c_str, - clip_skip, - guidance, - eta, - width, - height, - sample_method, - sigmas, - seed, - batch_count, - control_cond, - control_strength, - style_ratio, - normalize_input, - input_id_images_path_c_str, - {}); - - size_t t1 = ggml_time_ms(); - - LOG_INFO("txt2img completed in %.2fs", (t1 - t0) * 1.0f / 1000); - - return result_images; -} - -sd_image_t* img2img(sd_ctx_t* sd_ctx, - sd_image_t init_image, - sd_image_t mask, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - const char* input_id_images_path_c_str) { - LOG_DEBUG("img2img %dx%d", width, height); - if (sd_ctx == NULL) { - return NULL; - } - - struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - if (sd_version_is_sd3(sd_ctx->sd->version)) { - params.mem_size *= 2; - } - if (sd_version_is_flux(sd_ctx->sd->version)) { - params.mem_size *= 3; - } - if (sd_ctx->sd->stacked_id) { - params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB - } params.mem_size += width * height * 3 * sizeof(float) * 3; - params.mem_size *= batch_count; + params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count; + params.mem_size *= sd_img_gen_params->batch_count; params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); @@ -1770,155 +1903,197 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, return NULL; } - size_t t0 = ggml_time_ms(); - + int64_t seed = sd_img_gen_params->seed; if (seed < 0) { srand((int)time(NULL)); seed = rand(); } sd_ctx->sd->rng->manual_seed(seed); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); + size_t t0 = ggml_time_ms(); - sd_mask_to_tensor(mask.data, mask_img); + ggml_tensor* init_latent = NULL; + ggml_tensor* concat_latent = NULL; + ggml_tensor* denoise_mask = NULL; + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_img_gen_params->sample_steps); - sd_image_to_tensor(init_image.data, init_img); + if (sd_img_gen_params->init_image.data) { + LOG_INFO("IMG2IMG"); - ggml_tensor* concat_latent; - ggml_tensor* denoise_mask = NULL; + size_t t_enc = static_cast(sd_img_gen_params->sample_steps * sd_img_gen_params->strength); + if (t_enc == sd_img_gen_params->sample_steps) + t_enc--; + LOG_INFO("target t_enc is %zu steps", t_enc); + std::vector sigma_sched; + sigma_sched.assign(sigmas.begin() + sd_img_gen_params->sample_steps - t_enc - 1, sigmas.end()); + sigmas = sigma_sched; - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = 8 * 8; // flatten the whole mask - } - ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_apply_mask(init_img, mask_img, masked_img); - ggml_tensor* masked_latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - } - concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], mask_channels + masked_latent->ne[2], 1); - for (int ix = 0; ix < masked_latent->ne[0]; ix++) { - for (int iy = 0; iy < masked_latent->ne[1]; iy++) { - int mx = ix * 8; - int my = iy * 8; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); - ggml_tensor_set_f32(concat_latent, v, ix, iy, k); - } - // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image - for (int x = 0; x < 8; x++) { - for (int y = 0; y < 8; y++) { - float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); - // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) - // python code was using "b (h 8) (w 8) -> b (8 8) h w" - ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y); + ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); + + sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img); + sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img); + + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + int64_t mask_channels = 1; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + mask_channels = 8 * 8; // flatten the whole mask + } + ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + sd_apply_mask(init_img, mask_img, masked_img); + ggml_tensor* masked_latent = NULL; + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + } + concat_latent = ggml_new_tensor_4d(work_ctx, + GGML_TYPE_F32, + masked_latent->ne[0], + masked_latent->ne[1], + mask_channels + masked_latent->ne[2], + 1); + for (int ix = 0; ix < masked_latent->ne[0]; ix++) { + for (int iy = 0; iy < masked_latent->ne[1]; iy++) { + int mx = ix * 8; + int my = iy * 8; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + for (int k = 0; k < masked_latent->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); + ggml_tensor_set_f32(concat_latent, v, ix, iy, k); + } + // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image + for (int x = 0; x < 8; x++) { + for (int y = 0; y < 8; y++) { + float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); + // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) + // python code was using "b (h 8) (w 8) -> b (8 8) h w" + ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y); + } + } + } else { + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(concat_latent, m, ix, iy, 0); + for (int k = 0; k < masked_latent->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); + ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels); } - } - } else { - float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(concat_latent, m, ix, iy, 0); - for (int k = 0; k < masked_latent->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); - ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels); } } } } - } - { - // LOG_WARN("Inpainting with a base model is not great"); - denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); - for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { - for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { - int mx = ix * 8; - int my = iy * 8; - float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(denoise_mask, m, ix, iy); + { + // LOG_WARN("Inpainting with a base model is not great"); + denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); + for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { + for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { + int mx = ix * 8; + int my = iy * 8; + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(denoise_mask, m, ix, iy); + } } } - } - ggml_tensor* init_latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + } } else { - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + LOG_INFO("TXT2IMG"); + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); + } + init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); + } + + if (sd_img_gen_params->ref_images_count > 0) { + LOG_INFO("EDIT mode"); + } + + std::vector ref_latents; + for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { + ggml_tensor* img = ggml_new_tensor_4d(work_ctx, + GGML_TYPE_F32, + sd_img_gen_params->ref_images[i].width, + sd_img_gen_params->ref_images[i].height, + 3, + 1); + sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img); + + ggml_tensor* latent = NULL; + if (sd_ctx->sd->use_tiny_autoencoder) { + latent = sd_ctx->sd->encode_first_stage(work_ctx, img); + } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) { + latent = sd_ctx->sd->encode_first_stage(work_ctx, img); + latent = ggml_view_3d(work_ctx, + latent, + latent->ne[0], + latent->ne[1], + latent->ne[2] / 2, + latent->nb[1], + latent->nb[2], + 0); + } else { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); + latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } + ref_latents.push_back(latent); } - size_t t1 = ggml_time_ms(); - LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); - if (t_enc == sample_steps) - t_enc--; - LOG_INFO("target t_enc is %zu steps", t_enc); - std::vector sigma_sched; - sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); - - sd_image_t* result_images = generate_image(sd_ctx, - work_ctx, - init_latent, - prompt_c_str, - negative_prompt_c_str, - clip_skip, - guidance, - eta, - width, - height, - sample_method, - sigma_sched, - seed, - batch_count, - control_cond, - control_strength, - style_ratio, - normalize_input, - input_id_images_path_c_str, - {}, - concat_latent, - denoise_mask); + if (sd_img_gen_params->init_image.data != NULL || sd_img_gen_params->ref_images_count > 0) { + size_t t1 = ggml_time_ms(); + LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + } + + sd_image_t* result_images = generate_image_internal(sd_ctx, + work_ctx, + init_latent, + SAFE_STR(sd_img_gen_params->prompt), + SAFE_STR(sd_img_gen_params->negative_prompt), + sd_img_gen_params->clip_skip, + sd_img_gen_params->guidance, + sd_img_gen_params->eta, + width, + height, + sd_img_gen_params->sample_method, + sigmas, + seed, + sd_img_gen_params->batch_count, + sd_img_gen_params->control_cond, + sd_img_gen_params->control_strength, + sd_img_gen_params->style_strength, + sd_img_gen_params->normalize_input, + sd_img_gen_params->input_id_images_path, + ref_latents, + concat_latent, + denoise_mask); size_t t2 = ggml_time_ms(); - LOG_INFO("img2img completed in %.2fs", (t2 - t0) * 1.0f / 1000); + LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000); return result_images; } -SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, - sd_image_t init_image, - int width, - int height, - int video_frames, - int motion_bucket_id, - int fps, - float augmentation_level, - sd_guidance_params_t guidance, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed) { - if (sd_ctx == NULL) { +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) { + if (sd_ctx == NULL || sd_vid_gen_params == NULL) { return NULL; } + int width = sd_vid_gen_params->width; + int height = sd_vid_gen_params->height; LOG_INFO("img2vid %dx%d", width, height); - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_vid_gen_params->sample_steps); struct ggml_init_params params; params.mem_size = static_cast(10 * 1024) * 1024; // 10 MB - params.mem_size += width * height * 3 * sizeof(float) * video_frames; + params.mem_size += width * height * 3 * sizeof(float) * sd_vid_gen_params->video_frames; params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); @@ -1930,6 +2105,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return NULL; } + int64_t seed = sd_vid_gen_params->seed; if (seed < 0) { seed = (int)time(NULL); } @@ -1939,12 +2115,12 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, int64_t t0 = ggml_time_ms(); SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx, - init_image, + sd_vid_gen_params->init_image, width, height, - fps, - motion_bucket_id, - augmentation_level); + sd_vid_gen_params->fps, + sd_vid_gen_params->motion_bucket_id, + sd_vid_gen_params->augmentation_level); auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn); ggml_set_f32(uc_crossattn, 0.f); @@ -1966,13 +2142,13 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, int C = 4; int W = width / 8; int H = height / 8; - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames); + struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames); ggml_set_f32(x_t, 0.f); - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames); + struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); + LOG_INFO("sampling using %s method", sampling_methods_str[sd_vid_gen_params->sample_method]); struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, noise, @@ -1981,9 +2157,9 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, {}, {}, 0.f, - guidance, + sd_vid_gen_params->guidance, 0.f, - sample_method, + sd_vid_gen_params->sample_method, sigmas, -1, SDCondition(NULL, NULL, NULL)); @@ -2003,13 +2179,13 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return NULL; } - sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t)); + sd_image_t* result_images = (sd_image_t*)calloc(sd_vid_gen_params->video_frames, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } - for (size_t i = 0; i < video_frames; i++) { + for (size_t i = 0; i < sd_vid_gen_params->video_frames; i++) { auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i); result_images[i].width = width; @@ -2025,114 +2201,3 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return result_images; } - -sd_image_t* edit(sd_ctx_t* sd_ctx, - sd_image_t* ref_images, - int ref_images_count, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - const char* input_id_images_path_c_str) { - LOG_DEBUG("edit %dx%d", width, height); - if (sd_ctx == NULL) { - return NULL; - } - if (ref_images_count <= 0) { - LOG_ERROR("ref images count should > 0"); - return NULL; - } - - struct ggml_init_params params; - params.mem_size = static_cast(30 * 1024 * 1024); // 10 MB - params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count; - params.mem_size *= batch_count; - params.mem_buffer = NULL; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - struct ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return NULL; - } - - if (seed < 0) { - srand((int)time(NULL)); - seed = rand(); - } - sd_ctx->sd->rng->manual_seed(seed); - - size_t t0 = ggml_time_ms(); - - std::vector ref_latents; - for (int i = 0; i < ref_images_count; i++) { - ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1); - sd_image_to_tensor(ref_images[i].data, img); - - ggml_tensor* latent = NULL; - if (sd_ctx->sd->use_tiny_autoencoder) { - latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) { - latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - latent = ggml_view_3d(work_ctx, - latent, - latent->ne[0], - latent->ne[1], - latent->ne[2] / 2, - latent->nb[1], - latent->nb[2], - 0); - } else { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); - latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } - ref_latents.push_back(latent); - } - - size_t t1 = ggml_time_ms(); - LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - - ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); - - sd_image_t* result_images = generate_image(sd_ctx, - work_ctx, - init_latent, - prompt_c_str, - negative_prompt_c_str, - clip_skip, - guidance, - eta, - width, - height, - sample_method, - sigmas, - seed, - batch_count, - control_cond, - control_strength, - style_ratio, - normalize_input, - "", - ref_latents, - NULL); - - size_t t2 = ggml_time_ms(); - - LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000); - - return result_images; -} \ No newline at end of file diff --git a/stable-diffusion.h b/stable-diffusion.h index ac50df1a..a6032592 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -30,7 +30,8 @@ extern "C" { enum rng_type_t { STD_DEFAULT_RNG, - CUDA_RNG + CUDA_RNG, + RNG_TYPE_COUNT }; enum sample_method_t { @@ -46,7 +47,7 @@ enum sample_method_t { LCM, DDIM_TRAILING, TCD, - N_SAMPLE_METHODS + SAMPLE_METHOD_COUNT }; enum schedule_t { @@ -56,7 +57,7 @@ enum schedule_t { EXPONENTIAL, AYS, GITS, - N_SCHEDULES + SCHEDULE_COUNT }; // same as enum ggml_type @@ -103,8 +104,6 @@ enum sd_type_t { SD_TYPE_COUNT = 39, }; -SD_API const char* sd_type_name(enum sd_type_t type); - enum sd_log_level_t { SD_LOG_DEBUG, SD_LOG_INFO, @@ -112,13 +111,33 @@ enum sd_log_level_t { SD_LOG_ERROR }; -typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); -typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); - -SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); -SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API int32_t get_num_physical_cores(); -SD_API const char* sd_get_system_info(); +typedef struct { + const char* model_path; + const char* clip_l_path; + const char* clip_g_path; + const char* t5xxl_path; + const char* diffusion_model_path; + const char* vae_path; + const char* taesd_path; + const char* control_net_path; + const char* lora_model_dir; + const char* embedding_dir; + const char* stacked_id_embed_dir; + bool vae_decode_only; + bool vae_tiling; + bool free_params_immediately; + int n_threads; + enum sd_type_t wtype; + enum rng_type_t rng_type; + enum schedule_t schedule; + bool keep_clip_on_cpu; + bool keep_control_net_on_cpu; + bool keep_vae_on_cpu; + bool diffusion_flash_attn; + bool chroma_use_dit_mask; + bool chroma_use_t5_mask; + int chroma_t5_mask_pad; +} sd_ctx_params_t; typedef struct { uint32_t width; @@ -127,8 +146,6 @@ typedef struct { uint8_t* data; } sd_image_t; -typedef struct sd_ctx_t sd_ctx_t; - typedef struct { int* layers; size_t layer_count; @@ -145,106 +162,76 @@ typedef struct { sd_slg_params_t slg; } sd_guidance_params_t; -SD_API sd_ctx_t* new_sd_ctx(const char* model_path, - const char* clip_l_path, - const char* clip_g_path, - const char* t5xxl_path, - const char* diffusion_model_path, - const char* vae_path, - const char* taesd_path, - const char* control_net_path_c_str, - const char* lora_model_dir, - const char* embed_dir_c_str, - const char* stacked_id_embed_dir_c_str, - bool vae_decode_only, - bool vae_tiling, - bool free_params_immediately, - int n_threads, - enum sd_type_t wtype, - enum rng_type_t rng_type, - enum schedule_t s, - bool keep_clip_on_cpu, - bool keep_control_net_cpu, - bool keep_vae_on_cpu, - bool diffusion_flash_attn, - bool chroma_use_dit_mask, - bool chroma_use_t5_mask, - int chroma_t5_mask_pad); +typedef struct { + const char* prompt; + const char* negative_prompt; + int clip_skip; + sd_guidance_params_t guidance; + sd_image_t init_image; + sd_image_t* ref_images; + int ref_images_count; + sd_image_t mask_image; + int width; + int height; + enum sample_method_t sample_method; + int sample_steps; + float eta; + float strength; + int64_t seed; + int batch_count; + const sd_image_t* control_cond; + float control_strength; + float style_strength; + bool normalize_input; + const char* input_id_images_path; +} sd_img_gen_params_t; -SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); +typedef struct { + sd_image_t init_image; + int width; + int height; + sd_guidance_params_t guidance; + enum sample_method_t sample_method; + int sample_steps; + float strength; + int64_t seed; + int video_frames; + int motion_bucket_id; + int fps; + float augmentation_level; +} sd_vid_gen_params_t; + +typedef struct sd_ctx_t sd_ctx_t; + +typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); +typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); + +SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); +SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); +SD_API int32_t get_num_physical_cores(); +SD_API const char* sd_get_system_info(); -SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt, - const char* negative_prompt, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_strength, - bool normalize_input, - const char* input_id_images_path); +SD_API const char* sd_type_name(enum sd_type_t type); +SD_API enum sd_type_t str_to_sd_type(const char* str); +SD_API const char* sd_rng_type_name(enum rng_type_t rng_type); +SD_API enum rng_type_t str_to_rng_type(const char* str); +SD_API const char* sd_sample_method_name(enum sample_method_t sample_method); +SD_API enum sample_method_t str_to_sample_method(const char* str); +SD_API const char* sd_schedule_name(enum schedule_t schedule); +SD_API enum schedule_t str_to_schedule(const char* str); + +SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); +SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); -SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, - sd_image_t init_image, - sd_image_t mask_image, - const char* prompt, - const char* negative_prompt, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_strength, - bool normalize_input, - const char* input_id_images_path); +SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); +SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, - sd_image_t init_image, - int width, - int height, - int video_frames, - int motion_bucket_id, - int fps, - float augmentation_level, - sd_guidance_params_t guidance, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed); +SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); +SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); -SD_API sd_image_t* edit(sd_ctx_t* sd_ctx, - sd_image_t* ref_images, - int ref_images_count, - const char* prompt, - const char* negative_prompt, - int clip_skip, - sd_guidance_params_t guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_strength, - bool normalize_input, - const char* input_id_images_path); +SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params); // broken typedef struct upscaler_ctx_t upscaler_ctx_t; @@ -254,7 +241,11 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); -SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* tensor_type_rules); +SD_API bool convert(const char* input_path, + const char* vae_path, + const char* output_path, + enum sd_type_t output_type, + const char* tensor_type_rules); SD_API uint8_t* preprocess_canny(uint8_t* img, int width, diff --git a/util.cpp b/util.cpp index 631c1206..92bc9ef5 100644 --- a/util.cpp +++ b/util.cpp @@ -441,10 +441,6 @@ const char* sd_get_system_info() { return buffer; } -const char* sd_type_name(enum sd_type_t type) { - return ggml_type_name((ggml_type)type); -} - sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) { sd_image_f32_t converted_image; converted_image.width = image.width; diff --git a/util.h b/util.h index 14fa812e..d98c9a28 100644 --- a/util.h +++ b/util.h @@ -7,6 +7,9 @@ #include "stable-diffusion.h" +#define SAFE_STR(s) ((s) ? (s) : "") +#define BOOL_STR(b) ((b) ? "true" : "false") + bool ends_with(const std::string& str, const std::string& ending); bool starts_with(const std::string& str, const std::string& start); bool contains(const std::string& str, const std::string& substr);