diff --git a/README.md b/README.md
index 4720dc29..8ce98137 100644
--- a/README.md
+++ b/README.md
@@ -282,14 +282,14 @@ usage: ./bin/sd [arguments]
 
 arguments:
   -h, --help                         show this help message and exit
-  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)
+  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen
   -t, --threads N                    number of threads to use during computation (default: -1)
                                      If threads <= 0, then threads will be set to the number of CPU physical cores
   -m, --model [MODEL]                path to full model
   --diffusion-model                  path to the standalone diffusion model
   --clip_l                           path to the clip-l text encoder
   --clip_g                           path to the clip-g text encoder
-  --t5xxl                            path to the the t5xxl text encoder
+  --t5xxl                            path to the t5xxl text encoder
   --vae [VAE]                        path to vae
   --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
   --control-net [CONTROL_PATH]       path to control net model
@@ -301,16 +301,18 @@ arguments:
   --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
   --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
                                      If not specified, the default is the type of the weight file
+  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
   --lora-model-dir [DIR]             lora model directory
   -i, --init-img [IMAGE]             path to the input image, required by img2img
   --mask [MASK]                      path to the mask image, required by img2img with mask
   --control-image [IMAGE]            path to image condition, control net
-  -r, --ref_image [PATH]             reference image for Flux Kontext models (can be used multiple times)
+  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
   -o, --output OUTPUT                path to write result image to (default: ./output.png)
   -p, --prompt [PROMPT]              the prompt to render
   -n, --negative-prompt PROMPT       the negative prompt (default: "")
   --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-  --guidance SCALE                   guidance scale for img2img (default: 3.5)
+  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
   --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
                                      0 means disabled, a value of 2.5 is nice for sd3.5 medium
   --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
@@ -319,7 +321,7 @@ arguments:
   --skip-layer-end END               SLG disabling point: (default: 0.2)
                                      SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
   --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%)
+  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)
   --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
                                      1.0 corresponds to full destruction of information in init image
   -H, --height H                     image height, in pixel space (default: 512)
@@ -371,7 +373,7 @@ Using formats of different precisions will yield results of varying quality.
 
 
 ```
-./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```
 
 <p align="center">
diff --git a/docs/kontext.md b/docs/kontext.md
index 51975255..69873503 100644
--- a/docs/kontext.md
+++ b/docs/kontext.md
@@ -27,7 +27,7 @@ You can download the preconverted gguf weights from [FLUX.1-Kontext-dev-GGUF](ht
 For example:
 
 ```
- .\bin\Release\sd.exe -M edit -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
+ .\bin\Release\sd.exe -r .\flux1-dev-q8_0.png --diffusion-model  ..\models\flux1-kontext-dev-q8_0.gguf --vae ..\models\ae.sft --clip_l ..\models\clip_l.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -p "change 'flux.cpp' to 'kontext.cpp'" --cfg-scale 1.0 --sampling-method euler -v
 ```
 
 
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 234dad3a..5879967f 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1,13 +1,15 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
+#include <functional>
 #include <iostream>
+#include <map>
 #include <random>
+#include <regex>
 #include <string>
 #include <vector>
 
 // #include "preprocessing.hpp"
-#include "flux.hpp"
 #include "stable-diffusion.h"
 
 #define STB_IMAGE_IMPLEMENTATION
@@ -22,58 +24,26 @@
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 
-const char* rng_type_to_str[] = {
-    "std_default",
-    "cuda",
-};
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "ipndm",
-    "ipndm_v",
-    "lcm",
-    "ddim_trailing",
-    "tcd",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
-    "exponential",
-    "ays",
-    "gits",
-};
+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
 
 const char* modes_str[] = {
-    "txt2img",
-    "img2img",
-    "img2vid",
-    "edit",
+    "img_gen",
+    "vid_gen",
     "convert",
 };
-#define SD_ALL_MODES_STR "txt2img, img2img, edit, convert"
+#define SD_ALL_MODES_STR "img_gen, vid_gen, convert"
 
 enum SDMode {
-    TXT2IMG,
-    IMG2IMG,
-    IMG2VID,
-    EDIT,
+    IMG_GEN,
+    VID_GEN,
     CONVERT,
     MODE_COUNT
 };
 
 struct SDParams {
     int n_threads = -1;
-    SDMode mode   = TXT2IMG;
+    SDMode mode   = IMG_GEN;
     std::string model_path;
     std::string clip_l_path;
     std::string clip_g_path;
@@ -82,9 +52,9 @@ struct SDParams {
     std::string vae_path;
     std::string taesd_path;
     std::string esrgan_path;
-    std::string controlnet_path;
-    std::string embeddings_path;
-    std::string stacked_id_embeddings_path;
+    std::string control_net_path;
+    std::string embedding_dir;
+    std::string stacked_id_embed_dir;
     std::string input_id_images_path;
     sd_type_t wtype = SD_TYPE_COUNT;
     std::string tensor_type_rules;
@@ -154,9 +124,9 @@ void print_params(SDParams params) {
     printf("    vae_path:          %s\n", params.vae_path.c_str());
     printf("    taesd_path:        %s\n", params.taesd_path.c_str());
     printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
-    printf("    controlnet_path:   %s\n", params.controlnet_path.c_str());
-    printf("    embeddings_path:   %s\n", params.embeddings_path.c_str());
-    printf("    stacked_id_embeddings_path:   %s\n", params.stacked_id_embeddings_path.c_str());
+    printf("    control_net_path:   %s\n", params.control_net_path.c_str());
+    printf("    embedding_dir:   %s\n", params.embedding_dir.c_str());
+    printf("    stacked_id_embed_dir:   %s\n", params.stacked_id_embed_dir.c_str());
     printf("    input_id_images_path:   %s\n", params.input_id_images_path.c_str());
     printf("    style ratio:       %.2f\n", params.style_ratio);
     printf("    normalize input image :  %s\n", params.normalize_input ? "true" : "false");
@@ -184,11 +154,11 @@ void print_params(SDParams params) {
     printf("    clip_skip:         %d\n", params.clip_skip);
     printf("    width:             %d\n", params.width);
     printf("    height:            %d\n", params.height);
-    printf("    sample_method:     %s\n", sample_method_str[params.sample_method]);
-    printf("    schedule:          %s\n", schedule_str[params.schedule]);
+    printf("    sample_method:     %s\n", sd_sample_method_name(params.sample_method));
+    printf("    schedule:          %s\n", sd_schedule_name(params.schedule));
     printf("    sample_steps:      %d\n", params.sample_steps);
     printf("    strength(img2img): %.2f\n", params.strength);
-    printf("    rng:               %s\n", rng_type_to_str[params.rng_type]);
+    printf("    rng:               %s\n", sd_rng_type_name(params.rng_type));
     printf("    seed:              %ld\n", params.seed);
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
@@ -203,11 +173,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("\n");
     printf("arguments:\n");
     printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODE]                  run mode, one of:\n");
-    printf("                                     txt2img: generate an image from a text prompt (default)\n");
-    printf("                                     img2img: generate an image from a text prompt and an initial image (--init-img)\n");
-    printf("                                     edit:    modify an image (--ref-image) based on text instructions\n");
-    printf("                                     convert: convert a model file to gguf format, optionally with quantization\n");
+    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen\n");
     printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
     printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
     printf("  -m, --model [MODEL]                path to full model\n");
@@ -246,7 +212,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
-    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
+    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
     printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
@@ -275,432 +241,344 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -v, --verbose                      print extra info\n");
 }
 
-void parse_args(int argc, const char** argv, SDParams& params) {
+struct StringOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    std::string* target;
+};
+
+struct IntOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    int* target;
+};
+
+struct FloatOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    float* target;
+};
+
+struct BoolOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    bool keep_true;
+    bool* target;
+};
+
+struct ManualOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    std::function<int(int argc, const char** argv, int index)> cb;
+};
+
+struct ArgOptions {
+    std::vector<StringOption> string_options;
+    std::vector<IntOption> int_options;
+    std::vector<FloatOption> float_options;
+    std::vector<BoolOption> bool_options;
+    std::vector<ManualOption> manual_options;
+};
+
+bool parse_options(int argc, const char** argv, ArgOptions& options) {
     bool invalid_arg = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
 
-        if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-M" || arg == "--mode") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* mode_selected = argv[i];
-            int mode_found            = -1;
-            for (int d = 0; d < MODE_COUNT; d++) {
-                if (!strcmp(mode_selected, modes_str[d])) {
-                    mode_found = d;
-                }
-            }
-            if (mode_found == -1) {
-                fprintf(stderr,
-                        "error: invalid mode %s, must be one of [%s]\n",
-                        mode_selected, SD_ALL_MODES_STR);
-                exit(1);
-            }
-            params.mode = (SDMode)mode_found;
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.model_path = argv[i];
-        } else if (arg == "--clip_l") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_l_path = argv[i];
-        } else if (arg == "--clip_g") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_g_path = argv[i];
-        } else if (arg == "--t5xxl") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.t5xxl_path = argv[i];
-        } else if (arg == "--diffusion-model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.diffusion_model_path = argv[i];
-        } else if (arg == "--vae") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.vae_path = argv[i];
-        } else if (arg == "--taesd") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.taesd_path = argv[i];
-        } else if (arg == "--control-net") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.controlnet_path = argv[i];
-        } else if (arg == "--upscale-model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.esrgan_path = argv[i];
-        } else if (arg == "--embd-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.embeddings_path = argv[i];
-        } else if (arg == "--stacked-id-embd-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.stacked_id_embeddings_path = argv[i];
-        } else if (arg == "--input-id-images-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_id_images_path = argv[i];
-        } else if (arg == "--type") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string type        = argv[i];
-            bool found              = false;
-            std::string valid_types = "";
-            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
-                auto trait = ggml_get_type_traits((ggml_type)i);
-                std::string name(trait->type_name);
-                if (name == "f32" || trait->to_float && trait->type_size) {
-                    if (i)
-                        valid_types += ", ";
-                    valid_types += name;
-                    if (type == name) {
-                        if (ggml_quantize_requires_imatrix((ggml_type)i)) {
-                            printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name);
-                        }
-                        params.wtype = (enum sd_type_t)i;
-                        found        = true;
-                        break;
-                    }
-                }
-            }
-            if (!found) {
-                fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n",
-                        type.c_str(),
-                        valid_types.c_str());
-                exit(1);
-            }
-        } else if (arg == "--tensor-type-rules") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.tensor_type_rules = argv[i];
-        } else if (arg == "--lora-model-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.lora_model_dir = argv[i];
-        } else if (arg == "-i" || arg == "--init-img") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_path = argv[i];
-        } else if (arg == "--mask") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.mask_path = argv[i];
-        } else if (arg == "--control-image") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.control_image_path = argv[i];
-        } else if (arg == "-o" || arg == "--output") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.output_path = argv[i];
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "--upscale-repeats") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.upscale_repeats = std::stoi(argv[i]);
-            if (params.upscale_repeats < 1) {
-                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
-                exit(1);
-            }
-        } else if (arg == "-n" || arg == "--negative-prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.negative_prompt = argv[i];
-        } else if (arg == "--cfg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--img-cfg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.img_cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--guidance") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.guidance = std::stof(argv[i]);
-        } else if (arg == "--eta") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.eta = std::stof(argv[i]);
-        } else if (arg == "--strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.strength = std::stof(argv[i]);
-        } else if (arg == "--style-ratio") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.style_ratio = std::stof(argv[i]);
-        } else if (arg == "--control-strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.control_strength = std::stof(argv[i]);
-        } else if (arg == "-H" || arg == "--height") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.height = std::stoi(argv[i]);
-        } else if (arg == "-W" || arg == "--width") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.width = std::stoi(argv[i]);
-        } else if (arg == "--steps") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_steps = std::stoi(argv[i]);
-        } else if (arg == "--clip-skip") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_skip = std::stoi(argv[i]);
-        } else if (arg == "--vae-tiling") {
-            params.vae_tiling = true;
-        } else if (arg == "--control-net-cpu") {
-            params.control_net_cpu = true;
-        } else if (arg == "--normalize-input") {
-            params.normalize_input = true;
-        } else if (arg == "--clip-on-cpu") {
-            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
-        } else if (arg == "--vae-on-cpu") {
-            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
-        } else if (arg == "--diffusion-fa") {
-            params.diffusion_flash_attn = true;  // can reduce MEM significantly
-        } else if (arg == "--canny") {
-            params.canny_preprocess = true;
-        } else if (arg == "-b" || arg == "--batch-count") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.batch_count = std::stoi(argv[i]);
-        } else if (arg == "--rng") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string rng_type_str = argv[i];
-            if (rng_type_str == "std_default") {
-                params.rng_type = STD_DEFAULT_RNG;
-            } else if (rng_type_str == "cuda") {
-                params.rng_type = CUDA_RNG;
-            } else {
-                invalid_arg = true;
-                break;
-            }
-        } else if (arg == "--schedule") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* schedule_selected = argv[i];
-            int schedule_found            = -1;
-            for (int d = 0; d < N_SCHEDULES; d++) {
-                if (!strcmp(schedule_selected, schedule_str[d])) {
-                    schedule_found = d;
-                }
-            }
-            if (schedule_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.schedule = (schedule_t)schedule_found;
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.seed = std::stoll(argv[i]);
-        } else if (arg == "--sampling-method") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* sample_method_selected = argv[i];
-            int sample_method_found            = -1;
-            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
-                if (!strcmp(sample_method_selected, sample_method_str[m])) {
-                    sample_method_found = m;
+        for (auto& option : options.string_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                if (++i >= argc) {
+                    invalid_arg = true;
+                    break;
                 }
+                *option.target = std::string(argv[i]);
             }
-            if (sample_method_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_method = (sample_method_t)sample_method_found;
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv);
-            exit(0);
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--color") {
-            params.color = true;
-        } else if (arg == "--slg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.slg_scale = std::stof(argv[i]);
-        } else if (arg == "--skip-layers") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            if (argv[i][0] != '[') {
-                invalid_arg = true;
-                break;
-            }
-            std::string layers_str = argv[i];
-            while (layers_str.back() != ']') {
+        }
+        if (invalid_arg) {
+            break;
+        }
+
+        for (auto& option : options.int_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
                 if (++i >= argc) {
                     invalid_arg = true;
                     break;
                 }
-                layers_str += " " + std::string(argv[i]);
+                *option.target = std::stoi(argv[i]);
             }
-            layers_str = layers_str.substr(1, layers_str.size() - 2);
-
-            std::regex regex("[, ]+");
-            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
-            std::sregex_token_iterator end;
-            std::vector<std::string> tokens(iter, end);
-            std::vector<int> layers;
-            for (const auto& token : tokens) {
-                try {
-                    layers.push_back(std::stoi(token));
-                } catch (const std::invalid_argument& e) {
+        }
+        if (invalid_arg) {
+            break;
+        }
+
+        for (auto& option : options.float_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                if (++i >= argc) {
                     invalid_arg = true;
                     break;
                 }
+                *option.target = std::stof(argv[i]);
             }
-            params.skip_layers = layers;
+        }
+        if (invalid_arg) {
+            break;
+        }
 
-            if (invalid_arg) {
-                break;
-            }
-        } else if (arg == "--skip-layer-start") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.skip_layer_start = std::stof(argv[i]);
-        } else if (arg == "--skip-layer-end") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.skip_layer_end = std::stof(argv[i]);
-        } else if (arg == "-r" || arg == "--ref-image") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
+        for (auto& option : options.bool_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                if (option.keep_true) {
+                    *option.target = true;
+                } else {
+                    *option.target = false;
+                }
             }
-            params.ref_image_paths.push_back(argv[i]);
-        } else if (arg == "--chroma-disable-dit-mask") {
-            params.chroma_use_dit_mask = false;
-        } else if (arg == "--chroma-enable-t5-mask") {
-            params.chroma_use_t5_mask = true;
-        } else if (arg == "--chroma-t5-mask-pad") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
+        }
+        if (invalid_arg) {
+            break;
+        }
+
+        for (auto& option : options.manual_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                int ret = option.cb(argc, argv, i);
+                if (ret < 0) {
+                    invalid_arg = true;
+                    break;
+                }
+                i += ret;
             }
-            params.chroma_t5_mask_pad = std::stoi(argv[i]);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            print_usage(argc, argv);
-            exit(1);
+        }
+        if (invalid_arg) {
+            break;
         }
     }
     if (invalid_arg) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return false;
+    }
+    return true;
+}
+
+void parse_args(int argc, const char** argv, SDParams& params) {
+    ArgOptions options;
+    options.string_options = {
+        {"-m", "--model", "", &params.model_path},
+        {"", "--clip_l", "", &params.clip_l_path},
+        {"", "--clip_g", "", &params.clip_g_path},
+        {"", "--t5xxl", "", &params.t5xxl_path},
+        {"", "--diffusion-model", "", &params.diffusion_model_path},
+        {"", "--vae", "", &params.vae_path},
+        {"", "--taesd", "", &params.taesd_path},
+        {"", "--control-net", "", &params.control_net_path},
+        {"", "--embd-dir", "", &params.embedding_dir},
+        {"", "--stacked-id-embd-dir", "", &params.stacked_id_embed_dir},
+        {"", "--lora-model-dir", "", &params.lora_model_dir},
+        {"-i", "--init-img", "", &params.input_path},
+        {"", "--tensor-type-rules", "", &params.tensor_type_rules},
+        {"", "--input-id-images-dir", "", &params.input_id_images_path},
+        {"", "--mask", "", &params.mask_path},
+        {"", "--control-image", "", &params.control_image_path},
+        {"-o", "--output", "", &params.output_path},
+        {"-p", "--prompt", "", &params.prompt},
+        {"-n", "--negative-prompt", "", &params.negative_prompt},
+
+        {"", "--upscale-model", "", &params.esrgan_path},
+    };
+
+    options.int_options = {
+        {"-t", "--threads", "", &params.n_threads},
+        {"", "--upscale-repeats", "", &params.upscale_repeats},
+        {"-H", "--height", "", &params.height},
+        {"-W", "--width", "", &params.width},
+        {"", "--steps", "", &params.sample_steps},
+        {"", "--clip-skip", "", &params.clip_skip},
+        {"-b", "--batch-count", "", &params.batch_count},
+        {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
+    };
+
+    options.float_options = {
+        {"", "--cfg-scale", "", &params.cfg_scale},
+        {"", "--img-cfg-scale", "", &params.img_cfg_scale},
+        {"", "--guidance", "", &params.guidance},
+        {"", "--eta", "", &params.eta},
+        {"", "--strength", "", &params.strength},
+        {"", "--style-ratio", "", &params.style_ratio},
+        {"", "--control-strength", "", &params.control_strength},
+        {"", "--slg-scale", "", &params.slg_scale},
+        {"", "--skip-layer-start", "", &params.skip_layer_start},
+        {"", "--skip-layer-end", "", &params.skip_layer_end},
+
+    };
+
+    options.bool_options = {
+        {"", "--vae-tiling", "", true, &params.vae_tiling},
+        {"", "--control-net-cpu", "", true, &params.control_net_cpu},
+        {"", "--normalize-input", "", true, &params.normalize_input},
+        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
+        {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
+        {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--canny", "", true, &params.canny_preprocess},
+        {"-v", "--verbos", "", true, &params.verbose},
+        {"", "--color", "", true, &params.color},
+        {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
+        {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
+    };
+
+    auto on_mode_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* mode = argv[index];
+        if (mode != NULL) {
+            int mode_found = -1;
+            for (int i = 0; i < MODE_COUNT; i++) {
+                if (!strcmp(mode, modes_str[i])) {
+                    mode_found = i;
+                }
+            }
+            if (mode_found == -1) {
+                fprintf(stderr,
+                        "error: invalid mode %s, must be one of [%s]\n",
+                        mode, SD_ALL_MODES_STR);
+                exit(1);
+            }
+            params.mode = (SDMode)mode_found;
+        }
+        return 1;
+    };
+
+    auto on_type_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        params.wtype    = str_to_sd_type(arg);
+        if (params.wtype == SD_TYPE_COUNT) {
+            fprintf(stderr, "error: invalid weight format %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_rng_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        params.rng_type = str_to_rng_type(arg);
+        if (params.rng_type == RNG_TYPE_COUNT) {
+            fprintf(stderr, "error: invalid rng type %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_schedule_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        params.schedule = str_to_schedule(arg);
+        if (params.schedule == SCHEDULE_COUNT) {
+            fprintf(stderr, "error: invalid schedule %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg      = argv[index];
+        params.sample_method = str_to_sample_method(arg);
+        if (params.sample_method == SAMPLE_METHOD_COUNT) {
+            fprintf(stderr, "error: invalid sample method %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_seed_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        params.seed = std::stoll(argv[index]);
+        return 1;
+    };
+
+    auto on_help_arg = [&](int argc, const char** argv, int index) {
+        print_usage(argc, argv);
+        exit(0);
+        return 0;
+    };
+
+    auto on_skip_layers_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        std::string layers_str = argv[index];
+        if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') {
+            return -1;
+        }
+
+        layers_str = layers_str.substr(1, layers_str.size() - 2);
+
+        std::regex regex("[, ]+");
+        std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
+        std::sregex_token_iterator end;
+        std::vector<std::string> tokens(iter, end);
+        std::vector<int> layers;
+        for (const auto& token : tokens) {
+            try {
+                layers.push_back(std::stoi(token));
+            } catch (const std::invalid_argument& e) {
+                return -1;
+            }
+        }
+        params.skip_layers = layers;
+        return 1;
+    };
+
+    auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        params.ref_image_paths.push_back(argv[index]);
+        return 1;
+    };
+
+    options.manual_options = {
+        {"-M", "--mode", "", on_mode_arg},
+        {"", "--type", "", on_type_arg},
+        {"", "--rng", "", on_rng_arg},
+        {"-s", "--seed", "", on_seed_arg},
+        {"", "--sampling-method", "", on_sample_method_arg},
+        {"", "--schedule", "", on_schedule_arg},
+        {"", "--skip-layers", "", on_skip_layers_arg},
+        {"-r", "--ref-image", "", on_ref_image_arg},
+        {"-h", "--help", "", on_help_arg},
+    };
+
+    if (!parse_options(argc, argv, options)) {
         print_usage(argc, argv);
         exit(1);
     }
+
     if (params.n_threads <= 0) {
         params.n_threads = get_num_physical_cores();
     }
 
-    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
+    if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: prompt\n");
         print_usage(argc, argv);
         exit(1);
@@ -712,18 +590,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
-    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
-        fprintf(stderr, "error: when using the img2img/img2vid mode, the following arguments are required: init-img\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.mode == EDIT && params.ref_image_paths.size() == 0) {
-        fprintf(stderr, "error: when using the edit mode, the following arguments are required: ref-image\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
     if (params.output_path.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: output_path\n");
         print_usage(argc, argv);
@@ -754,6 +620,11 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n");
     }
 
+    if (params.upscale_repeats < 1) {
+        fprintf(stderr, "error: upscale multiplier must be at least 1\n");
+        exit(1);
+    }
+
     if (params.seed < 0) {
         srand((int)time(NULL));
         params.seed = rand();
@@ -804,8 +675,8 @@ std::string get_image_params(SDParams params, int64_t seed) {
     parameter_string += "Seed: " + std::to_string(seed) + ", ";
     parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
     parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
-    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
-    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
+    parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", ";
+    parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_method));
     if (params.schedule == KARRAS) {
         parameter_string += " karras";
     }
@@ -899,7 +770,7 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    if (params.mode == IMG2VID) {
+    if (params.mode == VID_GEN) {
         fprintf(stderr, "SVD support is broken, do not use it!!!\n");
         return 1;
     }
@@ -910,7 +781,7 @@ int main(int argc, const char* argv[]) {
     uint8_t* mask_image_buffer    = NULL;
     std::vector<sd_image_t> ref_images;
 
-    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
+    if (params.input_path.size() > 0) {
         vae_decode_only = false;
 
         int c              = 0;
@@ -960,7 +831,7 @@ int main(int argc, const char* argv[]) {
             free(input_image_buffer);
             input_image_buffer = resized_image_buffer;
         }
-    } else if (params.mode == EDIT) {
+    } else if (params.ref_image_paths.size() > 0) {
         vae_decode_only = false;
         for (auto& path : params.ref_image_paths) {
             int c                 = 0;
@@ -993,39 +864,48 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
-                                  params.clip_l_path.c_str(),
-                                  params.clip_g_path.c_str(),
-                                  params.t5xxl_path.c_str(),
-                                  params.diffusion_model_path.c_str(),
-                                  params.vae_path.c_str(),
-                                  params.taesd_path.c_str(),
-                                  params.controlnet_path.c_str(),
-                                  params.lora_model_dir.c_str(),
-                                  params.embeddings_path.c_str(),
-                                  params.stacked_id_embeddings_path.c_str(),
-                                  vae_decode_only,
-                                  params.vae_tiling,
-                                  true,
-                                  params.n_threads,
-                                  params.wtype,
-                                  params.rng_type,
-                                  params.schedule,
-                                  params.clip_on_cpu,
-                                  params.control_net_cpu,
-                                  params.vae_on_cpu,
-                                  params.diffusion_flash_attn,
-                                  params.chroma_use_dit_mask,
-                                  params.chroma_use_t5_mask,
-                                  params.chroma_t5_mask_pad);
+    sd_ctx_params_t sd_ctx_params = {
+        params.model_path.c_str(),
+        params.clip_l_path.c_str(),
+        params.clip_g_path.c_str(),
+        params.t5xxl_path.c_str(),
+        params.diffusion_model_path.c_str(),
+        params.vae_path.c_str(),
+        params.taesd_path.c_str(),
+        params.control_net_path.c_str(),
+        params.lora_model_dir.c_str(),
+        params.embedding_dir.c_str(),
+        params.stacked_id_embed_dir.c_str(),
+        vae_decode_only,
+        params.vae_tiling,
+        true,
+        params.n_threads,
+        params.wtype,
+        params.rng_type,
+        params.schedule,
+        params.clip_on_cpu,
+        params.control_net_cpu,
+        params.vae_on_cpu,
+        params.diffusion_flash_attn,
+        params.chroma_use_dit_mask,
+        params.chroma_use_t5_mask,
+        params.chroma_t5_mask_pad,
+    };
+
+    sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");
         return 1;
     }
 
+    sd_image_t input_image = {(uint32_t)params.width,
+                              (uint32_t)params.height,
+                              3,
+                              input_image_buffer};
+
     sd_image_t* control_image = NULL;
-    if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
+    if (params.control_net_path.size() > 0 && params.control_image_path.size() > 0) {
         int c                = 0;
         control_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
         if (control_image_buffer == NULL) {
@@ -1061,107 +941,52 @@ int main(int argc, const char* argv[]) {
                              mask_image_buffer};
 
     sd_image_t* results;
-    if (params.mode == TXT2IMG) {
-        results = txt2img(sd_ctx,
-                          params.prompt.c_str(),
-                          params.negative_prompt.c_str(),
-                          params.clip_skip,
-                          guidance_params,
-                          params.eta,
-                          params.width,
-                          params.height,
-                          params.sample_method,
-                          params.sample_steps,
-                          params.seed,
-                          params.batch_count,
-                          control_image,
-                          params.control_strength,
-                          params.style_ratio,
-                          params.normalize_input,
-                          params.input_id_images_path.c_str());
-    } else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
-        sd_image_t input_image = {(uint32_t)params.width,
-                                  (uint32_t)params.height,
-                                  3,
-                                  input_image_buffer};
-
-        if (params.mode == IMG2VID) {
-            results = img2vid(sd_ctx,
-                              input_image,
-                              params.width,
-                              params.height,
-                              params.video_frames,
-                              params.motion_bucket_id,
-                              params.fps,
-                              params.augmentation_level,
-                              guidance_params,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed);
-            if (results == NULL) {
-                printf("generate failed\n");
-                free_sd_ctx(sd_ctx);
-                return 1;
-            }
-            size_t last            = params.output_path.find_last_of(".");
-            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
-            for (int i = 0; i < params.video_frames; i++) {
-                if (results[i].data == NULL) {
-                    continue;
-                }
-                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-                printf("save result image to '%s'\n", final_image_path.c_str());
-                free(results[i].data);
-                results[i].data = NULL;
-            }
-            free(results);
-            free_sd_ctx(sd_ctx);
-            return 0;
-        } else {
-            results = img2img(sd_ctx,
-                              input_image,
-                              mask_image,
-                              params.prompt.c_str(),
-                              params.negative_prompt.c_str(),
-                              params.clip_skip,
-                              guidance_params,
-                              params.eta,
-                              params.width,
-                              params.height,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed,
-                              params.batch_count,
-                              control_image,
-                              params.control_strength,
-                              params.style_ratio,
-                              params.normalize_input,
-                              params.input_id_images_path.c_str());
-        }
-    } else {  // EDIT
-        results = edit(sd_ctx,
-                       ref_images.data(),
-                       ref_images.size(),
-                       params.prompt.c_str(),
-                       params.negative_prompt.c_str(),
-                       params.clip_skip,
-                       guidance_params,
-                       params.eta,
-                       params.width,
-                       params.height,
-                       params.sample_method,
-                       params.sample_steps,
-                       params.seed,
-                       params.batch_count,
-                       control_image,
-                       params.control_strength,
-                       params.style_ratio,
-                       params.normalize_input,
-                       params.input_id_images_path.c_str());
+    int expected_num_results = 1;
+    if (params.mode == IMG_GEN) {
+        sd_img_gen_params_t img_gen_params = {
+            params.prompt.c_str(),
+            params.negative_prompt.c_str(),
+            params.clip_skip,
+            guidance_params,
+            input_image,
+            ref_images.data(),
+            ref_images.size(),
+            mask_image,
+            params.width,
+            params.height,
+            params.sample_method,
+            params.sample_steps,
+            params.eta,
+            params.strength,
+            params.seed,
+            params.batch_count,
+            control_image,
+            params.control_strength,
+            params.style_ratio,
+            params.normalize_input,
+            params.input_id_images_path.c_str(),
+        };
+
+        results              = generate_image(sd_ctx, &img_gen_params);
+        expected_num_results = params.batch_count;
+    } else if (params.mode == VID_GEN) {
+        sd_vid_gen_params_t vid_gen_params = {
+            input_image,
+            params.width,
+            params.height,
+            guidance_params,
+            params.sample_method,
+            params.sample_steps,
+            params.strength,
+            params.seed,
+            params.video_frames,
+            params.motion_bucket_id,
+            params.fps,
+            params.augmentation_level,
+        };
+
+        results              = generate_video(sd_ctx, &vid_gen_params);
+        expected_num_results = params.video_frames;
     }
 
     if (results == NULL) {
@@ -1218,7 +1043,7 @@ int main(int argc, const char* argv[]) {
         dummy_name += ext;
         ext = ".png";
     }
-    for (int i = 0; i < params.batch_count; i++) {
+    for (int i = 0; i < expected_num_results; i++) {
         if (results[i].data == NULL) {
             continue;
         }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index c6b873fa..43b027ee 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -118,22 +118,6 @@ class StableDiffusionGGML {
 
     StableDiffusionGGML() = default;
 
-    StableDiffusionGGML(int n_threads,
-                        bool vae_decode_only,
-                        bool free_params_immediately,
-                        std::string lora_model_dir,
-                        rng_type_t rng_type)
-        : n_threads(n_threads),
-          vae_decode_only(vae_decode_only),
-          free_params_immediately(free_params_immediately),
-          lora_model_dir(lora_model_dir) {
-        if (rng_type == STD_DEFAULT_RNG) {
-            rng = std::make_shared<STDDefaultRNG>();
-        } else if (rng_type == CUDA_RNG) {
-            rng = std::make_shared<PhiloxRNG>();
-        }
-    }
-
     ~StableDiffusionGGML() {
         if (clip_backend != backend) {
             ggml_backend_free(clip_backend);
@@ -147,27 +131,7 @@ class StableDiffusionGGML {
         ggml_backend_free(backend);
     }
 
-    bool load_from_file(const std::string& model_path,
-                        const std::string& clip_l_path,
-                        const std::string& clip_g_path,
-                        const std::string& t5xxl_path,
-                        const std::string& diffusion_model_path,
-                        const std::string& vae_path,
-                        const std::string control_net_path,
-                        const std::string embeddings_path,
-                        const std::string id_embeddings_path,
-                        const std::string& taesd_path,
-                        bool vae_tiling_,
-                        ggml_type wtype,
-                        schedule_t schedule,
-                        bool clip_on_cpu,
-                        bool control_net_cpu,
-                        bool vae_on_cpu,
-                        bool diffusion_flash_attn,
-                        bool chroma_use_dit_mask,
-                        bool chroma_use_t5_mask,
-                        int chroma_t5_mask_pad) {
-        use_tiny_autoencoder = taesd_path.size() > 0;
+    void init_backend() {
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
@@ -203,62 +167,80 @@ class StableDiffusionGGML {
             LOG_DEBUG("Using CPU backend");
             backend = ggml_backend_cpu_init();
         }
+    }
 
-        ModelLoader model_loader;
+    bool init(const sd_ctx_params_t* sd_ctx_params) {
+        n_threads               = sd_ctx_params->n_threads;
+        vae_decode_only         = sd_ctx_params->vae_decode_only;
+        free_params_immediately = sd_ctx_params->free_params_immediately;
+        lora_model_dir          = SAFE_STR(sd_ctx_params->lora_model_dir);
+        use_tiny_autoencoder    = taesd_path.size() > 0;
+        vae_tiling              = sd_ctx_params->vae_tiling;
 
-        vae_tiling = vae_tiling_;
+        if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
+            rng = std::make_shared<STDDefaultRNG>();
+        } else if (sd_ctx_params->rng_type == CUDA_RNG) {
+            rng = std::make_shared<PhiloxRNG>();
+        }
+
+        init_backend();
+
+        ModelLoader model_loader;
 
-        if (model_path.size() > 0) {
-            LOG_INFO("loading model from '%s'", model_path.c_str());
-            if (!model_loader.init_from_file(model_path)) {
-                LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
+            LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
+            if (!model_loader.init_from_file(sd_ctx_params->model_path)) {
+                LOG_ERROR("init model loader from file failed: '%s'", sd_ctx_params->model_path);
             }
         }
 
-        if (diffusion_model_path.size() > 0) {
-            LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str());
-            if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) {
-                LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) > 0) {
+            LOG_INFO("loading diffusion model from '%s'", sd_ctx_params->diffusion_model_path);
+            if (!model_loader.init_from_file(sd_ctx_params->diffusion_model_path, "model.diffusion_model.")) {
+                LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->diffusion_model_path);
             }
         }
 
         bool is_unet = model_loader.model_is_unet();
 
-        if (clip_l_path.size() > 0) {
-            LOG_INFO("loading clip_l from '%s'", clip_l_path.c_str());
-            if (!model_loader.init_from_file(clip_l_path, is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.")) {
-                LOG_WARN("loading clip_l from '%s' failed", clip_l_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) {
+            LOG_INFO("loading clip_l from '%s'", sd_ctx_params->clip_l_path);
+            std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.";
+            if (!model_loader.init_from_file(sd_ctx_params->clip_l_path, prefix)) {
+                LOG_WARN("loading clip_l from '%s' failed", sd_ctx_params->clip_l_path);
             }
         }
 
-        if (clip_g_path.size() > 0) {
-            LOG_INFO("loading clip_g from '%s'", clip_g_path.c_str());
-            if (!model_loader.init_from_file(clip_g_path, is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.")) {
-                LOG_WARN("loading clip_g from '%s' failed", clip_g_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->clip_g_path)) > 0) {
+            LOG_INFO("loading clip_g from '%s'", sd_ctx_params->clip_g_path);
+            std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.";
+            if (!model_loader.init_from_file(sd_ctx_params->clip_g_path, prefix)) {
+                LOG_WARN("loading clip_g from '%s' failed", sd_ctx_params->clip_g_path);
             }
         }
 
-        if (t5xxl_path.size() > 0) {
-            LOG_INFO("loading t5xxl from '%s'", t5xxl_path.c_str());
-            if (!model_loader.init_from_file(t5xxl_path, "text_encoders.t5xxl.transformer.")) {
-                LOG_WARN("loading t5xxl from '%s' failed", t5xxl_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->t5xxl_path)) > 0) {
+            LOG_INFO("loading t5xxl from '%s'", sd_ctx_params->t5xxl_path);
+            if (!model_loader.init_from_file(sd_ctx_params->t5xxl_path, "text_encoders.t5xxl.transformer.")) {
+                LOG_WARN("loading t5xxl from '%s' failed", sd_ctx_params->t5xxl_path);
             }
         }
 
-        if (vae_path.size() > 0) {
-            LOG_INFO("loading vae from '%s'", vae_path.c_str());
-            if (!model_loader.init_from_file(vae_path, "vae.")) {
-                LOG_WARN("loading vae from '%s' failed", vae_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) {
+            LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path);
+            if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) {
+                LOG_WARN("loading vae from '%s' failed", sd_ctx_params->vae_path);
             }
         }
 
         version = model_loader.get_sd_version();
         if (version == VERSION_COUNT) {
-            LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str());
+            LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
             return false;
         }
 
         LOG_INFO("Version: %s ", model_version_to_str[version]);
+        ggml_type wtype = (ggml_type)sd_ctx_params->wtype;
         if (wtype == GGML_TYPE_COUNT) {
             model_wtype = model_loader.get_sd_wtype();
             if (model_wtype == GGML_TYPE_COUNT) {
@@ -300,7 +282,7 @@ class StableDiffusionGGML {
 
         if (sd_version_is_sdxl(version)) {
             scale_factor = 0.13025f;
-            if (vae_path.size() == 0 && taesd_path.size() == 0) {
+            if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && strlen(SAFE_STR(sd_ctx_params->taesd_path)) == 0) {
                 LOG_WARN(
                     "!!!It looks like you are using SDXL model. "
                     "If you find that the generated images are completely black, "
@@ -314,6 +296,8 @@ class StableDiffusionGGML {
             // TODO: shift_factor
         }
 
+        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
+
         if (version == VERSION_SVD) {
             clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
             clip_vision->alloc_params_buffer();
@@ -341,11 +325,11 @@ class StableDiffusionGGML {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
             }
-            if (diffusion_flash_attn) {
+            if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
             if (sd_version_is_sd3(version)) {
-                if (diffusion_flash_attn) {
+                if (sd_ctx_params->diffusion_flash_attn) {
                     LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                 }
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
@@ -359,18 +343,36 @@ class StableDiffusionGGML {
                     }
                 }
                 if (is_chroma) {
-                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad);
+                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend,
+                                                                            model_loader.tensor_storages_types,
+                                                                            -1,
+                                                                            sd_ctx_params->chroma_use_t5_mask,
+                                                                            sd_ctx_params->chroma_t5_mask_pad);
                 } else {
                     cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask);
+                diffusion_model = std::make_shared<FluxModel>(backend,
+                                                              model_loader.tensor_storages_types,
+                                                              version,
+                                                              sd_ctx_params->diffusion_flash_attn,
+                                                              sd_ctx_params->chroma_use_dit_mask);
             } else {
-                if (id_embeddings_path.find("v2") != std::string::npos) {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
+                if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                                                                                           model_loader.tensor_storages_types,
+                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
+                                                                                           version,
+                                                                                           PM_VERSION_2);
                 } else {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                                                                                           model_loader.tensor_storages_types,
+                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
+                                                                                           version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
+                diffusion_model = std::make_shared<UNetModel>(backend,
+                                                              model_loader.tensor_storages_types,
+                                                              version,
+                                                              sd_ctx_params->diffusion_flash_attn);
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -380,23 +382,32 @@ class StableDiffusionGGML {
             diffusion_model->get_param_tensors(tensors);
 
             if (!use_tiny_autoencoder) {
-                if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
+                if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_INFO("VAE Autoencoder: Using CPU backend");
                     vae_backend = ggml_backend_cpu_init();
                 } else {
                     vae_backend = backend;
                 }
-                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
+                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
+                                                                    model_loader.tensor_storages_types,
+                                                                    "first_stage_model",
+                                                                    vae_decode_only,
+                                                                    false,
+                                                                    version);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
-                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version);
+                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend,
+                                                                    model_loader.tensor_storages_types,
+                                                                    "decoder.layers",
+                                                                    vae_decode_only,
+                                                                    version);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
-            if (control_net_path.size() > 0) {
+            if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
                 ggml_backend_t controlnet_backend = NULL;
-                if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
+                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_DEBUG("ControlNet: Using CPU backend");
                     controlnet_backend = ggml_backend_cpu_init();
                 } else {
@@ -405,21 +416,21 @@ class StableDiffusionGGML {
                 control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
             }
 
-            if (id_embeddings_path.find("v2") != std::string::npos) {
+            if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
                 pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2);
                 LOG_INFO("using PhotoMaker Version 2");
             } else {
                 pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version);
             }
-            if (id_embeddings_path.size() > 0) {
-                pmid_lora = std::make_shared<LoraModel>(backend, id_embeddings_path, "");
+            if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
+                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
                 if (!pmid_lora->load_from_file(true)) {
-                    LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str());
+                    LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->stacked_id_embed_dir);
                     return false;
                 }
-                LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", id_embeddings_path.c_str());
-                if (!model_loader.init_from_file(id_embeddings_path, "pmid.")) {
-                    LOG_WARN("loading stacked ID embedding from '%s' failed", id_embeddings_path.c_str());
+                LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->stacked_id_embed_dir);
+                if (!model_loader.init_from_file(sd_ctx_params->stacked_id_embed_dir, "pmid.")) {
+                    LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->stacked_id_embed_dir);
                 } else {
                     stacked_id = true;
                 }
@@ -491,7 +502,7 @@ class StableDiffusionGGML {
             }
             size_t control_net_params_mem_size = 0;
             if (control_net) {
-                if (!control_net->load_from_file(control_net_path)) {
+                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
                     return false;
                 }
                 control_net_params_mem_size = control_net->get_params_buffer_size();
@@ -547,7 +558,7 @@ class StableDiffusionGGML {
         }
 
         int64_t t1 = ggml_time_ms();
-        LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
+        LOG_INFO("loading model from '%s' completed, taking %.2fs", SAFE_STR(sd_ctx_params->model_path), (t1 - t0) * 1.0f / 1000);
 
         // check is_using_v_parameterization_for_sd2
 
@@ -592,8 +603,8 @@ class StableDiffusionGGML {
             LOG_INFO("running in eps-prediction mode");
         }
 
-        if (schedule != DEFAULT) {
-            switch (schedule) {
+        if (sd_ctx_params->schedule != DEFAULT) {
+            switch (sd_ctx_params->schedule) {
                 case DISCRETE:
                     LOG_INFO("running with discrete schedule");
                     denoiser->schedule = std::make_shared<DiscreteSchedule>();
@@ -620,7 +631,7 @@ class StableDiffusionGGML {
                     // Don't touch anything.
                     break;
                 default:
-                    LOG_ERROR("Unknown schedule %i", schedule);
+                    LOG_ERROR("Unknown schedule %i", sd_ctx_params->schedule);
                     abort();
             }
         }
@@ -1185,80 +1196,301 @@ class StableDiffusionGGML {
 
 /*================================================= SD API ==================================================*/
 
+#define NONE_STR "NONE"
+
+const char* sd_type_name(enum sd_type_t type) {
+    return ggml_type_name((ggml_type)type);
+}
+
+enum sd_type_t str_to_sd_type(const char* str) {
+    for (int i = 0; i < SD_TYPE_COUNT; i++) {
+        auto trait = ggml_get_type_traits((ggml_type)i);
+        if (!strcmp(str, trait->type_name)) {
+            return (enum sd_type_t)i;
+        }
+    }
+    return SD_TYPE_COUNT;
+}
+
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
+};
+
+const char* sd_rng_type_name(enum rng_type_t rng_type) {
+    if (rng_type < RNG_TYPE_COUNT) {
+        return rng_type_to_str[rng_type];
+    }
+    return NONE_STR;
+}
+
+enum rng_type_t str_to_rng_type(const char* str) {
+    for (int i = 0; i < RNG_TYPE_COUNT; i++) {
+        if (!strcmp(str, rng_type_to_str[i])) {
+            return (enum rng_type_t)i;
+        }
+    }
+    return RNG_TYPE_COUNT;
+}
+
+const char* sample_method_to_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+    "ddim_trailing",
+    "tcd",
+};
+
+const char* sd_sample_method_name(enum sample_method_t sample_method) {
+    if (sample_method < SAMPLE_METHOD_COUNT) {
+        return sample_method_to_str[sample_method];
+    }
+    return NONE_STR;
+}
+
+enum sample_method_t str_to_sample_method(const char* str) {
+    for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) {
+        if (!strcmp(str, sample_method_to_str[i])) {
+            return (enum sample_method_t)i;
+        }
+    }
+    return SAMPLE_METHOD_COUNT;
+}
+
+const char* schedule_to_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+const char* sd_schedule_name(enum schedule_t schedule) {
+    if (schedule < SCHEDULE_COUNT) {
+        return schedule_to_str[schedule];
+    }
+    return NONE_STR;
+}
+
+enum schedule_t str_to_schedule(const char* str) {
+    for (int i = 0; i < SCHEDULE_COUNT; i++) {
+        if (!strcmp(str, schedule_to_str[i])) {
+            return (enum schedule_t)i;
+        }
+    }
+    return SCHEDULE_COUNT;
+}
+
+void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
+    memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t));
+    sd_ctx_params->vae_decode_only         = true;
+    sd_ctx_params->vae_tiling              = false;
+    sd_ctx_params->free_params_immediately = true;
+    sd_ctx_params->n_threads               = get_num_physical_cores();
+    sd_ctx_params->wtype                   = SD_TYPE_COUNT;
+    sd_ctx_params->rng_type                = CUDA_RNG;
+    sd_ctx_params->schedule                = DEFAULT;
+    sd_ctx_params->keep_clip_on_cpu        = false;
+    sd_ctx_params->keep_control_net_on_cpu = false;
+    sd_ctx_params->keep_vae_on_cpu         = false;
+    sd_ctx_params->diffusion_flash_attn    = false;
+    sd_ctx_params->chroma_use_dit_mask     = true;
+    sd_ctx_params->chroma_use_t5_mask      = false;
+    sd_ctx_params->chroma_t5_mask_pad      = 1;
+}
+
+char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
+    char* buf = (char*)malloc(4096);
+    if (!buf)
+        return NULL;
+    buf[0] = '\0';
+
+    snprintf(buf + strlen(buf), 4096 - strlen(buf),
+             "model_path: %s\n"
+             "clip_l_path: %s\n"
+             "clip_g_path: %s\n"
+             "t5xxl_path: %s\n"
+             "diffusion_model_path: %s\n"
+             "vae_path: %s\n"
+             "taesd_path: %s\n"
+             "control_net_path: %s\n"
+             "lora_model_dir: %s\n"
+             "embedding_dir: %s\n"
+             "stacked_id_embed_dir: %s\n"
+             "vae_decode_only: %s\n"
+             "vae_tiling: %s\n"
+             "free_params_immediately: %s\n"
+             "n_threads: %d\n"
+             "wtype: %s\n"
+             "rng_type: %s\n"
+             "schedule: %s\n"
+             "keep_clip_on_cpu: %s\n"
+             "keep_control_net_on_cpu: %s\n"
+             "keep_vae_on_cpu: %s\n"
+             "diffusion_flash_attn: %s\n"
+             "chroma_use_dit_mask: %s\n"
+             "chroma_use_t5_mask: %s\n"
+             "chroma_t5_mask_pad: %d\n",
+             SAFE_STR(sd_ctx_params->model_path),
+             SAFE_STR(sd_ctx_params->clip_l_path),
+             SAFE_STR(sd_ctx_params->clip_g_path),
+             SAFE_STR(sd_ctx_params->t5xxl_path),
+             SAFE_STR(sd_ctx_params->diffusion_model_path),
+             SAFE_STR(sd_ctx_params->vae_path),
+             SAFE_STR(sd_ctx_params->taesd_path),
+             SAFE_STR(sd_ctx_params->control_net_path),
+             SAFE_STR(sd_ctx_params->lora_model_dir),
+             SAFE_STR(sd_ctx_params->embedding_dir),
+             SAFE_STR(sd_ctx_params->stacked_id_embed_dir),
+             BOOL_STR(sd_ctx_params->vae_decode_only),
+             BOOL_STR(sd_ctx_params->vae_tiling),
+             BOOL_STR(sd_ctx_params->free_params_immediately),
+             sd_ctx_params->n_threads,
+             sd_type_name(sd_ctx_params->wtype),
+             sd_rng_type_name(sd_ctx_params->rng_type),
+             sd_schedule_name(sd_ctx_params->schedule),
+             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
+             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
+             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
+             BOOL_STR(sd_ctx_params->diffusion_flash_attn),
+             BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
+             BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
+             sd_ctx_params->chroma_t5_mask_pad);
+
+    return buf;
+}
+
+void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
+    memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t));
+    sd_img_gen_params->clip_skip                   = -1;
+    sd_img_gen_params->guidance.txt_cfg            = 7.0f;
+    sd_img_gen_params->guidance.min_cfg            = 1.0f;
+    sd_img_gen_params->guidance.img_cfg            = INFINITY;
+    sd_img_gen_params->guidance.distilled_guidance = 3.5f;
+    sd_img_gen_params->guidance.slg.layer_count    = 0;
+    sd_img_gen_params->guidance.slg.layer_start    = 0.01f;
+    sd_img_gen_params->guidance.slg.layer_end      = 0.2f;
+    sd_img_gen_params->guidance.slg.scale          = 0.f;
+    sd_img_gen_params->ref_images_count            = 0;
+    sd_img_gen_params->width                       = 512;
+    sd_img_gen_params->height                      = 512;
+    sd_img_gen_params->sample_method               = EULER_A;
+    sd_img_gen_params->sample_steps                = 20;
+    sd_img_gen_params->eta                         = 0.f;
+    sd_img_gen_params->strength                    = 0.75f;
+    sd_img_gen_params->seed                        = -1;
+    sd_img_gen_params->batch_count                 = 1;
+    sd_img_gen_params->control_strength            = 0.9f;
+    sd_img_gen_params->style_strength              = 20.f;
+    sd_img_gen_params->normalize_input             = false;
+}
+
+char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
+    char* buf = (char*)malloc(4096);
+    if (!buf)
+        return NULL;
+    buf[0] = '\0';
+
+    snprintf(buf + strlen(buf), 4096 - strlen(buf),
+             "prompt: %s\n"
+             "negative_prompt: %s\n"
+             "clip_skip: %d\n"
+             "txt_cfg: %.2f\n"
+             "img_cfg: %.2f\n"
+             "min_cfg: %.2f\n"
+             "distilled_guidance: %.2f\n"
+             "slg.layer_count: %zu\n"
+             "slg.layer_start: %.2f\n"
+             "slg.layer_end: %.2f\n"
+             "slg.scale: %.2f\n"
+             "width: %d\n"
+             "height: %d\n"
+             "sample_method: %s\n"
+             "sample_steps: %d\n"
+             "eta: %.2f\n"
+             "strength: %.2f\n"
+             "seed: %" PRId64
+             "\n"
+             "batch_count: %d\n"
+             "ref_images_count: %d\n"
+             "control_strength: %.2f\n"
+             "style_strength: %.2f\n"
+             "normalize_input: %s\n"
+             "input_id_images_path: %s\n",
+             SAFE_STR(sd_img_gen_params->prompt),
+             SAFE_STR(sd_img_gen_params->negative_prompt),
+             sd_img_gen_params->clip_skip,
+             sd_img_gen_params->guidance.txt_cfg,
+             sd_img_gen_params->guidance.img_cfg,
+             sd_img_gen_params->guidance.min_cfg,
+             sd_img_gen_params->guidance.distilled_guidance,
+             sd_img_gen_params->guidance.slg.layer_count,
+             sd_img_gen_params->guidance.slg.layer_start,
+             sd_img_gen_params->guidance.slg.layer_end,
+             sd_img_gen_params->guidance.slg.scale,
+             sd_img_gen_params->width,
+             sd_img_gen_params->height,
+             sd_sample_method_name(sd_img_gen_params->sample_method),
+             sd_img_gen_params->sample_steps,
+             sd_img_gen_params->eta,
+             sd_img_gen_params->strength,
+             sd_img_gen_params->seed,
+             sd_img_gen_params->batch_count,
+             sd_img_gen_params->ref_images_count,
+             sd_img_gen_params->control_strength,
+             sd_img_gen_params->style_strength,
+             BOOL_STR(sd_img_gen_params->normalize_input),
+             SAFE_STR(sd_img_gen_params->input_id_images_path));
+
+    return buf;
+}
+
+void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
+    memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
+    sd_vid_gen_params->guidance.txt_cfg            = 7.0f;
+    sd_vid_gen_params->guidance.min_cfg            = 1.0f;
+    sd_vid_gen_params->guidance.img_cfg            = INFINITY;
+    sd_vid_gen_params->guidance.distilled_guidance = 3.5f;
+    sd_vid_gen_params->guidance.slg.layer_count    = 0;
+    sd_vid_gen_params->guidance.slg.layer_start    = 0.01f;
+    sd_vid_gen_params->guidance.slg.layer_end      = 0.2f;
+    sd_vid_gen_params->guidance.slg.scale          = 0.f;
+    sd_vid_gen_params->width                       = 512;
+    sd_vid_gen_params->height                      = 512;
+    sd_vid_gen_params->sample_method               = EULER_A;
+    sd_vid_gen_params->sample_steps                = 20;
+    sd_vid_gen_params->strength                    = 0.75f;
+    sd_vid_gen_params->seed                        = -1;
+    sd_vid_gen_params->video_frames                = 6;
+    sd_vid_gen_params->motion_bucket_id            = 127;
+    sd_vid_gen_params->fps                         = 6;
+    sd_vid_gen_params->augmentation_level          = 0.f;
+}
+
 struct sd_ctx_t {
     StableDiffusionGGML* sd = NULL;
 };
 
-sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
-                     const char* clip_l_path_c_str,
-                     const char* clip_g_path_c_str,
-                     const char* t5xxl_path_c_str,
-                     const char* diffusion_model_path_c_str,
-                     const char* vae_path_c_str,
-                     const char* taesd_path_c_str,
-                     const char* control_net_path_c_str,
-                     const char* lora_model_dir_c_str,
-                     const char* embed_dir_c_str,
-                     const char* id_embed_dir_c_str,
-                     bool vae_decode_only,
-                     bool vae_tiling,
-                     bool free_params_immediately,
-                     int n_threads,
-                     enum sd_type_t wtype,
-                     enum rng_type_t rng_type,
-                     enum schedule_t s,
-                     bool keep_clip_on_cpu,
-                     bool keep_control_net_cpu,
-                     bool keep_vae_on_cpu,
-                     bool diffusion_flash_attn,
-                     bool chroma_use_dit_mask,
-                     bool chroma_use_t5_mask,
-                     int chroma_t5_mask_pad) {
+sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
     }
-    std::string model_path(model_path_c_str);
-    std::string clip_l_path(clip_l_path_c_str);
-    std::string clip_g_path(clip_g_path_c_str);
-    std::string t5xxl_path(t5xxl_path_c_str);
-    std::string diffusion_model_path(diffusion_model_path_c_str);
-    std::string vae_path(vae_path_c_str);
-    std::string taesd_path(taesd_path_c_str);
-    std::string control_net_path(control_net_path_c_str);
-    std::string embd_path(embed_dir_c_str);
-    std::string id_embd_path(id_embed_dir_c_str);
-    std::string lora_model_dir(lora_model_dir_c_str);
-
-    sd_ctx->sd = new StableDiffusionGGML(n_threads,
-                                         vae_decode_only,
-                                         free_params_immediately,
-                                         lora_model_dir,
-                                         rng_type);
+
+    sd_ctx->sd = new StableDiffusionGGML();
     if (sd_ctx->sd == NULL) {
         return NULL;
     }
 
-    if (!sd_ctx->sd->load_from_file(model_path,
-                                    clip_l_path,
-                                    clip_g_path,
-                                    t5xxl_path_c_str,
-                                    diffusion_model_path,
-                                    vae_path,
-                                    control_net_path,
-                                    embd_path,
-                                    id_embd_path,
-                                    taesd_path,
-                                    vae_tiling,
-                                    (ggml_type)wtype,
-                                    s,
-                                    keep_clip_on_cpu,
-                                    keep_control_net_cpu,
-                                    keep_vae_on_cpu,
-                                    diffusion_flash_attn,
-                                    chroma_use_dit_mask,
-                                    chroma_use_t5_mask,
-                                    chroma_t5_mask_pad)) {
+    if (!sd_ctx->sd->init(sd_ctx_params)) {
         delete sd_ctx->sd;
         sd_ctx->sd = NULL;
         free(sd_ctx);
@@ -1275,28 +1507,28 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
     free(sd_ctx);
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx,
-                           struct ggml_context* work_ctx,
-                           ggml_tensor* init_latent,
-                           std::string prompt,
-                           std::string negative_prompt,
-                           int clip_skip,
-                           sd_guidance_params_t guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           const std::vector<float>& sigmas,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_ratio,
-                           bool normalize_input,
-                           std::string input_id_images_path,
-                           std::vector<ggml_tensor*> ref_latents,
-                           ggml_tensor* concat_latent = NULL,
-                           ggml_tensor* denoise_mask  = NULL) {
+sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
+                                    struct ggml_context* work_ctx,
+                                    ggml_tensor* init_latent,
+                                    std::string prompt,
+                                    std::string negative_prompt,
+                                    int clip_skip,
+                                    sd_guidance_params_t guidance,
+                                    float eta,
+                                    int width,
+                                    int height,
+                                    enum sample_method_t sample_method,
+                                    const std::vector<float>& sigmas,
+                                    int64_t seed,
+                                    int batch_count,
+                                    const sd_image_t* control_cond,
+                                    float control_strength,
+                                    float style_ratio,
+                                    bool normalize_input,
+                                    std::string input_id_images_path,
+                                    std::vector<ggml_tensor*> ref_latents,
+                                    ggml_tensor* concat_latent = NULL,
+                                    ggml_tensor* denoise_mask  = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1639,25 +1871,11 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
     return init_latent;
 }
 
-sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
-                    int clip_skip,
-                    sd_guidance_params_t guidance,
-                    float eta,
-                    int width,
-                    int height,
-                    enum sample_method_t sample_method,
-                    int sample_steps,
-                    int64_t seed,
-                    int batch_count,
-                    const sd_image_t* control_cond,
-                    float control_strength,
-                    float style_ratio,
-                    bool normalize_input,
-                    const char* input_id_images_path_c_str) {
-    LOG_DEBUG("txt2img %dx%d", width, height);
-    if (sd_ctx == NULL) {
+sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
+    int width  = sd_img_gen_params->width;
+    int height = sd_img_gen_params->height;
+    LOG_DEBUG("generate_image %dx%d", width, height);
+    if (sd_ctx == NULL || sd_img_gen_params == NULL) {
         return NULL;
     }
 
@@ -1672,94 +1890,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
-    params.mem_size += width * height * 3 * sizeof(float);
-    params.mem_size *= batch_count;
-    params.mem_buffer = NULL;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return NULL;
-    }
-
-    size_t t0 = ggml_time_ms();
-
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
-    }
-
-    ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height);
-
-    sd_image_t* result_images = generate_image(sd_ctx,
-                                               work_ctx,
-                                               init_latent,
-                                               prompt_c_str,
-                                               negative_prompt_c_str,
-                                               clip_skip,
-                                               guidance,
-                                               eta,
-                                               width,
-                                               height,
-                                               sample_method,
-                                               sigmas,
-                                               seed,
-                                               batch_count,
-                                               control_cond,
-                                               control_strength,
-                                               style_ratio,
-                                               normalize_input,
-                                               input_id_images_path_c_str,
-                                               {});
-
-    size_t t1 = ggml_time_ms();
-
-    LOG_INFO("txt2img completed in %.2fs", (t1 - t0) * 1.0f / 1000);
-
-    return result_images;
-}
-
-sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                    sd_image_t init_image,
-                    sd_image_t mask,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
-                    int clip_skip,
-                    sd_guidance_params_t guidance,
-                    float eta,
-                    int width,
-                    int height,
-                    sample_method_t sample_method,
-                    int sample_steps,
-                    float strength,
-                    int64_t seed,
-                    int batch_count,
-                    const sd_image_t* control_cond,
-                    float control_strength,
-                    float style_ratio,
-                    bool normalize_input,
-                    const char* input_id_images_path_c_str) {
-    LOG_DEBUG("img2img %dx%d", width, height);
-    if (sd_ctx == NULL) {
-        return NULL;
-    }
-
-    struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-    if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        params.mem_size *= 2;
-    }
-    if (sd_version_is_flux(sd_ctx->sd->version)) {
-        params.mem_size *= 3;
-    }
-    if (sd_ctx->sd->stacked_id) {
-        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-    }
     params.mem_size += width * height * 3 * sizeof(float) * 3;
-    params.mem_size *= batch_count;
+    params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count;
+    params.mem_size *= sd_img_gen_params->batch_count;
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
@@ -1770,155 +1903,197 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
-    size_t t0 = ggml_time_ms();
-
+    int64_t seed = sd_img_gen_params->seed;
     if (seed < 0) {
         srand((int)time(NULL));
         seed = rand();
     }
     sd_ctx->sd->rng->manual_seed(seed);
 
-    ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-    ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    size_t t0 = ggml_time_ms();
 
-    sd_mask_to_tensor(mask.data, mask_img);
+    ggml_tensor* init_latent   = NULL;
+    ggml_tensor* concat_latent = NULL;
+    ggml_tensor* denoise_mask  = NULL;
+    std::vector<float> sigmas  = sd_ctx->sd->denoiser->get_sigmas(sd_img_gen_params->sample_steps);
 
-    sd_image_to_tensor(init_image.data, init_img);
+    if (sd_img_gen_params->init_image.data) {
+        LOG_INFO("IMG2IMG");
 
-    ggml_tensor* concat_latent;
-    ggml_tensor* denoise_mask = NULL;
+        size_t t_enc = static_cast<size_t>(sd_img_gen_params->sample_steps * sd_img_gen_params->strength);
+        if (t_enc == sd_img_gen_params->sample_steps)
+            t_enc--;
+        LOG_INFO("target t_enc is %zu steps", t_enc);
+        std::vector<float> sigma_sched;
+        sigma_sched.assign(sigmas.begin() + sd_img_gen_params->sample_steps - t_enc - 1, sigmas.end());
+        sigmas = sigma_sched;
 
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        int64_t mask_channels = 1;
-        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-            mask_channels = 8 * 8;  // flatten the whole mask
-        }
-        ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_apply_mask(init_img, mask_img, masked_img);
-        ggml_tensor* masked_latent = NULL;
-        if (!sd_ctx->sd->use_tiny_autoencoder) {
-            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-            masked_latent        = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-        } else {
-            masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-        }
-        concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], mask_channels + masked_latent->ne[2], 1);
-        for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
-            for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
-                int mx = ix * 8;
-                int my = iy * 8;
-                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                    for (int k = 0; k < masked_latent->ne[2]; k++) {
-                        float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
-                        ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
-                    }
-                    // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
-                    for (int x = 0; x < 8; x++) {
-                        for (int y = 0; y < 8; y++) {
-                            float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
-                            // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
-                            // python code was using "b (h 8) (w 8) -> b (8 8) h w"
-                            ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
+        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+        ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+
+        sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img);
+        sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img);
+
+        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+            int64_t mask_channels = 1;
+            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                mask_channels = 8 * 8;  // flatten the whole mask
+            }
+            ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+            sd_apply_mask(init_img, mask_img, masked_img);
+            ggml_tensor* masked_latent = NULL;
+            if (!sd_ctx->sd->use_tiny_autoencoder) {
+                ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+                masked_latent        = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+            } else {
+                masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+            }
+            concat_latent = ggml_new_tensor_4d(work_ctx,
+                                               GGML_TYPE_F32,
+                                               masked_latent->ne[0],
+                                               masked_latent->ne[1],
+                                               mask_channels + masked_latent->ne[2],
+                                               1);
+            for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
+                for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
+                    int mx = ix * 8;
+                    int my = iy * 8;
+                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                        for (int k = 0; k < masked_latent->ne[2]; k++) {
+                            float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
+                            ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
+                        }
+                        // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
+                        for (int x = 0; x < 8; x++) {
+                            for (int y = 0; y < 8; y++) {
+                                float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
+                                // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
+                                // python code was using "b (h 8) (w 8) -> b (8 8) h w"
+                                ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
+                            }
+                        }
+                    } else {
+                        float m = ggml_tensor_get_f32(mask_img, mx, my);
+                        ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
+                        for (int k = 0; k < masked_latent->ne[2]; k++) {
+                            float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
+                            ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
                         }
-                    }
-                } else {
-                    float m = ggml_tensor_get_f32(mask_img, mx, my);
-                    ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
-                    for (int k = 0; k < masked_latent->ne[2]; k++) {
-                        float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
-                        ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
                     }
                 }
             }
         }
-    }
 
-    {
-        // LOG_WARN("Inpainting with a base model is not great");
-        denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
-        for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
-            for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
-                int mx  = ix * 8;
-                int my  = iy * 8;
-                float m = ggml_tensor_get_f32(mask_img, mx, my);
-                ggml_tensor_set_f32(denoise_mask, m, ix, iy);
+        {
+            // LOG_WARN("Inpainting with a base model is not great");
+            denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
+            for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
+                for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
+                    int mx  = ix * 8;
+                    int my  = iy * 8;
+                    float m = ggml_tensor_get_f32(mask_img, mx, my);
+                    ggml_tensor_set_f32(denoise_mask, m, ix, iy);
+                }
             }
         }
-    }
 
-    ggml_tensor* init_latent = NULL;
-    if (!sd_ctx->sd->use_tiny_autoencoder) {
-        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+            init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        } else {
+            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        }
     } else {
-        init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        LOG_INFO("TXT2IMG");
+        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+            LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
+        }
+        init_latent = generate_init_latent(sd_ctx, work_ctx, width, height);
+    }
+
+    if (sd_img_gen_params->ref_images_count > 0) {
+        LOG_INFO("EDIT mode");
+    }
+
+    std::vector<struct ggml_tensor*> ref_latents;
+    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
+        ggml_tensor* img = ggml_new_tensor_4d(work_ctx,
+                                              GGML_TYPE_F32,
+                                              sd_img_gen_params->ref_images[i].width,
+                                              sd_img_gen_params->ref_images[i].height,
+                                              3,
+                                              1);
+        sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img);
+
+        ggml_tensor* latent = NULL;
+        if (sd_ctx->sd->use_tiny_autoencoder) {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+        } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+            latent = ggml_view_3d(work_ctx,
+                                  latent,
+                                  latent->ne[0],
+                                  latent->ne[1],
+                                  latent->ne[2] / 2,
+                                  latent->nb[1],
+                                  latent->nb[2],
+                                  0);
+        } else {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
+            latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        }
+        ref_latents.push_back(latent);
     }
 
-    size_t t1 = ggml_time_ms();
-    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
-    if (t_enc == sample_steps)
-        t_enc--;
-    LOG_INFO("target t_enc is %zu steps", t_enc);
-    std::vector<float> sigma_sched;
-    sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
-
-    sd_image_t* result_images = generate_image(sd_ctx,
-                                               work_ctx,
-                                               init_latent,
-                                               prompt_c_str,
-                                               negative_prompt_c_str,
-                                               clip_skip,
-                                               guidance,
-                                               eta,
-                                               width,
-                                               height,
-                                               sample_method,
-                                               sigma_sched,
-                                               seed,
-                                               batch_count,
-                                               control_cond,
-                                               control_strength,
-                                               style_ratio,
-                                               normalize_input,
-                                               input_id_images_path_c_str,
-                                               {},
-                                               concat_latent,
-                                               denoise_mask);
+    if (sd_img_gen_params->init_image.data != NULL || sd_img_gen_params->ref_images_count > 0) {
+        size_t t1 = ggml_time_ms();
+        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+    }
+
+    sd_image_t* result_images = generate_image_internal(sd_ctx,
+                                                        work_ctx,
+                                                        init_latent,
+                                                        SAFE_STR(sd_img_gen_params->prompt),
+                                                        SAFE_STR(sd_img_gen_params->negative_prompt),
+                                                        sd_img_gen_params->clip_skip,
+                                                        sd_img_gen_params->guidance,
+                                                        sd_img_gen_params->eta,
+                                                        width,
+                                                        height,
+                                                        sd_img_gen_params->sample_method,
+                                                        sigmas,
+                                                        seed,
+                                                        sd_img_gen_params->batch_count,
+                                                        sd_img_gen_params->control_cond,
+                                                        sd_img_gen_params->control_strength,
+                                                        sd_img_gen_params->style_strength,
+                                                        sd_img_gen_params->normalize_input,
+                                                        sd_img_gen_params->input_id_images_path,
+                                                        ref_latents,
+                                                        concat_latent,
+                                                        denoise_mask);
 
     size_t t2 = ggml_time_ms();
 
-    LOG_INFO("img2img completed in %.2fs", (t2 - t0) * 1.0f / 1000);
+    LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000);
 
     return result_images;
 }
 
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           sd_guidance_params_t guidance,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed) {
-    if (sd_ctx == NULL) {
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
+    if (sd_ctx == NULL || sd_vid_gen_params == NULL) {
         return NULL;
     }
 
+    int width  = sd_vid_gen_params->width;
+    int height = sd_vid_gen_params->height;
     LOG_INFO("img2vid %dx%d", width, height);
 
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_vid_gen_params->sample_steps);
 
     struct ggml_init_params params;
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
-    params.mem_size += width * height * 3 * sizeof(float) * video_frames;
+    params.mem_size += width * height * 3 * sizeof(float) * sd_vid_gen_params->video_frames;
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
@@ -1930,6 +2105,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
+    int64_t seed = sd_vid_gen_params->seed;
     if (seed < 0) {
         seed = (int)time(NULL);
     }
@@ -1939,12 +2115,12 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
     int64_t t0 = ggml_time_ms();
 
     SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx,
-                                                     init_image,
+                                                     sd_vid_gen_params->init_image,
                                                      width,
                                                      height,
-                                                     fps,
-                                                     motion_bucket_id,
-                                                     augmentation_level);
+                                                     sd_vid_gen_params->fps,
+                                                     sd_vid_gen_params->motion_bucket_id,
+                                                     sd_vid_gen_params->augmentation_level);
 
     auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn);
     ggml_set_f32(uc_crossattn, 0.f);
@@ -1966,13 +2142,13 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
     int C                   = 4;
     int W                   = width / 8;
     int H                   = height / 8;
-    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
+    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames);
     ggml_set_f32(x_t, 0.f);
 
-    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
+    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames);
     ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+    LOG_INFO("sampling using %s method", sampling_methods_str[sd_vid_gen_params->sample_method]);
     struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                  x_t,
                                                  noise,
@@ -1981,9 +2157,9 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                                                  {},
                                                  {},
                                                  0.f,
-                                                 guidance,
+                                                 sd_vid_gen_params->guidance,
                                                  0.f,
-                                                 sample_method,
+                                                 sd_vid_gen_params->sample_method,
                                                  sigmas,
                                                  -1,
                                                  SDCondition(NULL, NULL, NULL));
@@ -2003,13 +2179,13 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
-    sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t));
+    sd_image_t* result_images = (sd_image_t*)calloc(sd_vid_gen_params->video_frames, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
-    for (size_t i = 0; i < video_frames; i++) {
+    for (size_t i = 0; i < sd_vid_gen_params->video_frames; i++) {
         auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i);
 
         result_images[i].width   = width;
@@ -2025,114 +2201,3 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
 
     return result_images;
 }
-
-sd_image_t* edit(sd_ctx_t* sd_ctx,
-                 sd_image_t* ref_images,
-                 int ref_images_count,
-                 const char* prompt_c_str,
-                 const char* negative_prompt_c_str,
-                 int clip_skip,
-                 sd_guidance_params_t guidance,
-                 float eta,
-                 int width,
-                 int height,
-                 enum sample_method_t sample_method,
-                 int sample_steps,
-                 int64_t seed,
-                 int batch_count,
-                 const sd_image_t* control_cond,
-                 float control_strength,
-                 float style_ratio,
-                 bool normalize_input,
-                 const char* input_id_images_path_c_str) {
-    LOG_DEBUG("edit %dx%d", width, height);
-    if (sd_ctx == NULL) {
-        return NULL;
-    }
-    if (ref_images_count <= 0) {
-        LOG_ERROR("ref images count should > 0");
-        return NULL;
-    }
-
-    struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(30 * 1024 * 1024);  // 10 MB
-    params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count;
-    params.mem_size *= batch_count;
-    params.mem_buffer = NULL;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return NULL;
-    }
-
-    if (seed < 0) {
-        srand((int)time(NULL));
-        seed = rand();
-    }
-    sd_ctx->sd->rng->manual_seed(seed);
-
-    size_t t0 = ggml_time_ms();
-
-    std::vector<struct ggml_tensor*> ref_latents;
-    for (int i = 0; i < ref_images_count; i++) {
-        ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1);
-        sd_image_to_tensor(ref_images[i].data, img);
-
-        ggml_tensor* latent = NULL;
-        if (sd_ctx->sd->use_tiny_autoencoder) {
-            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-        } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) {
-            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-            latent = ggml_view_3d(work_ctx,
-                                  latent,
-                                  latent->ne[0],
-                                  latent->ne[1],
-                                  latent->ne[2] / 2,
-                                  latent->nb[1],
-                                  latent->nb[2],
-                                  0);
-        } else {
-            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
-            latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-        }
-        ref_latents.push_back(latent);
-    }
-
-    size_t t1 = ggml_time_ms();
-    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-
-    ggml_tensor* init_latent = generate_init_latent(sd_ctx, work_ctx, width, height);
-
-    sd_image_t* result_images = generate_image(sd_ctx,
-                                               work_ctx,
-                                               init_latent,
-                                               prompt_c_str,
-                                               negative_prompt_c_str,
-                                               clip_skip,
-                                               guidance,
-                                               eta,
-                                               width,
-                                               height,
-                                               sample_method,
-                                               sigmas,
-                                               seed,
-                                               batch_count,
-                                               control_cond,
-                                               control_strength,
-                                               style_ratio,
-                                               normalize_input,
-                                               "",
-                                               ref_latents,
-                                               NULL);
-
-    size_t t2 = ggml_time_ms();
-
-    LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000);
-
-    return result_images;
-}
\ No newline at end of file
diff --git a/stable-diffusion.h b/stable-diffusion.h
index ac50df1a..a6032592 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -30,7 +30,8 @@ extern "C" {
 
 enum rng_type_t {
     STD_DEFAULT_RNG,
-    CUDA_RNG
+    CUDA_RNG,
+    RNG_TYPE_COUNT
 };
 
 enum sample_method_t {
@@ -46,7 +47,7 @@ enum sample_method_t {
     LCM,
     DDIM_TRAILING,
     TCD,
-    N_SAMPLE_METHODS
+    SAMPLE_METHOD_COUNT
 };
 
 enum schedule_t {
@@ -56,7 +57,7 @@ enum schedule_t {
     EXPONENTIAL,
     AYS,
     GITS,
-    N_SCHEDULES
+    SCHEDULE_COUNT
 };
 
 // same as enum ggml_type
@@ -103,8 +104,6 @@ enum sd_type_t {
     SD_TYPE_COUNT = 39,
 };
 
-SD_API const char* sd_type_name(enum sd_type_t type);
-
 enum sd_log_level_t {
     SD_LOG_DEBUG,
     SD_LOG_INFO,
@@ -112,13 +111,33 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
-typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
-typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-
-SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
-SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API int32_t get_num_physical_cores();
-SD_API const char* sd_get_system_info();
+typedef struct {
+    const char* model_path;
+    const char* clip_l_path;
+    const char* clip_g_path;
+    const char* t5xxl_path;
+    const char* diffusion_model_path;
+    const char* vae_path;
+    const char* taesd_path;
+    const char* control_net_path;
+    const char* lora_model_dir;
+    const char* embedding_dir;
+    const char* stacked_id_embed_dir;
+    bool vae_decode_only;
+    bool vae_tiling;
+    bool free_params_immediately;
+    int n_threads;
+    enum sd_type_t wtype;
+    enum rng_type_t rng_type;
+    enum schedule_t schedule;
+    bool keep_clip_on_cpu;
+    bool keep_control_net_on_cpu;
+    bool keep_vae_on_cpu;
+    bool diffusion_flash_attn;
+    bool chroma_use_dit_mask;
+    bool chroma_use_t5_mask;
+    int chroma_t5_mask_pad;
+} sd_ctx_params_t;
 
 typedef struct {
     uint32_t width;
@@ -127,8 +146,6 @@ typedef struct {
     uint8_t* data;
 } sd_image_t;
 
-typedef struct sd_ctx_t sd_ctx_t;
-
 typedef struct {
     int* layers;
     size_t layer_count;
@@ -145,106 +162,76 @@ typedef struct {
     sd_slg_params_t slg;
 } sd_guidance_params_t;
 
-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* clip_l_path,
-                            const char* clip_g_path,
-                            const char* t5xxl_path,
-                            const char* diffusion_model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* control_net_path_c_str,
-                            const char* lora_model_dir,
-                            const char* embed_dir_c_str,
-                            const char* stacked_id_embed_dir_c_str,
-                            bool vae_decode_only,
-                            bool vae_tiling,
-                            bool free_params_immediately,
-                            int n_threads,
-                            enum sd_type_t wtype,
-                            enum rng_type_t rng_type,
-                            enum schedule_t s,
-                            bool keep_clip_on_cpu,
-                            bool keep_control_net_cpu,
-                            bool keep_vae_on_cpu,
-                            bool diffusion_flash_attn,
-                            bool chroma_use_dit_mask,
-                            bool chroma_use_t5_mask,
-                            int chroma_t5_mask_pad);
+typedef struct {
+    const char* prompt;
+    const char* negative_prompt;
+    int clip_skip;
+    sd_guidance_params_t guidance;
+    sd_image_t init_image;
+    sd_image_t* ref_images;
+    int ref_images_count;
+    sd_image_t mask_image;
+    int width;
+    int height;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float eta;
+    float strength;
+    int64_t seed;
+    int batch_count;
+    const sd_image_t* control_cond;
+    float control_strength;
+    float style_strength;
+    bool normalize_input;
+    const char* input_id_images_path;
+} sd_img_gen_params_t;
 
-SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
+typedef struct {
+    sd_image_t init_image;
+    int width;
+    int height;
+    sd_guidance_params_t guidance;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float strength;
+    int64_t seed;
+    int video_frames;
+    int motion_bucket_id;
+    int fps;
+    float augmentation_level;
+} sd_vid_gen_params_t;
+
+typedef struct sd_ctx_t sd_ctx_t;
+
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API int32_t get_num_physical_cores();
+SD_API const char* sd_get_system_info();
 
-SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           sd_guidance_params_t guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path);
+SD_API const char* sd_type_name(enum sd_type_t type);
+SD_API enum sd_type_t str_to_sd_type(const char* str);
+SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
+SD_API enum rng_type_t str_to_rng_type(const char* str);
+SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
+SD_API enum sample_method_t str_to_sample_method(const char* str);
+SD_API const char* sd_schedule_name(enum schedule_t schedule);
+SD_API enum schedule_t str_to_schedule(const char* str);
+
+SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
+SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
 
-SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           sd_image_t mask_image,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           sd_guidance_params_t guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path);
+SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
+SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           sd_guidance_params_t guidance,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed);
+SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
+SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
 
-SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
-                        sd_image_t* ref_images,
-                        int ref_images_count,
-                        const char* prompt,
-                        const char* negative_prompt,
-                        int clip_skip,
-                        sd_guidance_params_t guidance,
-                        float eta,
-                        int width,
-                        int height,
-                        enum sample_method_t sample_method,
-                        int sample_steps,
-                        int64_t seed,
-                        int batch_count,
-                        const sd_image_t* control_cond,
-                        float control_strength,
-                        float style_strength,
-                        bool normalize_input,
-                        const char* input_id_images_path);
+SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken
 
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
@@ -254,7 +241,11 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* tensor_type_rules);
+SD_API bool convert(const char* input_path,
+                    const char* vae_path,
+                    const char* output_path,
+                    enum sd_type_t output_type,
+                    const char* tensor_type_rules);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,
diff --git a/util.cpp b/util.cpp
index 631c1206..92bc9ef5 100644
--- a/util.cpp
+++ b/util.cpp
@@ -441,10 +441,6 @@ const char* sd_get_system_info() {
     return buffer;
 }
 
-const char* sd_type_name(enum sd_type_t type) {
-    return ggml_type_name((ggml_type)type);
-}
-
 sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
     sd_image_f32_t converted_image;
     converted_image.width   = image.width;
diff --git a/util.h b/util.h
index 14fa812e..d98c9a28 100644
--- a/util.h
+++ b/util.h
@@ -7,6 +7,9 @@
 
 #include "stable-diffusion.h"
 
+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
+
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);