leejet · leejet · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025
diff --git a/README.md b/README.md
@@ -299,9 +299,6 @@ arguments:
   --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
   --control-net [CONTROL_PATH]       path to control net model
   --embd-dir [EMBEDDING_PATH]        path to embeddings
-  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings
-  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir
-  --normalize-input                  normalize PHOTOMAKER input id images
   --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
   --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
   --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
@@ -348,7 +345,6 @@ arguments:
   --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
                                      SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
   --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)
   --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
                                      1.0 corresponds to full destruction of information in init image
   -H, --height H                     image height, in pixel space (default: 512)
@@ -383,6 +379,11 @@ arguments:
                                      only enabled if `--high-noise-steps` is set to -1
   --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
   --vace-strength                    wan vace strength
+  --photo-maker                      path to PHOTOMAKER model
+  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir
+  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed
+  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)
+  --normalize-input                  normalize PHOTOMAKER input id images
   -v, --verbose                      print extra info
 ```
 

diff --git a/docs/photo_maker.md b/docs/photo_maker.md
@@ -6,16 +6,15 @@ You can use [PhotoMaker](https://github.com/TencentARC/PhotoMaker) to personaliz
 
 Download PhotoMaker model file (in safetensor format) [here](https://huggingface.co/bssrdf/PhotoMaker). The official release of the model file (in .bin format) does not work with ```stablediffusion.cpp```.
 
-- Specify the PhotoMaker model path using the `--stacked-id-embd-dir PATH` parameter.
-- Specify the input images path using the `--input-id-images-dir PATH` parameter.
-  - input images **must** have the same width and height for preprocessing (to be improved)
+- Specify the PhotoMaker model path using the `--photo-maker PATH` parameter.
+- Specify the input images path using the `--pm-id-images-dir PATH` parameter.
 
 In prompt, make sure you have a class word followed by the trigger word ```"img"``` (hard-coded for now). The class word could be one of ```"man, woman, girl, boy"```. If input ID images contain asian faces, add ```Asian``` before the class
 word.
 
 Another PhotoMaker specific parameter:
 
-- ```--style-ratio  (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
+- ```--pm-style-strength  (0-100)%```: default is 20 and 10-20 typically gets good results. Lower ratio means more faithfully following input ID (not necessarily better quality).
 
 Other parameters recommended for running Photomaker:
 
@@ -28,7 +27,7 @@ If on low memory GPUs (<= 8GB), recommend running with ```--vae-on-cpu``` option
 Example:
 
 ```bash
-bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
+bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors  --vae ../models/sdxl_vae.safetensors --photo-maker ../models/photomaker-v1.safetensors --pm-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0  --sampling-method euler -H 1024 -W 1024 --pm-style-strength 10 --vae-on-cpu --steps 50
 ```
 
 ## PhotoMaker Version 2

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -66,8 +66,6 @@ struct SDParams {
     std::string esrgan_path;
     std::string control_net_path;
     std::string embedding_dir;
-    std::string stacked_id_embed_dir;
-    std::string input_id_images_path;
     sd_type_t wtype = SD_TYPE_COUNT;
     std::string tensor_type_rules;
     std::string lora_model_dir;
@@ -82,11 +80,10 @@ struct SDParams {
 
     std::string prompt;
     std::string negative_prompt;
-    float style_ratio = 20.f;
-    int clip_skip     = -1;  // <= 0 represents unspecified
-    int width         = 512;
-    int height        = 512;
-    int batch_count   = 1;
+    int clip_skip   = -1;  // <= 0 represents unspecified
+    int width       = 512;
+    int height      = 512;
+    int batch_count = 1;
 
     std::vector<int> skip_layers = {7, 8, 9};
     sd_sample_params_t sample_params;
@@ -116,6 +113,12 @@ struct SDParams {
     bool color                 = false;
     int upscale_repeats        = 1;
 
+    // Photo Maker
+    std::string photo_maker_path;
+    std::string pm_id_images_dir;
+    std::string pm_id_embed_path;
+    float pm_style_strength = 20.f;
+
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
@@ -149,9 +152,10 @@ void print_params(SDParams params) {
     printf("    esrgan_path:                       %s\n", params.esrgan_path.c_str());
     printf("    control_net_path:                  %s\n", params.control_net_path.c_str());
     printf("    embedding_dir:                     %s\n", params.embedding_dir.c_str());
-    printf("    stacked_id_embed_dir:              %s\n", params.stacked_id_embed_dir.c_str());
-    printf("    input_id_images_path:              %s\n", params.input_id_images_path.c_str());
-    printf("    style ratio:                       %.2f\n", params.style_ratio);
+    printf("    photo_maker_path:                  %s\n", params.photo_maker_path.c_str());
+    printf("    pm_id_images_dir:                  %s\n", params.pm_id_images_dir.c_str());
+    printf("    pm_id_embed_path:                  %s\n", params.pm_id_embed_path.c_str());
+    printf("    pm_style_strength:                 %.2f\n", params.pm_style_strength);
     printf("    normalize input image:             %s\n", params.normalize_input ? "true" : "false");
     printf("    output_path:                       %s\n", params.output_path.c_str());
     printf("    init_image_path:                   %s\n", params.init_image_path.c_str());
@@ -217,9 +221,6 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
     printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
-    printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings\n");
-    printf("  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir\n");
-    printf("  --normalize-input                  normalize PHOTOMAKER input id images\n");
     printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
@@ -266,7 +267,6 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
-    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
     printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
@@ -301,6 +301,11 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     only enabled if `--high-noise-steps` is set to -1\n");
     printf("  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)\n");
     printf("  --vace-strength                    wan vace strength\n");
+    printf("  --photo-maker                      path to PHOTOMAKER model\n");
+    printf("  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir\n");
+    printf("  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed\n");
+    printf("  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)\n");
+    printf("  --normalize-input                  normalize PHOTOMAKER input id images\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -487,12 +492,13 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--taesd", "", &params.taesd_path},
         {"", "--control-net", "", &params.control_net_path},
         {"", "--embd-dir", "", &params.embedding_dir},
-        {"", "--stacked-id-embd-dir", "", &params.stacked_id_embed_dir},
         {"", "--lora-model-dir", "", &params.lora_model_dir},
         {"-i", "--init-img", "", &params.init_image_path},
         {"", "--end-img", "", &params.end_image_path},
         {"", "--tensor-type-rules", "", &params.tensor_type_rules},
-        {"", "--input-id-images-dir", "", &params.input_id_images_path},
+        {"", "--photo-maker", "", &params.photo_maker_path},
+        {"", "--pm-id-images-dir", "", &params.pm_id_images_dir},
+        {"", "--pm-id-embed-path", "", &params.pm_id_embed_path},
         {"", "--mask", "", &params.mask_image_path},
         {"", "--control-image", "", &params.control_image_path},
         {"", "--control-video", "", &params.control_video_path},
@@ -532,7 +538,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--high-noise-skip-layer-end", "", &params.high_noise_sample_params.guidance.slg.layer_end},
         {"", "--high-noise-eta", "", &params.high_noise_sample_params.eta},
         {"", "--strength", "", &params.strength},
-        {"", "--style-ratio", "", &params.style_ratio},
+        {"", "--pm-style-strength", "", &params.pm_style_strength},
         {"", "--control-strength", "", &params.control_strength},
         {"", "--moe-boundary", "", &params.moe_boundary},
         {"", "--flow-shift", "", &params.flow_shift},
@@ -1075,14 +1081,58 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte
                      STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
                      STBIR_FILTER_BOX, STBIR_FILTER_BOX,
                      STBIR_COLORSPACE_SRGB, nullptr);
-
-        // Save resized result
+        width  = resized_width;
+        height = resized_height;
         free(image_buffer);
         image_buffer = resized_image_buffer;
     }
     return image_buffer;
 }
 
+bool load_images_from_dir(const std::string dir,
+                          std::vector<sd_image_t>& images,
+                          int expected_width  = 0,
+                          int expected_height = 0,
+                          int max_image_num   = 0,
+                          bool verbose        = false) {
+    if (!fs::exists(dir) || !fs::is_directory(dir)) {
+        fprintf(stderr, "'%s' is not a valid directory\n", dir.c_str());
+        return false;
+    }
+
+    for (const auto& entry : fs::directory_iterator(dir)) {
+        if (!entry.is_regular_file())
+            continue;
+
+        std::string path = entry.path().string();
+        std::string ext  = entry.path().extension().string();
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+
+        if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") {
+            if (verbose) {
+                printf("load image %zu from '%s'\n", images.size(), path.c_str());
+            }
+            int width             = 0;
+            int height            = 0;
+            uint8_t* image_buffer = load_image(path.c_str(), width, height, expected_width, expected_height);
+            if (image_buffer == NULL) {
+                fprintf(stderr, "load image from '%s' failed\n", path.c_str());
+                return false;
+            }
+
+            images.push_back({(uint32_t)width,
+                              (uint32_t)height,
+                              3,
+                              image_buffer});
+
+            if (max_image_num > 0 && images.size() >= max_image_num) {
+                break;
+            }
+        }
+    }
+    return true;
+}
+
 int main(int argc, const char* argv[]) {
     SDParams params;
     parse_args(argc, argv, params);
@@ -1122,21 +1172,27 @@ int main(int argc, const char* argv[]) {
     sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL};
     sd_image_t mask_image    = {(uint32_t)params.width, (uint32_t)params.height, 1, NULL};
     std::vector<sd_image_t> ref_images;
+    std::vector<sd_image_t> pmid_images;
     std::vector<sd_image_t> control_frames;
 
     auto release_all_resources = [&]() {
         free(init_image.data);
         free(end_image.data);
         free(control_image.data);
         free(mask_image.data);
-        for (auto ref_image : ref_images) {
-            free(ref_image.data);
-            ref_image.data = NULL;
+        for (auto image : ref_images) {
+            free(image.data);
+            image.data = NULL;
         }
         ref_images.clear();
-        for (auto frame : control_frames) {
-            free(frame.data);
-            frame.data = NULL;
+        for (auto image : pmid_images) {
+            free(image.data);
+            image.data = NULL;
+        }
+        pmid_images.clear();
+        for (auto image : control_frames) {
+            free(image.data);
+            image.data = NULL;
         }
         control_frames.clear();
     };
@@ -1225,44 +1281,26 @@ int main(int argc, const char* argv[]) {
     }
 
     if (!params.control_video_path.empty()) {
-        std::string dir = params.control_video_path;
-
-        if (!fs::exists(dir) || !fs::is_directory(dir)) {
-            fprintf(stderr, "'%s' is not a valid directory\n", dir.c_str());
+        if (!load_images_from_dir(params.control_video_path,
+                                  control_frames,
+                                  params.width,
+                                  params.height,
+                                  params.video_frames,
+                                  params.verbose)) {
             release_all_resources();
             return 1;
         }
+    }
 
-        for (const auto& entry : fs::directory_iterator(dir)) {
-            if (!entry.is_regular_file())
-                continue;
-
-            std::string path = entry.path().string();
-            std::string ext  = entry.path().extension().string();
-            std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-
-            if (ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp") {
-                if (params.verbose) {
-                    printf("load control frame %zu from '%s'\n", control_frames.size(), path.c_str());
-                }
-                int width             = 0;
-                int height            = 0;
-                uint8_t* image_buffer = load_image(path.c_str(), width, height, params.width, params.height);
-                if (image_buffer == NULL) {
-                    fprintf(stderr, "load image from '%s' failed\n", path.c_str());
-                    release_all_resources();
-                    return 1;
-                }
-
-                control_frames.push_back({(uint32_t)params.width,
-                                          (uint32_t)params.height,
-                                          3,
-                                          image_buffer});
-
-                if (control_frames.size() >= params.video_frames) {
-                    break;
-                }
-            }
+    if (!params.pm_id_images_dir.empty()) {
+        if (!load_images_from_dir(params.pm_id_images_dir,
+                                  pmid_images,
+                                  0,
+                                  0,
+                                  0,
+                                  params.verbose)) {
+            release_all_resources();
+            return 1;
         }
     }
 
@@ -1283,7 +1321,7 @@ int main(int argc, const char* argv[]) {
         params.control_net_path.c_str(),
         params.lora_model_dir.c_str(),
         params.embedding_dir.c_str(),
-        params.stacked_id_embed_dir.c_str(),
+        params.photo_maker_path.c_str(),
         vae_decode_only,
         true,
         params.n_threads,
@@ -1334,9 +1372,13 @@ int main(int argc, const char* argv[]) {
             params.batch_count,
             control_image,
             params.control_strength,
-            params.style_ratio,
             params.normalize_input,
-            params.input_id_images_path.c_str(),
+            {
+                pmid_images.data(),
+                (int)pmid_images.size(),
+                params.pm_id_embed_path.c_str(),
+                params.pm_style_strength,
+            },  // pm_params
             params.vae_tiling_params,
         };