leejet · leejet · May 31, 2026 · May 28, 2026 · May 31, 2026 · May 31, 2026
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ API and command-line option may change frequently.***
 
 ## 🔥Important News
 
+* **2026/05/31** 🚀 stable-diffusion.cpp now supports **PiD**
 * **2026/05/27** 🚀 stable-diffusion.cpp now supports **Lens**
 * **2026/05/17** 🚀 stable-diffusion.cpp now supports **LTX-2.3**
 * **2026/04/11** 🚀 stable-diffusion.cpp now uses a brand-new embedded web UI.  
@@ -42,6 +43,7 @@ API and command-line option may change frequently.***
     - [Chroma](./docs/chroma.md)
     - [Chroma1-Radiance](./docs/chroma_radiance.md)
     - [Qwen Image](./docs/qwen_image.md)
+    - [PiD](./docs/pid.md)
     - [LongCat Image](./docs/longcat_image.md)
     - [Z-Image](./docs/z_image.md)
     - [Ovis-Image](./docs/ovis_image.md)

diff --git a/assets/pid/example.png b/assets/pid/example.png
diff --git a/docs/pid.md b/docs/pid.md
@@ -0,0 +1,39 @@
+# How to Use
+
+PiD is NVIDIA's Pixel Diffusion Decoder. It replaces the usual VAE decode or decode-then-upscale path with a pixel-space diffusion decoder conditioned on a
+source latent and text prompt.
+
+In stable-diffusion.cpp, PiD currently runs as an image edit pipeline: provide a reference image with `-r`/`--ref-image`, encode that image with a matching VAE, then let the PiD diffusion model decode/upscale directly to RGB.
+
+## Download weights
+
+- Download PiD
+    - safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/diffusion_models
+- Download Gemma 2 2B
+    - safetensors: https://huggingface.co/Comfy-Org/PixelDiT/tree/main/text_encoders
+- Download the VAE that matches the PiD checkpoint backbone
+    - safetensors: https://huggingface.co/nvidia/PiD/tree/main/checkpoints
+    - Flux / Z-Image PiD: use the Flux VAE and pass `--vae-format flux`
+    - SD3 PiD: use the SD3 VAE and pass `--vae-format sd3`
+    - Flux.2 PiD: use the Flux.2 VAE and pass `--vae-format flux2`
+
+The official PiD model card should be checked before use. At the time of the initial PiD release, the official weights are under the NSCLv1 non-commercial license.
+
+## Examples
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model ..\..\ComfyUI\models\diffusion_models\pid_flux1_512_to_2048_4step_bf16.safetensors --llm "..\..\ComfyUI\models\text_encoders\gemma_2_2b_it_elm_bf16.safetensors" --vae ..\..\ComfyUI\models\vae\ae.sft --vae-format flux --cfg-scale 1.0  -p "a lovely cat" -r ..\assets\ernie_image\turbo_example.png --diffusion-fa -v --steps 4 -H 2048 -W 2048 --rng cpu
+```
+
+Before:
+
+<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
+
+After:
+<img width="1024" alt="PiD example" src="../assets/pid/example.png" />
+
+## Notes
+
+- `-r`/`--ref-image` is required. PiD uses the first reference image as the source latent condition.
+- `--vae-format` should match the VAE latent layout used by the PiD checkpoint. This is important when using standalone VAE files because the PiD diffusion
+  checkpoint alone does not identify the VAE format.
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -35,6 +35,22 @@ const char* const modes_str[] = {
     "metadata",
 };
 
+static sd_vae_format_t str_to_vae_format(const std::string& value) {
+    if (value == "auto") {
+        return SD_VAE_FORMAT_AUTO;
+    }
+    if (value == "flux") {
+        return SD_VAE_FORMAT_FLUX;
+    }
+    if (value == "sd3") {
+        return SD_VAE_FORMAT_SD3;
+    }
+    if (value == "flux2") {
+        return SD_VAE_FORMAT_FLUX2;
+    }
+    return SD_VAE_FORMAT_COUNT;
+}
+
 #if defined(_WIN32)
 static std::string utf16_to_utf8(const std::wstring& wstr) {
     if (wstr.empty())
@@ -348,6 +364,10 @@ ArgOptions SDContextParams::get_options() {
          "--vae",
          "path to standalone vae model",
          &vae_path},
+        {"",
+         "--vae-format",
+         "VAE latent format override: auto, flux, sd3, or flux2 (default: auto)",
+         &vae_format},
         {"",
          "--audio-vae",
          "path to standalone LTX audio vae model",
@@ -639,6 +659,11 @@ bool SDContextParams::validate(SDMode mode) {
         }
     }
 
+    if (str_to_vae_format(vae_format) == SD_VAE_FORMAT_COUNT) {
+        LOG_ERROR("error: vae_format must be 'auto', 'flux', 'sd3', or 'flux2'");
+        return false;
+    }
+
     return true;
 }
 
@@ -679,6 +704,7 @@ std::string SDContextParams::to_string() const {
         << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
         << "  embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
         << "  vae_path: \"" << vae_path << "\",\n"
+        << "  vae_format: \"" << vae_format << "\",\n"
         << "  audio_vae_path: \"" << audio_vae_path << "\",\n"
         << "  taesd_path: \"" << taesd_path << "\",\n"
         << "  esrgan_path: \"" << esrgan_path << "\",\n"
@@ -772,6 +798,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_use_t5_mask,
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
+        str_to_vae_format(vae_format),
         max_vram,
         backend.c_str(),
         params_backend.c_str(),

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -94,6 +94,7 @@ struct SDContextParams {
     std::string high_noise_diffusion_model_path;
     std::string embeddings_connectors_path;
     std::string vae_path;
+    std::string vae_format = "auto";
     std::string audio_vae_path;
     std::string taesd_path;
     std::string esrgan_path;

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -168,6 +168,14 @@ typedef struct {
     const char* path;
 } sd_embedding_t;
 
+enum sd_vae_format_t {
+    SD_VAE_FORMAT_AUTO = -1,
+    SD_VAE_FORMAT_FLUX,
+    SD_VAE_FORMAT_SD3,
+    SD_VAE_FORMAT_FLUX2,
+    SD_VAE_FORMAT_COUNT,
+};
+
 typedef struct {
     const char* model_path;
     const char* clip_l_path;
@@ -212,6 +220,7 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+    enum sd_vae_format_t vae_format;
     float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
     const char* backend;
     const char* params_backend;

diff --git a/src/conditioner.hpp b/src/conditioner.hpp
@@ -1171,7 +1171,6 @@ struct FluxCLIPEmbedder : public Conditioner {
         return true;
     }
 
-
     void free_params_buffer() override {
         if (clip_l) {
             clip_l->free_params_buffer();
@@ -1601,8 +1600,8 @@ struct AnimaConditioner : public Conditioner {
 
     bool alloc_params_buffer() override {
         if (!llm->alloc_params_buffer()) {
-                return false;
-            }
+            return false;
+        }
         return true;
     }
 
@@ -1719,13 +1718,17 @@ struct LLMEmbedder : public Conditioner {
             arch = LLM::LLMArch::MINISTRAL_3_3B;
         } else if (sd_version_is_lens(version)) {
             arch = LLM::LLMArch::GPT_OSS_20B;
+        } else if (sd_version_is_pid(version)) {
+            arch = LLM::LLMArch::GEMMA2_2B;
         } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) {
             arch = LLM::LLMArch::QWEN3;
         }
         if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2 || arch == LLM::LLMArch::MINISTRAL_3_3B) {
             tokenizer = std::make_shared<MistralTokenizer>();
         } else if (arch == LLM::LLMArch::GPT_OSS_20B) {
             tokenizer = std::make_shared<GPTOSSTokenizer>();
+        } else if (arch == LLM::LLMArch::GEMMA2_2B) {
+            tokenizer = std::make_shared<Gemma2Tokenizer>();
         } else {
             tokenizer = std::make_shared<Qwen2Tokenizer>();
         }
@@ -1743,7 +1746,7 @@ struct LLMEmbedder : public Conditioner {
 
     bool alloc_params_buffer() override {
         if (!llm->alloc_params_buffer()) {
-                return false;
+            return false;
         }
         return true;
     }
@@ -1847,12 +1850,16 @@ struct LLMEmbedder : public Conditioner {
         sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
         sd::Tensor<float> attention_mask;
         if (!mask.empty()) {
-            attention_mask = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            attention_mask                     = sd::Tensor<float>({static_cast<int64_t>(mask.size()), static_cast<int64_t>(mask.size())});
+            const float masked_attention_value = -std::numeric_limits<float>::max() / 4.0f;
             for (size_t i1 = 0; i1 < mask.size(); ++i1) {
                 for (size_t i0 = 0; i0 < mask.size(); ++i0) {
                     float value = 0.0f;
-                    if (mask[i0] == 0.0f || i0 > i1) {
-                        value = -INFINITY;
+                    if (mask[i0] == 0.0f) {
+                        value += masked_attention_value;
+                    }
+                    if (i0 > i1) {
+                        value += masked_attention_value;
                     }
                     attention_mask[static_cast<int64_t>(i0 + mask.size() * i1)] = value;
                 }
@@ -2126,6 +2133,53 @@ struct LLMEmbedder : public Conditioner {
             prompt_attn_range.second = static_cast<int>(prompt.size());
 
             prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        } else if (sd_version_is_pid(version)) {
+            constexpr int pixeldit_max_length = 300;
+            const std::string chi_prompt =
+                "Given a user prompt, generate an \"Enhanced prompt\" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:\n"
+                "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.\n"
+                "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n"
+                "Here are examples of how to transform or refine prompts:\n"
+                "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.\n"
+                "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n"
+                "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:\n"
+                "User Prompt: ";
+            auto chi_tokens       = std::get<0>(tokenize(chi_prompt, {0, 0}));
+            size_t num_chi_tokens = chi_tokens.size();
+            max_length            = (int)num_chi_tokens + pixeldit_max_length - 2;
+            min_length            = max_length;
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += " " + conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            auto hidden_states = encode_prompt(n_threads,
+                                               prompt,
+                                               prompt_attn_range,
+                                               min_length,
+                                               0,
+                                               image_embeds,
+                                               out_layers,
+                                               0,
+                                               false,
+                                               max_length);
+            GGML_ASSERT(!hidden_states.empty());
+
+            if (hidden_states.shape()[1] > pixeldit_max_length) {
+                auto bos      = sd::ops::slice(hidden_states, 1, 0, 1);
+                auto tail     = sd::ops::slice(hidden_states,
+                                               1,
+                                               hidden_states.shape()[1] - (pixeldit_max_length - 1),
+                                               hidden_states.shape()[1]);
+                hidden_states = sd::ops::concat(bos, tail, 1);
+            }
+
+            int64_t t1 = ggml_time_ms();
+            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+
+            SDCondition result;
+            result.c_crossattn = std::move(hidden_states);
+            return result;
         } else {
             GGML_ABORT("unknown version %d", version);
         }
@@ -2268,10 +2322,10 @@ struct LTXAVEmbedder : public Conditioner {
 
     bool alloc_params_buffer() override {
         if (!llm->alloc_params_buffer()) {
-                return false;
+            return false;
         }
         if (!projector->alloc_params_buffer()) {
-                return false;
+            return false;
         }
         return true;
     }

diff --git a/src/llm.hpp b/src/llm.hpp
@@ -37,6 +37,7 @@ namespace LLM {
         MISTRAL_SMALL_3_2,
         MINISTRAL_3_3B,
         GEMMA3_12B,
+        GEMMA2_2B,
         GPT_OSS_20B,
         ARCH_COUNT,
     };
@@ -48,6 +49,7 @@ namespace LLM {
         "mistral_small3.2",
         "ministral3.3b",
         "gemma3_12b",
+        "gemma2_2b",
         "gpt_oss_20b",
     };
 
@@ -900,6 +902,33 @@ namespace LLM {
                                                  1.f,
                                                  32.f,
                                                  1.f);
+            } else if (arch == LLMArch::GEMMA2_2B) {
+                q = ggml_rope_ext(ctx->ggml_ctx,
+                                  q,
+                                  input_pos,
+                                  nullptr,
+                                  head_dim,
+                                  GGML_ROPE_TYPE_NEOX,
+                                  8192,
+                                  10000.f,
+                                  1.f,
+                                  0.f,
+                                  1.f,
+                                  32.f,
+                                  1.f);
+                k = ggml_rope_ext(ctx->ggml_ctx,
+                                  k,
+                                  input_pos,
+                                  nullptr,
+                                  head_dim,
+                                  GGML_ROPE_TYPE_NEOX,
+                                  8192,
+                                  10000.f,
+                                  1.f,
+                                  0.f,
+                                  1.f,
+                                  32.f,
+                                  1.f);
             } else if (arch == LLMArch::QWEN3_VL) {
                 int sections[4] = {24, 20, 20, 0};
                 q               = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_IMROPE, 262144, 5000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@@ -957,10 +986,18 @@ namespace LLM {
             : arch(params.arch),
               sliding_attention(0) {
             if (params.arch == LLMArch::GEMMA3_12B) {
-                post_attention_norm_name = "post_attention_norm";
-                post_ffw_norm_name       = "post_ffw_norm";
+                post_attention_norm_name = "post_attention_norm";       // attn_post_norm
+                pre_ffw_norm_name        = "post_attention_layernorm";  // ffn_norm
+                post_ffw_norm_name       = "post_ffw_norm";             // ffn_post_norm
+            } else if (params.arch == LLMArch::GEMMA2_2B) {
+                post_attention_norm_name = "post_attention_layernorm";  // ffn_norm
+                pre_ffw_norm_name        = "pre_feedforward_layernorm";
+                post_ffw_norm_name       = "post_feedforward_layernorm";
+            } else if (params.arch == LLMArch::GPT_OSS_20B) {
+                pre_ffw_norm_name = "post_attention_norm";  // attn_post_norm
+            } else {
+                pre_ffw_norm_name = "post_attention_layernorm";  // ffn_norm
             }
-            pre_ffw_norm_name = params.arch == LLMArch::GPT_OSS_20B ? "post_attention_norm" : "post_attention_layernorm";
 
             blocks["self_attn"] = std::make_shared<Attention>(params);
             if (params.arch == LLMArch::GPT_OSS_20B) {
@@ -1447,6 +1484,21 @@ namespace LLM {
                 params.rope_thetas             = {1000000.f, 10000.f};
                 params.rope_scales             = {8.f, 1.f};
                 params.sliding_attention       = {1024, 1024, 1024, 1024, 1024, 0};
+            } else if (arch == LLMArch::GEMMA2_2B) {
+                params.head_dim                = 256;
+                params.num_heads               = 8;
+                params.num_kv_heads            = 4;
+                params.qkv_bias                = false;
+                params.qk_norm                 = false;
+                params.rms_norm_eps            = 1e-6f;
+                params.rms_norm_add            = true;
+                params.normalize_input         = true;
+                params.max_position_embeddings = 8192;
+                params.mlp_activation          = MLPActivation::GELU_TANH;
+                params.hidden_size             = 2304;
+                params.intermediate_size       = 9216;
+                params.num_layers              = 26;
+                params.vocab_size              = 256000;
             } else if (arch == LLMArch::GPT_OSS_20B) {
                 params.head_dim                = 64;
                 params.num_heads               = 64;
@@ -1585,6 +1637,7 @@ namespace LLM {
                 params.arch == LLMArch::MINISTRAL_3_3B ||
                 params.arch == LLMArch::QWEN3 ||
                 params.arch == LLMArch::GEMMA3_12B ||
+                params.arch == LLMArch::GEMMA2_2B ||
                 params.arch == LLMArch::GPT_OSS_20B) {
                 input_pos_vec.resize(n_tokens);
                 for (int i = 0; i < n_tokens; ++i) {

diff --git a/src/lora.hpp b/src/lora.hpp
@@ -91,7 +91,6 @@ struct LoraModel : public GGMLRunner {
             return false;
         }
 
-
         dry_run = false;
         model_loader.load_tensors(on_new_tensor_cb, n_threads);
-Original file line number
+Diff line change
@@ Expand Up / @@ -91,7 +91,6 @@ struct LoraModel : public GGMLRunner { @@
                 return false;
             }
             dry_run = false;
             model_loader.load_tensors(on_new_tensor_cb, n_threads);
@@ Expand Down @@