leejet · pwilkin · Apr 23, 2026 · Apr 24, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -265,6 +265,7 @@ target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
 
 if (SD_BUILD_EXAMPLES)
     add_subdirectory(examples)
+    add_subdirectory(tests/ltx_parity)
 endif()
 
 set(SD_PUBLIC_HEADERS include/stable-diffusion.h)

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -319,6 +319,10 @@ ArgOptions SDContextParams::get_options() {
          "--qwen2vl_vision",
          "alias of --llm_vision. Deprecated.",
          &llm_vision_path},
+        {"",
+         "--gemma-tokenizer",
+         "path to Gemma's tokenizer.json (HF format). Required for LTX-2 text conditioning.",
+         &gemma_tokenizer_path},
         {"",
          "--diffusion-model",
          "path to the standalone diffusion model",
@@ -376,6 +380,25 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
+        {"",
+         "--fit-compute-reserve-dit",
+         "auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
+         "(default: 2048, 0 keeps the built-in default)",
+         &auto_fit_compute_reserve_dit_mb},
+        {"",
+         "--fit-compute-reserve-vae",
+         "auto-fit: MiB reserved on the VAE's GPU for its compute buffer "
+         "(default: 1024, 0 keeps the built-in default)",
+         &auto_fit_compute_reserve_vae_mb},
+        {"",
+         "--fit-compute-reserve-cond",
+         "auto-fit: MiB reserved on the conditioner's GPU for its compute "
+         "buffer (default: 512, 0 keeps the built-in default)",
+         &auto_fit_compute_reserve_cond_mb},
     };
 
     options.float_options = {};
@@ -445,6 +468,16 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (priority: DiT+compute > VAE > Conditioner; "
+         "overflow goes to CPU or DiT-params-offload mode)",
+         true, &auto_fit},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -638,6 +671,7 @@ std::string SDContextParams::to_string() const {
         << "  t5xxl_path: \"" << t5xxl_path << "\",\n"
         << "  llm_path: \"" << llm_path << "\",\n"
         << "  llm_vision_path: \"" << llm_vision_path << "\",\n"
+        << "  gemma_tokenizer_path: \"" << gemma_tokenizer_path << "\",\n"
         << "  diffusion_model_path: \"" << diffusion_model_path << "\",\n"
         << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
         << "  vae_path: \"" << vae_path << "\",\n"
@@ -693,6 +727,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         t5xxl_path.c_str(),
         llm_path.c_str(),
         llm_vision_path.c_str(),
+        gemma_tokenizer_path.c_str(),
         diffusion_model_path.c_str(),
         high_noise_diffusion_model_path.c_str(),
         vae_path.c_str(),
@@ -727,6 +762,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_use_t5_mask,
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
+        auto_fit,
+        auto_fit_target_mb,
+        auto_fit_dry_run,
+        auto_fit_compute_reserve_dit_mb,
+        auto_fit_compute_reserve_vae_mb,
+        auto_fit_compute_reserve_cond_mb,
     };
     return sd_ctx_params;
 }
@@ -2012,6 +2053,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
     params.strength                 = strength;
     params.seed                     = seed;
     params.video_frames             = video_frames;
+    params.fps                      = static_cast<float>(fps);
     params.vace_strength            = vace_strength;
     params.vae_tiling_params        = vae_tiling_params;
     params.cache                    = cache_params;

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -90,6 +90,7 @@ struct SDContextParams {
     std::string t5xxl_path;
     std::string llm_path;
     std::string llm_vision_path;
+    std::string gemma_tokenizer_path;
     std::string diffusion_model_path;
     std::string high_noise_diffusion_model_path;
     std::string vae_path;
@@ -127,6 +128,14 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit: pick DiT/VAE/Conditioner device placements from free GPU memory.
+    bool auto_fit                         = false;
+    int  auto_fit_target_mb               = 512;
+    bool auto_fit_dry_run                 = false;
+    int  auto_fit_compute_reserve_dit_mb  = 0;  // 0 = use header default
+    int  auto_fit_compute_reserve_vae_mb  = 0;
+    int  auto_fit_compute_reserve_cond_mb = 0;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -76,6 +76,7 @@ enum prediction_t {
     FLOW_PRED,
     FLUX_FLOW_PRED,
     FLUX2_FLOW_PRED,
+    LTX2_FLOW_PRED,
     PREDICTION_COUNT
 };
 
@@ -169,6 +170,11 @@ typedef struct {
     const char* t5xxl_path;
     const char* llm_path;
     const char* llm_vision_path;
+    // Path to a HuggingFace-format tokenizer.json file. Currently only read by the
+    // LTX-2 Gemma 3 conditioner, which requires Gemma's tokenizer for BPE + metaspace
+    // encoding of prompts. If empty for LTX-2, the conditioner aborts with a clear
+    // message. Non-LTX-2 pipelines ignore this field.
+    const char* gemma_tokenizer_path;
     const char* diffusion_model_path;
     const char* high_noise_diffusion_model_path;
     const char* vae_path;
@@ -203,6 +209,20 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true, the CLI placement overrides (env vars,
+    // keep_*_on_cpu) are ignored and the plan is computed automatically.
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
+    // per-component compute-buffer reserve; 0 means use the built-in default.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    int  auto_fit_compute_reserve_dit_mb;
+    int  auto_fit_compute_reserve_vae_mb;
+    int  auto_fit_compute_reserve_cond_mb;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -332,6 +352,12 @@ typedef struct {
     float strength;
     int64_t seed;
     int video_frames;
+    // Output video fps. Carried through to models that use it for temporal
+    // positional embeddings — LTX-2's RoPE divides the time axis by fps
+    // (ltx_core/tools.py::VideoLatentTools.create_initial_state), so the
+    // default 24 on LTXRunner silently produces wrong positions at any
+    // other target fps. 0 means "don't override runner default".
+    float fps;
     float vace_strength;
     sd_tiling_params_t vae_tiling_params;
     sd_cache_params_t cache;