Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(tests/ltx_parity)
endif()

set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
Expand Down
42 changes: 42 additions & 0 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,10 @@ ArgOptions SDContextParams::get_options() {
"--qwen2vl_vision",
"alias of --llm_vision. Deprecated.",
&llm_vision_path},
{"",
"--gemma-tokenizer",
"path to Gemma's tokenizer.json (HF format). Required for LTX-2 text conditioning.",
&gemma_tokenizer_path},
{"",
"--diffusion-model",
"path to the standalone diffusion model",
Expand Down Expand Up @@ -376,6 +380,25 @@ ArgOptions SDContextParams::get_options() {
"--chroma-t5-mask-pad",
"t5 mask pad size of chroma",
&chroma_t5_mask_pad},
{"",
"--fit-target",
"auto-fit: MiB of free memory to leave on each GPU (default: 512)",
&auto_fit_target_mb},
{"",
"--fit-compute-reserve-dit",
"auto-fit: MiB reserved on the DiT's GPU for its compute buffer "
"(default: 2048, 0 keeps the built-in default)",
&auto_fit_compute_reserve_dit_mb},
{"",
"--fit-compute-reserve-vae",
"auto-fit: MiB reserved on the VAE's GPU for its compute buffer "
"(default: 1024, 0 keeps the built-in default)",
&auto_fit_compute_reserve_vae_mb},
{"",
"--fit-compute-reserve-cond",
"auto-fit: MiB reserved on the conditioner's GPU for its compute "
"buffer (default: 512, 0 keeps the built-in default)",
&auto_fit_compute_reserve_cond_mb},
};

options.float_options = {};
Expand Down Expand Up @@ -445,6 +468,16 @@ ArgOptions SDContextParams::get_options() {
"--chroma-enable-t5-mask",
"enable t5 mask for chroma",
true, &chroma_use_t5_mask},
{"",
"--auto-fit",
"automatically pick DiT/VAE/Conditioner device placements based on "
"free GPU memory (priority: DiT+compute > VAE > Conditioner; "
"overflow goes to CPU or DiT-params-offload mode)",
true, &auto_fit},
{"",
"--fit-dry-run",
"auto-fit: print the computed plan and exit without loading models",
true, &auto_fit_dry_run},
};

auto on_type_arg = [&](int argc, const char** argv, int index) {
Expand Down Expand Up @@ -638,6 +671,7 @@ std::string SDContextParams::to_string() const {
<< " t5xxl_path: \"" << t5xxl_path << "\",\n"
<< " llm_path: \"" << llm_path << "\",\n"
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
<< " gemma_tokenizer_path: \"" << gemma_tokenizer_path << "\",\n"
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
<< " vae_path: \"" << vae_path << "\",\n"
Expand Down Expand Up @@ -693,6 +727,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
t5xxl_path.c_str(),
llm_path.c_str(),
llm_vision_path.c_str(),
gemma_tokenizer_path.c_str(),
diffusion_model_path.c_str(),
high_noise_diffusion_model_path.c_str(),
vae_path.c_str(),
Expand Down Expand Up @@ -727,6 +762,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
chroma_use_t5_mask,
chroma_t5_mask_pad,
qwen_image_zero_cond_t,
auto_fit,
auto_fit_target_mb,
auto_fit_dry_run,
auto_fit_compute_reserve_dit_mb,
auto_fit_compute_reserve_vae_mb,
auto_fit_compute_reserve_cond_mb,
};
return sd_ctx_params;
}
Expand Down Expand Up @@ -2012,6 +2053,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
params.strength = strength;
params.seed = seed;
params.video_frames = video_frames;
params.fps = static_cast<float>(fps);
params.vace_strength = vace_strength;
params.vae_tiling_params = vae_tiling_params;
params.cache = cache_params;
Expand Down
9 changes: 9 additions & 0 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ struct SDContextParams {
std::string t5xxl_path;
std::string llm_path;
std::string llm_vision_path;
std::string gemma_tokenizer_path;
std::string diffusion_model_path;
std::string high_noise_diffusion_model_path;
std::string vae_path;
Expand Down Expand Up @@ -127,6 +128,14 @@ struct SDContextParams {

bool qwen_image_zero_cond_t = false;

// Auto-fit: pick DiT/VAE/Conditioner device placements from free GPU memory.
bool auto_fit = false;
int auto_fit_target_mb = 512;
bool auto_fit_dry_run = false;
int auto_fit_compute_reserve_dit_mb = 0; // 0 = use header default
int auto_fit_compute_reserve_vae_mb = 0;
int auto_fit_compute_reserve_cond_mb = 0;

prediction_t prediction = PREDICTION_COUNT;
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;

Expand Down
26 changes: 26 additions & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ enum prediction_t {
FLOW_PRED,
FLUX_FLOW_PRED,
FLUX2_FLOW_PRED,
LTX2_FLOW_PRED,
PREDICTION_COUNT
};

Expand Down Expand Up @@ -169,6 +170,11 @@ typedef struct {
const char* t5xxl_path;
const char* llm_path;
const char* llm_vision_path;
// Path to a HuggingFace-format tokenizer.json file. Currently only read by the
// LTX-2 Gemma 3 conditioner, which requires Gemma's tokenizer for BPE + metaspace
// encoding of prompts. If empty for LTX-2, the conditioner aborts with a clear
// message. Non-LTX-2 pipelines ignore this field.
const char* gemma_tokenizer_path;
const char* diffusion_model_path;
const char* high_noise_diffusion_model_path;
const char* vae_path;
Expand Down Expand Up @@ -203,6 +209,20 @@ typedef struct {
bool chroma_use_t5_mask;
int chroma_t5_mask_pad;
bool qwen_image_zero_cond_t;

// Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
// When `auto_fit` is true, the CLI placement overrides (env vars,
// keep_*_on_cpu) are ignored and the plan is computed automatically.
// `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
// `auto_fit_dry_run` prints the plan and aborts init before loading.
// `auto_fit_compute_reserve_{dit,vae,cond}_mb` let the user tune the
// per-component compute-buffer reserve; 0 means use the built-in default.
bool auto_fit;
int auto_fit_target_mb;
bool auto_fit_dry_run;
int auto_fit_compute_reserve_dit_mb;
int auto_fit_compute_reserve_vae_mb;
int auto_fit_compute_reserve_cond_mb;
} sd_ctx_params_t;

typedef struct {
Expand Down Expand Up @@ -332,6 +352,12 @@ typedef struct {
float strength;
int64_t seed;
int video_frames;
// Output video fps. Carried through to models that use it for temporal
// positional embeddings — LTX-2's RoPE divides the time axis by fps
// (ltx_core/tools.py::VideoLatentTools.create_initial_state), so the
// default 24 on LTXRunner silently produces wrong positions at any
// other target fps. 0 means "don't override runner default".
float fps;
float vace_strength;
sd_tiling_params_t vae_tiling_params;
sd_cache_params_t cache;
Expand Down
Loading