From 73b1ad5828eeb8cd9b650e26d7f8f7229a289a2d Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Thu, 14 Aug 2025 11:12:53 -0300 Subject: [PATCH 1/3] refactor: remove clip_skip persistent attribute All handlers are constructed with the default clip_skip value, and it is always set during inference time, so there isn't much point in keeping it as a persistent attribute. Instead, just propagate the parameter value down from get_learned_condition*. --- clip.hpp | 37 +++++++++--------------- conditioner.hpp | 67 +++++++++++++++----------------------------- stable-diffusion.cpp | 2 -- 3 files changed, 36 insertions(+), 70 deletions(-) diff --git a/clip.hpp b/clip.hpp index f92c9c2fa..9a8c271ff 100644 --- a/clip.hpp +++ b/clip.hpp @@ -678,12 +678,10 @@ class CLIPTextModel : public GGMLBlock { int32_t n_head = 12; int32_t n_layer = 12; // num_hidden_layers int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14 - int32_t clip_skip = -1; bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true, - int clip_skip_value = -1) + bool with_final_ln = true) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -696,20 +694,12 @@ class CLIPTextModel : public GGMLBlock { n_head = 20; n_layer = 32; } - set_clip_skip(clip_skip_value); blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - void set_clip_skip(int skip) { - if (skip <= 0) { - skip = -1; - } - clip_skip = skip; - } - struct ggml_tensor* get_token_embed_weight() { auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); return embeddings->get_token_embed_weight(); @@ -720,7 +710,8 @@ class CLIPTextModel : public GGMLBlock { struct ggml_tensor* input_ids, struct ggml_tensor* tkn_embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { // input_ids: [N, n_token] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); @@ -888,9 +879,8 @@ struct CLIPTextModelRunner : public GGMLRunner { const String2GGMLType& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true, - int clip_skip_value = -1) - : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) { + bool with_final_ln = true) + : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln) { model.init(params_ctx, tensor_types, prefix); } @@ -898,10 +888,6 @@ struct CLIPTextModelRunner : public GGMLRunner { return "clip"; } - void set_clip_skip(int clip_skip) { - model.set_clip_skip(clip_skip); - } - void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } @@ -911,7 +897,8 @@ struct CLIPTextModelRunner : public GGMLRunner { struct ggml_tensor* input_ids, struct ggml_tensor* embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; if (input_ids->ne[0] > model.n_token) { @@ -919,14 +906,15 @@ struct CLIPTextModelRunner : public GGMLRunner { input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); } - return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled); + return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, int num_custom_embeddings = 0, void* custom_embeddings_data = NULL, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); @@ -945,7 +933,7 @@ struct CLIPTextModelRunner : public GGMLRunner { embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); } - struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled); + struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -958,10 +946,11 @@ struct CLIPTextModelRunner : public GGMLRunner { void* custom_embeddings_data, size_t max_token_idx, bool return_pooled, + int clip_skip, ggml_tensor** output, ggml_context* output_ctx = NULL) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled); + return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } diff --git a/conditioner.hpp b/conditioner.hpp index cfd2b4ca7..d53de57ee 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -61,8 +61,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { const String2GGMLType& tensor_types, const std::string& embd_dir, SDVersion version = VERSION_SD1, - PMVersion pv = PM_VERSION_1, - int clip_skip = -1) + PMVersion pv = PM_VERSION_1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { if (sd_version_is_sd1(version)) { text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); @@ -72,20 +71,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 1; - if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { - clip_skip = 2; - } - } - text_model->set_clip_skip(clip_skip); - if (sd_version_is_sdxl(version)) { - text_model2->set_clip_skip(clip_skip); - } } void get_param_tensors(std::map& tensors) { @@ -412,7 +397,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool zero_out_masked = false) { - set_clip_skip(clip_skip); int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] @@ -421,6 +405,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { struct ggml_tensor* pooled = NULL; std::vector hidden_states_vec; + if (clip_skip <= 0) { + clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1; + } + size_t chunk_len = 77; size_t chunk_count = tokens.size() / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { @@ -455,6 +443,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states1, work_ctx); if (sd_version_is_sdxl(version)) { @@ -464,6 +453,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states2, work_ctx); // concat chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); @@ -475,6 +465,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -669,21 +660,11 @@ struct SD3CLIPEmbedder : public Conditioner { SD3CLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) + const String2GGMLType& tensor_types = {}) : clip_g_tokenizer(0) { clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); - clip_g->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -780,7 +761,6 @@ struct SD3CLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { - set_clip_skip(clip_skip); auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& clip_g_tokens = token_and_weights[1].first; @@ -788,6 +768,10 @@ struct SD3CLIPEmbedder : public Conditioner { auto& t5_tokens = token_and_weights[2].first; auto& t5_weights = token_and_weights[2].second; + if (clip_skip <= 0) { + clip_skip = 2; + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096] @@ -818,6 +802,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_l, work_ctx); { @@ -845,6 +830,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_l, work_ctx); } @@ -866,6 +852,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_g, work_ctx); @@ -894,6 +881,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_g, work_ctx); } @@ -1017,18 +1005,9 @@ struct FluxCLIPEmbedder : public Conditioner { FluxCLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) { + const String2GGMLType& tensor_types = {}) { clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -1109,12 +1088,15 @@ struct FluxCLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool zero_out_masked = false) { - set_clip_skip(clip_skip); auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& t5_tokens = token_and_weights[1].first; auto& t5_weights = token_and_weights[1].second; + if (clip_skip <= 0) { + clip_skip = 2; + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096] @@ -1143,6 +1125,7 @@ struct FluxCLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -1241,7 +1224,6 @@ struct T5CLIPEmbedder : public Conditioner { T5CLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}, - int clip_skip = -1, bool use_mask = false, int mask_pad = 1, bool is_umt5 = false) @@ -1249,9 +1231,6 @@ struct T5CLIPEmbedder : public Conditioner { t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); } - void set_clip_skip(int clip_skip) { - } - void get_param_tensors(std::map& tensors) { t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index db4e07cb0..5f9dec009 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -373,7 +373,6 @@ class StableDiffusionGGML { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types, - -1, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else { @@ -391,7 +390,6 @@ class StableDiffusionGGML { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types, - -1, true, 1, true); From 489069c70306e33c8f2a7bca6e88d258319b7464 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Thu, 14 Aug 2025 11:50:00 -0300 Subject: [PATCH 2/3] feat: reduce CLIP memory usage with no embeddings The CLIP weights need to be converted to f32 for textual inversions (fbd42b6fc16d14fbd362993fa1d083740a05f113), but that increases the amount of allocated VRAM even when embeddings aren't being used. --- clip.hpp | 22 ++++++++++++++++------ conditioner.hpp | 9 +++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/clip.hpp b/clip.hpp index 9a8c271ff..15e8c818a 100644 --- a/clip.hpp +++ b/clip.hpp @@ -548,9 +548,15 @@ class CLIPEmbeddings : public GGMLBlock { int64_t embed_dim; int64_t vocab_size; int64_t num_positions; + bool force_clip_f32; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type token_wtype = GGML_TYPE_F32; + if (!force_clip_f32) { + auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); + if (tensor_type != tensor_types.end()) + token_wtype = tensor_type->second; + } enum ggml_type position_wtype = GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); @@ -560,10 +566,12 @@ class CLIPEmbeddings : public GGMLBlock { public: CLIPEmbeddings(int64_t embed_dim, int64_t vocab_size = 49408, - int64_t num_positions = 77) + int64_t num_positions = 77, + bool force_clip_f32 = false) : embed_dim(embed_dim), vocab_size(vocab_size), - num_positions(num_positions) { + num_positions(num_positions), + force_clip_f32(force_clip_f32) { } struct ggml_tensor* get_token_embed_weight() { @@ -681,7 +689,8 @@ class CLIPTextModel : public GGMLBlock { bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true) + bool with_final_ln = true, + bool force_clip_f32 = false) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -695,7 +704,7 @@ class CLIPTextModel : public GGMLBlock { n_layer = 32; } - blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); + blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } @@ -879,8 +888,9 @@ struct CLIPTextModelRunner : public GGMLRunner { const String2GGMLType& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true) - : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln) { + bool with_final_ln = true, + bool force_clip_f32 = false) + : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) { model.init(params_ctx, tensor_types, prefix); } diff --git a/conditioner.hpp b/conditioner.hpp index d53de57ee..bda99dfc2 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -63,13 +63,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { SDVersion version = VERSION_SD1, PMVersion pv = PM_VERSION_1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { + bool force_clip_f32 = embd_dir.size() > 0; if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } } From 0218169092294d489be96056153f5cc58cd7b45e Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 14 Sep 2025 12:07:20 +0800 Subject: [PATCH 3/3] format code --- clip.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clip.hpp b/clip.hpp index 15e8c818a..bde8a78a5 100644 --- a/clip.hpp +++ b/clip.hpp @@ -551,7 +551,7 @@ class CLIPEmbeddings : public GGMLBlock { bool force_clip_f32; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { - enum ggml_type token_wtype = GGML_TYPE_F32; + enum ggml_type token_wtype = GGML_TYPE_F32; if (!force_clip_f32) { auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); if (tensor_type != tensor_types.end())