From 73b1ad5828eeb8cd9b650e26d7f8f7229a289a2d Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:12:53 -0300
Subject: [PATCH 1/3] refactor: remove clip_skip persistent attribute

All handlers are constructed with the default clip_skip value, and
it is always set during inference time, so there isn't much point
in keeping it as a persistent attribute. Instead, just propagate
the parameter value down from get_learned_condition*.
---
 clip.hpp             | 37 +++++++++---------------
 conditioner.hpp      | 67 +++++++++++++++-----------------------------
 stable-diffusion.cpp |  2 --
 3 files changed, 36 insertions(+), 70 deletions(-)
diff --git a/clip.hpp b/clip.hpp
index f92c9c2fa..9a8c271ff 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -678,12 +678,10 @@ class CLIPTextModel : public GGMLBlock {
     int32_t n_head            = 12;
     int32_t n_layer           = 12;    // num_hidden_layers
     int32_t projection_dim    = 1280;  // only for OPEN_CLIP_VIT_BIGG_14
-    int32_t clip_skip         = -1;
     bool with_final_ln        = true;
 
     CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  bool with_final_ln  = true,
-                  int clip_skip_value = -1)
+                  bool with_final_ln  = true)
         : version(version), with_final_ln(with_final_ln) {
         if (version == OPEN_CLIP_VIT_H_14) {
             hidden_size       = 1024;
@@ -696,20 +694,12 @@ class CLIPTextModel : public GGMLBlock {
             n_head            = 20;
             n_layer           = 32;
         }
-        set_clip_skip(clip_skip_value);
 
         blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
         blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
         blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    void set_clip_skip(int skip) {
-        if (skip <= 0) {
-            skip = -1;
-        }
-        clip_skip = skip;
-    }
-
     struct ggml_tensor* get_token_embed_weight() {
         auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         return embeddings->get_token_embed_weight();
@@ -720,7 +710,8 @@ class CLIPTextModel : public GGMLBlock {
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* tkn_embeddings,
                                 size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
         // input_ids: [N, n_token]
         auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@@ -888,9 +879,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        bool with_final_ln  = true,
-                        int clip_skip_value = -1)
-        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
+                        bool with_final_ln  = true)
+        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
@@ -898,10 +888,6 @@ struct CLIPTextModelRunner : public GGMLRunner {
         return "clip";
     }
 
-    void set_clip_skip(int clip_skip) {
-        model.set_clip_skip(clip_skip);
-    }
-
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         model.get_param_tensors(tensors, prefix);
     }
@@ -911,7 +897,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* embeddings,
                                 size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
         size_t N       = input_ids->ne[1];
         size_t n_token = input_ids->ne[0];
         if (input_ids->ne[0] > model.n_token) {
@@ -919,14 +906,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
             input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
         }
 
-        return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
+        return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
     }
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
                                     int num_custom_embeddings    = 0,
                                     void* custom_embeddings_data = NULL,
                                     size_t max_token_idx         = 0,
-                                    bool return_pooled           = false) {
+                                    bool return_pooled           = false,
+                                    int clip_skip                = -1) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
         input_ids = to_backend(input_ids);
@@ -945,7 +933,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
             embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
         }
 
-        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
 
         ggml_build_forward_expand(gf, hidden_states);
 
@@ -958,10 +946,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
                  void* custom_embeddings_data,
                  size_t max_token_idx,
                  bool return_pooled,
+                 int clip_skip,
                  ggml_tensor** output,
                  ggml_context* output_ctx = NULL) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
         };
         GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
     }
diff --git a/conditioner.hpp b/conditioner.hpp
index cfd2b4ca7..d53de57ee 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -61,8 +61,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       const String2GGMLType& tensor_types,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
-                                      PMVersion pv      = PM_VERSION_1,
-                                      int clip_skip     = -1)
+                                      PMVersion pv      = PM_VERSION_1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (sd_version_is_sd1(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
@@ -72,20 +71,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         }
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 1;
-            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
-                clip_skip = 2;
-            }
-        }
-        text_model->set_clip_skip(clip_skip);
-        if (sd_version_is_sdxl(version)) {
-            text_model2->set_clip_skip(clip_skip);
-        }
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -412,7 +397,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int height,
                                              int adm_in_channels  = -1,
                                              bool zero_out_masked = false) {
-        set_clip_skip(clip_skip);
         int64_t t0                               = ggml_time_ms();
         struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
         struct ggml_tensor* chunk_hidden_states  = NULL;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@@ -421,6 +405,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         struct ggml_tensor* pooled               = NULL;
         std::vector<float> hidden_states_vec;
 
+        if (clip_skip <= 0) {
+            clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
+        }
+
         size_t chunk_len   = 77;
         size_t chunk_count = tokens.size() / chunk_len;
         for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@@ -455,6 +443,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                     token_embed_custom.data(),
                                     max_token_idx,
                                     false,
+                                    clip_skip,
                                     &chunk_hidden_states1,
                                     work_ctx);
                 if (sd_version_is_sdxl(version)) {
@@ -464,6 +453,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                          token_embed_custom.data(),
                                          max_token_idx,
                                          false,
+                                         clip_skip,
                                          &chunk_hidden_states2, work_ctx);
                     // concat
                     chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@@ -475,6 +465,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              token_embed_custom.data(),
                                              max_token_idx,
                                              true,
+                                             clip_skip,
                                              &pooled,
                                              work_ctx);
                     }
@@ -669,21 +660,11 @@ struct SD3CLIPEmbedder : public Conditioner {
 
     SD3CLIPEmbedder(ggml_backend_t backend,
                     bool offload_params_to_cpu,
-                    const String2GGMLType& tensor_types = {},
-                    int clip_skip                       = -1)
+                    const String2GGMLType& tensor_types = {})
         : clip_g_tokenizer(0) {
         clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
         clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
-        clip_g->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -780,7 +761,6 @@ struct SD3CLIPEmbedder : public Conditioner {
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
-        set_clip_skip(clip_skip);
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
         auto& clip_g_tokens  = token_and_weights[1].first;
@@ -788,6 +768,10 @@ struct SD3CLIPEmbedder : public Conditioner {
         auto& t5_tokens      = token_and_weights[2].first;
         auto& t5_weights     = token_and_weights[2].second;
 
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
+
         int64_t t0                                 = ggml_time_ms();
         struct ggml_tensor* hidden_states          = NULL;  // [N, n_token*2, 4096]
         struct ggml_tensor* chunk_hidden_states    = NULL;  // [n_token*2, 4096]
@@ -818,6 +802,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 false,
+                                clip_skip,
                                 &chunk_hidden_states_l,
                                 work_ctx);
                 {
@@ -845,6 +830,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     NULL,
                                     max_token_idx,
                                     true,
+                                    clip_skip,
                                     &pooled_l,
                                     work_ctx);
                 }
@@ -866,6 +852,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 false,
+                                clip_skip,
                                 &chunk_hidden_states_g,
                                 work_ctx);
 
@@ -894,6 +881,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     NULL,
                                     max_token_idx,
                                     true,
+                                    clip_skip,
                                     &pooled_g,
                                     work_ctx);
                 }
@@ -1017,18 +1005,9 @@ struct FluxCLIPEmbedder : public Conditioner {
 
     FluxCLIPEmbedder(ggml_backend_t backend,
                      bool offload_params_to_cpu,
-                     const String2GGMLType& tensor_types = {},
-                     int clip_skip                       = -1) {
+                     const String2GGMLType& tensor_types = {}) {
         clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -1109,12 +1088,15 @@ struct FluxCLIPEmbedder : public Conditioner {
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool zero_out_masked = false) {
-        set_clip_skip(clip_skip);
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
         auto& t5_tokens      = token_and_weights[1].first;
         auto& t5_weights     = token_and_weights[1].second;
 
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
+
         int64_t t0                              = ggml_time_ms();
         struct ggml_tensor* hidden_states       = NULL;  // [N, n_token, 4096]
         struct ggml_tensor* chunk_hidden_states = NULL;  // [n_token, 4096]
@@ -1143,6 +1125,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 true,
+                                clip_skip,
                                 &pooled,
                                 work_ctx);
             }
@@ -1241,7 +1224,6 @@ struct T5CLIPEmbedder : public Conditioner {
     T5CLIPEmbedder(ggml_backend_t backend,
                    bool offload_params_to_cpu,
                    const String2GGMLType& tensor_types = {},
-                   int clip_skip                       = -1,
                    bool use_mask                       = false,
                    int mask_pad                        = 1,
                    bool is_umt5                        = false)
@@ -1249,9 +1231,6 @@ struct T5CLIPEmbedder : public Conditioner {
         t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
     }
 
-    void set_clip_skip(int clip_skip) {
-    }
-
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index db4e07cb0..5f9dec009 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -373,7 +373,6 @@ class StableDiffusionGGML {
                     cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                         offload_params_to_cpu,
                                                                         model_loader.tensor_storages_types,
-                                                                        -1,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
                 } else {
@@ -391,7 +390,6 @@ class StableDiffusionGGML {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types,
-                                                                    -1,
                                                                     true,
                                                                     1,
                                                                     true);

From 489069c70306e33c8f2a7bca6e88d258319b7464 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:50:00 -0300
Subject: [PATCH 2/3] feat: reduce CLIP memory usage with no embeddings

The CLIP weights need to be converted to f32 for textual inversions
(fbd42b6fc16d14fbd362993fa1d083740a05f113), but that increases the
amount of allocated VRAM even when embeddings aren't being used.
---
 clip.hpp        | 22 ++++++++++++++++------
 conditioner.hpp |  9 +++++----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/clip.hpp b/clip.hpp
index 9a8c271ff..15e8c818a 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -548,9 +548,15 @@ class CLIPEmbeddings : public GGMLBlock {
     int64_t embed_dim;
     int64_t vocab_size;
     int64_t num_positions;
+    bool force_clip_f32;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         enum ggml_type token_wtype    = GGML_TYPE_F32;
+        if (!force_clip_f32) {
+            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+            if (tensor_type != tensor_types.end())
+                token_wtype = tensor_type->second;
+        }
         enum ggml_type position_wtype = GGML_TYPE_F32;
 
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@@ -560,10 +566,12 @@ class CLIPEmbeddings : public GGMLBlock {
 public:
     CLIPEmbeddings(int64_t embed_dim,
                    int64_t vocab_size    = 49408,
-                   int64_t num_positions = 77)
+                   int64_t num_positions = 77,
+                   bool force_clip_f32   = false)
         : embed_dim(embed_dim),
           vocab_size(vocab_size),
-          num_positions(num_positions) {
+          num_positions(num_positions),
+          force_clip_f32(force_clip_f32) {
     }
 
     struct ggml_tensor* get_token_embed_weight() {
@@ -681,7 +689,8 @@ class CLIPTextModel : public GGMLBlock {
     bool with_final_ln        = true;
 
     CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  bool with_final_ln  = true)
+                  bool with_final_ln  = true,
+                  bool force_clip_f32 = false)
         : version(version), with_final_ln(with_final_ln) {
         if (version == OPEN_CLIP_VIT_H_14) {
             hidden_size       = 1024;
@@ -695,7 +704,7 @@ class CLIPTextModel : public GGMLBlock {
             n_layer           = 32;
         }
 
-        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
+        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
         blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
         blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
@@ -879,8 +888,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        bool with_final_ln  = true)
-        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln) {
+                        bool with_final_ln  = true,
+                        bool force_clip_f32 = false)
+        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/conditioner.hpp b/conditioner.hpp
index d53de57ee..bda99dfc2 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -63,13 +63,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        bool force_clip_f32 = embd_dir.size() > 0;
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
         }
     }
 

From 0218169092294d489be96056153f5cc58cd7b45e Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 14 Sep 2025 12:07:20 +0800
Subject: [PATCH 3/3] format code

---
 clip.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clip.hpp b/clip.hpp
index 15e8c818a..bde8a78a5 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -551,7 +551,7 @@ class CLIPEmbeddings : public GGMLBlock {
     bool force_clip_f32;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
-        enum ggml_type token_wtype    = GGML_TYPE_F32;
+        enum ggml_type token_wtype = GGML_TYPE_F32;
         if (!force_clip_f32) {
             auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
             if (tensor_type != tensor_types.end())