From d5fe4e81bd447124836ecfb47d794f8768665b9f Mon Sep 17 00:00:00 2001 From: frob Date: Sat, 26 Apr 2025 10:10:20 +0200 Subject: [PATCH 1/4] grammar : handle maxItems == 0 in JSON schema (#13117) Co-authored-by: Richard Lyons --- common/json-schema-to-grammar.cpp | 3 +++ examples/json_schema_to_grammar.py | 3 +++ .../public_legacy/json-schema-to-grammar.mjs | 3 +++ tests/test-json-schema-to-grammar.cpp | 16 ++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 9067982257120..5b3059c2f774f 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -16,6 +16,9 @@ using json = nlohmann::ordered_json; static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); + if (max_items == 0) { + return ""; + } if (min_items == 0 && max_items == 1) { return item_rule + "?"; } diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 55f94c0b0a864..ed379585546c2 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -10,6 +10,9 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): + if max_items == 0: + return "" + if min_items == 0 and max_items == 1: return f'{item_rule}?' diff --git a/examples/server/public_legacy/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs index f767ce7b72008..b12bf2ab0909a 100644 --- a/examples/server/public_legacy/json-schema-to-grammar.mjs +++ b/examples/server/public_legacy/json-schema-to-grammar.mjs @@ -2,6 +2,9 @@ const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'; function _buildRepetition(itemRule, minItems, maxItems, opts={}) { + if (maxItems == 0) { + return ''; + } if (minItems === 0 && maxItems === 1) { return `${itemRule}?`; } diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index e35134f3cb063..38cf01d6d8dfb 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -597,6 +597,22 @@ static void test_all(const std::string & lang, std::function Date: Sat, 26 Apr 2025 22:05:31 +0800 Subject: [PATCH 2/4] ggml: move fp16/bf16 conversion optimizations to CPU backend + export conversion APIs (#13107) * ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion * move fp converter to ggml-cpu * Switch ggml_compute_forward_get_rows_f16/bf16 to new ggml_cpu_fp16/bf16_to_fp32 --- ggml/include/ggml-cpu.h | 5 ++ ggml/src/ggml-cpu/ggml-cpu.c | 91 +++++++++++++++++++++++++++++++++++- ggml/src/ggml-cpu/ops.cpp | 4 +- ggml/src/ggml.c | 51 ++------------------ 4 files changed, 101 insertions(+), 50 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index f5e11f1e10002..de77a875ec533 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -133,6 +133,11 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index dbad8f61a1e92..64405449e2467 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_F16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, .nrows = 1, @@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_K, }, [GGML_TYPE_BF16] = { - .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, @@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m512 x_vec = _mm512_loadu_ps(x + i); + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_si256((__m256i *)(y + i), y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for (; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + +void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__F16C__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i)); + __m512 y_vec = _mm512_cvtph_ps(x_vec); + _mm512_storeu_ps(y + i, y_vec); + } +#endif + for (; i + 7 < n; i += 8) { + __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i)); + __m256 y_vec = _mm256_cvtph_ps(x_vec); + _mm256_storeu_ps(y + i, y_vec); + } + for (; i + 3 < n; i += 4) { + __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i)); + __m128 y_vec = _mm_cvtph_ps(x_vec); + _mm_storeu_ps(y + i, y_vec); + } +#endif + for (; i < n; ++i) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) { + int64_t i = 0; + for (; i < n; ++i) { + y[i] = GGML_FP32_TO_BF16(x[i]); + } +} + +void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX2__) +#if defined(__AVX512F__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#endif + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} int ggml_cpu_has_avx(void) { #if defined(__AVX__) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3c2adb217267b..7413192b746b6 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_fp16_to_fp32_row( + ggml_cpu_fp16_to_fp32( (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16( GGML_ASSERT(i01 >= 0 && i01 < ne01); - ggml_bf16_to_fp32_row( + ggml_cpu_bf16_to_fp32( (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2a39dc7bfd125..7654ae1779b1d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4,6 +4,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-threading.h" +#include "ggml-cpu.h" #include "ggml.h" // FIXME: required here for quantization functions @@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library -// currently, the ggml_cpu_has_* functions are entirely compile-time void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { - int64_t i = 0; -#if defined(__F16C__) - //if (ggml_cpu_has_f16c()) { - for (; i + 7 < n; i += 8) { - __m256 x_vec = _mm256_loadu_ps(x + i); - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storeu_si128((__m128i *)(y + i), y_vec); - } - for(; i + 3 < n; i += 4) { - __m128 x_vec = _mm_loadu_ps(x + i); - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); - _mm_storel_epi64((__m128i *)(y + i), y_vec); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(x[i]); } } void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { - int64_t i = 0; -#if defined(__AVX512F__) - //if (ggml_cpu_has_avx512()) { - for (; i + 16 <= n; i += 16) { - _mm512_storeu_ps(y + i, - _mm512_castsi512_ps( - _mm512_slli_epi32( - _mm512_cvtepu16_epi32( - _mm256_loadu_si256( - (const __m256i *)(x + i))), - 16))); - } - //} -#endif -#if defined(__AVX2__) - //if (ggml_cpu_has_avx2()) { - for (; i + 8 <= n; i += 8) { - _mm256_storeu_ps(y + i, - _mm256_castsi256_ps( - _mm256_slli_epi32( - _mm256_cvtepu16_epi32( - _mm_loadu_si128( - (const __m128i *)(x + i))), - 16))); - } - //} -#endif - for (; i < n; i++) { + int i = 0; + for (; i < n; ++i) { y[i] = GGML_BF16_TO_FP32(x[i]); } } From 4753791e70acd4d4e02f2098f14a03df26c992bd Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 26 Apr 2025 22:39:47 +0200 Subject: [PATCH 3/4] clip : improve projector naming (#13118) * clip : improve projector naming * no more kv has_llava_projector * rm unused kv * rm more unused --- examples/llava/clip-impl.h | 27 +- examples/llava/clip.cpp | 512 ++++++++++++++++--------------------- examples/llava/clip.h | 2 - 3 files changed, 235 insertions(+), 306 deletions(-) diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 53ac381304765..16d0a8efc56ae 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -17,22 +17,15 @@ #define KEY_FTYPE "general.file_type" #define KEY_NAME "general.name" #define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" -#define KEY_HAS_GLM_PROJ "clip.has_glm_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" -#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" #define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_SILU "clip.use_silu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_EMBD "clip.vision.embedding_length" +#define KEY_N_FF "clip.vision.feed_forward_length" +#define KEY_N_BLOCK "clip.vision.block_count" +#define KEY_N_HEAD "clip.vision.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.vision.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.vision.projection_dim" #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" @@ -96,9 +89,9 @@ enum projector_type { PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, - PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_MINICPMV, PROJECTOR_TYPE_GLM_EDGE, - PROJECTOR_TYPE_MERGER, + PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_PIXTRAL, @@ -109,9 +102,9 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index da8a590f0e563..e8c01c68a9779 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -308,13 +308,8 @@ struct clip_vision_model { }; struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; bool has_llava_projector = false; - bool has_minicpmv_projector = false; - bool has_glm_projector = false; - bool has_qwen2vl_merger = false; - int minicpmv_version = 2; + int minicpmv_version = 0; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -373,23 +368,20 @@ struct clip_ctx { } }; -static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { +static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) { const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; + int image_size_width = img.nx; + int image_size_height = img.ny; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); - const int hidden_size = hparams.hidden_size; - const int n_head = hparams.n_head; - const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; - const float eps = hparams.eps; - - GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1 + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; struct ggml_init_params params = { /*.mem_size =*/ ctx->buf_compute_meta.size(), @@ -621,15 +613,14 @@ static ggml_tensor * build_rope_2d( return cur; } -static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32_batch & imgs) { +static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) { const auto & model = ctx->vision_model; const auto & hparams = model.hparams; GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL); - GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1 - int image_size_width = imgs.entries[0]->nx; - int image_size_height = imgs.entries[0]->ny; + int image_size_width = img.nx; + int image_size_height = img.ny; const int patch_size = hparams.patch_size; const int n_patches_x = image_size_width / patch_size; @@ -772,18 +763,14 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i } static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (!ctx->has_vision_encoder) { - LOG_ERR("This gguf file seems to have no vision encoder\n"); - return nullptr; - } - const auto & model = ctx->vision_model; const auto & hparams = model.hparams; const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; - if (ctx->has_minicpmv_projector) { + + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height); image_size_width = load_image_size.width; image_size_height = load_image_size.height; @@ -792,7 +779,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im image_size_height = imgs.entries[0]->ny; } } - else if (ctx->has_qwen2vl_merger) { + + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { // use the image's native resolution when image is avaible if (is_inf) { // if (imgs->data->nx && imgs->data->ny) { @@ -800,12 +788,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im image_size_height = imgs.entries[0]->ny; } } + const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int patches_w = image_size_width / patch_size; const int patches_h = image_size_height / patch_size; const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions; + const int num_position_ids = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions; const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; @@ -814,7 +803,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im const int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) { + if (ctx->has_llava_projector + || ctx->proj_type == PROJECTOR_TYPE_MINICPMV + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { GGML_ASSERT(batch_size == 1); } @@ -835,8 +826,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - if (ctx->has_qwen2vl_merger) { - GGML_ASSERT(image_size_width % (patch_size * 2) == 0); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { + GGML_ASSERT(image_size_width % (patch_size * 2) == 0); GGML_ASSERT(image_size_height % (patch_size * 2) == 0); auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); @@ -865,29 +856,26 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im struct ggml_tensor * embeddings = inp; struct ggml_tensor * pos_embed = nullptr; - if (ctx->has_llava_projector) { - // concat class_embeddings and patch_embeddings - if (model.class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); ggml_set_name(positions, "positions"); ggml_set_input(positions); - if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding + if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); } - if (ctx->has_minicpmv_projector) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; if (ctx->minicpmv_version == 2) { @@ -941,7 +929,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { Q = ggml_rope_multi( ctx0, Q, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); @@ -953,7 +941,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { K = ggml_rope_multi( ctx0, K, positions, nullptr, d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); @@ -1218,106 +1206,98 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } } // minicpmv projector - else if (ctx->has_minicpmv_projector) - { - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - struct ggml_tensor * q = model.mm_model_query; - { // layernorm - q = ggml_norm(ctx0, q, eps); - q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + } + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + } + struct ggml_tensor * k; + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + int hidden_size = 4096; + const int d_head = 128; + int n_head = hidden_size/d_head; + int num_query = 96; + if (ctx->minicpmv_version == 2) { + hidden_size = 4096; + n_head = hidden_size/d_head; + num_query = 96; } - struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - { // layernorm - v = ggml_norm(ctx0, v, eps); - v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + else if (ctx->minicpmv_version == 3) { + hidden_size = 3584; + n_head = hidden_size/d_head; + num_query = 64; } - struct ggml_tensor * k; - { // position - // q = ggml_add(ctx0, q, model.mm_model_pos_embed); - k = ggml_add(ctx0, v, pos_embed); + else if (ctx->minicpmv_version == 4) { + hidden_size = 3584; + n_head = hidden_size/d_head; + num_query = 64; } - { // attention - int hidden_size = 4096; - const int d_head = 128; - int n_head = hidden_size/d_head; - int num_query = 96; - if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; - num_query = 96; - } - else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } - else if (ctx->minicpmv_version == 4) { - hidden_size = 3584; - n_head = hidden_size/d_head; - num_query = 64; - } + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); - struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); - struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); - // permute - Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); - Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); - Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); - K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); - K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); - K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); - V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); - V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); - V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); - - embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); - } - { // layernorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); - } - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); } - else { - GGML_ASSERT(false); + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); } + // glm projector - else if (ctx->has_glm_projector) { - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); - embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - //GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - struct ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_silu_inplace(ctx0, embeddings); - embeddings = ggml_mul(ctx0, embeddings,x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - } else { - GGML_ABORT("fatal error"); + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3)); + embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + struct ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_silu_inplace(ctx0, embeddings); + embeddings = ggml_mul(ctx0, embeddings,x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); } } - else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -1343,11 +1323,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: { - res = clip_image_build_graph_siglip(ctx, imgs); + GGML_ASSERT(imgs.entries.size() == 1); + res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]); } break; case PROJECTOR_TYPE_PIXTRAL: { - res = clip_image_build_graph_pixtral(ctx, imgs); + GGML_ASSERT(imgs.entries.size() == 1); + res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]); } break; default: { @@ -1419,8 +1401,8 @@ struct clip_model_loader { auto & hparams = ctx_clip.vision_model.hparams; // projector type + std::string proj_type; { - std::string proj_type; get_string(KEY_PROJ_TYPE, proj_type, false); if (!proj_type.empty()) { ctx_clip.proj_type = clip_projector_type_from_string(proj_type); @@ -1432,33 +1414,27 @@ struct clip_model_loader { // other hparams { - get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false); - get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false); - GGML_ASSERT(ctx_clip.has_vision_encoder); - GGML_ASSERT(!ctx_clip.has_text_encoder); - - // legacy keys, use KEY_PROJ_TYPE instead - get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false); - get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false); get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); - get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false); - get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false); - // !!! do NOT extend the list above, use KEY_PROJ_TYPE instead get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false); get_bool(KEY_USE_SILU, ctx_clip.use_silu, false); - get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); - get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); - get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); - get_u32(string_format(KEY_N_BLOCK, "vision"), hparams.n_layer); - get_u32(string_format(KEY_PROJ_DIM, "vision"), hparams.projection_dim); - get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps); - get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PATCH_SIZE, hparams.patch_size); - get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); + get_u32(KEY_N_EMBD, hparams.hidden_size); + get_u32(KEY_N_HEAD, hparams.n_head); + get_u32(KEY_N_FF, hparams.n_intermediate); + get_u32(KEY_N_BLOCK, hparams.n_layer); + get_u32(KEY_PROJ_DIM, hparams.projection_dim); + get_f32(KEY_LAYER_NORM_EPS, hparams.eps); + get_u32(KEY_IMAGE_SIZE, hparams.image_size); + get_u32(KEY_PATCH_SIZE, hparams.patch_size); + get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false); + ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP + || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM + || ctx_clip.proj_type == PROJECTOR_TYPE_LDP + || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2; + { std::string mm_patch_merge_type; get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); @@ -1491,32 +1467,56 @@ struct clip_model_loader { for (auto & layer : vision_feature_layer) { hparams.vision_feature_layer.insert(layer); } + // Calculate the deepest feature layer based on hparams and projector type - ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip); + // NOTE: This is only used by build_graph_legacy() + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int n_layer = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV + || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE + || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) { + n_layer += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; + } + + // model-specific params + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + if (ctx_clip.minicpmv_version == 0) { + ctx_clip.minicpmv_version = 2; // default to 2 if not set + } + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + hparams.rope_theta = 10000.0f; + } break; + default: + break; + } - LOG_INF("%s: text_encoder: %d\n", __func__, ctx_clip.has_text_encoder); - LOG_INF("%s: vision_encoder: %d\n", __func__, ctx_clip.has_vision_encoder); - LOG_INF("%s: llava_projector: %d\n", __func__, ctx_clip.has_llava_projector); - LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector); + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); + LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); - LOG_INF("%s: glm_projector: %d\n", __func__, ctx_clip.has_glm_projector); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } - - // model-specific params - switch (ctx_clip.proj_type) { - case PROJECTOR_TYPE_IDEFICS3: - { - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); - } break; - case PROJECTOR_TYPE_PIXTRAL: - { - hparams.rope_theta = 10000.0f; - } break; - default: - break; - } } void load_tensors() { @@ -1569,9 +1569,6 @@ struct clip_model_loader { vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false); vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); - if (vision_model.patch_embeddings_1 == nullptr) { - ctx_clip.has_qwen2vl_merger = false; - } vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false); @@ -1669,7 +1666,7 @@ struct clip_model_loader { vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); } break; - case PROJECTOR_TYPE_RESAMPLER: + case PROJECTOR_TYPE_MINICPMV: { // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); @@ -1702,7 +1699,7 @@ struct clip_model_loader { vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); } break; - case PROJECTOR_TYPE_MERGER: + case PROJECTOR_TYPE_QWEN2VL: { vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); @@ -2479,11 +2476,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - clip_image_size original_size{img->nx, img->ny}; bool pad_to_square = true; auto & params = ctx->vision_model.hparams; @@ -2504,7 +2496,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } return true; } - else if (ctx->has_qwen2vl_merger) { + else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { clip_image_u8 resized; auto patch_size = clip_get_patch_size(ctx) * 2; int nx = ceil((float)img->nx / patch_size) * patch_size; @@ -2518,7 +2510,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->entries.push_back(std::move(img_f32)); return true; } - else if (ctx->has_glm_projector + else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { clip_image_u8 resized_image; @@ -2646,7 +2638,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { n_patches /= 4; - } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { if (ctx->minicpmv_version == 2) { n_patches = 96; } @@ -2656,7 +2648,10 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i else if (ctx->minicpmv_version == 4) { n_patches = 64; } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + else { + GGML_ABORT("Unknown minicpmv version"); + } + } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { int patch_size = params.patch_size * 2; int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); @@ -2761,11 +2756,6 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co } bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - clip_image_f32_batch imgs; clip_image_f32_ptr img_copy(clip_image_f32_init()); *img_copy = *img; @@ -2776,20 +2766,11 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { const clip_image_f32_batch & imgs = *imgs_c_ptr; - - if (!ctx->has_vision_encoder) { - LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__); - return false; - } - int batch_size = imgs.entries.size(); - if (ctx->has_llava_projector) { - GGML_ASSERT(batch_size == 1); // TODO: support multiple images - } - if (ctx->has_minicpmv_projector) { - GGML_ASSERT(batch_size == 1); - } - if (ctx->has_glm_projector) { + + if (ctx->has_llava_projector + || ctx->proj_type == PROJECTOR_TYPE_MINICPMV + || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) { GGML_ASSERT(batch_size == 1); } @@ -2799,21 +2780,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs - const auto & model = ctx->vision_model; + const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - // TODO @ngxson : this is ugly, need to refactor later - bool support_dynamic_size = ctx->has_minicpmv_projector - || ctx->has_qwen2vl_merger - || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL; + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; - const int image_size = hparams.image_size; - int image_size_width = image_size; - int image_size_height = image_size; - if (support_dynamic_size) { - image_size_width = imgs.entries[0]->nx; - image_size_height = imgs.entries[0]->ny; - } const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); @@ -2839,14 +2811,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs.entries.size(); i++) { const int nx = imgs.entries[i]->nx; const int ny = imgs.entries[i]->ny; - - if (ctx->has_glm_projector - || ctx->has_llava_projector - || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 - || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { - GGML_ASSERT(nx == image_size && ny == image_size); - } - const int n = nx * ny; for (int b = 0; b < batch_size; b++) { @@ -2864,13 +2828,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); } - if (ctx->has_minicpmv_projector) { + + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { { // inspired from siglip: // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); + std::vector pos_data(ggml_nelements(positions)); + int * data = pos_data.data(); int bucket_coords_h[1024]; int bucket_coords_w[1024]; for (int i = 0; i < pos_h; i++){ @@ -2881,11 +2847,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } for (int i = 0, id = 0; i < pos_h; i++){ for (int j = 0; j < pos_w; j++){ - positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; } } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); + ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions)); } { @@ -2903,30 +2868,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima else if (ctx->minicpmv_version == 4) { embed_dim = 3584; } + else { + GGML_ABORT("Unknown minicpmv version"); + } + + // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); - for(int i=0;i < pos_w * pos_h; ++i){ - for(int j=0; j < embed_dim; ++j){ - pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j]; + std::vector pos_data(ggml_nelements(pos_embed)); + float * data = pos_data.data(); + for(int i = 0; i < pos_w * pos_h; ++i){ + for(int j = 0; j < embed_dim; ++j){ + data[i * embed_dim + j] = pos_embed_t[i][j]; } } - ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed)); - free(pos_embed_data); + ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed)); } } else { - if (model.class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + // non-minicpmv models - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); - } - - if (ctx->has_qwen2vl_merger) { + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); const int pw = image_size_width / patch_size; @@ -2978,6 +2941,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos)); } else { + // llava and other models struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); int* positions_data = (int*)malloc(ggml_nbytes(positions)); @@ -2987,7 +2951,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); - if (!ctx->has_glm_projector) { + if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) { struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); // The patches vector is used to get rows to index into the embeds with; // we should skip dim 0 only if we have CLS to avoid going out of bounds @@ -3166,7 +3130,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_2_b->ne[0]; case PROJECTOR_TYPE_MLP_NORM: return ctx->vision_model.mm_3_b->ne[0]; - case PROJECTOR_TYPE_RESAMPLER: + case PROJECTOR_TYPE_MINICPMV: if (ctx->minicpmv_version == 2) { return 4096; } else if (ctx->minicpmv_version == 3) { @@ -3174,36 +3138,33 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { } else if (ctx->minicpmv_version == 4) { return 3584; } - break; // Should not happen if version is valid + GGML_ABORT("Unknown minicpmv version"); case PROJECTOR_TYPE_GLM_EDGE: return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - case PROJECTOR_TYPE_MERGER: + case PROJECTOR_TYPE_QWEN2VL: return ctx->vision_model.mm_1_b->ne[0]; case PROJECTOR_TYPE_GEMMA3: return ctx->vision_model.mm_input_proj_w->ne[0]; case PROJECTOR_TYPE_IDEFICS3: return ctx->vision_model.projection->ne[1]; default: - break; // Fall through to throw + GGML_ABORT("Unknown projector type"); } - - std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; - throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } int clip_is_minicpmv(const struct clip_ctx * ctx) { - if (ctx->has_minicpmv_projector) { + if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { return ctx->minicpmv_version; } return 0; } bool clip_is_glm(const struct clip_ctx * ctx) { - return ctx->has_glm_projector; + return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE; } bool clip_is_qwen2vl(const struct clip_ctx * ctx) { - return ctx->has_qwen2vl_merger; + return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL; } bool clip_is_llava(const struct clip_ctx * ctx) { @@ -3214,29 +3175,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type == PROJECTOR_TYPE_GEMMA3; } -// Determine the number of encoder layers to iterate over -int get_deepest_feature_layer(const struct clip_ctx * ctx) { - // Get the index of the second to last layer; this is the - // default for models that have a llava projector - const auto & hparams = ctx->vision_model.hparams; - int n_layer = hparams.n_layer - 1; - int deepest_feature_layer = -1; - - // Handle other projectors; incrementing here indicates that we - // should use the last encoder layer for the vision features. - if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { - n_layer += 1; - } - - // If we set explicit vision feature layers, only go up to the deepest one - for (const auto & feature_layer : hparams.vision_feature_layer) { - if (feature_layer > deepest_feature_layer) { - deepest_feature_layer = feature_layer; - } - } - return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; -} - bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 5fc45d3e23904..6ba42ad892146 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -114,8 +114,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool clip_is_llava(const struct clip_ctx * ctx); CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx); -CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); - CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); From 2d451c80590b9ac250322769ac13d3b4870dbcf7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 26 Apr 2025 22:58:12 +0200 Subject: [PATCH 4/4] common : add common_remote_get_content (#13123) * common : add common_remote_get_content * support max size and timeout * add tests --- common/arg.cpp | 105 ++++++++++++++++++++++++++------------ common/arg.h | 9 ++++ tests/test-arg-parser.cpp | 47 +++++++++++++++++ 3 files changed, 127 insertions(+), 34 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0657553e4e9cf..de173159f4a76 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -162,6 +162,10 @@ struct common_hf_file_res { #ifdef LLAMA_USE_CURL +bool common_has_curl() { + return true; +} + #ifdef __linux__ #include #elif defined(_WIN32) @@ -527,64 +531,89 @@ static bool common_download_model( return true; } -/** - * Allow getting the HF file from the HF repo with tag (like ollama), for example: - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 - * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s - * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) - * - * Return pair of (with "repo" already having tag removed) - * - * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. - */ -static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) { - auto parts = string_split(hf_repo_with_tag, ':'); - std::string tag = parts.size() > 1 ? parts.back() : "latest"; - std::string hf_repo = parts[0]; - if (string_split(hf_repo, '/').size() != 2) { - throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); - } - - // fetch model info from Hugging Face Hub API +std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params) { curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); curl_slist_ptr http_headers; - std::string res_str; + std::vector res_buffer; - std::string model_endpoint = get_model_endpoint(); - - std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag; curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { - static_cast(data)->append((char * ) ptr, size * nmemb); + auto data_vec = static_cast *>(data); + data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb); return size * nmemb; }; curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer); #if defined(_WIN32) curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); #endif - if (!bearer_token.empty()) { - std::string auth_header = "Authorization: Bearer " + bearer_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + if (params.timeout > 0) { + curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout); + } + if (params.max_size > 0) { + curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size); } - // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); - http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); + for (const auto & header : params.headers) { + http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str()); + } curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); CURLcode res = curl_easy_perform(curl.get()); if (res != CURLE_OK) { - throw std::runtime_error("error: cannot make GET request to HF API"); + std::string error_msg = curl_easy_strerror(res); + throw std::runtime_error("error: cannot make GET request: " + error_msg); } long res_code; - std::string ggufFile = ""; - std::string mmprojFile = ""; curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + + return { res_code, std::move(res_buffer) }; +} + +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * + * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. + */ +static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts.back() : "latest"; + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); + } + + std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag; + + // headers + std::vector headers; + headers.push_back("Accept: application/json"); + if (!bearer_token.empty()) { + headers.push_back("Authorization: Bearer " + bearer_token); + } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + // User-Agent header is already set in common_remote_get_content, no need to set it here + + // make the request + common_remote_params params; + params.headers = headers; + auto res = common_remote_get_content(url, params); + long res_code = res.first; + std::string res_str(res.second.data(), res.second.size()); + std::string ggufFile; + std::string mmprojFile; + if (res_code == 200) { // extract ggufFile.rfilename in json, using regex { @@ -618,6 +647,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_ #else +bool common_has_curl() { + return false; +} + static bool common_download_file_single(const std::string &, const std::string &, const std::string &) { LOG_ERR("error: built without CURL, cannot download model from internet\n"); return false; @@ -640,6 +673,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s return {}; } +std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params) { + throw std::runtime_error("error: built without CURL, cannot download model from the internet"); +} + #endif // LLAMA_USE_CURL // diff --git a/common/arg.h b/common/arg.h index 49ab8667b1052..70bea100fd4f2 100644 --- a/common/arg.h +++ b/common/arg.h @@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e // function to be used by test-arg-parser common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +bool common_has_curl(); + +struct common_remote_params { + std::vector headers; + long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout + long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB +}; +// get remote file content, returns +std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 537fc63a4c975..21dbd5404222f 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -126,6 +126,53 @@ int main(void) { assert(params.cpuparams.n_threads == 1010); #endif // _WIN32 + if (common_has_curl()) { + printf("test-arg-parser: test curl-related functions\n\n"); + const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md"; + const char * BAD_URL = "https://www.google.com/404"; + const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin"; + + { + printf("test-arg-parser: test good URL\n\n"); + auto res = common_remote_get_content(GOOD_URL, {}); + assert(res.first == 200); + assert(res.second.size() > 0); + std::string str(res.second.data(), res.second.size()); + assert(str.find("llama.cpp") != std::string::npos); + } + + { + printf("test-arg-parser: test bad URL\n\n"); + auto res = common_remote_get_content(BAD_URL, {}); + assert(res.first == 404); + } + + { + printf("test-arg-parser: test max size error\n"); + common_remote_params params; + params.max_size = 1; + try { + common_remote_get_content(GOOD_URL, params); + assert(false && "it should throw an error"); + } catch (std::exception & e) { + printf(" expected error: %s\n\n", e.what()); + } + } + + { + printf("test-arg-parser: test timeout error\n"); + common_remote_params params; + params.timeout = 1; + try { + common_remote_get_content(BIG_FILE, params); + assert(false && "it should throw an error"); + } catch (std::exception & e) { + printf(" expected error: %s\n\n", e.what()); + } + } + } else { + printf("test-arg-parser: no curl, skipping curl-related functions\n"); + } printf("test-arg-parser: all tests OK\n\n"); }