diff --git a/README.md b/README.md index 1785493c3e2b0..42c0eb633ef5d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9) -- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated +- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 5a78216e44fa4..0786594296e94 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1133,8 +1133,6 @@ struct test { "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 6409b4f5e6cd0..27b6d27e5cac3 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -64,13 +64,7 @@ endif() add_executable(llama-llava-cli deprecation-warning.cpp) add_executable(llama-gemma3-cli deprecation-warning.cpp) add_executable(llama-minicpmv-cli deprecation-warning.cpp) - -set(TARGET llama-qwen2vl-cli) -add_executable(${TARGET} qwen2vl-cli.cpp) -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) +add_executable(llama-qwen2vl-cli deprecation-warning.cpp) set(TARGET llama-mtmd-cli) add_executable(${TARGET} mtmd-cli.cpp) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a5eb55f4d412d..ad3e7df1d8a3a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2825,15 +2825,18 @@ void clip_free(clip_ctx * ctx) { delete ctx; } +// deprecated size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); + const int32_t nx = ctx->vision_model.hparams.image_size; + const int32_t ny = ctx->vision_model.hparams.image_size; + return clip_embd_nbytes_by_img(ctx, nx, ny); } -size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { clip_image_f32 img; img.nx = img_w; img.ny = img_h; - return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); + return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); } int32_t clip_get_image_size(const struct clip_ctx * ctx) { @@ -2863,14 +2866,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_grid_pinpoints.size(); } +// deprecated int clip_n_patches(const struct clip_ctx * ctx) { clip_image_f32 img; img.nx = ctx->vision_model.hparams.image_size; img.ny = ctx->vision_model.hparams.image_size; - return clip_n_patches_by_img(ctx, &img); + return clip_n_output_tokens(ctx, &img); } +// deprecated int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + return clip_n_output_tokens(ctx, img); +} + +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + const int n_total = clip_n_output_tokens(ctx, img); + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); + } + return n_total; +} + +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->vision_model.hparams; + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { + return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); + } + return 1; +} + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->vision_model.hparams; int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 6ba42ad892146..0a53bd8eb78e1 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); -CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx); @@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); -CLIP_API int clip_n_patches (const struct clip_ctx * ctx); -CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); -CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); +GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx), + "use clip_n_output_tokens instead"); +GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img), + "use clip_n_output_tokens instead"); + +CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// for M-RoPE, this will be the number of token positions in X and Y directions +// for other models, X will be the total number of tokens and Y will be 1 +CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); +CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// this should be equal to the embedding dimension of the text model +CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 03a22cbb4c205..c00d16aefff10 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< } // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) -static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { +static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) { struct { struct ggml_context * ctx; } model; @@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector model.ctx = ggml_init(params); - struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 + struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4 // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // fill it with the image embeddings, ignoring the base for (size_t i = 1; i < num_images; i++) { @@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context // append without newline tokens (default behavior in llava_arch when not using unpad ): - memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches - *n_img_pos_out = static_cast(result->ne[1]+clip_n_patches(ctx_clip)); + memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches + *n_img_pos_out = static_cast(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input)); // Debug: Test single segments // Current findings: sending base image, sending a segment embedding all works similar to python @@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes_by_img(ctx_clip, nx, ny)); - n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res); + n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding - *n_img_pos = clip_n_patches(ctx_clip); clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0); + *n_img_pos = clip_n_output_tokens(ctx_clip, img_res); bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096 if (!encoded) { LOG_ERR("Unable to encode image\n"); @@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); int n_img_pos_out; - clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); + clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0); + clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input); *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp index 250e8c9a9e871..4d857ca64e0b4 100644 --- a/examples/llava/mtmd-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -136,39 +136,6 @@ struct mtmd_cli_context { } }; -struct decode_embd_batch { - std::vector pos; - std::vector n_seq_id; - std::vector seq_id_0; - std::vector seq_ids; - std::vector logits; - llama_batch batch; - decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); - n_seq_id.resize(n_tokens); - seq_ids .resize(n_tokens + 1); - logits .resize(n_tokens); - seq_id_0.resize(1); - seq_id_0[0] = seq_id; - seq_ids [n_tokens] = nullptr; - batch = { - /*n_tokens =*/ n_tokens, - /*tokens =*/ nullptr, - /*embd =*/ embd, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), - /*logits =*/ logits.data(), - }; - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = pos_0 + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = false; - } - } -}; - static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) { llama_tokens generated_tokens; for (int i = 0; i < n_predict; i++) { @@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect return 1; } - ctx.n_past += mtmd_helper_get_n_tokens(chunks); + ctx.n_past += mtmd_helper_get_n_pos(chunks); return 0; } @@ -371,6 +338,7 @@ int main(int argc, char ** argv) { } } if (g_is_interrupted) LOG("\nInterrupted by user\n"); + LOG("\n\n"); llama_perf_context_print(ctx.lctx); return g_is_interrupted ? 130 : 0; } diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index f95f0503569f9..7081fd7352bb7 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -40,11 +40,14 @@ struct mtmd_context { llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row + bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE + // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, const llama_model * text_model, const mtmd_context_params & ctx_params) : + text_model (text_model), print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), image_marker (ctx_params.image_marker) @@ -56,9 +59,8 @@ struct mtmd_context { if (!ctx_clip) { throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); } - this->text_model = text_model; - GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead"); + use_mrope = clip_is_qwen2vl(ctx_clip); int minicpmv_version = clip_is_minicpmv(ctx_clip); if (minicpmv_version == 2) { @@ -126,6 +128,7 @@ struct mtmd_image_tokens_data { struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction + bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) uint32_t n_tokens() const { return nx * ny; } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -202,6 +205,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx, string_replace_all(prompt_modified, ctx->image_marker, marker_modified); } + else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) { + // <|vision_start|> ... (image embeddings) ... <|vision_end|> + marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>"; + string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + + } + // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix std::vector parts = string_split_str(prompt_modified, ctx->image_marker); @@ -226,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, for (auto & entry : batch_f32.entries) { mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); - image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get()); + image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get()); image_tokens->ny = 1; image_tokens->batch_f32.entries.push_back(std::move(entry)); image_tokens->id = id; @@ -322,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } else { size_t n_tokens = 0; for (const auto & entry : batch_f32.entries) { - n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get()); + n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get()); } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); - image_tokens->nx = n_tokens; - image_tokens->ny = 1; // TODO + if (ctx->use_mrope) { + // for Qwen2VL, we need this information for M-RoPE decoding positions + image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get()); + image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get()); + image_tokens->use_mrope_pos = true; + } else { + // other models, we only need the total number of tokens + image_tokens->nx = n_tokens; + image_tokens->ny = 1; + } image_tokens->batch_f32 = std::move(batch_f32); image_tokens->id = bitmaps[i_img].id; // optional @@ -372,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { return image_tokens->id; } +llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { + if (image_tokens->use_mrope_pos) { + return 1; // for M-RoPE, the whole image is 1 in temporal dimension + } + return image_tokens->n_tokens(); +} + int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); @@ -389,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; for (size_t i = 0; i < entries.size(); i++) { - int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get()); + int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get()); ok = clip_image_encode( ctx->ctx_clip, ctx->n_threads, @@ -417,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) { if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { n_tokens += chunk.tokens_text.size(); } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { - n_tokens += chunk.tokens_image->n_tokens(); + n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); } else { GGML_ASSERT(false && "chunk type not supported"); } @@ -425,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) { return n_tokens; } +llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) { + llama_pos n_pos = 0; + for (auto & chunk : chunks) { + if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + n_pos += chunk.tokens_text.size(); + } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get()); + } else { + GGML_ASSERT(false && "chunk type not supported"); + } + } + return n_pos; +} + // helper struct to make working with embd batch easier // note: this will be removed after llama_batch_ext refactoring struct decode_embd_batch { + int n_pos_per_embd; + int n_mmproj_embd; std::vector pos; + std::vector pos_view; // used by mrope std::vector n_seq_id; std::vector seq_id_0; std::vector seq_ids; std::vector logits; llama_batch batch; - decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { - pos .resize(n_tokens); + decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { + pos .resize(n_tokens * n_pos_per_embd); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); logits .resize(n_tokens); seq_id_0.resize(1); - seq_id_0[0] = seq_id; seq_ids [n_tokens] = nullptr; batch = { /*n_tokens =*/ n_tokens, @@ -451,13 +492,64 @@ struct decode_embd_batch { /*seq_id =*/ seq_ids.data(), /*logits =*/ logits.data(), }; - for (int i = 0; i < n_tokens; i++) { + } + + void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) { + seq_id_0[0] = seq_id; + for (int i = 0; i < batch.n_tokens; i++) { batch.pos [i] = pos_0 + i; batch.n_seq_id[i] = 1; batch.seq_id [i] = seq_id_0.data(); batch.logits [i] = false; } } + + void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) { + GGML_ASSERT(n_pos_per_embd == 4); + seq_id_0[0] = seq_id; + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + int i = y * nx + x; + pos[i ] = pos_0; + pos[i + batch.n_tokens ] = pos_0 + y; + pos[i + batch.n_tokens * 2] = pos_0 + x; + pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused + } + } + for (int i = 0; i < batch.n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + + llama_batch get_view(int offset, int n_tokens) { + llama_pos * pos_ptr; + pos_view.clear(); + pos_view.resize(n_tokens * n_pos_per_embd); + if (n_pos_per_embd > 1) { + // mrope + // for example, with layout of src: 1234...1234...1234...1234... + // offset 2 will give us dst: 34...34...34...34... + for (int i = 0; i < n_pos_per_embd; i++) { + auto src = pos.begin() + i * batch.n_tokens + offset; + pos_view.insert(pos_view.end(), src, src + n_tokens); + } + pos_ptr = pos_view.data(); + } else { + // normal + pos_ptr = pos.data() + offset; + } + return { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ batch.embd + offset * n_mmproj_embd, + /*pos =*/ pos_ptr, + /*n_seq_id =*/ batch.n_seq_id + offset, + /*seq_id =*/ batch.seq_id + offset, + /*logits =*/ batch.logits + offset, + }; + } }; int32_t mtmd_helper_eval(mtmd_context * ctx, @@ -470,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, llama_pos n_past = pos0; llama_batch text_batch = llama_batch_init(n_batch, 0, 1); int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); + int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; for (auto & chunk : chunks) { bool is_last = &chunk == &chunks.back(); @@ -517,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, int32_t i_batch = 0; int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; float * embd = mtmd_get_output_embd(ctx); + decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); + + const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get()); + const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get()); + + if (mtmd_decode_use_mrope(ctx)) { + batch_embd.set_position_mrope(n_past, nx, ny, seq_id); + } else { + batch_embd.set_position_normal(n_past, seq_id); + } if (mtmd_decode_use_non_causal(ctx)) { llama_set_causal_attn(lctx, false); @@ -524,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, } while (i_batch < n_img_batches) { // split into batches - int32_t pos_offset = i_batch*n_batch; - int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); - float * embd_batch = embd + pos_offset*n_mmproj_embd; - decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0); + int pos_offset = i_batch*n_batch; + int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); + llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch); - printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); + LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); int64_t t1 = ggml_time_ms(); - ret = llama_decode(lctx, batch_img.batch); + ret = llama_decode(lctx, batch_embd_view); if (ret != 0) { LOG_ERR("failed to decode image\n"); llama_set_causal_attn(lctx, true); // restore causal attn @@ -545,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, } i_batch++; - n_past += n_tokens_batch; } + // for mrope, one image is one single **temporal** position + n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens; + if (mtmd_decode_use_non_causal(ctx)) { llama_set_causal_attn(lctx, true); } @@ -595,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) { return false; } +bool mtmd_decode_use_mrope(mtmd_context * ctx) { + return ctx->use_mrope; +} + void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); } diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h index 78be192dd6eb6..6805e5e4816c3 100644 --- a/examples/llava/mtmd.h +++ b/examples/llava/mtmd.h @@ -102,6 +102,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens); MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens); MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens); +MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise) MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens); // returns 0 on success @@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); // whether we need to set non-causal mask before llama_decode MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); +// whether the current model use M-RoPE for llama_decode +MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); + // // helper functions (can be implemented based on other functions) // -// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past +// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks); +// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past +MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks); + // helper function that automatically: // 1. run llama_decode() on text chunks // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-test.cpp similarity index 99% rename from examples/llava/qwen2vl-cli.cpp rename to examples/llava/qwen2vl-test.cpp index 1e54851ea07a0..7f9e3dca885c6 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-test.cpp @@ -27,6 +27,8 @@ #include #include +// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL +// IT IS NOT A PRODUCTION CODE static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index 4002f9d531bd2..75604315cfeba 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -54,8 +54,8 @@ add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M" add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K" add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0" -add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" -add_test "llama-qwen2vl-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" +add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" +add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" # to test the big models, run: ./tests.sh big add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index aba2f27f9b564..b497959fd8689 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } + // if the assistant message appears at the end of list, we do not add end-of-turn token + // for ex. this can be useful to modify the reasoning process in reasoning models + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"; + common_chat_msg last_message; + if (prefill_assistant_message) { + last_message = inputs.messages.back(); + inputs.messages.pop_back(); + + /* sanity check, max one assistant message at the end of the list */ + if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ + throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + } + + inputs.extract_reasoning = false; + inputs.add_generation_prompt = true; + } + // Apply chat template to the list of messages auto chat_params = common_chat_templates_apply(tmpls, inputs); + /* Append assistant prefilled message */ + if (prefill_assistant_message) { + chat_params.prompt += last_message.content; + } + llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) { diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index a224ec0e12ddb..c6dec4276b36d 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -1,6 +1,8 @@ #include "convert.cuh" #include "dequantize.cuh" +#include + #define CUDA_Q8_0_NE_ALIGN 2048 template @@ -570,30 +572,46 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t } template -static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { - const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; +static __global__ void convert_unary( + const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t s01, const int64_t s02, const int64_t s03) { + const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (i00 >= ne00) { return; } + const int64_t i01 = blockIdx.y; + const int64_t i02 = blockIdx.z % ne02; + const int64_t i03 = blockIdx.z / ne02; + const src_t * x = (const src_t *) vx; - y[i] = float(x[i]); + const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; + const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; + y[iy] = float(x[ix]); } template -static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - convert_unary<<>>(vx, y, k); +static void convert_unary_cuda(const void * vx, dst_t * y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { + const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03); + convert_unary<<>> + (vx, y, ne00, ne01, ne02, s01, s02, s03); +} + +template +static void convert_unary_cont_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + convert_unary_cuda(vx, y, k, 1, 1, 1, k, k, k, stream); } to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) { switch (type) { case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -643,9 +661,9 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: - return convert_unary_cuda; + return convert_unary_cont_cuda; case GGML_TYPE_BF16: - return convert_unary_cuda; + return convert_unary_cont_cuda; default: return nullptr; } @@ -692,7 +710,18 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: - return convert_unary_cuda; + return convert_unary_cont_cuda; + case GGML_TYPE_BF16: + return convert_unary_cont_cuda; + default: + return nullptr; + } +} + +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_cuda; case GGML_TYPE_BF16: return convert_unary_cuda; default: diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index 411a13cf12667..b65b98e08e7e2 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -3,7 +3,7 @@ #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 template -using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); +using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream); typedef to_t_cuda_t to_fp32_cuda_t; typedef to_t_cuda_t to_fp16_cuda_t; @@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type); to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); + +// TODO more general support for non-contiguous inputs + +template +using to_t_nc_cuda_t = void (*)(const void * x, T * y, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, + int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream); + +typedef to_t_nc_cuda_t to_fp16_nc_cuda_t; +to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 19b9ce7231aa2..fba8cb6565bae 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1720,15 +1720,15 @@ static __global__ void k_compute_batched_ptrs( size_t nb12, size_t nb13, size_t nbd2, size_t nbd3, int64_t r2, int64_t r3) { - int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; - int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; + const int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int64_t i03 = i13 / r3; - int64_t i02 = i12 / r2; + const int64_t i03 = i13 / r3; + const int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; @@ -1742,6 +1742,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); + // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. + // As long as dst is contiguous this does not matter though. + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); @@ -1750,21 +1754,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - void * src0_ddq = src0->data; - half * src0_f16 = (half *) src0_ddq; - float * src1_ddf = (float *) src1->data; - float * dst_ddf = (float *) dst->data; + const half * src0_f16 = (const half *) src0->data; + float * dst_ddf = (float *) dst->data; - // convert src1 to fp16 + const half * src1_f16 = (const half *) src1->data; + const size_t ts_src1 = ggml_type_size(src1->type); + GGML_ASSERT(nb10 == ts_src1); + int64_t s11 = nb11 / ts_src1; + int64_t s12 = nb12 / ts_src1; + int64_t s13 = nb13 / ts_src1; ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); + + // convert src1 to fp16 if (src1->type != GGML_TYPE_F16) { - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type); const int64_t ne_src1 = ggml_nelements(src1); src1_f16_alloc.alloc(ne_src1); GGML_ASSERT(to_fp16_cuda != nullptr); - to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + + to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + + src1_f16 = src1_f16_alloc.get(); + s11 = ne10; + s12 = ne11*s11; + s13 = ne12*s12; } - half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get(); ggml_cuda_pool_alloc dst_f16(ctx.pool()); char * dst_t; @@ -1824,13 +1838,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, - cu_compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half), + src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11, + beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0, + cu_compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } } @@ -1841,15 +1855,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co CUBLAS_CHECK( cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA - (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB - beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC + alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA + src1_f16, CUDA_R_16F, s11, s12, // strideB + beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx - const int ne23 = ne12*ne13; + const int64_t ne23 = ne12*ne13; ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); @@ -1861,8 +1875,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ne12, ne13, ne23, nb02, nb03, - src1->type == GGML_TYPE_F16 ? nb12 : nb12/2, - src1->type == GGML_TYPE_F16 ? nb13 : nb13/2, + src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half), + src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half), nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); @@ -1871,8 +1885,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10, - beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, + (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11, + beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, ne23, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1936,7 +1950,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && use_mul_mat_vec_q) { ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && - dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { diff --git a/include/llama.h b/include/llama.h index a13350e15be6a..06c56395c139f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1232,6 +1232,7 @@ extern "C" { "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + /// Setting k <= 0 makes this a noop LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 2706ea2635444..fabb9ca237653 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -55,13 +55,16 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - if (ubatch->token && n_pos_per_embd > 1) { + if (ubatch->token && n_pos_per_embd == 4) { // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D - // the other dimensions are all 0, they are unused for text tokens - std::vector pos_data(n_tokens*n_pos_per_embd, 0); + // the 3 first dims are the same, and 4th dim is all 0 + std::vector pos_data(n_tokens*n_pos_per_embd); // copy the first dimension for (int i = 0; i < n_tokens; ++i) { - pos_data[i] = ubatch->pos[i]; + pos_data[ i] = ubatch->pos[i]; + pos_data[ n_tokens + i] = ubatch->pos[i]; + pos_data[2 * n_tokens + i] = ubatch->pos[i]; + pos_data[3 * n_tokens + i] = 0; // 4th dim is 0 } ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); } else { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2ec55d55a37be..822e2bb2cf018 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; + case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; case LLM_TYPE_1_6B: return "1.6B"; + case LLM_TYPE_1_7B: return "1.7B"; case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2_8B: return "2.8B"; @@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_15B: return "15B"; case LLM_TYPE_16B: return "16B"; case LLM_TYPE_20B: return "20B"; + case LLM_TYPE_27B: return "27B"; case LLM_TYPE_30B: return "30B"; case LLM_TYPE_32B: return "32B"; case LLM_TYPE_34B: return "34B"; @@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; case LLM_TYPE_236B: return "236B"; + case LLM_TYPE_290B: return "290B"; case LLM_TYPE_314B: return "314B"; case LLM_TYPE_671B: return "671B"; case LLM_TYPE_SMALL: return "0.1B"; @@ -88,10 +92,10 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_16x3_8B: return "16x3.8B"; case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B"; case LLM_TYPE_57B_A14B: return "57B.A14B"; - case LLM_TYPE_27B: return "27B"; - case LLM_TYPE_290B: return "290B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_30B_A3B: return "30B.A3B"; + case LLM_TYPE_235B_A22B: return "235B.A22B"; default: return "?B"; } } @@ -793,6 +797,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; + case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; + case 40: type = LLM_TYPE_14B; break; + case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -802,6 +810,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -10185,7 +10195,6 @@ struct llm_build_deepseek2 : public llm_graph_context { // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head} ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope); - ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32); cb(q_nope_absorbed, "q_nope_absorbed", il); // {kv_lora_rank, n_head, n_tokens} diff --git a/src/llama-model.h b/src/llama-model.h index fd82d106ccda8..95eca00266a4b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -39,11 +39,13 @@ enum llm_type { LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, + LLM_TYPE_0_6B, LLM_TYPE_1B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, LLM_TYPE_1_6B, + LLM_TYPE_1_7B, LLM_TYPE_1_8B, LLM_TYPE_2B, LLM_TYPE_2_8B, @@ -62,6 +64,7 @@ enum llm_type { LLM_TYPE_15B, LLM_TYPE_16B, LLM_TYPE_20B, + LLM_TYPE_27B, LLM_TYPE_30B, LLM_TYPE_32B, LLM_TYPE_34B, @@ -70,6 +73,7 @@ enum llm_type { LLM_TYPE_65B, LLM_TYPE_70B, LLM_TYPE_236B, + LLM_TYPE_290B, LLM_TYPE_314B, LLM_TYPE_671B, LLM_TYPE_SMALL, @@ -84,10 +88,10 @@ enum llm_type { LLM_TYPE_16x3_8B, LLM_TYPE_10B_128x3_66B, LLM_TYPE_57B_A14B, - LLM_TYPE_27B, - LLM_TYPE_290B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_30B_A3B, + LLM_TYPE_235B_A22B, }; struct llama_layer_posnet { diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d14979850285d..c0a5f9340d585 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // } if (k <= 0) { - k = cur_p->size; + return; } k = std::min(k, (int) cur_p->size); @@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } cur_p->sorted = true; } + cur_p->size = k; }