diff --git a/common/arg.cpp b/common/arg.cpp index 62eec8337e033..5ed5a23903332 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2848,15 +2848,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", - "reasoning format (default: deepseek; allowed values: deepseek, none)\n" - "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n" - "only supported for non-streamed responses", + "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" + "- none: leaves thoughts unparsed in `message.content`\n" + "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n" + "(default: deepseek)", [](common_params & params, const std::string & value) { /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } - else { std::invalid_argument("invalid value"); } + else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); + add_opt(common_arg( + {"--reasoning-budget"}, "N", + "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)", + [](common_params & params, int value) { + if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); } + params.reasoning_budget = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET")); add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", string_format( @@ -2955,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { std::invalid_argument("invalid value"); } + else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( diff --git a/common/chat.cpp b/common/chat.cpp index 78af5eafa40c3..adfe51db5a770 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -133,6 +133,7 @@ struct templates_params { bool stream; std::string grammar; bool add_generation_prompt = true; + bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); }; @@ -573,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init( return tmpls; } -std::string common_chat_format_name(common_chat_format format) { +const char * common_chat_format_name(common_chat_format format) { switch (format) { case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only"; case COMMON_CHAT_FORMAT_GENERIC: return "Generic"; @@ -591,6 +592,15 @@ std::string common_chat_format_name(common_chat_format format) { } } +const char * common_reasoning_format_name(common_reasoning_format format) { + switch (format) { + case COMMON_REASONING_FORMAT_NONE: return "none"; + case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; + default: + throw std::runtime_error("Unknown reasoning format"); + } +} + static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) { std::string arguments; if (builder.is_partial()) { @@ -918,7 +928,13 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; if (string_ends_with(data.prompt, "<|START_THINKING|>")) { - data.thinking_forced_open = true; + if (!inputs.enable_thinking) { + data.prompt += "<|END_THINKING|>"; + } else { + data.thinking_forced_open = true; + } + } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) { + data.prompt += "<|START_THINKING|><|END_THINKING|>"; } data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; @@ -1186,7 +1202,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1; if (string_ends_with(data.prompt, "\n")) { - data.thinking_forced_open = true; + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } } if (inputs.tools.is_array() && !inputs.tools.empty()) { @@ -1460,104 +1480,114 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + json additional_context = { + {"enable_thinking", inputs.enable_thinking}, + }; + + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; if (string_ends_with(data.prompt, "\n")) { - data.thinking_forced_open = true; + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } } - // (content)?({"name": "foo", "arguments": {"a": 1}})* - data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; - data.grammar = build_grammar([&](const common_grammar_builder & builder) { - std::vector tool_rules; - std::vector tool_call_alts; - std::vector escaped_names; - foreach_function(inputs.tools, [&](const json & tool) { - const auto & function = tool.at("function"); - std::string name = function.at("name"); - auto parameters = function.at("parameters"); - builder.resolve_refs(parameters); - tool_rules.push_back(builder.add_schema(name + "-call", { - {"type", "object"}, - {"properties", json { - {"name", json {{"const", name}}}, - {"arguments", parameters}, - }}, - {"required", json::array({"name", "arguments"})}, - })); - tool_call_alts.push_back(builder.add_rule( - name + "-function-tag", - "\"\" space " + - builder.add_schema(name + "-args", parameters) + " " - "\"\" space")); + if (!inputs.tools.is_null()) { + // (content)?({"name": "foo", "arguments": {"a": 1}})* + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + std::vector tool_call_alts; + std::vector escaped_names; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + tool_rules.push_back(builder.add_schema(name + "-call", { + {"type", "object"}, + {"properties", json { + {"name", json {{"const", name}}}, + {"arguments", parameters}, + }}, + {"required", json::array({"name", "arguments"})}, + })); + tool_call_alts.push_back(builder.add_rule( + name + "-function-tag", + "\"\" space " + + builder.add_schema(name + "-args", parameters) + " " + "\"\" space")); - data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_WORD, - "", + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + "", + }); + auto escaped_name = regex_escape(name); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, + " alt_tags { + any_tool_call, + "\"\" space " + any_tool_call + " \"\"", + // The rest is just to accommodate common "good bad" outputs. + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + }; + auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space"); + tool_call_alts.push_back(wrappable_tool_call); + tool_call_alts.push_back( + "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space "); + auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); + builder.add_rule("root", + std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + + (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call)); + // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives) data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, - " tag in the grammar, + // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) + std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + ( + "(\\s*" + "(?:" + "||||)?" + "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\"" + ")" + ")[\\s\\S]*" + ), }); - escaped_names.push_back(escaped_name); - }); - auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space"); - std::vector alt_tags { - any_tool_call, - "\"\" space " + any_tool_call + " \"\"", - // The rest is just to accommodate common "good bad" outputs. - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - "\"\" space " + any_tool_call + " \"\"", - }; - auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space"); - tool_call_alts.push_back(wrappable_tool_call); - tool_call_alts.push_back( - "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space "); - auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); - builder.add_rule("root", - std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + - (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call)); - // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives) - data.grammar_triggers.push_back({ - COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, - // If thinking_forced_open, then we capture the tag in the grammar, - // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) - std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + ( - "(\\s*" - "(?:" - "||||)?" - "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\"" - ")" - ")[\\s\\S]*" - ), + data.preserved_tokens = { + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "```", + "```json", + "```xml", + }; }); - data.preserved_tokens = { - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "```", - "```json", - "```xml", - }; - }); + } return data; } @@ -1669,6 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja( params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; + params.enable_thinking = inputs.enable_thinking; params.grammar = inputs.grammar; params.now = inputs.now; if (!inputs.json_schema.empty()) { @@ -1702,7 +1733,7 @@ static common_chat_params common_chat_templates_apply_jinja( } // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) - if (src.find("") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) { + if (src.find("") != std::string::npos && params.json_schema.is_null()) { return common_chat_params_init_hermes_2_pro(tmpl, params); } @@ -1821,7 +1852,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) { } static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) { - LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str()); + LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str()); switch (format) { case COMMON_CHAT_FORMAT_CONTENT_ONLY: @@ -1858,7 +1889,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form common_chat_parse_command_r7b(builder); break; default: - throw std::runtime_error("Unsupported format: " + common_chat_format_name(format)); + throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format)); } builder.finish(); } diff --git a/common/chat.h b/common/chat.h index ce926777ebe91..3e2cbbaae3369 100644 --- a/common/chat.h +++ b/common/chat.h @@ -123,6 +123,7 @@ struct common_chat_templates_inputs { common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; bool parallel_tool_calls = false; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); }; @@ -181,7 +182,8 @@ std::string common_chat_format_example( const struct common_chat_templates * tmpls, bool use_jinja); -std::string common_chat_format_name(common_chat_format format); +const char* common_chat_format_name(common_chat_format format); +const char* common_reasoning_format_name(common_reasoning_format format); common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/common/common.cpp b/common/common.cpp index eb16055ea6448..2afa9b2d641d4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -849,7 +849,7 @@ std::string fs_get_cache_directory() { if (getenv("LLAMA_CACHE")) { cache_directory = std::getenv("LLAMA_CACHE"); } else { -#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) +#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) if (std::getenv("XDG_CACHE_HOME")) { cache_directory = std::getenv("XDG_CACHE_HOME"); } else { diff --git a/common/common.h b/common/common.h index f0c52c314b744..92b9533fc2948 100644 --- a/common/common.h +++ b/common/common.h @@ -368,6 +368,7 @@ struct common_params { bool use_jinja = false; // NOLINT bool enable_chat_template = true; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + int reasoning_budget = -1; bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response std::vector api_keys; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 123083b915412..91af508a2fb28 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2643,7 +2643,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) -@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM") +@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration") class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 @@ -2667,8 +2667,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = f"model.{name}" # map to Qwen2ForCausalLM tensors if "language_model." in name: name = name.replace("language_model.", "") # for InternVL - if name.startswith("mlp") or name.startswith("vision_model"): - # skip visual tensors + if name.startswith("mlp") or name.startswith("multi_modal_projector") \ + or name.startswith("vision_model") or name.startswith("audio_tower"): + # skip vision and audio tensors return [] yield from super().modify_tensors(data_torch, name, bid) @@ -5993,11 +5994,11 @@ class UltravoxModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument") + raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument") -@ModelBase.register("UltravoxModel") -class UltravoxAudioModel(MmprojModel): +@ModelBase.register("Qwen2AudioForConditionalGeneration") +class WhisperEncoderModel(MmprojModel): has_vision_encoder = False # no vision encoder has_audio_encoder = True @@ -6009,10 +6010,9 @@ def __init__(self, *args, **kwargs): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A) self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) - self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) def tensor_force_quant(self, name, new_name, bid, n_dims): del bid, new_name, n_dims # unused @@ -6023,6 +6023,10 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + if name.startswith("language_model."): + # skip language model tensors + return [] + # prevent clash naming with vision tensors if name.startswith("multi_modal_projector"): name = "audio." + name @@ -6033,6 +6037,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] + +@ModelBase.register("UltravoxModel") +class UltravoxWhisperEncoderModel(WhisperEncoderModel): + has_vision_encoder = False # no vision encoder + has_audio_encoder = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) + ###### CONVERSION LOGIC ###### diff --git a/docs/multimodal.md b/docs/multimodal.md index ffcbbd774bd82..3a0994a279ae8 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -33,7 +33,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload ## Pre-quantized models -These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org +These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server` @@ -81,6 +81,10 @@ NOTE: some models may require large context window, for example: `-c 8192` # Llama 4 Scout (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF + +# Moondream2 20250414 version +(tool_name) -hf ggml-org/moondream2-20250414-GGUF + ``` **Audio models**: @@ -89,4 +93,8 @@ NOTE: some models may require large context window, for example: `-c 8192` # Ultravox 0.5 (tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF (tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF + +# Qwen2-Audio and SeaLLM-Audio +# note: no pre-quantized GGUF this model, as they have very poor result +# ref: https://github.com/ggml-org/llama.cpp/pull/13760 ``` diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index aaa94176f168c..0a9d3a927c23a 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -1,74 +1,93 @@ #include "binbcast.hpp" -#include #include #include #include -#include "dpct/helper.hpp" #include "ggml.h" -template -static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, - dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) { - auto element_id = it.get_global_id(0); - auto global_range = it.get_global_range(0); - for (; element_id < num_elements; element_id += global_range) { - auto src0_float_val = sycl::vec(src0[element_id]).template convert(); - auto src1_float_val = sycl::vec(src1[element_id]).template convert(); - float dst_val = bin_op(src0_float_val[0], src1_float_val[0]); - auto val_to_store = sycl::vec(dst_val).template convert(); - dst[element_id] = val_to_store; +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) / + ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) % + ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; + i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); } } -template -static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13, - int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10, - int s11, int s12, int s13, std::size_t num_dst_elements, - const sycl::nd_item<1> & item_ct1) { - auto calculate_logical_index = - [](const std::array & dims, std::size_t element_id) __attribute__((always_inline))->std::array { - std::array logical_index; -#pragma unroll(4) - for (int i = 3; i >= 0; i--) { - logical_index[i] = element_id % dims[i]; - element_id /= dims[i]; - } - return logical_index; - }; - - auto calculate_index = [](const std::array & dims, const std::array & strides, - const std::array & indices) __attribute__((always_inline)) - ->std::size_t { - std::size_t index = 0; -#pragma unroll(4) - for (int i = 0; i < 4; i++) { - auto index_i = indices[i]; - if (indices[i] >= dims[i]) { - index_i = indices[i] % dims[i]; - } - index += strides[i] * index_i; - } - return index; - }; - - auto element_id = item_ct1.get_global_id(0); - for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) { - auto logical_index = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id); - auto src_0_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index); - auto src_1_index = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index); - auto dst_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index); - auto src0_float_val = sycl::vec(src0[src_0_index]).template convert(); - auto src1_float_val = sycl::vec(src1[src_1_index]).template convert(); - float dst_val = bin_op(src0_float_val[0], src1_float_val[0]); - auto val_to_store = sycl::vec(dst_val).template convert(); - dst[dst_index] = val_to_store; +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); } -template struct bin_bcast_sycl { + +template +struct bin_bcast_sycl { template void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11, @@ -77,73 +96,165 @@ template struct bin_bcast_sycl { const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0, const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous, const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) { - auto check_bcast_required = [](const std::array & src_dims, - const std::array & dst_dims) -> bool { + int nr0 = ne10 / ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne[] = {ne0, ne1, ne2, ne3}; + int64_t cne0[] = {ne00, ne01, ne02, ne03}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + size_t cnb[] = {nb0, nb1, nb2, nb3}; + size_t cnb0[] = {nb00, nb01, nb02, nb03}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) { for (int i = 0; i < 4; i++) { - if (dst_dims[i] > src_dims[i]) { - return true; + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb, cne); + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne); + collapse(cne0); + collapse(cne1); } } - return false; - }; - - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - GGML_ASSERT(nb0 % sizeof(dst_t) == 0); - GGML_ASSERT(nb1 % sizeof(dst_t) == 0); - GGML_ASSERT(nb2 % sizeof(dst_t) == 0); - GGML_ASSERT(nb3 % sizeof(dst_t) == 0); - - GGML_ASSERT(nb00 % sizeof(src0_t) == 0); - GGML_ASSERT(nb01 % sizeof(src0_t) == 0); - GGML_ASSERT(nb02 % sizeof(src0_t) == 0); - GGML_ASSERT(nb03 % sizeof(src0_t) == 0); - - GGML_ASSERT(nb10 % sizeof(src1_t) == 0); - GGML_ASSERT(nb11 % sizeof(src1_t) == 0); - GGML_ASSERT(nb12 % sizeof(src1_t) == 0); - GGML_ASSERT(nb13 % sizeof(src1_t) == 0); - - // dst strides in number of elements - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - // src1 strides in number of elements - size_t s10 = nb10 / sizeof(src0_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - // src0 strides in number of elements - size_t s00 = nb00 / sizeof(src0_t); - size_t s01 = nb01 / sizeof(src0_t); - size_t s02 = nb02 / sizeof(src0_t); - size_t s03 = nb03 / sizeof(src0_t); - - std::size_t num_dst_elements = static_cast(ne0) * static_cast(ne1) * - static_cast(ne2) * static_cast(ne3); - std::size_t local_range = 256; - std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range; - - bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) || - check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 }); - bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous; - - if (! needs_broadcasting && all_contiguous) { - stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) { - k_bin_bcast_contiguous(src0_dd, src1_dd, dst_dd, num_dst_elements, it); - }); - }); - } else { - stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) { - k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1, - s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it); - }); - }); + } + { + int64_t ne0 = cne[0]; + int64_t ne1 = cne[1]; + int64_t ne2 = cne[2]; + int64_t ne3 = cne[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb[0]; + size_t nb1 = cnb[1]; + size_t nb2 = cnb[2]; + size_t nb3 = cnb[3]; + + size_t nb00 = cnb0[0]; + size_t nb01 = cnb0[1]; + size_t nb02 = cnb0[2]; + size_t nb03 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + size_t s00 = nb00 / sizeof(src0_t); + size_t s01 = nb01 / sizeof(src0_t); + size_t s02 = nb02 / sizeof(src0_t); + size_t s03 = nb03 / sizeof(src0_t); + + GGML_UNUSED(s00); + + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); + GGML_ASSERT(nb1 % sizeof(dst_t) == 0); + GGML_ASSERT(nb2 % sizeof(dst_t) == 0); + GGML_ASSERT(nb3 % sizeof(dst_t) == 0); + + GGML_ASSERT(nb00 % sizeof(src0_t) == 0); + GGML_ASSERT(nb01 % sizeof(src0_t) == 0); + GGML_ASSERT(nb02 % sizeof(src0_t) == 0); + GGML_ASSERT(nb03 % sizeof(src0_t) == 0); + + GGML_ASSERT(nb10 % sizeof(src1_t) == 0); + GGML_ASSERT(nb11 % sizeof(src1_t) == 0); + GGML_ASSERT(nb12 % sizeof(src1_t) == 0); + GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = std::min( + ne1, block_size / (unsigned int)block_dims[2]); + block_dims[0] = std::min( + std::min( + ne2 * ne3, block_size / (unsigned int)block_dims[2] / + (unsigned int)block_dims[1]), + 64U); + + sycl::range<3> block_nums( + (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * + sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02, + s03, s11, s12, s13, item_ct1); + }); + } + } else { + /* + DPCT1049:16: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, + ne2, ne3, ne10, ne11, ne12, ne13, + s1, s2, s3, s01, s02, s03, s11, s12, s13, + item_ct1); + }); + } } } }; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 58de45dfddb85..c6255d6867a15 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -546,6 +546,7 @@ class MODEL_TENSOR(IntEnum): A_ENC_FFN_GATE = auto() A_ENC_FFN_DOWN = auto() A_MMPROJ = auto() + A_MMPROJ_FC = auto() A_MM_NORM_PRE = auto() A_MM_NORM_MID = auto() @@ -825,6 +826,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate", MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down", MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}", + MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid", } @@ -885,6 +887,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_FFN_GATE, MODEL_TENSOR.A_ENC_FFN_DOWN, MODEL_TENSOR.A_MMPROJ, + MODEL_TENSOR.A_MMPROJ_FC, MODEL_TENSOR.A_MM_NORM_PRE, MODEL_TENSOR.A_MM_NORM_MID, ], @@ -2256,6 +2259,7 @@ class VisionProjectorType: QWEN25VL = "qwen2.5vl_merger" ULTRAVOX = "ultravox" INTERNVL = "internvl" + QWEN2A = "qwen2a" # audio # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 91a95ea48b42d..4a0615b656812 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1165,6 +1165,10 @@ class TensorNameMap: "audio.multi_modal_projector.linear_{bid}", # ultravox ), + MODEL_TENSOR.A_MMPROJ_FC: ( + "audio.multi_modal_projector.linear", # qwen2audio + ), + MODEL_TENSOR.A_MM_NORM_PRE: ( "audio.multi_modal_projector.ln_pre", # ultravox ), diff --git a/include/llama.h b/include/llama.h index 52cd7a5a037ef..eafab7323d9bf 100644 --- a/include/llama.h +++ b/include/llama.h @@ -471,6 +471,7 @@ extern "C" { LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); + LLAMA_API size_t llama_max_parallel_sequences(void); LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); diff --git a/models/ggml-vocab-nomic-bert-moe.gguf b/models/ggml-vocab-nomic-bert-moe.gguf new file mode 100644 index 0000000000000..b6f4d9441113b Binary files /dev/null and b/models/ggml-vocab-nomic-bert-moe.gguf differ diff --git a/models/ggml-vocab-nomic-bert-moe.gguf.inp b/models/ggml-vocab-nomic-bert-moe.gguf.inp new file mode 100644 index 0000000000000..9baf7d77ae6b5 --- /dev/null +++ b/models/ggml-vocab-nomic-bert-moe.gguf.inp @@ -0,0 +1,112 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +!!!!!! +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ +Cửa Việt +__ggml_vocab_test__ + discards +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-nomic-bert-moe.gguf.out b/models/ggml-vocab-nomic-bert-moe.gguf.out new file mode 100644 index 0000000000000..c31c61092cb3d --- /dev/null +++ b/models/ggml-vocab-nomic-bert-moe.gguf.out @@ -0,0 +1,46 @@ + 17 297 201 78660 21775 + 72805 4097 56 + + + + + + + + + + 35378 8999 + 35378 8999 + 35378 6661 + 35378 6661 + 35378 6661 38 + 35378 4 8999 38 + 35378 4 8999 38 + 903 83 6 3 5 238 6366 + 148 7709 1019 361 458 134362 104 7 71 420 1132 + 14271 29 117152 + 6 149561 78270 48967 64254 7616 81705 + 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 15 191 538 28 121505 450 1556 6863 10002 47 1098 16 + 35378 + 35378 + 35378 + 35378 + 35378 + 35378 35378 + 15 + 2203 + 242 1615 + 35378 4 113 25 5584 38 11249 621 398 6 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087 + 6 90827 + 138 + 3912 + 6 66000 + 138 66000 + 3912 66000 + 6 66000 66000 + 138 66000 66000 + 3912 66000 66000 + 6 66000 66000 66000 + 199152 3763 + 17116 99397 + 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 6 3 138 3912 6 66000 138 66000 3912 66000 6 66000 66000 138 66000 66000 3912 66000 66000 80308 1031 5 363 138 27 363 6 149561 78270 48967 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087 6 110405 1369 69112 69112 69112 14271 29 117152 5106 4765 4765 1135 164721 164721 164721 58 58 58 58 2551 90827 32 85908 87 25 272 2809 242 18 18345 764 25 7 2685 4 242 11766 398 9077 32 242 594 959 9077 87 25 1181 3249 442 4 242 397 398 1884 3060 26156 32 1401 25 26455 10 25 141 866 diff --git a/models/templates/Qwen-Qwen3-0.6B.jinja b/models/templates/Qwen-Qwen3-0.6B.jinja new file mode 100644 index 0000000000000..699ff8df401fe --- /dev/null +++ b/models/templates/Qwen-Qwen3-0.6B.jinja @@ -0,0 +1,85 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set content = message.content %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is defined and message.reasoning_content is not none %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in message.content %} + {%- set content = message.content.split('')[-1].lstrip('\n') %} + {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- message.content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/models/templates/README.md b/models/templates/README.md index b8655be9fce95..35b6386dd0649 100644 --- a/models/templates/README.md +++ b/models/templates/README.md @@ -20,4 +20,5 @@ These templates can be updated with the following commands: ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja ./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja +./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja ``` \ No newline at end of file diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 85b4324b699e6..98ecb7c8249ce 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -25,7 +25,11 @@ llama_context::llama_context( const auto & hparams = model.hparams; - cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_seq_max = std::max(1u, params.n_seq_max); + if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) { + throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES)); + } + cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; cparams.yarn_ext_factor = params.yarn_ext_factor; diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp index 28369be365252..f7b36590fe3e3 100644 --- a/src/llama-cparams.cpp +++ b/src/llama-cparams.cpp @@ -1 +1,5 @@ #include "llama-cparams.h" + +size_t llama_max_parallel_sequences(void) { + return LLAMA_MAX_PARALLEL_SEQUENCES; +} diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 246fa5777deea..2871031ef0961 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -4,6 +4,8 @@ #include +#define LLAMA_MAX_PARALLEL_SEQUENCES 64 + struct llama_cparams { uint32_t n_ctx; // context size used during inference uint32_t n_batch; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index a2624d71589b5..ae2d2684f8cba 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -65,8 +65,6 @@ llama_kv_cache_unified::llama_kv_cache_unified( }; head = 0; - size = kv_size; - used = 0; cells.resize(kv_size); @@ -138,13 +136,9 @@ llama_kv_cache_unified::llama_kv_cache_unified( } void llama_kv_cache_unified::clear() { - for (uint32_t i = 0; i < size; ++i) { - cells[i].pos = -1; - cells[i].seq_id.clear(); - } + cells.reset(); head = 0; - used = 0; for (auto & buf : bufs) { ggml_backend_buffer_clear(buf.get(), 0); @@ -152,7 +146,7 @@ void llama_kv_cache_unified::clear() { } bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - uint32_t new_head = size; + uint32_t new_head = cells.size(); if (p0 < 0) { p0 = 0; @@ -162,33 +156,20 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1 = std::numeric_limits::max(); } - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].pos >= p0 && cells[i].pos < p1) { - if (seq_id < 0) { - cells[i].seq_id.clear(); - } else if (cells[i].has_seq_id(seq_id)) { - cells[i].seq_id.erase(seq_id); - } else { - continue; - } - - if (cells[i].is_empty()) { - // keep count of the number of used cells - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } - if (new_head == size) { - new_head = i; - } + if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) { + if (new_head == cells.size()) { + new_head = i; } } } // If we freed up a slot, set head to it so searching can start there. - if (new_head != size && new_head < head) { + if (new_head != cells.size() && new_head < head) { head = new_head; } @@ -208,49 +189,40 @@ void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id p1 = std::numeric_limits::max(); } - // otherwise, this is the KV of a Transformer-like model - head = 0; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { - cells[i].seq_id.insert(seq_id_dst); + if (cells.seq_has(i, seq_id_src)) { + cells.seq_add(i, seq_id_dst); } } } void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { - uint32_t new_head = size; + uint32_t new_head = cells.size(); - for (uint32_t i = 0; i < size; ++i) { - if (!cells[i].has_seq_id(seq_id)) { - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; - cells[i].seq_id.clear(); - - if (new_head == size){ + for (uint32_t i = 0; i < cells.size(); ++i) { + if (cells.seq_keep(i, seq_id)) { + if (new_head == cells.size()) { new_head = i; } - } else { - cells[i].seq_id.clear(); - cells[i].seq_id.insert(seq_id); } } // If we freed up a slot, set head to it so searching can start there. - if (new_head != size && new_head < head) { + if (new_head != cells.size() && new_head < head) { head = new_head; } } -void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - if (delta == 0) { +void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + if (shift == 0) { return; } - uint32_t new_head = size; + uint32_t new_head = cells.size(); if (p0 < 0) { p0 = 0; @@ -260,25 +232,19 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po p1 = std::numeric_limits::max(); } - // If there is no range then return early to avoid looping over the + // If there is no range then return early to avoid looping over all cells. if (p0 == p1) { return; } - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { - has_shift = true; - - cells[i].pos += delta; - cells[i].delta += delta; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } - if (cells[i].pos < 0) { - if (!cells[i].is_empty()) { - used--; - } - cells[i].pos = -1; - cells[i].seq_id.clear(); - if (new_head == size) { + if (cells.seq_has(i, seq_id)) { + if (cells.pos_add(i, shift)) { + if (new_head == cells.size()) { new_head = i; } } @@ -287,7 +253,7 @@ void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_po // If we freed up a slot, set head to it so searching can start there. // Otherwise we just start the next search from the beginning. - head = new_head != size ? new_head : 0; + head = new_head != cells.size() ? new_head : 0; } void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { @@ -308,15 +274,13 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po return; } - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { - has_shift = true; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } - { - llama_pos p_old = cells[i].pos; - cells[i].pos /= d; - cells[i].delta += cells[i].pos - p_old; - } + if (cells.seq_has(i, seq_id)) { + cells.pos_div(i, d); } } } @@ -324,9 +288,9 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const { llama_pos result = std::numeric_limits::max(); - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id)) { - result = std::min(result, cells[i].pos); + for (uint32_t i = 0; i < cells.size(); ++i) { + if (cells.seq_has(i, seq_id)) { + result = std::min(result, cells.pos_get(i)); } } @@ -340,9 +304,9 @@ llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const { llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { llama_pos result = -1; - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id)) { - result = std::max(result, cells[i].pos); + for (uint32_t i = 0; i < cells.size(); ++i) { + if (cells.seq_has(i, seq_id)) { + result = std::max(result, cells.pos_get(i)); } } @@ -350,25 +314,15 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { } void llama_kv_cache_unified::restore() { - for (const auto & [id, cell] : recovery.cells) { - // TODO: move to new `struct kv_cells` - const bool is_empty0 = cells[id].is_empty(); - const bool is_empty1 = cell.is_empty(); - - if (!is_empty0 && is_empty1) { - used--; - } else if (is_empty0 && !is_empty1) { - used++; - } - - cells[id] = cell; + for (auto & state : recovery.states) { + cells.set(state.i, state.cells); } recovery.clear(); } void llama_kv_cache_unified::commit() { - if (recovery.cells.empty()) { + if (recovery.states.empty()) { LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n", __func__, "https://github.com/ggml-org/llama.cpp/pull/13194"); return; @@ -382,7 +336,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { auto * sched = lctx.get_sched(); - if (has_shift) { + if (cells.get_has_shift()) { if (!get_can_shift()) { GGML_ABORT("The current KV cache / model configuration does not support K-shift"); } @@ -406,13 +360,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { need_reserve = true; } - { - has_shift = false; - - for (uint32_t i = 0; i < size; ++i) { - cells[i].delta = 0; - } - } + cells.reset_shift(); } if (do_defrag) { @@ -443,7 +391,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { void llama_kv_cache_unified::defrag_sched(float thold) { // - do not defrag small contexts (i.e. < 2048 tokens) // - count the padding towards the number of used tokens - const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f; + const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n)) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > thold) { @@ -454,7 +402,7 @@ void llama_kv_cache_unified::defrag_sched(float thold) { } void llama_kv_cache_unified::set_full() { - n = size; + n = cells.size(); // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not // affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views. @@ -478,14 +426,14 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) { // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it - if (head > used + 2*ubatch.n_tokens) { + if (head > cells.get_used() + 2*ubatch.n_tokens) { head = 0; } // otherwise, one cell per token. - if (n_tokens > size) { - LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size); + if (n_tokens > cells.size()) { + LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size()); return false; } @@ -498,10 +446,10 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) { std::string ss; if (n_swa > 0) { for (uint32_t i = 0; i < size; ++i) { - if (cells[i].pos == -1) { + if (cells.is_empty(i)) { ss += '.'; } else { - ss += std::to_string(*cells[i].seq_id.begin()); + ss += 'x'; } if (i%256 == 255) { ss += '\n'; @@ -515,15 +463,16 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) { uint32_t n_tested = 0; while (true) { - if (head + n_tokens > size) { - n_tested += size - head; + if (head + n_tokens > cells.size()) { + n_tested += cells.size() - head; head = 0; continue; } bool found = true; for (uint32_t i = 0; i < n_tokens; i++) { - if (cells[head + i].pos >= 0) { + // TODO: improve to accept cells that are masked by the SWA + if (!cells.is_empty(head + i)) { found = false; head += i + 1; n_tested += i + 1; @@ -535,31 +484,27 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) { break; } - if (n_tested >= size) { + if (n_tested >= cells.size()) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return false; } } - for (uint32_t i = 0; i < n_tokens; ++i) { - // remember the original state - if (recovery.cells.find(head + i) == recovery.cells.end()) { - recovery.cells[head + i] = cells[head + i]; - } + // store the old state of the cells in the recovery stack + recovery.states.push_back({head, cells.cp(head, n_tokens)}); - cells[head + i].pos = ubatch.pos[i]; + for (uint32_t i = 0; i < n_tokens; ++i) { + cells.pos_set(head + i, ubatch.pos[i]); for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) { - cells[head + i].seq_id.insert(ubatch.seq_id[i][j]); + cells.seq_add(head + i, ubatch.seq_id[i][j]); } } - used += n_tokens; - // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad))); + n = std::min(cells.size(), std::max(n_pad, GGML_PAD(cell_max(), n_pad))); #ifdef FIND_SLOT_DEBUG LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa); @@ -577,7 +522,7 @@ uint32_t llama_kv_cache_unified::get_n() const { } uint32_t llama_kv_cache_unified::get_size() const { - return size; + return cells.size(); } ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const { @@ -661,30 +606,19 @@ void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llam int n_attended = 0; - for (uint32_t i = 0; i < size; ++i) { - const llama_pos p0 = cells[i].pos; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.seq_has(i, seq_id)) { + continue; + } + + const llama_pos p0 = cells.pos_get(i); if (p0 <= pmin && !is_masked_swa(p0, pmin)) { n_attended++; } if (is_masked_swa(p0, pmax)) { - if (seq_id < 0) { - cells[i].seq_id.clear(); - } else if (cells[i].has_seq_id(seq_id)) { - cells[i].seq_id.erase(seq_id); - } else { - continue; - } - - if (cells[i].is_empty()) { - // keep count of the number of used cells - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; - } + cells.seq_rm(i, seq_id); } } @@ -723,25 +657,31 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j]; for (int i = 0; i < n_kv; ++i) { - const llama_pos p0 = cells[i].pos; + float f = 0.0f; bool masked = false; - // mask the token if not the same sequence - masked = masked || (!cells[i].has_seq_id(seq_id)); + if (cells.is_empty(i)) { + masked = true; + } else { + const llama_pos p0 = cells.pos_get(i); - // mask future tokens - masked = masked || (causal_attn && p0 > p1); + // mask the token if not the same sequence + masked = masked || (!cells.seq_has(i, seq_id)); - // apply SWA if any - masked = masked || (is_masked_swa(p0, p1)); + // mask future tokens + masked = masked || (causal_attn && p0 > p1); - float f = 0.0f; + // apply SWA if any + masked = masked || (is_masked_swa(p0, p1)); + + if (!masked && hparams.use_alibi) { + f = -std::abs(p0 - p1); + } + } if (masked) { f = -INFINITY; - } else if (hparams.use_alibi) { - f = -std::abs(p0 - p1); } data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; @@ -765,8 +705,8 @@ void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const { int32_t * data = (int32_t *) dst->data; - for (uint32_t i = 0; i < size; ++i) { - data[i] = cells[i].delta; + for (uint32_t i = 0; i < cells.size(); ++i) { + data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i); } } @@ -783,7 +723,10 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false); + // the position when the cells is empty is irrelevant - it will be masked out later in the attention + const llama_pos p0 = cells.is_empty(i) ? -1 : cells.pos_get(i); + + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(p0, ubatch->pos[j], hparams.n_rel_attn_bkts, false); } } } @@ -910,7 +853,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( ggml_tensor * k = ggml_view_3d(ctx, layer.k, - n_embd_head_k, n_head_kv, size, + n_embd_head_k, n_head_kv, cells.size(), ggml_row_size(layer.k->type, n_embd_head_k), ggml_row_size(layer.k->type, n_embd_k_gqa), 0); @@ -1050,12 +993,12 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( } else { view_v_src = ggml_view_2d(ctx, layer.v, nm, n_embd_v_gqa, - ggml_row_size(layer.v->type, size), + ggml_row_size(layer.v->type, cells.size()), ggml_row_size(layer.v->type, i)); view_v_dst = ggml_view_2d(ctx, layer.v, nm, n_embd_v_gqa, - ggml_row_size(layer.v->type, size), + ggml_row_size(layer.v->type, cells.size()), ggml_row_size(layer.v->type, id)); } @@ -1076,7 +1019,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { const uint32_t n_layer = layers.size(); const uint32_t n_kv = cell_max(); - const uint32_t n_used = used; + const uint32_t n_used = cells.get_used(); assert(n_used <= n_kv); @@ -1104,9 +1047,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { ids.resize(n_kv, n_kv); for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = cells[i0]; - - if (!cell0.is_empty()) { + if (!cells.is_empty(i0)) { ids[i0] = i0; continue; @@ -1117,7 +1058,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { uint32_t nh = 1; // determine the size of the hole - while (i0 + nh < n_used && cells[i0 + nh].is_empty()) { + while (i0 + nh < n_used && cells.is_empty(i0 + nh)) { nh++; } @@ -1126,9 +1067,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { // starting from the end, find nh non-empty cells for (; is > i0; --is) { - const auto & cell1 = cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { + if (cells.is_empty(is) || ids[is] != n_kv) { continue; } @@ -1155,9 +1094,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { // go back and move the nf cells to the hole for (; i1 < n_kv; ++i1) { - auto & cell1 = cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { + if (cells.is_empty(i1) || ids[i1] != n_kv) { if (n_moves == max_moves) { stop = true; break; @@ -1171,10 +1108,8 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { ids[i1] = i0 + nf; // move the cell meta data - cells[i0 + nf] = cell1; + cells.mv(i1, i0 + nf); - // clear the old cell and move the head there - cell1 = kv_cell(); head = n_used; if (!cont) { @@ -1210,10 +1145,8 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { } uint32_t llama_kv_cache_unified::cell_max() const { - for (uint32_t i = size; i > 0; --i) { - const kv_cell & cell = cells[i - 1]; - - if (cell.pos >= 0 && !cell.is_empty()) { + for (uint32_t i = cells.size(); i > 0; --i) { + if (!cells.is_empty(i - 1)) { return i; } } @@ -1222,9 +1155,7 @@ uint32_t llama_kv_cache_unified::cell_max() const { } bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const { - if (p0 < 0) { - return true; - } + assert(p0 >= 0 && p1 >= 0); switch (swa_type) { case LLAMA_SWA_TYPE_NONE: @@ -1255,23 +1186,24 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq // Count the number of cells with the specified seq_id // Find all the ranges of cells with this seq id (or all, when -1) - uint32_t cell_range_begin = size; - for (uint32_t i = 0; i < size; ++i) { - const auto & cell = cells[i]; - if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + uint32_t cell_range_begin = cells.size(); + + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) { ++cell_count; - if (cell_range_begin == size) { + if (cell_range_begin == cells.size()) { cell_range_begin = i; } } else { - if (cell_range_begin != size) { + if (cell_range_begin != cells.size()) { cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = size; + cell_range_begin = cells.size(); } } } - if (cell_range_begin != size) { - cell_ranges.emplace_back(cell_range_begin, size); + + if (cell_range_begin != cells.size()) { + cell_ranges.emplace_back(cell_range_begin, cells.size()); } // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count @@ -1308,17 +1240,24 @@ void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_i void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { for (const auto & range : cell_ranges) { for (uint32_t i = range.first; i < range.second; ++i) { - const auto & cell = cells[i]; - const llama_pos pos = cell.pos; - const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; + std::vector seq_ids; + + for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) { + if (cur == seq_id || seq_id == -1) { + if (cells.seq_has(i, cur)) { + seq_ids.push_back(cur); + } + } + } + + const llama_pos pos = cells.pos_get(i); + const uint32_t n_seq_id = seq_ids.size(); io.write(&pos, sizeof(pos)); io.write(&n_seq_id, sizeof(n_seq_id)); - if (n_seq_id) { - for (auto seq_id : cell.seq_id) { - io.write(&seq_id, sizeof(seq_id)); - } + for (const auto & seq_id : seq_ids) { + io.write(&seq_id, sizeof(seq_id)); } } } @@ -1379,7 +1318,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std:: } } else { // When v is transposed, we also need the element size and get the element ranges from each row - const uint32_t kv_size = size; + const uint32_t kv_size = cells.size(); for (const auto & layer : layers) { const uint32_t il = layer.il; @@ -1429,14 +1368,20 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell io.read_to(&pos, sizeof(pos)); io.read_to(&n_seq_id, sizeof(n_seq_id)); - if (n_seq_id != 0) { + if (n_seq_id != 1) { LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); return false; } - batch.pos[i] = pos; - batch.n_seq_id[i] = 1; - batch.seq_id[i] = &dest_seq_id; + // read the sequence id, but directly discard it - we will use dest_seq_id instead + { + llama_seq_id seq_id; + io.read_to(&seq_id, sizeof(seq_id)); + } + + batch.pos[i] = pos; + batch.n_seq_id[i] = n_seq_id; + batch.seq_id[i] = &dest_seq_id; } if (!find_slot(batch)) { @@ -1448,15 +1393,15 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) // Assume that this is one contiguous block of cells - GGML_ASSERT(head + cell_count <= size); - GGML_ASSERT(cells[head].pos == batch.pos[0]); - GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(cells[head].has_seq_id(dest_seq_id)); - GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id)); + GGML_ASSERT(head + cell_count <= cells.size()); + GGML_ASSERT(cells.pos_get(head) == batch.pos[0]); + GGML_ASSERT(cells.pos_get(head + cell_count - 1) == batch.pos[cell_count - 1]); + GGML_ASSERT(cells.seq_has(head, dest_seq_id)); + GGML_ASSERT(cells.seq_has(head + cell_count - 1, dest_seq_id)); } else { // whole KV cache restore - if (cell_count > size) { + if (cell_count > cells.size()) { LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); return false; } @@ -1464,15 +1409,13 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell clear(); for (uint32_t i = 0; i < cell_count; ++i) { - kv_cell & cell = cells[i]; - llama_pos pos; uint32_t n_seq_id; io.read_to(&pos, sizeof(pos)); io.read_to(&n_seq_id, sizeof(n_seq_id)); - cell.pos = pos; + cells.pos_set(i, pos); for (uint32_t j = 0; j < n_seq_id; ++j) { llama_seq_id seq_id; @@ -1483,12 +1426,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell return false; } - cell.seq_id.insert(seq_id); + cells.seq_add(i, seq_id); } } head = 0; - used = cell_count; } return true; @@ -1505,8 +1447,8 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size()); return false; } - if (cell_count > size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); + if (cell_count > cells.size()) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size()); return false; } if (this->v_trans != (bool) v_trans) { @@ -1609,7 +1551,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell if (cell_count) { // For each row in the transposed matrix, read the values for the whole cell range for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (head + j * size) * v_size_el; + const size_t dst_offset = (head + j * cells.size()) * v_size_el; ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); } } @@ -1689,9 +1631,9 @@ void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) { kv_swa ->seq_keep(seq_id); } -void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - kv_base->seq_add(seq_id, p0, p1, delta); - kv_swa ->seq_add(seq_id, p0, p1, delta); +void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + kv_base->seq_add(seq_id, p0, p1, shift); + kv_swa ->seq_add(seq_id, p0, p1, shift); } void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { @@ -2063,8 +2005,8 @@ void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) { } } -void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - if (delta == 0) { +void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + if (shift == 0) { return; } @@ -2087,7 +2029,7 @@ void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_ if (tail_id >= 0) { kv_cell & cell = cells[tail_id]; if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; + cell.pos += shift; } } } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 191a1090a1252..86a96820e2420 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -4,6 +4,7 @@ #include "llama-io.h" #include "llama-graph.h" #include "llama-memory.h" +#include "llama-kv-cells.h" #include "ggml-cpp.h" @@ -35,6 +36,7 @@ struct llama_kv_cache : public llama_memory_i { virtual void defrag_sched(float thold) = 0; // simulate full cache, used for allocating worst-case compute buffers + // TODO: remove virtual void set_full() = 0; // @@ -42,7 +44,7 @@ struct llama_kv_cache : public llama_memory_i { // // ============================================================================================================= - // TODO: refactor and simplify this + // TODO: refactor and simplify this [TAG: KV_API] virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0; @@ -121,7 +123,7 @@ class llama_kv_cache_unified : public llama_kv_cache { bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; void seq_keep(llama_seq_id seq_id) override; - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; llama_pos seq_pos_min(llama_seq_id seq_id) const override; @@ -159,7 +161,7 @@ class llama_kv_cache_unified : public llama_kv_cache { // llama_kv_cache_unified specific API // - uint32_t get_n() const; + uint32_t get_n() const; uint32_t get_size() const; // get views of the current state of the cache @@ -180,26 +182,6 @@ class llama_kv_cache_unified : public llama_kv_cache { const llama_model & model; const llama_hparams & hparams; - struct kv_cell { - llama_pos pos = -1; - llama_pos delta = 0; - - // TODO: replace with bitset uint64_t - std::set seq_id; - - bool has_seq_id(const llama_seq_id & id) const { - return seq_id.find(id) != seq_id.end(); - } - - bool is_empty() const { - return seq_id.empty(); - } - - bool is_same_seq(const kv_cell & other) const { - return seq_id == other.seq_id; - } - }; - struct kv_layer { // layer index in the model // note: can be different from the layer index in the KV cache @@ -209,15 +191,13 @@ class llama_kv_cache_unified : public llama_kv_cache { ggml_tensor * v; }; - bool has_shift = false; bool do_defrag = false; bool v_trans = true; // the value tensor is transposed uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot()) - uint32_t size = 0; // total number of cells, shared across all sequences - uint32_t used = 0; // used cells (i.e. at least one seq_id) (TODO: add `struct kv_cells` and keep track automaticallt) // computed before each graph build + // TODO: cells should start to maintain this value dynamically based on the edits uint32_t n = 0; const uint32_t n_seq_max = 1; @@ -233,19 +213,29 @@ class llama_kv_cache_unified : public llama_kv_cache { std::vector ctxs; std::vector bufs; - std::vector cells; // TODO: replace with `struct kv_cells` + llama_kv_cells_unified cells; + std::vector layers; // model layer id -> KV cache layer id std::unordered_map map_layer_ids; // recovery information used to restore the KV cells to their original state in case of a failure + // TODO: do not store as a state in the llama_kv_cache object, instead return upon batch preparation + // to achieve that, first need to refactor the llama_kv_cache interface [TAG: KV_API] struct { void clear() { - cells.clear(); + states.clear(); } - std::unordered_map cells; + struct state { + uint32_t i; + + llama_kv_cells_unified cells; + }; + + // stack with the partial states before each ubatch + std::vector states; } recovery; // defrag @@ -257,6 +247,7 @@ class llama_kv_cache_unified : public llama_kv_cache { bool defrag_prepare(int32_t n_max_nodes); // find how many cells are currently in use + // TODO: optimize uint32_t cell_max() const; size_t total_size() const; @@ -325,7 +316,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache { bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; void seq_keep(llama_seq_id seq_id) override; - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; llama_pos seq_pos_min(llama_seq_id seq_id) const override; @@ -431,7 +422,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache { bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; void seq_keep(llama_seq_id seq_id) override; - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; llama_pos seq_pos_min(llama_seq_id seq_id) const override; diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h new file mode 100644 index 0000000000000..138545533ed22 --- /dev/null +++ b/src/llama-kv-cells.h @@ -0,0 +1,273 @@ +#pragma once + +#include "llama.h" +#include "llama-cparams.h" + +#include +#include +#include + +// meta information about KV cells that can be part of multiple sequences at the same time +// TODO: add unit tests +class llama_kv_cells_unified { +public: + void reset() { + for (uint32_t i = 0; i < pos.size(); ++i) { + pos[i] = -1; + shift[i] = 0; + seq[i].reset(); + } + + used = 0; + has_shift = false; + } + + void reset_shift() { + has_shift = false; + + for (uint32_t i = 0; i < shift.size(); ++i) { + shift[i] = 0; + } + } + + uint32_t size() const { + return pos.size(); + } + + void resize(uint32_t n) { + pos.resize(n); + shift.resize(n); + seq.resize(n); + + reset(); + } + + bool is_empty(uint32_t i) const { + assert(i < pos.size()); + assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0); + + return pos[i] == -1; + } + + uint32_t get_used() const { + return used; + } + + bool get_has_shift() const { + return has_shift; + } + + // move cell isrc to idst (used during defrag) + void mv(uint32_t isrc, uint32_t idst) { + assert(isrc < pos.size()); + assert(idst < pos.size()); + + pos [idst] = pos [isrc]; + shift[idst] = shift[isrc]; + seq [idst] = seq [isrc]; + + pos [isrc] = -1; + shift[isrc] = 0; + seq [isrc].reset(); + } + + // copy the state of cells [i, i + n) (used for save/restore the state of the cells) + llama_kv_cells_unified cp(uint32_t i, uint32_t n) const { + assert(i + n <= pos.size()); + + llama_kv_cells_unified res; + + res.resize(n); + + for (uint32_t j = 0; j < n; ++j) { + res.pos[j] = pos[i + j]; + res.seq[j] = seq[i + j]; + + assert(shift[i + j] == 0); + } + + return res; + } + + // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells) + void set(uint32_t i, const llama_kv_cells_unified & other) { + assert(i + other.pos.size() <= pos.size()); + + for (uint32_t j = 0; j < other.pos.size(); ++j) { + if (pos[i + j] == -1 && other.pos[j] != -1) { + used++; + } + + if (pos[i + j] != -1 && other.pos[j] == -1) { + used--; + } + + pos[i + j] = other.pos[j]; + seq[i + j] = other.seq[j]; + + assert(shift[i + j] == 0); + } + } + + // note: call only if the cell has seq_id + // return true if the cell becomes empty + bool seq_rm(uint32_t i, llama_seq_id seq_id) { + assert(i < pos.size()); + assert(seq[i].test(seq_id)); + assert(pos[i] != -1); + assert(seq_id >= 0); + + seq[i].reset(seq_id); + + if (seq[i].none()) { + pos[i] = -1; + + used--; + + return true; + } + + return false; + } + + // return true if the cell becomes empty (i.e. it did not contain seq_id before the call) + bool seq_keep(uint32_t i, llama_seq_id seq_id) { + assert(i < pos.size()); + + if (seq[i].test(seq_id)) { + seq[i].reset(); + seq[i].set(seq_id); + + return false; + } + + if (seq[i].any()) { + seq[i].reset(); + pos[i] = -1; + + used--; + + return true; + } + + assert(pos[i] == -1); + + return false; + } + + bool seq_has(uint32_t i, llama_seq_id seq_id) const { + assert(i < pos.size()); + assert(seq_id >= 0); + + return seq[i].test(seq_id); + } + + // note: call only if the cell is not empty and the seq_id is not in the cell + void seq_add(uint32_t i, llama_seq_id seq_id) { + assert(i < pos.size()); + assert(pos[i] != -1); + assert(!seq[i].test(seq_id)); + + seq[i].set(seq_id); + } + + // note: call only if the cell is not empty + llama_pos pos_get(uint32_t i) const { + assert(i < pos.size()); + assert(pos[i] != -1); + + return pos[i]; + } + + // note: call only if the cell is not empty + llama_pos get_shift(uint32_t i) const { + assert(i < pos.size()); + assert(pos[i] != -1); + + return shift[i]; + } + + // check if a cell is not empty and its position is within [p0, p1) + bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const { + assert(i < pos.size()); + + return pos[i] >= p0 && pos[i] < p1; + } + + // set the position of an empty cell + // does not modify "has_shift" + // note: call only if the cell is empty + void pos_set(uint32_t i, llama_pos p) { + assert(i < pos.size()); + assert(pos[i] == -1); + + pos[i] = p; + used++; + } + + // pos[i] = pos[i] + d + // sets "has_shift" to true + // note: call only if the cell is not empty + bool pos_add(uint32_t i, llama_pos d) { + assert(i < pos.size()); + assert(pos[i] != -1); + + pos[i] += d; + shift[i] += d; + + has_shift = true; + + if (pos[i] < 0) { + pos[i] = -1; + seq[i].reset(); + + used--; + + return true; + } + + return false; + } + + // pos[i] = pos[i] / d + // sets "has_shift" to true + // note: call only if the cell is not empty + void pos_div(uint32_t i, int d) { + assert(i < pos.size()); + assert(pos[i] != -1); + + const llama_pos p_old = pos[i]; + + pos[i] /= d; + shift[i] += p_old - pos[i]; + + has_shift = true; + } + +private: + uint32_t used = 0; // used cells (i.e. pos[i] != -1, allowed to not have any seq_id) + + bool has_shift = false; + + std::vector pos; + + // this array accumulates any applied shifts to the pos array since the last reset_shift() call + // this is used to queue multiple updates to the pos array, which in the end can be applied in one go: + // + // cells.pos_add(x, shift_x); + // cells.pos_div(y, shift_y); + // ... + // + // if (cells.has_shift()) { + // for (int i = 0; i < n; ++i) { + // auto shift_i = cells.get_shift(i); + // ... + // } + // cells.reset_shift(); + // } + // + std::vector shift; + + std::vector> seq; +}; + diff --git a/src/llama-memory.h b/src/llama-memory.h index c2571edc715e1..a2d250434affa 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -22,7 +22,7 @@ class llama_memory_i { virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; virtual void seq_keep(llama_seq_id seq_id) = 0; - virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0; + virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0; virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0; virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 81b052e1b1a47..e99f5309f9904 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2489,7 +2489,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 00466b9ba02ec..62a9f5842bca8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -92,6 +92,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index dfcdce350ba86..fb048022a06c4 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -737,14 +737,14 @@ static void test_template_output_parsers() { auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja"); std::vector end_tokens{ "<|im_end|>" }; - assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); + assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format); } { auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja"); std::vector end_tokens{ "<|im_end|>" }; - assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); + assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format); assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format); assert_equals( COMMON_CHAT_FORMAT_HERMES_2_PRO, diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 15ec3db906477..27ce8c43f678c 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -107,6 +107,7 @@ // ultravox #define TN_CONV1D "a.conv1d.%d.%s" #define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s" +#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer #define TN_MM_NORM_PRE "mm.a.norm_pre.%s" #define TN_MM_NORM_MID "mm.a.norm_mid.%s" @@ -128,6 +129,7 @@ enum projector_type { PROJECTOR_TYPE_ULTRAVOX, PROJECTOR_TYPE_INTERNVL, PROJECTOR_TYPE_LLAMA4, + PROJECTOR_TYPE_QWEN2A, PROJECTOR_TYPE_UNKNOWN, }; @@ -145,6 +147,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, { PROJECTOR_TYPE_INTERNVL, "internvl"}, { PROJECTOR_TYPE_LLAMA4, "llama4"}, + { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 34c4ce84275fa..6205dad5ae262 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -254,7 +254,9 @@ struct clip_vision_model { ggml_tensor * post_ln_w; ggml_tensor * post_ln_b; - ggml_tensor * projection; + ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) + ggml_tensor * mm_fc_w; + ggml_tensor * mm_fc_b; // LLaVA projection ggml_tensor * mm_input_norm_w = nullptr; @@ -1471,48 +1473,58 @@ struct clip_graph { cb(cur, "after_transformer", -1); - // StackAudioFrames - // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py - { - int64_t stride = n_embd * hparams.proj_stack_factor; - int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride); - int64_t pad = padded_len - ggml_nelements(cur); - if (pad > 0) { - cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0); - cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); + if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) { + // StackAudioFrames + // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py + { + int64_t stride = n_embd * hparams.proj_stack_factor; + int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride); + int64_t pad = padded_len - ggml_nelements(cur); + if (pad > 0) { + cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0); + cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); + } + cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, + ggml_row_size(cur->type, stride), 0); } - cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, - ggml_row_size(cur->type, stride), 0); - } - cb(cur, "after_stacked", -1); + cb(cur, "after_stacked", -1); - // UltravoxProjector - { - // pre-norm - cur = ggml_rms_norm(ctx0, cur, 1e-6); - cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + // UltravoxProjector + { + // pre-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); - // ffn in - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + // ffn in + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - // swiglu - { - int64_t split_point = cur->ne[0] / 2; - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + // swiglu + { + int64_t split_point = cur->ne[0] / 2; + ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + + // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half + x1 = ggml_silu(ctx0, x1); + cur = ggml_mul(ctx0, x0, x1); + } - // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half - x1 = ggml_silu(ctx0, x1); - cur = ggml_mul(ctx0, x0, x1); + // mid-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); + + // ffn out + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); } - // mid-norm - cur = ggml_rms_norm(ctx0, cur, 1e-6); - cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); + } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) { + // projector + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); - // ffn out - cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + } else { + GGML_ABORT("%s: unknown projector type", __func__); } cb(cur, "projected", -1); @@ -1655,6 +1667,17 @@ struct clip_graph { inpL = cur; } + // TODO @ngxson : find a way to move this outside + if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) { + ggml_tensor * cur = inpL; + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + inpL = cur; + } + // post-layernorm if (model.post_ln_w) { inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); @@ -1952,6 +1975,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 res = graph.build_llama4(); } break; case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: { res = graph.build_whisper_enc(); } break; @@ -2186,8 +2210,10 @@ struct clip_model_loader { }; } break; case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: { - get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor); + bool require_stack = ctx_clip.proj_type == PROJECTOR_TYPE_ULTRAVOX; + get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack); if (hparams.n_mel_bins != 128) { throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__)); } @@ -2266,7 +2292,7 @@ struct clip_model_loader { return cur; }; - auto & vision_model = ctx_clip.vision_model; + auto & vision_model = ctx_clip.vision_model; // TODO: rename this to just "model" vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false); @@ -2463,6 +2489,15 @@ struct clip_model_loader { vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight")); } break; + case PROJECTOR_TYPE_QWEN2A: + { + vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + vision_model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight")); + vision_model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias")); + } break; case PROJECTOR_TYPE_INTERNVL: { vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); @@ -3450,6 +3485,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor; const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor); n_patches = n_len / proj_stack_factor / 2; + } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) { + // divide by 2 because of whisper + // another divide by 2 because of nn.AvgPool1d(2, stride=2) + n_patches = img->nx / 4; } return n_patches; @@ -3850,6 +3889,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_ULTRAVOX: { // do nothing @@ -3910,7 +3950,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int n_tokens_out = embeddings->ne[1]; const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); if (n_tokens_out != expected_n_tokens_out) { - LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); + LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); GGML_ABORT("Invalid number of output tokens"); } @@ -3955,6 +3995,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_3_w->ne[1]; case PROJECTOR_TYPE_LLAMA4: return ctx->vision_model.mm_model_proj->ne[1]; + case PROJECTOR_TYPE_QWEN2A: + return ctx->vision_model.mm_fc_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } @@ -3991,6 +4033,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.has_audio; } +bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { + return ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type == PROJECTOR_TYPE_QWEN2A; +} + bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; clip_img.buf.resize(h * w * 3); diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 94267131628a6..5abfcd1a3c418 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -4,6 +4,8 @@ #include #include +// !!! Internal header, to be used by mtmd only !!! + struct clip_ctx; struct clip_image_size { @@ -99,3 +101,4 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); +bool clip_has_whisper_encoder(const struct clip_ctx * ctx); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d3f3cf3a061de..c3be91265f331 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -146,6 +146,13 @@ struct mtmd_context { throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); } + if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) { + throw std::runtime_error(string_format( + "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n" + "hint: you may be using wrong mmproj\n", + llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip))); + } + has_vision = clip_has_vision_encoder(ctx_clip); has_audio = clip_has_audio_encoder(ctx_clip); use_mrope = clip_is_qwen2vl(ctx_clip); @@ -196,7 +203,7 @@ struct mtmd_context { ov_img_first = false; // overview image is last } - if (proj == PROJECTOR_TYPE_ULTRAVOX) { + if (clip_has_whisper_encoder(ctx_clip)) { // TODO @ngxson : check if model n_mel is 128 or 80 w_filters = whisper_precalc_filters::get_128_bins(); } @@ -208,7 +215,7 @@ struct mtmd_context { } if (has_audio) { LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n" - " https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__); + " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__); } } @@ -327,6 +334,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx, marker_modified = "" + ctx->media_marker + ""; string_replace_all(prompt_modified, ctx->media_marker, marker_modified); + } else if (proj_type == PROJECTOR_TYPE_QWEN2A) { + // <|audio_bos|> ... (embeddings) ... <|audio_eos|> + marker_modified = "<|audio_bos|>" + ctx->media_marker + "<|audio_eos|>"; + string_replace_all(prompt_modified, ctx->media_marker, marker_modified); + } // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 2c722b012ea05..b53f215a2fafd 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -203,6 +203,8 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk); // get output embeddings from the last encode pass +// the reading size (in bytes) is equal to: +// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); ///////////////////////////////////////// diff --git a/tools/rpc/rpc-server.cpp b/tools/rpc/rpc-server.cpp index 581c74018c877..b4b7c47489524 100644 --- a/tools/rpc/rpc-server.cpp +++ b/tools/rpc/rpc-server.cpp @@ -111,7 +111,7 @@ static std::string fs_get_cache_directory() { if (getenv("LLAMA_CACHE")) { cache_directory = std::getenv("LLAMA_CACHE"); } else { -#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) +#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) if (std::getenv("XDG_CACHE_HOME")) { cache_directory = std::getenv("XDG_CACHE_HOME"); } else { diff --git a/tools/server/README.md b/tools/server/README.md index 0b84966ae86d7..06533c172e530 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)
controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).
only supported for non-streamed responses
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 3f1d3f31dcbf9..8d4e392ff3315 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 9f0b0ffaa6e1e..07b613122e321 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -178,7 +178,7 @@ struct slot_params { {"grammar_triggers", grammar_triggers}, {"preserved_tokens", sampling.preserved_tokens}, {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, - {"reasoning_format", (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")}, + {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, {"samplers", samplers}, @@ -357,7 +357,7 @@ struct server_task { auto it = data.find("chat_format"); if (it != data.end()) { params.oaicompat_chat_syntax.format = static_cast(it->get()); - SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format).c_str()); + SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format)); } else { params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; } @@ -2089,6 +2089,7 @@ struct server_context { /* common_chat_templates */ chat_templates.get(), /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, + /* enable_thinking */ params_base.reasoning_budget != 0, }; } diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py index cf9f96a7fbc52..c53eda5b88445 100644 --- a/tools/server/tests/unit/test_template.py +++ b/tools/server/tests/unit/test_template.py @@ -25,6 +25,40 @@ def create_server(): server.n_slots = 1 +@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]]) +@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [ + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "\n"), + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", -1, "\n"), + ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", 0, "\n"), + + ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"), + ("Qwen-Qwen3-0.6B", 0, "<|im_start|>assistant\n\n\n\n\n"), + + ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n\n"), + ("Qwen-QwQ-32B", 0, "<|im_start|>assistant\n\n"), + + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"), + ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", 0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"), +]) +def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]): + global server + server.jinja = True + server.reasoning_budget = reasoning_budget + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + + res = server.make_request("POST", "/apply-template", data={ + "messages": [ + {"role": "user", "content": "What is today?"}, + ], + "tools": tools, + }) + assert res.status_code == 200 + prompt = res.body["prompt"] + + assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'" + + @pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]]) @pytest.mark.parametrize("template_name,format", [ ("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"), @@ -47,3 +81,28 @@ def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]): today_str = datetime.date.today().strftime(format) assert today_str in prompt, f"Expected today's date ({today_str}) in content ({prompt})" + + +@pytest.mark.parametrize("add_generation_prompt", [False, True]) +@pytest.mark.parametrize("template_name,expected_generation_prompt", [ + ("meta-llama-Llama-3.3-70B-Instruct", "<|start_header_id|>assistant<|end_header_id|>"), +]) +def test_add_generation_prompt(template_name: str, expected_generation_prompt: str, add_generation_prompt: bool): + global server + server.jinja = True + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + + res = server.make_request("POST", "/apply-template", data={ + "messages": [ + {"role": "user", "content": "What is today?"}, + ], + "add_generation_prompt": add_generation_prompt, + }) + assert res.status_code == 200 + prompt = res.body["prompt"] + + if add_generation_prompt: + assert expected_generation_prompt in prompt, f"Expected generation prompt ({expected_generation_prompt}) in content ({prompt})" + else: + assert expected_generation_prompt not in prompt, f"Did not expect generation prompt ({expected_generation_prompt}) in content ({prompt})" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index b480801b17abb..11672f515df1d 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -84,7 +84,8 @@ class ServerProcess: draft_max: int | None = None no_webui: bool | None = None jinja: bool | None = None - reasoning_format: Literal['deepseek', 'none'] | None = None + reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None + reasoning_budget: int | None = None chat_template: str | None = None chat_template_file: str | None = None server_path: str | None = None @@ -191,6 +192,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.append("--jinja") if self.reasoning_format is not None: server_args.extend(("--reasoning-format", self.reasoning_format)) + if self.reasoning_budget is not None: + server_args.extend(("--reasoning-budget", self.reasoning_budget)) if self.chat_template: server_args.extend(["--chat-template", self.chat_template]) if self.chat_template_file: diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 91efcfef06772..fc9f7071e3fcb 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -568,6 +568,7 @@ struct oaicompat_parser_options { common_chat_templates * tmpls; bool allow_image; bool allow_audio; + bool enable_thinking = true; }; // used by /chat/completions endpoint @@ -731,7 +732,9 @@ static json oaicompat_chat_params_parse( inputs.grammar = grammar; inputs.use_jinja = opt.use_jinja; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); + inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); inputs.reasoning_format = opt.reasoning_format; + inputs.enable_thinking = opt.enable_thinking; if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { throw std::runtime_error("Cannot use custom grammar constraints with tools."); } diff --git a/tools/server/webui/src/components/useChatExtraContext.tsx b/tools/server/webui/src/components/useChatExtraContext.tsx index 42765524067e2..6f07012907335 100644 --- a/tools/server/webui/src/components/useChatExtraContext.tsx +++ b/tools/server/webui/src/components/useChatExtraContext.tsx @@ -46,8 +46,11 @@ export function useChatExtraContext(): ChatExtraContextApi { try { for (const file of files) { const mimeType = file.type; - if (file.size > 10 * 1024 * 1024) { - toast.error('File is too large. Maximum size is 10MB.'); + + // this limit is only to prevent accidental uploads of huge files + // it can potentially crashes the browser because we read the file as base64 + if (file.size > 500 * 1024 * 1024) { + toast.error('File is too large. Maximum size is 500MB.'); break; }