Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11

# ==============================================================================
# BUILD STAGE
Expand Down
2 changes: 1 addition & 1 deletion .devops/llama-cli-cann.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10

FROM ascendai/cann:$ASCEND_VERSION AS build

Expand Down
2 changes: 2 additions & 0 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
precompileMetalShaders ? false,
useWebUi ? true,
}:

let
Expand Down Expand Up @@ -164,6 +165,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "GGML_NATIVE" false)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build-cann.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"

- name: Pull container image
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,7 @@ jobs:
- name: Set container image
id: cann-image
run: |
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
echo "image=${image}" >> "${GITHUB_OUTPUT}"

- name: Pull container image
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)

Expand Down
11 changes: 10 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.verbose_prompt = true;
}
));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
{"--display-prompt"},
{"--no-display-prompt"},
Expand Down Expand Up @@ -2843,6 +2843,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
add_opt(common_arg(
{"--tools"}, "TOOL1,TOOL2,...",
"experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
"specify \"all\" to enable all tools\n"
"available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
[](common_params & params, const std::string & value) {
params.server_tools = parse_csv_row(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,9 @@ struct common_params {
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;

// enable built-in tools
std::vector<std::string> server_tools;

// router server configs
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
Expand Down
23 changes: 12 additions & 11 deletions common/reasoning-budget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
break;
}
case REASONING_BUDGET_FORCING:
// force_pos is advanced in apply(), not here.
// This ensures the first forced token isn't skipped when the sampler
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
break;
case REASONING_BUDGET_DONE:
break;
Expand All @@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
cur_p->data[i].logit = -INFINITY;
}
}

// advance to next forced token (done here rather than in accept so that
// the first forced token isn't skipped when starting in FORCING state)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
}

static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
Expand Down Expand Up @@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init(
common_reasoning_budget_state initial_state) {
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
}

common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
if (!smpl) {
return REASONING_BUDGET_IDLE;
}
return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
}
2 changes: 2 additions & 0 deletions common/reasoning-budget.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init(
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state);

common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
56 changes: 46 additions & 10 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <algorithm>
#include <cctype>
#include <climits>
#include <cmath>
#include <cstring>
#include <unordered_map>
Expand Down Expand Up @@ -109,6 +110,7 @@ struct common_sampler {
common_params_sampling params;

struct llama_sampler * grmr;
struct llama_sampler * rbudget;
struct llama_sampler * chain;

ring_buffer<llama_token> prev;
Expand Down Expand Up @@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
lparams.no_perf = params.no_perf;

llama_sampler * grmr = nullptr;
llama_sampler * rbudget = nullptr;
llama_sampler * chain = llama_sampler_chain_init(lparams);

std::vector<llama_sampler *> samplers;
Expand Down Expand Up @@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}

if (grmr) {
if (grmr && !params.grammar_lazy) {
try {
for (const auto & token : prefill_tokens) {
llama_sampler_accept(grmr, token);
Expand All @@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}

// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
samplers.push_back(common_reasoning_budget_init(
// reasoning budget sampler
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
rbudget = common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
prefill_tokens));
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
prefill_tokens);
}

if (params.has_logit_bias()) {
Expand Down Expand Up @@ -383,6 +386,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
auto * result = new common_sampler {
/* .params = */ params,
/* .grmr = */ grmr,
/* .rbudget = */ rbudget,
/* .chain = */ chain,
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {},
Expand All @@ -398,18 +402,39 @@ void common_sampler_free(struct common_sampler * gsmpl) {
}

llama_sampler_free(gsmpl->grmr);
llama_sampler_free(gsmpl->rbudget);
llama_sampler_free(gsmpl->chain);

delete gsmpl;
}

static bool grammar_should_apply(struct common_sampler * gsmpl) {
if (!gsmpl->grmr) {
return false;
}
if (!gsmpl->rbudget) {
return true;
}
if (gsmpl->params.grammar_lazy) {
// if grammar is lazy, only apply when reasoning budget is not active
const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
}
return true;
}

void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (!gsmpl) {
return;
}

const auto tm = gsmpl->tm();

// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);

llama_sampler_accept(gsmpl->rbudget, token);

if (gsmpl->grmr && accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
Expand All @@ -431,6 +456,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler {
/* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
/* .chain = */ llama_sampler_clone(gsmpl->chain),
/* .prev = */ gsmpl->prev,
/* .cur = */ gsmpl->cur,
Expand Down Expand Up @@ -500,6 +526,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
llama_token id = LLAMA_TOKEN_NULL;

auto & grmr = gsmpl->grmr;
auto & rbudget = gsmpl->rbudget;
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits

Expand All @@ -511,7 +538,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);

GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

// TODO: simplify
gsmpl->cur.resize(1);
Expand All @@ -524,15 +552,18 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

gsmpl->set_logits(ctx, idx);

if (grammar_first) {
// apply reasoning budget first
llama_sampler_apply(rbudget, &cur_p);

if (grammar_first && grammar_should_apply(gsmpl)) {
llama_sampler_apply(grmr, &cur_p);
}

llama_sampler_apply(chain, &cur_p);

id = cur_p.data[cur_p.selected].id;

if (grammar_first) {
if (grammar_first || !grammar_should_apply(gsmpl)) {
return id;
}

Expand All @@ -553,7 +584,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
gsmpl->set_logits(ctx, idx);

llama_sampler_apply(grmr, &cur_p);
llama_sampler_apply(rbudget, &cur_p);

if (grammar_should_apply(gsmpl)) {
llama_sampler_apply(grmr, &cur_p);
}

llama_sampler_apply(chain, &cur_p);

GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
Expand Down
2 changes: 2 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7150,6 +7150,8 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
return gguf.GGMLQuantizationType.F32
if ".rel_pos_h" in name or '.rel_pos_w' in name:
return gguf.GGMLQuantizationType.F32
if ".neck." in name or ".net_" in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
Expand Down
Loading