Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--embd-output-format"}, "FORMAT",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
[](common_params & params, const std::string & value) {
params.embd_out = value;
}
Expand Down
22 changes: 19 additions & 3 deletions common/json-schema-to-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,10 @@ class SchemaConverter {
}

std::string _resolve_ref(const std::string & ref) {
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
auto it = ref.find('#');
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
_refs_being_resolved.insert(ref);
json resolved = _refs[ref];
Expand Down Expand Up @@ -774,11 +777,24 @@ class SchemaConverter {
std::vector<std::string> tokens = string_split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i];
if (target.is_null() || !target.contains(sel)) {
if (target.is_object() && target.contains(sel)) {
target = target[sel];
} else if (target.is_array()) {
size_t sel_index;
try {
sel_index = std::stoul(sel);
} catch (const std::invalid_argument & e) {
sel_index = target.size();
}
if (sel_index >= target.size()) {
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
return;
}
target = target[sel_index];
} else {
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
return;
}
target = target[sel];
}
_refs[ref] = target;
}
Expand Down
1 change: 1 addition & 0 deletions examples/embedding/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The above command will output space-separated float values.
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
| 'json' | openai style |
| 'json+' | add cosine similarity matrix |
| 'raw' | plain text output |

### --embd-separator $"string"$
| $"string"$ | |
Expand Down
25 changes: 25 additions & 0 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
}

// plain, pipe-friendly output: one embedding per line
static void print_raw_embeddings(const float * emb,
int n_embd_count,
int n_embd,
const llama_model * model,
enum llama_pooling_type pooling_type,
int embd_normalize) {
const uint32_t n_cls_out = llama_model_n_cls_out(model);
const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;

for (int j = 0; j < n_embd_count; ++j) {
for (int i = 0; i < cols; ++i) {
if (embd_normalize == 0) {
LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
} else {
LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
}
}
LOG("\n");
}
}

int main(int argc, char ** argv) {
common_params params;

Expand Down Expand Up @@ -372,6 +395,8 @@ int main(int argc, char ** argv) {
}

if (notArray) LOG("\n}\n");
} else if (params.embd_out == "raw") {
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
}

LOG("\n");
Expand Down
16 changes: 13 additions & 3 deletions examples/json_schema_to_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,17 @@ def visit(n: dict):
raise ValueError(f'Unsupported ref {ref}')

for sel in ref.split('#')[-1].split('/')[1:]:
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]
assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
if isinstance(target, list):
try:
sel_index = int(sel)
except ValueError:
raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel_index]
else:
assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]

self._refs[ref] = target
else:
Expand Down Expand Up @@ -547,7 +556,8 @@ def join_seq():


def _resolve_ref(self, ref):
ref_name = ref.split('/')[-1]
ref_fragment = ref.split('#')[-1]
ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
if ref_name not in self._rules and ref not in self._refs_being_resolved:
self._refs_being_resolved.add(ref)
resolved = self._refs[ref]
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2234,7 +2234,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
ACL_MEM_MALLOC_HUGE_FIRST));

acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
theta_scale_ne, theta_scale_nb, 1);

float start = 0;
float step = 1;
Expand All @@ -2251,7 +2251,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne,
theta_scale_nb, GGML_MAX_DIMS);
theta_scale_nb, 1);
float zero_value = 0, one_value = 1;
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
aclScalar * low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
Expand Down
21 changes: 16 additions & 5 deletions ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,30 @@
GGML_ABORT("CANN error");
}

// Thread-local variable to record the current device of this thread.
thread_local int g_current_cann_device = -1;

/**
* @brief Sets the device to be used by CANN.
* @brief Set the CANN device to be used.
*
* @param device The device ID to set.
* @param device The target device ID to set.
*/
void ggml_cann_set_device(const int32_t device) {
int current_device = -1;
aclrtGetDevice(&current_device);
// int current_device = -1;
// Note: In some CANN versions, if no device has been set yet,
// aclrtGetDevice(&current_device) may return 0 by default.
// aclrtGetDevice(&current_device);

if (device == current_device) {
// If the current device is already the target one, no need to switch.
if (device == g_current_cann_device) {
return;
}

// Switch to the new device.
ACL_CHECK(aclrtSetDevice(device));

// Update the global device record.
g_current_cann_device = device;
}

/**
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv.cuh"
#include "ggml-cuda/gla.cuh"
#include "ggml-cuda/set.cuh"
#include "ggml-cuda/set-rows.cuh"
#include "ggml-cuda/pad_reflect_1d.cuh"
#include "ggml.h"
Expand Down Expand Up @@ -2416,6 +2417,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SET_ROWS:
ggml_cuda_op_set_rows(ctx, dst);
break;
case GGML_OP_SET:
ggml_cuda_op_set(ctx, dst);
break;
case GGML_OP_DUP:
ggml_cuda_dup(ctx, dst);
break;
Expand Down Expand Up @@ -3842,6 +3846,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
op->src[0]->type == GGML_TYPE_F32 &&
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
} break;
case GGML_OP_SET:
{
const ggml_type t = op->type;
return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
t == op->src[0]->type &&
t == op->src[1]->type;
} break;
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/mmvf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ static __global__ void mul_mat_vec_f(
}

dst[tid*stride_col_dst + row] = value;

if constexpr (!has_fusion) {
GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, glu_op, gate_x, x_bias, gate_bias, sumf_gate);
}
}

template<typename T, typename type_acc, int ncols_dst, int block_size>
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,10 @@ static __global__ void mul_mat_vec_q(
dst[j*stride_col_dst + threadIdx.x] = result;
}
}

if constexpr (!has_fusion) {
GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate);
}
}

static std::pair<dim3, dim3> calc_launch_params(
Expand Down
39 changes: 39 additions & 0 deletions ggml/src/ggml-cuda/set.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include "set.cuh"
#include "cpy.cuh"

void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];

GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32));
GGML_ASSERT(src1->type == src0->type);
GGML_ASSERT(dst ->type == src0->type);

GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));

const size_t nb1 = ((int32_t *) dst->op_params)[0];
const size_t nb2 = ((int32_t *) dst->op_params)[1];
const size_t nb3 = ((int32_t *) dst->op_params)[2];
const size_t offset = ((int32_t *) dst->op_params)[3];
const bool inplace= (bool) ((int32_t *) dst->op_params)[4];

if (!inplace) {
ggml_cuda_cpy(ctx, src0, dst);
}

ggml_tensor dst_view = *dst;
dst_view.data = (void *)((char *)dst->data + offset);
dst_view.ne[0] = src1->ne[0];
dst_view.ne[1] = src1->ne[1];
dst_view.ne[2] = src1->ne[2];
dst_view.ne[3] = src1->ne[3];

dst_view.nb[0] = ggml_element_size(dst);
dst_view.nb[1] = nb1;
dst_view.nb[2] = nb2;
dst_view.nb[3] = nb3;

ggml_cuda_cpy(ctx, src1, &dst_view);
}
7 changes: 7 additions & 0 deletions ggml/src/ggml-cuda/set.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once

#include "common.cuh"

#define CUDA_SET_BLOCK_SIZE 256

void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
9 changes: 6 additions & 3 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ static inline void hex_format_op_names(char * str, const struct ggml_tensor * t)
// ** backend sessions

struct ggml_hexagon_session {
ggml_hexagon_session(int dev_id) noexcept(false);
ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
~ggml_hexagon_session() noexcept(true);

void allocate(int dev_id) noexcept(false);
Expand Down Expand Up @@ -1631,10 +1631,13 @@ void ggml_hexagon_session::release() noexcept(true) {
}
}

ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
buffer_type.context = nullptr;
repack_buffer_type.context = nullptr;

buffer_type.device = dev;
repack_buffer_type.device = dev;

try {
allocate(dev_id);

Expand Down Expand Up @@ -3628,7 +3631,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
devices[i].iface = ggml_backend_hexagon_device_i;
devices[i].reg = reg;
try {
devices[i].context = new ggml_hexagon_session(i);
devices[i].context = new ggml_hexagon_session(i, &devices[i]);
} catch (std::exception const &exc) {
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
devices[i].context = nullptr;
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "roll.hpp"
#include "rope.hpp"
#include "set_rows.hpp"
#include "ssm_conv.hpp"
#include "softmax.hpp"
#include "tsembd.hpp"
#include "wkv.hpp"
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include "ggml-sycl/getrows.hpp"
#include "ggml-sycl/repeat_back.hpp"
#include "ggml-sycl/quantize.hpp"
#include "ggml-sycl/ssm_conv.hpp"
#include "ggml.h"

static bool g_sycl_loaded = false;
Expand Down Expand Up @@ -3921,6 +3922,8 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_GATED_LINEAR_ATTN:
ggml_sycl_op_gated_linear_attn(ctx, dst);
break;
case GGML_OP_SSM_CONV:
ggml_sycl_ssm_conv(ctx, dst);
case GGML_OP_ROLL:
ggml_sycl_roll(ctx, dst);
break;
Expand Down Expand Up @@ -4602,6 +4605,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_RWKV_WKV7:
case GGML_OP_GATED_LINEAR_ATTN:
return true;
case GGML_OP_SSM_CONV:
return op->type == GGML_TYPE_F32 &&
op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_ROLL:
return op->type == GGML_TYPE_F32;
case GGML_OP_ARANGE:
Expand Down
Loading