Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0fc16b4
kv-cache : split implementation in separate sources (#13920)
ggerganov Jun 1, 2025
c046217
parallel : fix n_junk == 0 (#13952)
ggerganov Jun 1, 2025
8726392
readme : update bindings (#13950)
ddh0 Jun 1, 2025
fedf034
ggml : Print backtrace on uncaught C++ exceptions (ggml/1232)
danielzgtg May 28, 2025
6eba72b
ggml : install dynamic backends (ggml/1240)
rgerganov May 29, 2025
a7b8d35
sync : whisper.cpp (ggml/1250)
ggerganov May 29, 2025
af6f91d
ggml : remove ggml_graph_import and ggml_graph_export declarations (g…
rgerganov May 30, 2025
d337252
cmake : Fix broken CMake error messages (ggml/1252)
dg0yt May 31, 2025
108009f
vulkan : Remove unexpected ; (ggml/1253)
dg0yt May 31, 2025
f3a4b16
sync : ggml
ggerganov Jun 1, 2025
e57bb87
ggml: check if non-native endian model is being loaded (#13943)
taronaeo Jun 1, 2025
c496fe0
convert : fix vocab padding code for bert models (#13954)
CISC Jun 1, 2025
5e1c3ae
convert : fix nomic-bert-moe mask token (#13757)
CISC Jun 1, 2025
7675c55
gguf: fix failure on version == 0 (#13956)
JohannesGaessler Jun 1, 2025
663445b
sycl: quantize and reorder the input to q8_1 when reorder is enabled …
AD2605 Jun 2, 2025
093e3f1
cmake : Handle mixed-case 'Power' strings in POWER CPU detection (#13…
shalinib-ibm Jun 2, 2025
bfd3227
mtmd : fix memory leak in mtmd_helper_eval_chunk_single (#13961)
ngxson Jun 2, 2025
c9bbc77
`server`: update deepseek reasoning format (pass reasoning_content as…
ochafik Jun 2, 2025
5582c49
gemma : more consistent attention scaling for v2 and v3 (#13951)
ggerganov Jun 2, 2025
ea394d7
metal : use F32 accumulators in FA kernels (#13975)
ggerganov Jun 2, 2025
3637576
server : disable speculative decoding for SWA models (#13970)
ggerganov Jun 2, 2025
bfb1e01
OpenCL: Add concat, tsembd, upscale, tanh, pad and repeat (#13840)
rmatif Jun 2, 2025
71e74a3
opencl: add `backend_synchronize` (#13939)
lhez Jun 2, 2025
7c054e2
Merge branch 'layla-build' into merge
l3utterfly Jun 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Bindings</summary>

- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
Expand Down
1 change: 1 addition & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2869,6 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: deepseek)",
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
else { throw std::invalid_argument("invalid value"); }
}
Expand Down
15 changes: 8 additions & 7 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const

std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
std::vector<common_chat_msg_diff> diffs;
// if (previous_msg.reasoning_content != current.reasoning_content) {
// auto & diff = diffs.emplace_back();
// diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
// }
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
auto & diff = diffs.emplace_back();
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
}
if (previous_msg.content != new_msg.content) {
auto & diff = diffs.emplace_back();
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
Expand Down Expand Up @@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t

template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
// if (!diff.reasoning_content_delta.empty()) {
// delta["reasoning_content"] = msg.reasoning_content;
// }
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
}
if (!diff.content_delta.empty()) {
delta["content"] = diff.content_delta;
}
Expand Down Expand Up @@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
switch (format) {
case COMMON_REASONING_FORMAT_NONE: return "none";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
default:
throw std::runtime_error("Unknown reasoning format");
}
Expand Down
2 changes: 1 addition & 1 deletion common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ struct common_chat_msg {
};

struct common_chat_msg_diff {
// std::string reasoning_content_delta;
std::string reasoning_content_delta;
std::string content_delta;
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
Expand Down
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ struct common_params_vocoder {

enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
};

struct common_params {
Expand Down
55 changes: 27 additions & 28 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3814,7 +3814,7 @@ def _xlmroberta_set_vocab(self) -> None:
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])

vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
Expand All @@ -3827,7 +3827,7 @@ def _xlmroberta_set_vocab(self) -> None:
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))

vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())

tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
Expand Down Expand Up @@ -3857,33 +3857,26 @@ def _xlmroberta_set_vocab(self) -> None:
unk_token = tokenizer_config_json.get("unk_token")
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))

for token_id in range(vocab_size):
for token_id in range(tokenizer.vocab_size):
piece = tokenizer._convert_id_to_token(token_id)
text = piece.encode("utf-8")
score = tokenizer_json["model"]["vocab"][token_id][1]

toktype = SentencePieceTokenTypes.NORMAL
if token_id == unk_token_id:
toktype = SentencePieceTokenTypes.UNKNOWN
elif token_id in tokenizer.all_special_ids:
toktype = SentencePieceTokenTypes.CONTROL
elif token_id in added_vocab.values():
toktype = SentencePieceTokenTypes.USER_DEFINED
# No reliable way to detect this, but jina doesn't have any
# elif tokenizer.IsByte(token_id):
# toktype = SentencePieceTokenTypes.BYTE

tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype

if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
text = piece.encode("utf-8")
score = tokenizer_json["model"]["vocab"][token_id][1]

toktype = SentencePieceTokenTypes.NORMAL
if token_id == unk_token_id:
toktype = SentencePieceTokenTypes.UNKNOWN
elif token_id in tokenizer.all_special_ids:
toktype = SentencePieceTokenTypes.CONTROL
elif token_id in added_vocab.values():
toktype = SentencePieceTokenTypes.USER_DEFINED
# No reliable way to detect this, but jina doesn't have any
# elif tokenizer.IsByte(token_id):
# toktype = SentencePieceTokenTypes.BYTE

tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype

if isinstance(tokenizer, SentencePieceProcessor):
# realign tokens (see HF tokenizer code)
Expand All @@ -3896,6 +3889,12 @@ def _xlmroberta_set_vocab(self) -> None:
SentencePieceTokenTypes.UNKNOWN,
] + toktypes[3:-1]

if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
# Add mask token missing from sentencepiece.bpe.model
tokens[250001] = b'<mask>'
scores[250001] = 0.0
toktypes[250001] = SentencePieceTokenTypes.CONTROL

self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
Expand Down
4 changes: 2 additions & 2 deletions examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
common_params params;

params.n_predict = 128;
params.n_junk = 0;
params.n_junk = 1;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
Expand All @@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
const bool is_sp_shared = params.is_pp_shared;

// extra text to insert in each client's prompt in order to make it larger
const int32_t n_junk = params.n_junk;
const int32_t n_junk = std::max(1, params.n_junk);

// init llama.cpp
llama_backend_init();
Expand Down
3 changes: 0 additions & 3 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2095,9 +2095,6 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);

GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);

// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

Expand Down
2 changes: 2 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ add_library(ggml-base
../include/ggml-opt.h
../include/gguf.h
ggml.c
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-opt.cpp
Expand Down Expand Up @@ -234,6 +235,7 @@ function(ggml_add_backend_library backend)
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
else()
add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend})
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-blas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,5 +96,4 @@ else()
message(ERROR "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct GGML_BLAS_VENDOR")
endif()
endif()
3 changes: 2 additions & 1 deletion ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
endif()

string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

if (EXTRACTED_NUMBER GREATER_EQUAL 10)
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
extern "C" {
#endif

void ggml_print_backtrace(void);

#ifndef MIN
# define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
Expand Down
8 changes: 5 additions & 3 deletions ggml/src/ggml-metal/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -4766,14 +4766,16 @@ static bool ggml_metal_encode_node(
GGML_ASSERT(nqptg % 8 == 0);
GGML_ASSERT(ncpsg % 32 == 0);

const int is_q = ggml_is_quantized(src1->type) ? 1 : 0;

// 2*(2*ncpsg + nqptg)*(nsg)
// ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
//
// 16*32*(nsg)
// the shared memory needed for the simdgroups to load the KV cache
// each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
//
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(2*ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))

int64_t nsgmax = 2;

Expand Down Expand Up @@ -4810,9 +4812,9 @@ static bool ggml_metal_encode_node(
// and store the soft_max values and the mask
//
// ne00*(nsg)
// each simdgroup has a full f16 head vector in shared mem to accumulate results
// each simdgroup has a full f32 head vector in shared mem to accumulate results
//
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*ne20*(nsg))*(sizeof(float)/2), 16))

int64_t nsgmax = 2;
while (true) {
Expand Down
Loading
Loading