diff --git a/context/llama_server_context.h b/context/llama_server_context.h index 9bc00f074..720b22e1e 100644 --- a/context/llama_server_context.h +++ b/context/llama_server_context.h @@ -1,15 +1,15 @@ +#include +#include #include #include -#include -#include // External #include "clip.h" #include "common.h" #include "llama.h" -#include "utils/json.hpp" -#include "stb_image.h" #include "llava.h" +#include "stb_image.h" +#include "utils/json.hpp" #if defined(_WIN32) #define NOMINMAX @@ -532,7 +532,8 @@ struct llama_server_context { std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr) { - LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}}); + LOG_ERROR_LLAMA("llama.cpp unable to load model", + {{"model", params.model}}); return false; } @@ -585,7 +586,11 @@ struct llama_server_context { try { batch = llama_batch_init(n_ctx, 0, params.n_parallel); } catch (const std::exception& e) { - LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}}); + LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata", + {{"exception", e.what()}, + {"n_tokens_alloc", n_ctx}, + {"embd", 0}, + {"n_seq_max", params.n_parallel}}); } // empty system prompt @@ -1244,19 +1249,35 @@ struct llama_server_context { res.stop = true; const int n_embd = llama_n_embd(model); - if (!params.embedding) { - LOG_WARNING_LLAMA("embedding disabled", - { - {"params.embedding", params.embedding}, - }); - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; - } else { - const float* data = llama_get_embeddings(ctx); - std::vector embedding(data, data + n_embd); + + std::vector embd_res(n_embd, 0.0f); + + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { + continue; + } + + const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + } + + if (embd == NULL) { + LOG_ERROR << "failed to get embeddings " + << "token: " << batch.token[i] + << ", seq_id: " << batch.seq_id[i][0]; + + res.result_json = json{ + {"embedding", std::vector(n_embd, 0.0f)}, + }; + + continue; + } + + llama_embd_normalize(embd, embd_res.data(), n_embd); + res.result_json = json{ - {"embedding", embedding}, + {"embedding", embd_res}, }; } queue_results.push_back(res); @@ -1380,7 +1401,7 @@ struct llama_server_context { std::vector append_tokens = tokenize(json_prompt, false); // has next image for (int i = 0; i < (int)append_tokens.size(); ++i) { - llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true); + llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id + 1}, true); slot.n_past += 1; } } @@ -1523,27 +1544,28 @@ struct llama_server_context { for (llama_client_slot& slot : slots) { if (slot.is_processing() && - slot.cache_tokens.size() >= (size_t)slot.n_ctx) { + (int)system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = (int)system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; LOG_TEE( "slot %d: context shift - n_keep = %d, n_left = %d, n_discard " "= %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, - slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, slot.id, - slot.params.n_keep + 1 + n_discard, - slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; - i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } + llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, + system_tokens.size() + slot.n_past, -n_discard); + + if (slot.params.cache_prompt) { + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); + i++) { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + } slot.n_past -= n_discard; @@ -1557,6 +1579,9 @@ struct llama_server_context { } } + // start populating the batch for this iteration + llama_batch_clear(batch); + // decode any currently ongoing sequences for (auto& slot : slots) { // release the slot @@ -1578,14 +1603,15 @@ struct llama_server_context { slot.i_batch = batch.n_tokens; llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, - {slot.id}, true); + {slot.id + 1}, true); slot.n_decoded += 1; slot.n_past += 1; } // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; + int32_t n_batch = llama_n_batch(ctx); + int32_t n_ubatch = llama_n_ubatch(ctx); // assign workload to the slots if (params.cont_batching || batch.n_tokens == 0) { @@ -1641,8 +1667,7 @@ struct llama_server_context { } else { prompt_tokens = tokenize( slot.prompt, - system_prompt.empty() && - add_bos_token); // add BOS if there isn't system prompt + system_prompt.empty()); // add BOS if there isn't system prompt } slot.num_prompt_tokens = prompt_tokens.size(); @@ -1738,9 +1763,11 @@ struct llama_server_context { std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; - for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { + for (; + slot.n_past < slot.num_prompt_tokens && batch.n_tokens < n_batch; + ++slot.n_past) { llama_batch_add(batch, prefix_tokens[slot.n_past], - system_tokens.size() + slot.n_past, {slot.id}, + system_tokens.size() + slot.n_past, {slot.id + 1}, false); } @@ -1803,7 +1830,8 @@ struct llama_server_context { } for (auto& slot : slots) { - if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { + if (slot.state != PROCESSING || slot.i_batch < (int)i || + slot.i_batch >= (int)(i + n_tokens)) { continue; } @@ -1812,7 +1840,7 @@ struct llama_server_context { send_embedding(slot); slot.release(); slot.i_batch = -1; - return true; + continue; } completion_token_output result; diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 6dff90090..14cf17ab1 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -154,14 +154,14 @@ llamaCPP::~llamaCPP() { StopBackgroundTask(); } -void llamaCPP::WarmupModel() { +void llamaCPP::WarmupModel(bool is_embedding) { json pseudo; LOG_INFO << "Warm-up model"; pseudo["prompt"] = "Hello"; pseudo["n_predict"] = 2; pseudo["stream"] = false; - const int task_id = llama.request_completion(pseudo, false, false, -1); + const int task_id = llama.request_completion(pseudo, false, is_embedding, -1); std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { @@ -624,7 +624,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); - params.embedding = jsonBody->get("embedding", true).asBool(); + params.embedding = jsonBody->get("embedding", false).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread params.n_batch = jsonBody->get("n_batch", 512).asInt(); params.n_parallel = jsonBody->get("n_parallel", 1).asInt(); @@ -681,7 +681,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::BackgroundTask, this); - WarmupModel(); + WarmupModel(params.embedding); return true; } diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index a00d25538..5b8d16e10 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController, void EmbeddingImpl(std::shared_ptr jsonBody, std::function& callback); bool CheckModelLoaded(std::function& callback); - void WarmupModel(); + void WarmupModel(bool is_embedding); void BackgroundTask(); void StopBackgroundTask(); };