From df7d27d9583c975be1046c1f49252bc6528bf2ba Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Sat, 13 Apr 2024 05:08:08 +0700 Subject: [PATCH 1/7] fix: embed, log for testing --- context/llama_server_context.h | 185 ++++++++++++++++++++------------- controllers/llamaCPP.cc | 4 +- 2 files changed, 113 insertions(+), 76 deletions(-) diff --git a/context/llama_server_context.h b/context/llama_server_context.h index 0aedc494c..5727897ca 100644 --- a/context/llama_server_context.h +++ b/context/llama_server_context.h @@ -1,15 +1,15 @@ +#include +#include #include #include -#include -#include // External #include "clip.h" #include "common.h" #include "llama.h" -#include "utils/json.hpp" -#include "stb_image.h" #include "llava.h" +#include "stb_image.h" +#include "utils/json.hpp" #if defined(_WIN32) #define NOMINMAX @@ -443,20 +443,22 @@ struct llama_client_slot { } void print_timings() const { - LOG_TEE("\n"); - LOG_TEE( - "%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per " - "token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, - t_prompt_processing / num_prompt_tokens_processed, - 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE( - "%s: eval time = %10.2f ms / %5d runs (%8.2f ms per " - "token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded, t_token_generation / n_decoded, - 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, - t_prompt_processing + t_token_generation); + LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing + << "ms / " << num_prompt_tokens_processed << " tokens (" + << t_prompt_processing / num_prompt_tokens_processed + << " ms per " + "token, " + << 1e3 / t_prompt_processing * num_prompt_tokens_processed + << " tokens per second)"; + LOG_DEBUG << __func__ << ": eval time = " << t_token_generation + << " ms / " << n_decoded << " runs (" + << t_token_generation / n_decoded + << " ms per " + "token, " + << 1e3 / t_token_generation * n_decoded + << " tokens per second)\n"; + LOG_DEBUG << __func__ << ": total time = " + << t_prompt_processing + t_token_generation << " ms"; } }; @@ -516,7 +518,7 @@ struct llama_server_context { params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); + LOG_DEBUG << "Multi Modal Mode Enabled"; clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1); if (clp_ctx == nullptr) { LOG_ERROR_LLAMA("unable to load clip model", @@ -532,7 +534,8 @@ struct llama_server_context { std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr) { - LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}}); + LOG_ERROR_LLAMA("llama.cpp unable to load model", + {{"model", params.model}}); return false; } @@ -540,11 +543,13 @@ struct llama_server_context { const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); const int n_embd_llm = llama_n_embd(model); if (n_embd_clip != n_embd_llm) { - LOG_TEE( - "%s: embedding dim of the multimodal projector (%d) is not " - "equal to that of LLaMA (%d). Make sure that you use the " - "correct mmproj file.\n", - __func__, n_embd_clip, n_embd_llm); + LOG_DEBUG << __func__ << ": embedding dim of the multimodal projector (" + << n_embd_clip + << ") is not " + "equal to that of LLaMA (" + << n_embd_llm + << "). Make sure that you use the " + "correct mmproj file."; llama_free(ctx); llama_free_model(model); return false; @@ -570,7 +575,7 @@ struct llama_server_context { const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_TEE("Available slots:\n"); + LOG_INFO << "Available slots: "; for (int i = 0; i < params.n_parallel; i++) { llama_client_slot slot; @@ -578,14 +583,18 @@ struct llama_server_context { slot.n_ctx = n_ctx_slot; slot.reset(); - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_INFO << " -> Slot " << slot.id << " - max context: " << n_ctx_slot; slots.push_back(slot); } try { batch = llama_batch_init(n_ctx, 0, params.n_parallel); } catch (const std::exception& e) { - LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}}); + LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata", + {{"exception", e.what()}, + {"n_tokens_alloc", n_ctx}, + {"embd", 0}, + {"n_seq_max", params.n_parallel}}); } // empty system prompt @@ -797,11 +806,11 @@ struct llama_server_context { img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes( image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, - img_sl.id); + LOG_DEBUG << "slot " << slot->id + << " - failed to load image [id: " << img_sl.id << "]"; return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG_INFO << "slot " << slot->id << " - loaded image"; img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -832,12 +841,13 @@ struct llama_server_context { } } if (!found) { - LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); + LOG_INFO << "ERROR: Image with id: " << img_id + << ", not found.\n"; slot->images.clear(); return false; } } catch (const std::invalid_argument& e) { - LOG_TEE("Invalid image number id in prompt\n"); + LOG_INFO << "Invalid image number id in prompt"; slot->images.clear(); return false; } @@ -860,7 +870,8 @@ struct llama_server_context { all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG_INFO << "slot " << slot->id + << " is processing [task id: " << slot->task_id << "]"; return true; } @@ -882,7 +893,7 @@ struct llama_server_context { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_WARN << __func__ << ": llama_decode() failed"; return; } @@ -891,7 +902,7 @@ struct llama_server_context { llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); } - LOG_TEE("system prompt updated\n"); + LOG_INFO << "system prompt updated"; system_need_update = false; } @@ -1055,7 +1066,7 @@ struct llama_server_context { if (!llava_image_embed_make_with_clip_img( clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { - LOG_TEE("Error processing the given image"); + LOG_DEBUG << "Error processing the given image"; return false; } @@ -1253,12 +1264,37 @@ struct llama_server_context { {"embedding", std::vector(n_embd, 0.0f)}, }; } else { - const float* data = llama_get_embeddings(ctx); - std::vector embedding(data, data + n_embd); + std::vector embd_res(n_embd, 0.0f); + + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + continue; + } + + const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + } + + if (embd == NULL) { + LOG_ERROR << "failed to get embeddings" + << " token " << batch.token[i] << ", seq_id " + << batch.seq_id[i][0]; + + res.result_json = json{ + {"embedding", std::vector(n_embd, 0.0f)}, + }; + + continue; + } + + llama_embd_normalize(embd, embd_res.data(), n_embd); + } res.result_json = json{ - {"embedding", embedding}, + {"embedding", embd_res}, }; } + queue_results.push_back(res); condition_results.notify_all(); } @@ -1341,7 +1377,7 @@ struct llama_server_context { 0, // unused }; if (llama_decode(ctx, batch_view)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_INFO << __func__ << " : failed to eval\n"; return false; } } @@ -1361,7 +1397,7 @@ struct llama_server_context { 0, }; if (llama_decode(ctx, batch_img)) { - LOG_TEE("%s : failed to eval image\n", __func__); + LOG_INFO << __func__ << " : failed to eval image"; return false; } slot.n_past += n_eval; @@ -1429,7 +1465,7 @@ struct llama_server_context { llama_client_slot* slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { - LOG_TEE("slot unavailable\n"); + LOG_INFO << "slot unavailable"; // send error result send_error(task, "slot unavailable"); return; @@ -1499,7 +1535,7 @@ struct llama_server_context { // update the system prompt wait until all slots are idle state if (system_need_update && all_slots_are_idle) { - LOG_TEE("updating system prompt\n"); + LOG_DEBUG << "updating system prompt"; update_system_prompt(); } @@ -1507,10 +1543,9 @@ struct llama_server_context { if (all_slots_are_idle) { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE( - "all slots are idle and system prompt is empty, clear the KV " - "cache\n"); - kv_cache_clear(); + LOG_INFO + << "all slots are idle and system prompt is empty, clear the KV " + "cache"; } // std::this_thread::sleep_for(std::chrono::milliseconds(5)); // TODO: Need to implement queueing using CV for better performance @@ -1528,15 +1563,13 @@ struct llama_server_context { const int n_left = slot.n_past - slot.params.n_keep - 1; const int n_discard = n_left / 2; - LOG_TEE( - "slot %d: context shift - n_keep = %d, n_left = %d, n_discard " - "= %d\n", - slot.id, slot.params.n_keep, n_left, n_discard); + LOG_INFO << "slot " << slot.id + << " context shift - n_keep = " << slot.params.n_keep + << ", n_left = " << n_left << ", n_discard: " << n_discard; llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, slot.id, - slot.params.n_keep + 1 + n_discard, - slot.n_past, -n_discard); + llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard, + slot.n_past, -n_discard); for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1565,8 +1598,8 @@ struct llama_server_context { slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, - (int)slot.cache_tokens.size()); + LOG_INFO << "slot " << slot.id << " released (" + << (int)slot.cache_tokens.size() << " tokens in cache)"; continue; } @@ -1699,12 +1732,13 @@ struct llama_server_context { slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", - slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_INFO << "slot " << slot.id << " : in cache: " << slot.n_past + << " tokens | to process: " + << slot.num_prompt_tokens_processed << " tokens"; } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, - (int)system_tokens.size() + slot.n_past); + LOG_INFO << "slot " << slot.id << " : kv cache rm - [" + << (int)system_tokens.size() + slot.n_past << ", end)"; llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); @@ -1713,10 +1747,9 @@ struct llama_server_context { if (slot.n_past == slot.num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE( - "slot %d : we have to evaluate at least 1 token to " - "generate logits\n", - slot.id); + LOG_INFO << "slot " << slot.id + << " : we have to evaluate at least 1 token to " + "generate logits"; slot.n_past--; } @@ -1745,7 +1778,7 @@ struct llama_server_context { } if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG_INFO << "failed processing images"; return false; } @@ -1785,15 +1818,17 @@ struct llama_server_context { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it // via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", - __func__, n_batch, ret); + LOG_INFO << __func__ + << " : failed to decode the batch, n_batch = " << n_batch + << ", ret = " << ret; return false; } - LOG_TEE( - "%s : failed to find free space in the KV cache, retrying with " - "smaller n_batch = %d\n", - __func__, n_batch / 2); + LOG_INFO + << __func__ + << " : failed to find free space in the KV cache, retrying with " + "smaller n_batch = " + << n_batch / 2; // retry with half the batch size to try to find a free slot in the KV // cache @@ -1870,7 +1905,8 @@ static void server_print_usage(const char* argv0, const gpt_params& params, " -tb N, --threads-batch N number of threads to use during batch " "and prompt processing (default: same as --threads)\n"); printf( - " -c N, --ctx-size N size of the prompt context (default: %d)\n", + " -c N, --ctx-size N size of the prompt context (default: " + "%d)\n", params.n_ctx); printf(" --rope-scaling {none,linear,yarn}\n"); printf( @@ -1988,7 +2024,8 @@ static void server_print_usage(const char* argv0, const gpt_params& params, "dynamic batching) (default: disabled)\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf( - " set a file to load a system prompt (initial " + " set a file to load a system prompt " + "(initial " "prompt of all slots), this is useful for chat applications.\n"); printf( " --mmproj MMPROJ_FILE path to a multimodal projector file for " diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index e19a7f32b..951b325b5 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -147,7 +147,7 @@ llamaCPP::llamaCPP() : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel, "llamaCPP")) { // Some default values for now below - log_disable(); // Disable the log to file feature, reduce bloat for + // log_disable(); // Disable the log to file feature, reduce bloat for // target // system () }; @@ -706,7 +706,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::BackgroundTask, this); - WarmupModel(); + // WarmupModel(); return true; } From ff67a5be05ff575df1b3b89c35f8ff82380d7154 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Sat, 13 Apr 2024 14:32:29 +0700 Subject: [PATCH 2/7] fix: warmup model --- controllers/llamaCPP.cc | 24 +++++++++++++++--------- controllers/llamaCPP.h | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 951b325b5..0b36e02e0 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -145,25 +145,25 @@ std::string create_return_json(const std::string& id, const std::string& model, llamaCPP::llamaCPP() : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel, - "llamaCPP")) { - // Some default values for now below - // log_disable(); // Disable the log to file feature, reduce bloat for - // target - // system () -}; + "llamaCPP")){ + // Some default values for now below + // log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () + }; llamaCPP::~llamaCPP() { StopBackgroundTask(); } -void llamaCPP::WarmupModel() { +void llamaCPP::WarmupModel(bool is_embedded_model) { json pseudo; LOG_INFO << "Warm-up model"; pseudo["prompt"] = "Hello"; pseudo["n_predict"] = 2; pseudo["stream"] = false; - const int task_id = llama.request_completion(pseudo, false, false, -1); + const int task_id = llama.request_completion(pseudo, false, is_embedded_model, -1); std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { @@ -606,6 +606,10 @@ void llamaCPP::LoadModel( bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { gpt_params params; + // For model like nomic-embed-text-v1.5.f16.gguf, etc, we need to warmup model with flag embedding = true. + // So we use this variable to differentiate with other models + // TODO: in case embedded model only, we should reject completion request from user? + bool is_embedded_model = false; // By default will setting based on number of handlers if (jsonBody) { if (!jsonBody->operator[]("mmproj").isNull()) { @@ -649,6 +653,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); + is_embedded_model = + !(*jsonBody)["embedding"].isNull() && (*jsonBody)["embedding"].asBool(); params.embedding = jsonBody->get("embedding", true).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread params.n_batch = jsonBody->get("n_batch", 512).asInt(); @@ -706,7 +712,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::BackgroundTask, this); - // WarmupModel(); + WarmupModel(is_embedded_model); return true; } diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 900786c79..914caff60 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController, void EmbeddingImpl(std::shared_ptr jsonBody, std::function&& callback); bool CheckModelLoaded(const std::function& callback); - void WarmupModel(); + void WarmupModel(bool is_embedded_model); void BackgroundTask(); void StopBackgroundTask(); }; From 4c0879482db1a83eab5e2a532d81639d627445e0 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Sat, 13 Apr 2024 14:35:44 +0700 Subject: [PATCH 3/7] fix: change log level --- context/llama_server_context.h | 38 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/context/llama_server_context.h b/context/llama_server_context.h index 5727897ca..841ec1108 100644 --- a/context/llama_server_context.h +++ b/context/llama_server_context.h @@ -575,7 +575,7 @@ struct llama_server_context { const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO << "Available slots: "; + LOG_DEBUG << "Available slots: "; for (int i = 0; i < params.n_parallel; i++) { llama_client_slot slot; @@ -583,7 +583,7 @@ struct llama_server_context { slot.n_ctx = n_ctx_slot; slot.reset(); - LOG_INFO << " -> Slot " << slot.id << " - max context: " << n_ctx_slot; + LOG_DEBUG << " -> Slot " << slot.id << " - max context: " << n_ctx_slot; slots.push_back(slot); } @@ -810,7 +810,7 @@ struct llama_server_context { << " - failed to load image [id: " << img_sl.id << "]"; return false; } - LOG_INFO << "slot " << slot->id << " - loaded image"; + LOG_DEBUG << "slot " << slot->id << " - loaded image"; img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -841,13 +841,13 @@ struct llama_server_context { } } if (!found) { - LOG_INFO << "ERROR: Image with id: " << img_id + LOG_DEBUG << "ERROR: Image with id: " << img_id << ", not found.\n"; slot->images.clear(); return false; } } catch (const std::invalid_argument& e) { - LOG_INFO << "Invalid image number id in prompt"; + LOG_DEBUG << "Invalid image number id in prompt"; slot->images.clear(); return false; } @@ -870,7 +870,7 @@ struct llama_server_context { all_slots_are_idle = false; - LOG_INFO << "slot " << slot->id + LOG_DEBUG << "slot " << slot->id << " is processing [task id: " << slot->task_id << "]"; return true; @@ -902,7 +902,7 @@ struct llama_server_context { llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); } - LOG_INFO << "system prompt updated"; + LOG_DEBUG << "system prompt updated"; system_need_update = false; } @@ -1377,7 +1377,7 @@ struct llama_server_context { 0, // unused }; if (llama_decode(ctx, batch_view)) { - LOG_INFO << __func__ << " : failed to eval\n"; + LOG_DEBUG << __func__ << " : failed to eval\n"; return false; } } @@ -1397,7 +1397,7 @@ struct llama_server_context { 0, }; if (llama_decode(ctx, batch_img)) { - LOG_INFO << __func__ << " : failed to eval image"; + LOG_DEBUG << __func__ << " : failed to eval image"; return false; } slot.n_past += n_eval; @@ -1465,7 +1465,7 @@ struct llama_server_context { llama_client_slot* slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { - LOG_INFO << "slot unavailable"; + LOG_DEBUG << "slot unavailable"; // send error result send_error(task, "slot unavailable"); return; @@ -1543,7 +1543,7 @@ struct llama_server_context { if (all_slots_are_idle) { if (system_prompt.empty() && clean_kv_cache) { - LOG_INFO + LOG_DEBUG << "all slots are idle and system prompt is empty, clear the KV " "cache"; } @@ -1563,7 +1563,7 @@ struct llama_server_context { const int n_left = slot.n_past - slot.params.n_keep - 1; const int n_discard = n_left / 2; - LOG_INFO << "slot " << slot.id + LOG_DEBUG << "slot " << slot.id << " context shift - n_keep = " << slot.params.n_keep << ", n_left = " << n_left << ", n_discard: " << n_discard; llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, @@ -1598,7 +1598,7 @@ struct llama_server_context { slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_INFO << "slot " << slot.id << " released (" + LOG_DEBUG << "slot " << slot.id << " released (" << (int)slot.cache_tokens.size() << " tokens in cache)"; continue; @@ -1732,12 +1732,12 @@ struct llama_server_context { slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; - LOG_INFO << "slot " << slot.id << " : in cache: " << slot.n_past + LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past << " tokens | to process: " << slot.num_prompt_tokens_processed << " tokens"; } - LOG_INFO << "slot " << slot.id << " : kv cache rm - [" + LOG_DEBUG << "slot " << slot.id << " : kv cache rm - [" << (int)system_tokens.size() + slot.n_past << ", end)"; llama_kv_cache_seq_rm(ctx, slot.id, @@ -1747,7 +1747,7 @@ struct llama_server_context { if (slot.n_past == slot.num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO << "slot " << slot.id + LOG_DEBUG << "slot " << slot.id << " : we have to evaluate at least 1 token to " "generate logits"; slot.n_past--; @@ -1778,7 +1778,7 @@ struct llama_server_context { } if (has_images && !ingest_images(slot, n_batch)) { - LOG_INFO << "failed processing images"; + LOG_DEBUG << "failed processing images"; return false; } @@ -1818,13 +1818,13 @@ struct llama_server_context { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it // via the context size - LOG_INFO << __func__ + LOG_DEBUG << __func__ << " : failed to decode the batch, n_batch = " << n_batch << ", ret = " << ret; return false; } - LOG_INFO + LOG_DEBUG << __func__ << " : failed to find free space in the KV cache, retrying with " "smaller n_batch = " From 30abf3a4cfc39e589b253c476ef60e51bb035c6c Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 15 Apr 2024 09:43:39 +0700 Subject: [PATCH 4/7] fix: restore kv_cache_clear --- context/llama_server_context.h | 1 + 1 file changed, 1 insertion(+) diff --git a/context/llama_server_context.h b/context/llama_server_context.h index 841ec1108..a4d8f7a89 100644 --- a/context/llama_server_context.h +++ b/context/llama_server_context.h @@ -1546,6 +1546,7 @@ struct llama_server_context { LOG_DEBUG << "all slots are idle and system prompt is empty, clear the KV " "cache"; + kv_cache_clear(); } // std::this_thread::sleep_for(std::chrono::milliseconds(5)); // TODO: Need to implement queueing using CV for better performance From a2cb6ba161f0c853b357cdabe61598706cbb074f Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 15 Apr 2024 16:36:47 +0700 Subject: [PATCH 5/7] fix: log_disable --- controllers/llamaCPP.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 0b36e02e0..c40d7b284 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -145,12 +145,12 @@ std::string create_return_json(const std::string& id, const std::string& model, llamaCPP::llamaCPP() : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel, - "llamaCPP")){ - // Some default values for now below - // log_disable(); // Disable the log to file feature, reduce bloat for - // target - // system () - }; + "llamaCPP")) { + // Some default values for now below + log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () +}; llamaCPP::~llamaCPP() { StopBackgroundTask(); @@ -163,7 +163,8 @@ void llamaCPP::WarmupModel(bool is_embedded_model) { pseudo["prompt"] = "Hello"; pseudo["n_predict"] = 2; pseudo["stream"] = false; - const int task_id = llama.request_completion(pseudo, false, is_embedded_model, -1); + const int task_id = + llama.request_completion(pseudo, false, is_embedded_model, -1); std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { From 833866106e06fe9478ecb4776156ccb94ba3c254 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 15 Apr 2024 17:33:03 +0700 Subject: [PATCH 6/7] fix: use params.embedding to decide warmup or not --- context/llama_server_context.h | 77 ++++++++++++++----------------- controllers/llamaCPP.cc | 82 +++++++++++++++++----------------- controllers/llamaCPP.h | 2 +- 3 files changed, 77 insertions(+), 84 deletions(-) diff --git a/context/llama_server_context.h b/context/llama_server_context.h index a4d8f7a89..3839e4b3f 100644 --- a/context/llama_server_context.h +++ b/context/llama_server_context.h @@ -842,7 +842,7 @@ struct llama_server_context { } if (!found) { LOG_DEBUG << "ERROR: Image with id: " << img_id - << ", not found.\n"; + << ", not found.\n"; slot->images.clear(); return false; } @@ -871,7 +871,7 @@ struct llama_server_context { all_slots_are_idle = false; LOG_DEBUG << "slot " << slot->id - << " is processing [task id: " << slot->task_id << "]"; + << " is processing [task id: " << slot->task_id << "]"; return true; } @@ -1255,45 +1255,36 @@ struct llama_server_context { res.stop = true; const int n_embd = llama_n_embd(model); - if (!params.embedding) { - LOG_WARNING_LLAMA("embedding disabled", - { - {"params.embedding", params.embedding}, - }); - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; - } else { - std::vector embd_res(n_embd, 0.0f); - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } + std::vector embd_res(n_embd, 0.0f); - const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + continue; + } - if (embd == NULL) { - LOG_ERROR << "failed to get embeddings" - << " token " << batch.token[i] << ", seq_id " - << batch.seq_id[i][0]; + const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + } - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; + if (embd == NULL) { + LOG_ERROR << "failed to get embeddings" + << " token " << batch.token[i] << ", seq_id " + << batch.seq_id[i][0]; - continue; - } + res.result_json = json{ + {"embedding", std::vector(n_embd, 0.0f)}, + }; - llama_embd_normalize(embd, embd_res.data(), n_embd); + continue; } - res.result_json = json{ - {"embedding", embd_res}, - }; + + llama_embd_normalize(embd, embd_res.data(), n_embd); } + res.result_json = json{ + {"embedding", embd_res}, + }; queue_results.push_back(res); condition_results.notify_all(); @@ -1565,8 +1556,8 @@ struct llama_server_context { const int n_discard = n_left / 2; LOG_DEBUG << "slot " << slot.id - << " context shift - n_keep = " << slot.params.n_keep - << ", n_left = " << n_left << ", n_discard: " << n_discard; + << " context shift - n_keep = " << slot.params.n_keep + << ", n_left = " << n_left << ", n_discard: " << n_discard; llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, slot.params.n_keep + n_discard + 1); llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard, @@ -1600,7 +1591,7 @@ struct llama_server_context { slot.t_last_used = ggml_time_us(); LOG_DEBUG << "slot " << slot.id << " released (" - << (int)slot.cache_tokens.size() << " tokens in cache)"; + << (int)slot.cache_tokens.size() << " tokens in cache)"; continue; } @@ -1734,12 +1725,12 @@ struct llama_server_context { slot.num_prompt_tokens - slot.n_past; LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past - << " tokens | to process: " - << slot.num_prompt_tokens_processed << " tokens"; + << " tokens | to process: " + << slot.num_prompt_tokens_processed << " tokens"; } LOG_DEBUG << "slot " << slot.id << " : kv cache rm - [" - << (int)system_tokens.size() + slot.n_past << ", end)"; + << (int)system_tokens.size() + slot.n_past << ", end)"; llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); @@ -1749,8 +1740,8 @@ struct llama_server_context { if (slot.n_past == slot.num_prompt_tokens) { // we have to evaluate at least 1 token to generate logits. LOG_DEBUG << "slot " << slot.id - << " : we have to evaluate at least 1 token to " - "generate logits"; + << " : we have to evaluate at least 1 token to " + "generate logits"; slot.n_past--; } @@ -1820,8 +1811,8 @@ struct llama_server_context { // if you get here, it means the KV cache is full - try increasing it // via the context size LOG_DEBUG << __func__ - << " : failed to decode the batch, n_batch = " << n_batch - << ", ret = " << ret; + << " : failed to decode the batch, n_batch = " << n_batch + << ", ret = " << ret; return false; } diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index c40d7b284..3ff0e899a 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -156,7 +156,7 @@ llamaCPP::~llamaCPP() { StopBackgroundTask(); } -void llamaCPP::WarmupModel(bool is_embedded_model) { +void llamaCPP::WarmupModel() { json pseudo; LOG_INFO << "Warm-up model"; @@ -164,7 +164,7 @@ void llamaCPP::WarmupModel(bool is_embedded_model) { pseudo["n_predict"] = 2; pseudo["stream"] = false; const int task_id = - llama.request_completion(pseudo, false, is_embedded_model, -1); + llama.request_completion(pseudo, false, false, -1); std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { @@ -405,36 +405,37 @@ void llamaCPP::InferenceImpl( return 0; }; // Queued task - state->instance->queue->runTaskInQueue( - [cb = std::move(callback), state, data, chunked_content_provider, request_id]() { - state->task_id = - state->instance->llama.request_completion(data, false, false, -1); - - // Start streaming response - auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, - "chat_completions.txt"); - cb(resp); - - int retries = 0; - - // Since this is an async task, we will wait for the task to be - // completed - while (state->inference_status != FINISHED && retries < 10) { - // Should wait chunked_content_provider lambda to be called within - // 3s - if (state->inference_status == PENDING) { - retries += 1; - } - if (state->inference_status != RUNNING) - LOG_INFO_REQUEST(request_id) - << "Wait for task to be released:" << state->task_id; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - LOG_INFO_REQUEST(request_id) << "Task completed, release it"; - // Request completed, release it - state->instance->llama.request_cancel(state->task_id); - LOG_INFO_REQUEST(request_id) << "Inference completed"; - }); + state->instance->queue->runTaskInQueue([cb = std::move(callback), state, + data, chunked_content_provider, + request_id]() { + state->task_id = + state->instance->llama.request_completion(data, false, false, -1); + + // Start streaming response + auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, + "chat_completions.txt"); + cb(resp); + + int retries = 0; + + // Since this is an async task, we will wait for the task to be + // completed + while (state->inference_status != FINISHED && retries < 10) { + // Should wait chunked_content_provider lambda to be called within + // 3s + if (state->inference_status == PENDING) { + retries += 1; + } + if (state->inference_status != RUNNING) + LOG_INFO_REQUEST(request_id) + << "Wait for task to be released:" << state->task_id; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + LOG_INFO_REQUEST(request_id) << "Task completed, release it"; + // Request completed, release it + state->instance->llama.request_cancel(state->task_id); + LOG_INFO_REQUEST(request_id) << "Inference completed"; + }); } else { queue->runTaskInQueue( [this, request_id, cb = std::move(callback), d = std::move(data)]() { @@ -607,10 +608,7 @@ void llamaCPP::LoadModel( bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { gpt_params params; - // For model like nomic-embed-text-v1.5.f16.gguf, etc, we need to warmup model with flag embedding = true. - // So we use this variable to differentiate with other models - // TODO: in case embedded model only, we should reject completion request from user? - bool is_embedded_model = false; + // By default will setting based on number of handlers if (jsonBody) { if (!jsonBody->operator[]("mmproj").isNull()) { @@ -654,9 +652,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); - is_embedded_model = - !(*jsonBody)["embedding"].isNull() && (*jsonBody)["embedding"].asBool(); - params.embedding = jsonBody->get("embedding", true).asBool(); + params.embedding = jsonBody->get("embedding", false).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread params.n_batch = jsonBody->get("n_batch", 512).asInt(); params.n_parallel = jsonBody->get("n_parallel", 1).asInt(); @@ -713,7 +709,13 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::BackgroundTask, this); - WarmupModel(is_embedded_model); + + // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model. + // So we use this variable to differentiate with other models + // TODO: in case embedded model only, we should reject completion request from user? + if (!params.embedding) { + WarmupModel(); + } return true; } diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 914caff60..900786c79 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController, void EmbeddingImpl(std::shared_ptr jsonBody, std::function&& callback); bool CheckModelLoaded(const std::function& callback); - void WarmupModel(bool is_embedded_model); + void WarmupModel(); void BackgroundTask(); void StopBackgroundTask(); }; From c7bb799548c983178db944bca8f9fd3108fc7a2f Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 15 Apr 2024 18:19:45 +0700 Subject: [PATCH 7/7] fix: add model_type --- README.md | 1 + controllers/llamaCPP.cc | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 565e399dd..92fda215e 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ Table of parameters |`grp_attn_w`|Integer|Group attention width in self-extend| |`mlock`|Boolean|Prevent system swapping of the model to disk in macOS| |`grammar_file`| String |You can constrain the sampling using GBNF grammars by providing path to a grammar file| +|`model_type` | String | Model type we want to use: llm or embedding, default value is llm| ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal ```zsh diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 3ff0e899a..3e3015c2a 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -163,8 +163,7 @@ void llamaCPP::WarmupModel() { pseudo["prompt"] = "Hello"; pseudo["n_predict"] = 2; pseudo["stream"] = false; - const int task_id = - llama.request_completion(pseudo, false, false, -1); + const int task_id = llama.request_completion(pseudo, false, false, -1); std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { @@ -608,7 +607,7 @@ void llamaCPP::LoadModel( bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { gpt_params params; - + std::string model_type; // By default will setting based on number of handlers if (jsonBody) { if (!jsonBody->operator[]("mmproj").isNull()) { @@ -652,7 +651,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); - params.embedding = jsonBody->get("embedding", false).asBool(); + params.embedding = jsonBody->get("embedding", true).asBool(); + model_type = jsonBody->get("model_type", "llm").asString(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread params.n_batch = jsonBody->get("n_batch", 512).asInt(); params.n_parallel = jsonBody->get("n_parallel", 1).asInt(); @@ -713,7 +713,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model. // So we use this variable to differentiate with other models // TODO: in case embedded model only, we should reject completion request from user? - if (!params.embedding) { + if (model_type == "llm") { WarmupModel(); } return true;