From a4c0b8ec83195a7a1afed2b32e625b8a4e981525 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 11 Dec 2023 16:01:51 +0700 Subject: [PATCH 1/4] number of chats completed --- controllers/llamaCPP.h | 1 + 1 file changed, 1 insertion(+) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index f480f7062..e1e3a6309 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1909,5 +1909,6 @@ class llamaCPP : public drogon::HttpController { std::string pre_prompt; int repeat_last_n; bool caching_enabled; + std::atomic no_of_chats = 0; }; }; // namespace inferences From 2f179bc80d17cc05a5c1c3ad5853b6db453155d3 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 11 Dec 2023 16:49:42 +0700 Subject: [PATCH 2/4] load model with clean_cache_threshold --- controllers/llamaCPP.cc | 11 +++++++++++ controllers/llamaCPP.h | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index d9aa26716..dec8716a2 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -8,6 +8,7 @@ #include #include #include +#include using namespace inferences; using json = nlohmann::json; @@ -177,6 +178,14 @@ void llamaCPP::chatCompletion( // To set default value if (jsonBody) { + // Increase number of chats received and clean the prompt + no_of_chats++; + if (no_of_chats % clean_cache_threshold == 0) { + LOG_INFO << "Clean cache threshold reached!"; + llama.kv_cache_clear(); + LOG_INFO << "Cache cleaned"; + } + // Default values to enable auto caching data["cache_prompt"] = caching_enabled; data["n_keep"] = -1; @@ -390,6 +399,8 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { .asInt(); params.cont_batching = jsonBody.get("cont_batching", false).asBool(); + this->clean_cache_threshold = + jsonBody.get("clean_cache_threshold", 5).asInt(); this->caching_enabled = jsonBody.get("caching_enabled", false).asBool(); this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString(); this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index e1e3a6309..2528056fd 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1909,6 +1909,7 @@ class llamaCPP : public drogon::HttpController { std::string pre_prompt; int repeat_last_n; bool caching_enabled; - std::atomic no_of_chats = 0; + std::atomic no_of_chats = 0; + int clean_cache_threshold; }; }; // namespace inferences From 4d96e59cd972ba479e0661efe7a78cc8d3ec85e9 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 11 Dec 2023 16:50:49 +0700 Subject: [PATCH 3/4] redundant --- controllers/llamaCPP.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index dec8716a2..84c323da1 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -8,7 +8,6 @@ #include #include #include -#include using namespace inferences; using json = nlohmann::json; From 5aa10f9cc8d96c3b04270ac5c1eff39c9dd8d6fe Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 11 Dec 2023 16:52:10 +0700 Subject: [PATCH 4/4] clean cach threshold docs udpate --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3272fb249..759932e17 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ Table of parameters | `cpu_threads` | Integer | The number of threads to use for inferencing (CPU MODE ONLY) | | `n_batch` | Integer | The batch size for prompt eval step | | `caching_enabled` | Boolean | To enable prompt caching or not | +| `clean_cache_threshold` | Integer | Number of chats that will trigger clean cache action| ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal ```zsh