diff --git a/README.md b/README.md index 3272fb249..759932e17 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ Table of parameters | `cpu_threads` | Integer | The number of threads to use for inferencing (CPU MODE ONLY) | | `n_batch` | Integer | The batch size for prompt eval step | | `caching_enabled` | Boolean | To enable prompt caching or not | +| `clean_cache_threshold` | Integer | Number of chats that will trigger clean cache action| ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal ```zsh diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index d9aa26716..84c323da1 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -177,6 +177,14 @@ void llamaCPP::chatCompletion( // To set default value if (jsonBody) { + // Increase number of chats received and clean the prompt + no_of_chats++; + if (no_of_chats % clean_cache_threshold == 0) { + LOG_INFO << "Clean cache threshold reached!"; + llama.kv_cache_clear(); + LOG_INFO << "Cache cleaned"; + } + // Default values to enable auto caching data["cache_prompt"] = caching_enabled; data["n_keep"] = -1; @@ -390,6 +398,8 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { .asInt(); params.cont_batching = jsonBody.get("cont_batching", false).asBool(); + this->clean_cache_threshold = + jsonBody.get("clean_cache_threshold", 5).asInt(); this->caching_enabled = jsonBody.get("caching_enabled", false).asBool(); this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString(); this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index f480f7062..2528056fd 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1909,5 +1909,7 @@ class llamaCPP : public drogon::HttpController { std::string pre_prompt; int repeat_last_n; bool caching_enabled; + std::atomic no_of_chats = 0; + int clean_cache_threshold; }; }; // namespace inferences