janhq · tikikun · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/README.md b/README.md
@@ -109,6 +109,7 @@ Table of parameters
 | `cpu_threads`   | Integer | The number of threads to use for inferencing (CPU MODE ONLY) |
 | `n_batch`       | Integer | The batch size for prompt eval step |
 | `caching_enabled` | Boolean | To enable prompt caching or not   |
+| `clean_cache_threshold` | Integer | Number of chats that will trigger clean cache action|
 
 ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal
 ```zsh

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -177,6 +177,14 @@ void llamaCPP::chatCompletion(
   // To set default value
 
   if (jsonBody) {
+    // Increase number of chats received and clean the prompt
+    no_of_chats++;
+    if (no_of_chats % clean_cache_threshold == 0) {
+      LOG_INFO << "Clean cache threshold reached!";
+      llama.kv_cache_clear();
+      LOG_INFO << "Cache cleaned";
+    }
+
     // Default values to enable auto caching
     data["cache_prompt"] = caching_enabled;
     data["n_keep"] = -1;
@@ -390,6 +398,8 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {
             .asInt();
     params.cont_batching = jsonBody.get("cont_batching", false).asBool();
 
+    this->clean_cache_threshold =
+        jsonBody.get("clean_cache_threshold", 5).asInt();
     this->caching_enabled = jsonBody.get("caching_enabled", false).asBool();
     this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString();
     this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString();

diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -1909,5 +1909,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
   std::string pre_prompt;
   int repeat_last_n;
   bool caching_enabled;
+  std::atomic<int> no_of_chats = 0;
+  int clean_cache_threshold;
 };
 }; // namespace inferences