From d1133c1a72cadbd7fc87da5c4d4b2ceebd2f4a14 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Mon, 15 Apr 2024 10:19:24 +0700
Subject: [PATCH 1/5] will always be cleaning cache

---
 context/llama_server_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/context/llama_server_context.h b/context/llama_server_context.h
index 9bc00f074..0aedc494c 100644
--- a/context/llama_server_context.h
+++ b/context/llama_server_context.h
@@ -471,7 +471,7 @@ struct llama_server_context {
   llama_batch batch;
 
   bool multimodal = false;
-  bool clean_kv_cache = false;
+  bool clean_kv_cache = true;
   bool all_slots_are_idle = false;
   bool add_bos_token = true;
 

From 91792ddc8fa74c31209c0c3b1e0f02e2f50d1a47 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Mon, 15 Apr 2024 11:50:26 +0700
Subject: [PATCH 2/5] turn off caching

---
 controllers/llamaCPP.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 7e616f9b2..caca80b46 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -655,7 +655,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
     params.cont_batching = jsonBody->get("cont_batching", false).asBool();
     this->clean_cache_threshold =
         jsonBody->get("clean_cache_threshold", 5).asInt();
-    this->caching_enabled = jsonBody->get("caching_enabled", true).asBool();
+    this->caching_enabled = jsonBody->get("caching_enabled", false).asBool();
     this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString();
     this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString();
     this->system_prompt =

From 9a46624a103570654c363492dda6b2926c376d0d Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Mon, 15 Apr 2024 12:52:55 +0700
Subject: [PATCH 3/5] force no caching

---
 controllers/llamaCPP.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index caca80b46..e7984eb5c 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -205,7 +205,8 @@ void llamaCPP::InferenceImpl(
   }
 
   // Default values to enable auto caching
-  data["cache_prompt"] = caching_enabled;
+  //data["cache_prompt"] = caching_enabled;
+  data["cache_prompt"] = false;
   data["n_keep"] = -1;
 
   // Passing load value

From e2bc67aac12ecf20c25187f82722b87d29c796f8 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Mon, 15 Apr 2024 15:41:51 +0700
Subject: [PATCH 4/5] hotfix: turn off current cache logic

---
 controllers/llamaCPP.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index e7984eb5c..12a2471c8 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -197,12 +197,12 @@ void llamaCPP::InferenceImpl(
   // To set default value
 
   // Increase number of chats received and clean the prompt
-  no_of_chats++;
-  if (no_of_chats % clean_cache_threshold == 0) {
-    LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!";
-    llama.kv_cache_clear();
-    LOG_INFO_REQUEST(request_id) << "Cache cleaned";
-  }
+  //no_of_chats++;
+  //if (no_of_chats % clean_cache_threshold == 0) {
+  //  LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!";
+  //  llama.kv_cache_clear();
+  //  LOG_INFO_REQUEST(request_id) << "Cache cleaned";
+  //}
 
   // Default values to enable auto caching
   //data["cache_prompt"] = caching_enabled;
@@ -656,7 +656,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
     params.cont_batching = jsonBody->get("cont_batching", false).asBool();
     this->clean_cache_threshold =
         jsonBody->get("clean_cache_threshold", 5).asInt();
-    this->caching_enabled = jsonBody->get("caching_enabled", false).asBool();
+    this->caching_enabled = jsonBody->get("caching_enabled", true).asBool();
     this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString();
     this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString();
     this->system_prompt =

From 1276fbca109b0156e890f8430a9271ee5467774c Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Mon, 15 Apr 2024 15:49:43 +0700
Subject: [PATCH 5/5] turn off all kidns of caching

---
 controllers/llamaCPP.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 12a2471c8..8efe3a464 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -656,7 +656,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
     params.cont_batching = jsonBody->get("cont_batching", false).asBool();
     this->clean_cache_threshold =
         jsonBody->get("clean_cache_threshold", 5).asInt();
-    this->caching_enabled = jsonBody->get("caching_enabled", true).asBool();
+    this->caching_enabled = jsonBody->get("caching_enabled", false).asBool();
     this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString();
     this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString();
     this->system_prompt =