janhq · tikikun · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/context/llama_server_context.h b/context/llama_server_context.h
@@ -471,7 +471,7 @@ struct llama_server_context {
   llama_batch batch;
 
   bool multimodal = false;
-  bool clean_kv_cache = true;
+  bool clean_kv_cache = false;
   bool all_slots_are_idle = false;
   bool add_bos_token = true;
 
@@ -860,7 +860,6 @@ struct llama_server_context {
   void kv_cache_clear() {
     // clear the entire KV cache
     llama_kv_cache_clear(ctx);
-    clean_kv_cache = false;
   }
 
   void update_system_prompt() {

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -615,7 +615,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
     params.cont_batching = jsonBody->get("cont_batching", false).asBool();
     this->clean_cache_threshold =
         jsonBody->get("clean_cache_threshold", 5).asInt();
-    this->caching_enabled = jsonBody->get("caching_enabled", false).asBool();
+    this->caching_enabled = jsonBody->get("caching_enabled", true).asBool();
     this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString();
     this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString();
     this->system_prompt =