diff --git a/context/llama_server_context.h b/context/llama_server_context.h index 9bc00f074..0aedc494c 100644 --- a/context/llama_server_context.h +++ b/context/llama_server_context.h @@ -471,7 +471,7 @@ struct llama_server_context { llama_batch batch; bool multimodal = false; - bool clean_kv_cache = false; + bool clean_kv_cache = true; bool all_slots_are_idle = false; bool add_bos_token = true; diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 7e616f9b2..8efe3a464 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -197,15 +197,16 @@ void llamaCPP::InferenceImpl( // To set default value // Increase number of chats received and clean the prompt - no_of_chats++; - if (no_of_chats % clean_cache_threshold == 0) { - LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!"; - llama.kv_cache_clear(); - LOG_INFO_REQUEST(request_id) << "Cache cleaned"; - } + //no_of_chats++; + //if (no_of_chats % clean_cache_threshold == 0) { + // LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!"; + // llama.kv_cache_clear(); + // LOG_INFO_REQUEST(request_id) << "Cache cleaned"; + //} // Default values to enable auto caching - data["cache_prompt"] = caching_enabled; + //data["cache_prompt"] = caching_enabled; + data["cache_prompt"] = false; data["n_keep"] = -1; // Passing load value @@ -655,7 +656,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { params.cont_batching = jsonBody->get("cont_batching", false).asBool(); this->clean_cache_threshold = jsonBody->get("clean_cache_threshold", 5).asInt(); - this->caching_enabled = jsonBody->get("caching_enabled", true).asBool(); + this->caching_enabled = jsonBody->get("caching_enabled", false).asBool(); this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString(); this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString(); this->system_prompt =