diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index da28c1bcb..474e675c6 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -187,16 +187,8 @@ void llamaCPP::chatCompletion( data["presence_penalty"] = (*jsonBody).get("presence_penalty", 0).asFloat(); const Json::Value &messages = (*jsonBody)["messages"]; - if (!(*jsonBody)["grammar_file"].isNull()) { - std::string grammar_file = (*jsonBody)["grammar_file"].asString(); - std::ifstream file(grammar_file); - if (!file) { - LOG_ERROR << "Grammar file not found"; - } else { - std::stringstream grammarBuf; - grammarBuf << file.rdbuf(); - data["grammar"] = grammarBuf.str(); - } + if (!grammar_file_content.empty()) { + data["grammar"] = grammar_file_content; }; if (!llama.multimodal) { @@ -514,6 +506,19 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { if (!jsonBody["mlock"].isNull()) { params.use_mlock = jsonBody["mlock"].asBool(); } + + if (!jsonBody["grammar_file"].isNull()) { + std::string grammar_file = jsonBody["grammar_file"].asString(); + std::ifstream file(grammar_file); + if (!file) { + LOG_ERROR << "Grammar file not found"; + } else { + std::stringstream grammarBuf; + grammarBuf << file.rdbuf(); + grammar_file_content = grammarBuf.str(); + } + }; + params.model = jsonBody["llama_model_path"].asString(); params.n_gpu_layers = jsonBody.get("ngl", 100).asInt(); params.n_ctx = jsonBody.get("ctx_len", 2048).asInt(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 234f798ab..a9581da21 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2576,5 +2576,6 @@ class llamaCPP : public drogon::HttpController { int clean_cache_threshold; std::atomic single_queue_is_busy; // This value only used under the // condition n_parallel is 1 + std::string grammar_file_content; }; }; // namespace inferences