diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 24e10d9be..49fc290ff 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -344,3 +344,16 @@ void llamaCPP::loadModel( warmupModel(); callback(resp); } + +void inferences::llamaCPP::unloadModel(const HttpRequestPtr &req, std::function &&callback) +{ + Json::Value jsonResp; + jsonResp["message"] = "No model loaded"; + if (model_loaded) { + llama.unloadModel(); + model_loaded = false; + jsonResp["message"] = "Model unloaded successfully"; + } + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + callback(resp); +} diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 2b188867a..67b5d979c 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -260,6 +260,13 @@ struct llama_server_context { return true; } + void unloadModel() { + llama_free(ctx); + llama_free_model(model); + ctx = nullptr; + model = nullptr; + } + std::vector tokenize(const json &json_prompt, bool add_bos) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, @@ -1272,6 +1279,7 @@ class llamaCPP : public drogon::HttpController { METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post); METHOD_ADD(llamaCPP::embedding, "embedding", Post); METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post); + METHOD_ADD(llamaCPP::unloadModel, "unloadmodel", Delete); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, @@ -1282,6 +1290,9 @@ class llamaCPP : public drogon::HttpController { std::function &&callback); void warmupModel(); + void unloadModel(const HttpRequestPtr &req, + std::function &&callback); + private: llama_server_context llama; bool model_loaded = false;