@@ -187,6 +187,16 @@ void llamaCPP::ChatCompletion(
187187void llamaCPP::InferenceImpl (
188188 inferences::ChatCompletionRequest&& completion,
189189 std::function<void (const HttpResponsePtr&)>&& callback) {
190+ if (llama.model_type == ModelType::EMBEDDING) {
191+ LOG_WARN << " Not support completion for embedding model" ;
192+ Json::Value jsonResp;
193+ jsonResp[" message" ] =
194+ " Not support completion for embedding model" ;
195+ auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
196+ resp->setStatusCode (drogon::k400BadRequest);
197+ callback (resp);
198+ return ;
199+ }
190200 std::string formatted_output = pre_prompt;
191201 int request_id = ++no_of_requests;
192202 LOG_INFO_REQUEST (request_id) << " Generating reponse for inference request" ;
@@ -653,6 +663,11 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
653663 params.n_ctx = jsonBody->get (" ctx_len" , 2048 ).asInt ();
654664 params.embedding = jsonBody->get (" embedding" , true ).asBool ();
655665 model_type = jsonBody->get (" model_type" , " llm" ).asString ();
666+ if (model_type == " llm" ) {
667+ llama.model_type = ModelType::LLM;
668+ } else {
669+ llama.model_type = ModelType::EMBEDDING;
670+ }
656671 // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
657672 params.n_batch = jsonBody->get (" n_batch" , 512 ).asInt ();
658673 params.n_parallel = jsonBody->get (" n_parallel" , 1 ).asInt ();
@@ -712,8 +727,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
712727
713728 // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model.
714729 // So we use this variable to differentiate with other models
715- // TODO: in case embedded model only, we should reject completion request from user?
716- if (model_type == " llm" ) {
730+ if (llama.model_type == ModelType::LLM) {
717731 WarmupModel ();
718732 }
719733 return true ;
0 commit comments