From 876790f57a959608c9a44ea5ca95a5e9ff2c7ec1 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 16 Apr 2024 17:44:39 +0700 Subject: [PATCH 1/3] fix: e2e testing for linux and mac --- .github/scripts/e2e-test-llama-linux-and-mac.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh index 5b7b9771d..7d888e08f 100644 --- a/.github/scripts/e2e-test-llama-linux-and-mac.sh +++ b/.github/scripts/e2e-test-llama-linux-and-mac.sh @@ -65,14 +65,12 @@ fi response2=$( curl --connect-timeout 60 -o /tmp/completion-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \ --header 'Content-Type: application/json' \ - --header 'Accept: text/event-stream' \ - --header 'Access-Control-Allow-Origin: *' \ --data '{ "messages": [ {"content": "Hello there", "role": "assistant"}, {"content": "Write a long and sad story for me", "role": "user"} ], - "stream": true, + "stream": false, "model": "gpt-3.5-turbo", "max_tokens": 50, "stop": ["hello"], From 849f7b4abd465f0d0773d87f37fefe0bcf4c48e9 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 16 Apr 2024 19:30:26 +0700 Subject: [PATCH 2/3] fix: thread hang if unload model when still receiving stream data --- controllers/llamaCPP.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index e4f33a27e..69284d6e9 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -190,8 +190,7 @@ void llamaCPP::InferenceImpl( if (llama.model_type == ModelType::EMBEDDING) { LOG_WARN << "Not support completion for embedding model"; Json::Value jsonResp; - jsonResp["message"] = - "Not support completion for embedding model"; + jsonResp["message"] = "Not support completion for embedding model"; auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); resp->setStatusCode(drogon::k400BadRequest); callback(resp); @@ -429,7 +428,8 @@ void llamaCPP::InferenceImpl( // Since this is an async task, we will wait for the task to be // completed - while (state->inference_status != FINISHED && retries < 10) { + while (state->inference_status != FINISHED && retries < 10 && + state->instance->llama.model_loaded_external) { // Should wait chunked_content_provider lambda to be called within // 3s if (state->inference_status == PENDING) { @@ -748,9 +748,10 @@ void llamaCPP::StopBackgroundTask() { if (llama.model_loaded_external) { llama.model_loaded_external = false; llama.condition_tasks.notify_one(); - LOG_INFO << "Background task stopped! "; + LOG_INFO << "Stopping background task! "; if (backgroundThread.joinable()) { backgroundThread.join(); } + LOG_INFO << "Background task stopped! "; } } From 9608a0c905ff84c3ce85e475d6d0555d2b9240b7 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 16 Apr 2024 19:30:49 +0700 Subject: [PATCH 3/3] Revert "fix: e2e testing for linux and mac" This reverts commit 876790f57a959608c9a44ea5ca95a5e9ff2c7ec1. --- .github/scripts/e2e-test-llama-linux-and-mac.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh index 7d888e08f..5b7b9771d 100644 --- a/.github/scripts/e2e-test-llama-linux-and-mac.sh +++ b/.github/scripts/e2e-test-llama-linux-and-mac.sh @@ -65,12 +65,14 @@ fi response2=$( curl --connect-timeout 60 -o /tmp/completion-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \ --header 'Content-Type: application/json' \ + --header 'Accept: text/event-stream' \ + --header 'Access-Control-Allow-Origin: *' \ --data '{ "messages": [ {"content": "Hello there", "role": "assistant"}, {"content": "Write a long and sad story for me", "role": "user"} ], - "stream": false, + "stream": true, "model": "gpt-3.5-turbo", "max_tokens": 50, "stop": ["hello"],