From 50c03c94f284e64dfdbad90d82fe97ac4ea414ce Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 8 Apr 2024 10:58:07 +0700 Subject: [PATCH 1/4] feat: add sample code for unit tests --- CMakeLists.txt | 3 +- controllers/llamaCPP.cc | 2 ++ models/chat_completion_request.h | 4 +-- nitro_deps/CMakeLists.txt | 17 +++++++++- test/CMakeLists.txt | 2 ++ test/models/CMakeLists.txt | 14 +++++++++ test/models/main.cc | 9 ++++++ test/models/test_models.cc | 53 ++++++++++++++++++++++++++++++++ 8 files changed, 100 insertions(+), 4 deletions(-) create mode 100644 test/CMakeLists.txt create mode 100644 test/models/CMakeLists.txt create mode 100644 test/models/main.cc create mode 100644 test/models/test_models.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 3aa6b716d..0415d668f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,7 @@ add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}") add_subdirectory(llama.cpp/examples/llava) add_subdirectory(llama.cpp) add_subdirectory(whisper.cpp) +add_subdirectory(test) add_executable(${PROJECT_NAME} main.cc) @@ -102,4 +103,4 @@ target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC}) # ############################################################################## # uncomment the following line for dynamically loading views set_property(TARGET -# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) +# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) \ No newline at end of file diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 14cf17ab1..bcd2cab9c 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -582,8 +582,10 @@ void llamaCPP::LoadModel( bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { gpt_params params; + LOG_INFO << "Start loading model"; // By default will setting based on number of handlers if (jsonBody) { + LOG_DEBUG << "Start parsing jsonBody"; if (!jsonBody->operator[]("mmproj").isNull()) { LOG_INFO << "MMPROJ FILE detected, multi-model enabled!"; params.mmproj = jsonBody->operator[]("mmproj").asString(); diff --git a/models/chat_completion_request.h b/models/chat_completion_request.h index bd802d67e..f4fd087f5 100644 --- a/models/chat_completion_request.h +++ b/models/chat_completion_request.h @@ -5,8 +5,8 @@ namespace inferences { struct ChatCompletionRequest { bool stream = false; int max_tokens = 500; - float top_p = 0.95; - float temperature = 0.8; + float top_p = 0.95f; + float temperature = 0.8f; float frequency_penalty = 0; float presence_penalty = 0; Json::Value stop = Json::Value(Json::arrayValue); diff --git a/nitro_deps/CMakeLists.txt b/nitro_deps/CMakeLists.txt index 03ab352db..4982816f5 100644 --- a/nitro_deps/CMakeLists.txt +++ b/nitro_deps/CMakeLists.txt @@ -79,8 +79,23 @@ ExternalProject_Add( -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} ) -# Fix trantor cmakelists to link c-ares on Windows +# Download and install GoogleTest +ExternalProject_Add( + gtest + GIT_REPOSITORY https://github.com/google/googletest + GIT_TAG v1.14.0 + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=release + -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} + -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} +) + + if(WIN32) + # Fix dynamic link for gtest + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + # Fix trantor cmakelists to link c-ares on Windows set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/nitro_deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt) ExternalProject_Add_Step(drogon trantor_custom_target COMMAND ${CMAKE_COMMAND} -E echo add_definitions(-DCARES_STATICLIB) >> ${TRANTOR_CMAKE_FILE} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 000000000..efeea693d --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,2 @@ + +add_subdirectory(models) \ No newline at end of file diff --git a/test/models/CMakeLists.txt b/test/models/CMakeLists.txt new file mode 100644 index 000000000..5bb101c3d --- /dev/null +++ b/test/models/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB SRCS *.cc) +project(test-models) + +add_executable(${PROJECT_NAME} ${SRCS}) + +find_package(Drogon CONFIG REQUIRED) +find_package(GTest CONFIG REQUIRED) + +target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon GTest::gtest GTest::gmock + ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../) + +add_test(NAME ${PROJECT_NAME} + COMMAND ${PROJECT_NAME}) \ No newline at end of file diff --git a/test/models/main.cc b/test/models/main.cc new file mode 100644 index 000000000..e2ba5955d --- /dev/null +++ b/test/models/main.cc @@ -0,0 +1,9 @@ +#include "gtest/gtest.h" +#include +#include + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + int ret = RUN_ALL_TESTS(); + return ret; +} \ No newline at end of file diff --git a/test/models/test_models.cc b/test/models/test_models.cc new file mode 100644 index 000000000..ac27e97dd --- /dev/null +++ b/test/models/test_models.cc @@ -0,0 +1,53 @@ +#include "gtest/gtest.h" +#include "models/chat_completion_request.h" + +using inferences::ChatCompletionRequest; + +class ModelTest : public ::testing::Test { +}; + + +TEST_F(ModelTest, should_parse_request) { + { + Json::Value data; + auto req = drogon::HttpRequest::newHttpJsonRequest(data); + + auto res = + drogon::fromRequest(*req.get()); + + EXPECT_EQ(res.stream, false); + EXPECT_EQ(res.max_tokens, 500); + EXPECT_EQ(res.top_p, 0.95f); + EXPECT_EQ(res.temperature, 0.8f); + EXPECT_EQ(res.frequency_penalty, 0); + EXPECT_EQ(res.presence_penalty, 0); + EXPECT_EQ(res.stop, Json::Value{}); + EXPECT_EQ(res.messages, Json::Value{}); + } + + { + Json::Value data; + data["stream"] = true; + data["max_tokens"] = 400; + data["top_p"] = 0.8; + data["temperature"] = 0.7; + data["frequency_penalty"] = 0.1; + data["presence_penalty"] = 0.2; + data["messages"] = "message"; + data["stop"] = "stop"; + + auto req = drogon::HttpRequest::newHttpJsonRequest(data); + + auto res = + drogon::fromRequest(*req.get()); + + EXPECT_EQ(res.stream, true); + EXPECT_EQ(res.max_tokens, 400); + EXPECT_EQ(res.top_p, 0.8f); + EXPECT_EQ(res.temperature, 0.7f); + EXPECT_EQ(res.frequency_penalty, 0.1f); + EXPECT_EQ(res.presence_penalty, 0.2f); + EXPECT_EQ(res.stop, Json::Value{"stop"}); + EXPECT_EQ(res.messages, Json::Value{"message"}); + } +} From 0739694d234cd522d0118e87f9a817daa0a288ca Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 8 Apr 2024 13:34:19 +0700 Subject: [PATCH 2/4] bug: trim the leading space --- CMakeLists.txt | 2 +- controllers/llamaCPP.cc | 50 +++++++++++++++------- nitro_deps/CMakeLists.txt | 2 +- test/CMakeLists.txt | 2 +- test/{models => components}/CMakeLists.txt | 4 +- test/{models => components}/main.cc | 2 +- test/{models => components}/test_models.cc | 0 test/components/test_nitro_utils.cc | 41 ++++++++++++++++++ utils/nitro_utils.h | 8 +++- 9 files changed, 88 insertions(+), 23 deletions(-) rename test/{models => components}/CMakeLists.txt (87%) rename test/{models => components}/main.cc (99%) rename test/{models => components}/test_models.cc (100%) create mode 100644 test/components/test_nitro_utils.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 0415d668f..eba4fee0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,4 +103,4 @@ target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC}) # ############################################################################## # uncomment the following line for dynamically loading views set_property(TARGET -# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) \ No newline at end of file +# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index bcd2cab9c..2e86e8e72 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -3,8 +3,8 @@ #include #include #include "log.h" -#include "utils/nitro_utils.h" #include "utils/logging_utils.h" +#include "utils/nitro_utils.h" // External #include "common.h" @@ -208,7 +208,8 @@ void llamaCPP::InferenceImpl( // Passing load value data["repeat_last_n"] = this->repeat_last_n; - LOG_INFO_REQUEST(request_id) << "Stop words:" << completion.stop.toStyledString(); + LOG_INFO_REQUEST(request_id) + << "Stop words:" << completion.stop.toStyledString(); data["stream"] = completion.stream; data["n_predict"] = completion.max_tokens; @@ -267,7 +268,8 @@ void llamaCPP::InferenceImpl( auto image_url = content_piece["image_url"]["url"].asString(); std::string base64_image_data; if (image_url.find("http") != std::string::npos) { - LOG_INFO_REQUEST(request_id) << "Remote image detected but not supported yet"; + LOG_INFO_REQUEST(request_id) + << "Remote image detected but not supported yet"; } else if (image_url.find("data:image") != std::string::npos) { LOG_INFO_REQUEST(request_id) << "Base64 image detected"; base64_image_data = nitro_utils::extractBase64(image_url); @@ -328,8 +330,10 @@ void llamaCPP::InferenceImpl( if (is_streamed) { LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone"; auto state = create_inference_state(this); + bool is_first_token = true; auto chunked_content_provider = - [state, data, request_id](char* pBuffer, std::size_t nBuffSize) -> std::size_t { + [state, data, request_id, &is_first_token]( + char* pBuffer, std::size_t nBuffSize) -> std::size_t { if (state->inference_status == PENDING) { state->inference_status = RUNNING; } else if (state->inference_status == FINISHED) { @@ -337,20 +341,23 @@ void llamaCPP::InferenceImpl( } if (!pBuffer) { - LOG_WARN_REQUEST(request_id) "Connection closed or buffer is null. Reset context"; + LOG_WARN_REQUEST(request_id) + "Connection closed or buffer is null. Reset context"; state->inference_status = FINISHED; return 0; } if (state->inference_status == EOS) { LOG_INFO_REQUEST(request_id) << "End of result"; + is_first_token = true; const std::string str = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", "", "stop") + "\n\n" + "data: [DONE]" + "\n\n"; - LOG_VERBOSE("data stream", {{"request_id": request_id}, {"to_send", str}}); + LOG_VERBOSE("data stream", + {{"request_id": request_id}, {"to_send", str}}); std::size_t nRead = std::min(str.size(), nBuffSize); memcpy(pBuffer, str.data(), nRead); state->inference_status = FINISHED; @@ -359,7 +366,13 @@ void llamaCPP::InferenceImpl( task_result result = state->instance->llama.next_result(state->task_id); if (!result.error) { - const std::string to_send = result.result_json["content"]; + std::string to_send = result.result_json["content"]; + + // trim the leading space if it is the first token + if (std::exchange(is_first_token, false)) { + nitro_utils::ltrim(to_send); + } + const std::string str = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", @@ -410,7 +423,8 @@ void llamaCPP::InferenceImpl( retries += 1; } if (state->inference_status != RUNNING) - LOG_INFO_REQUEST(request_id) << "Wait for task to be released:" << state->task_id; + LOG_INFO_REQUEST(request_id) + << "Wait for task to be released:" << state->task_id; std::this_thread::sleep_for(std::chrono::milliseconds(100)); } LOG_INFO_REQUEST(request_id) << "Task completed, release it"; @@ -428,8 +442,10 @@ void llamaCPP::InferenceImpl( if (!result.error && result.stop) { int prompt_tokens = result.result_json["tokens_evaluated"]; int predicted_tokens = result.result_json["tokens_predicted"]; + std::string to_send = result.result_json["content"]; + nitro_utils::ltrim(to_send); respData = create_full_return_json(nitro_utils::generate_random_string(20), - "_", result.result_json["content"], "_", + "_", to_send, "_", prompt_tokens, predicted_tokens); } else { respData["message"] = "Internal error during inference"; @@ -463,7 +479,8 @@ void llamaCPP::EmbeddingImpl( // Queue embedding task auto state = create_inference_state(this); - state->instance->queue->runTaskInQueue([this, state, jsonBody, callback, request_id]() { + state->instance->queue->runTaskInQueue([this, state, jsonBody, callback, + request_id]() { Json::Value responseData(Json::arrayValue); if (jsonBody->isMember("input")) { @@ -535,7 +552,7 @@ void llamaCPP::ModelStatus( auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); callback(resp); LOG_INFO << "Model status responded"; - } + } } void llamaCPP::LoadModel( @@ -545,10 +562,12 @@ void llamaCPP::LoadModel( if (!nitro_utils::isAVX2Supported() && ggml_cpu_has_avx2()) { LOG_ERROR << "AVX2 is not supported by your processor"; Json::Value jsonResp; - jsonResp["message"] = "AVX2 is not supported by your processor, please download and replace the correct Nitro asset version"; + jsonResp["message"] = + "AVX2 is not supported by your processor, please download and replace " + "the correct Nitro asset version"; auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); + callback(resp); return; } @@ -582,10 +601,8 @@ void llamaCPP::LoadModel( bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { gpt_params params; - LOG_INFO << "Start loading model"; // By default will setting based on number of handlers if (jsonBody) { - LOG_DEBUG << "Start parsing jsonBody"; if (!jsonBody->operator[]("mmproj").isNull()) { LOG_INFO << "MMPROJ FILE detected, multi-model enabled!"; params.mmproj = jsonBody->operator[]("mmproj").asString(); @@ -617,7 +634,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { if (model_path.isNull()) { LOG_ERROR << "Missing model path in request"; } else { - if (std::filesystem::exists(std::filesystem::path(model_path.asString()))) { + if (std::filesystem::exists( + std::filesystem::path(model_path.asString()))) { params.model = model_path.asString(); } else { LOG_ERROR << "Could not find model in path " << model_path.asString(); diff --git a/nitro_deps/CMakeLists.txt b/nitro_deps/CMakeLists.txt index 4982816f5..745def57b 100644 --- a/nitro_deps/CMakeLists.txt +++ b/nitro_deps/CMakeLists.txt @@ -83,7 +83,7 @@ ExternalProject_Add( ExternalProject_Add( gtest GIT_REPOSITORY https://github.com/google/googletest - GIT_TAG v1.14.0 + GIT_TAG v1.14.0 CMAKE_ARGS -DCMAKE_BUILD_TYPE=release -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index efeea693d..5cc48fbb9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,2 +1,2 @@ -add_subdirectory(models) \ No newline at end of file +add_subdirectory(components) diff --git a/test/models/CMakeLists.txt b/test/components/CMakeLists.txt similarity index 87% rename from test/models/CMakeLists.txt rename to test/components/CMakeLists.txt index 5bb101c3d..e18ae93e3 100644 --- a/test/models/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -1,5 +1,5 @@ file(GLOB SRCS *.cc) -project(test-models) +project(test-components) add_executable(${PROJECT_NAME} ${SRCS}) @@ -11,4 +11,4 @@ target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon GTest::gtest GTest: target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../) add_test(NAME ${PROJECT_NAME} - COMMAND ${PROJECT_NAME}) \ No newline at end of file + COMMAND ${PROJECT_NAME}) diff --git a/test/models/main.cc b/test/components/main.cc similarity index 99% rename from test/models/main.cc rename to test/components/main.cc index e2ba5955d..0fe7f3f26 100644 --- a/test/models/main.cc +++ b/test/components/main.cc @@ -6,4 +6,4 @@ int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); int ret = RUN_ALL_TESTS(); return ret; -} \ No newline at end of file +} diff --git a/test/models/test_models.cc b/test/components/test_models.cc similarity index 100% rename from test/models/test_models.cc rename to test/components/test_models.cc diff --git a/test/components/test_nitro_utils.cc b/test/components/test_nitro_utils.cc new file mode 100644 index 000000000..adf3e976b --- /dev/null +++ b/test/components/test_nitro_utils.cc @@ -0,0 +1,41 @@ +#include "gtest/gtest.h" +#include "utils/nitro_utils.h" + +class NitroUtilTest : public ::testing::Test { +}; + +TEST_F(NitroUtilTest, left_trim) { + { + std::string empty; + nitro_utils::ltrim(empty); + EXPECT_EQ(empty, ""); + } + + { + std::string s = "abc"; + std::string expected = "abc"; + nitro_utils::ltrim(s); + EXPECT_EQ(s, expected); + } + + { + std::string s = " abc"; + std::string expected = "abc"; + nitro_utils::ltrim(s); + EXPECT_EQ(s, expected); + } + + { + std::string s = "1 abc 2 "; + std::string expected = "1 abc 2 "; + nitro_utils::ltrim(s); + EXPECT_EQ(s, expected); + } + + { + std::string s = " |abc"; + std::string expected = "|abc"; + nitro_utils::ltrim(s); + EXPECT_EQ(s, expected); + } +} diff --git a/utils/nitro_utils.h b/utils/nitro_utils.h index 3957af6eb..c1087b345 100644 --- a/utils/nitro_utils.h +++ b/utils/nitro_utils.h @@ -165,7 +165,7 @@ inline std::string generate_random_string(std::size_t length) { std::random_device rd; std::mt19937 generator(rd()); - std::uniform_int_distribution<> distribution(0, characters.size() - 1); + std::uniform_int_distribution<> distribution(0, static_cast(characters.size()) - 1); std::string random_string(length, '\0'); std::generate_n(random_string.begin(), length, @@ -276,4 +276,10 @@ inline drogon::HttpResponsePtr nitroStreamResponse( return resp; } +inline void ltrim(std::string& s) { + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { + return !std::isspace(ch); + })); +}; + } // namespace nitro_utils From 922e4605b98af0f6da155665ef3af0941017ea1d Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 9 Apr 2024 09:51:05 +0700 Subject: [PATCH 3/4] cmake: fix linking issue for windows --- nitro_deps/CMakeLists.txt | 4 +--- test/components/CMakeLists.txt | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nitro_deps/CMakeLists.txt b/nitro_deps/CMakeLists.txt index 745def57b..29b9b0186 100644 --- a/nitro_deps/CMakeLists.txt +++ b/nitro_deps/CMakeLists.txt @@ -85,6 +85,7 @@ ExternalProject_Add( GIT_REPOSITORY https://github.com/google/googletest GIT_TAG v1.14.0 CMAKE_ARGS + -Dgtest_force_shared_crt=ON -DCMAKE_BUILD_TYPE=release -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} @@ -92,9 +93,6 @@ ExternalProject_Add( if(WIN32) - # Fix dynamic link for gtest - set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - # Fix trantor cmakelists to link c-ares on Windows set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/nitro_deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt) ExternalProject_Add_Step(drogon trantor_custom_target diff --git a/test/components/CMakeLists.txt b/test/components/CMakeLists.txt index e18ae93e3..71a44012b 100644 --- a/test/components/CMakeLists.txt +++ b/test/components/CMakeLists.txt @@ -1,6 +1,8 @@ file(GLOB SRCS *.cc) project(test-components) +enable_testing() + add_executable(${PROJECT_NAME} ${SRCS}) find_package(Drogon CONFIG REQUIRED) From 3280f0ec1ea39adf64de099a57ddfad50750ecb7 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 12 Apr 2024 08:30:12 +0700 Subject: [PATCH 4/4] fix: move is_first_token to state --- controllers/llamaCPP.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 2e86e8e72..5998ed38f 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -29,6 +29,8 @@ struct inferenceState { int task_id; InferenceStatus inference_status = PENDING; llamaCPP* instance; + // Check if we receive the first token, set it to false after receiving + bool is_first_token = true; inferenceState(llamaCPP* inst) : instance(inst) {} }; @@ -330,10 +332,10 @@ void llamaCPP::InferenceImpl( if (is_streamed) { LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone"; auto state = create_inference_state(this); - bool is_first_token = true; - auto chunked_content_provider = - [state, data, request_id, &is_first_token]( - char* pBuffer, std::size_t nBuffSize) -> std::size_t { + + auto chunked_content_provider = [state, data, request_id]( + char* pBuffer, + std::size_t nBuffSize) -> std::size_t { if (state->inference_status == PENDING) { state->inference_status = RUNNING; } else if (state->inference_status == FINISHED) { @@ -349,7 +351,6 @@ void llamaCPP::InferenceImpl( if (state->inference_status == EOS) { LOG_INFO_REQUEST(request_id) << "End of result"; - is_first_token = true; const std::string str = "data: " + create_return_json(nitro_utils::generate_random_string(20), "_", "", @@ -369,7 +370,7 @@ void llamaCPP::InferenceImpl( std::string to_send = result.result_json["content"]; // trim the leading space if it is the first token - if (std::exchange(is_first_token, false)) { + if (std::exchange(state->is_first_token, false)) { nitro_utils::ltrim(to_send); } @@ -444,9 +445,9 @@ void llamaCPP::InferenceImpl( int predicted_tokens = result.result_json["tokens_predicted"]; std::string to_send = result.result_json["content"]; nitro_utils::ltrim(to_send); - respData = create_full_return_json(nitro_utils::generate_random_string(20), - "_", to_send, "_", - prompt_tokens, predicted_tokens); + respData = create_full_return_json( + nitro_utils::generate_random_string(20), "_", to_send, "_", + prompt_tokens, predicted_tokens); } else { respData["message"] = "Internal error during inference"; LOG_ERROR_REQUEST(request_id) << "Error during inference";