Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit f64a90f

Browse files
fix: responses from /chat/completions endpoint contain a leading space in the content (#488)
Co-authored-by: vansangpfiev <sang@jan.ai>
1 parent 4419a4d commit f64a90f

File tree

10 files changed

+183
-21
lines changed

10 files changed

+183
-21
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}")
5858
add_subdirectory(llama.cpp/examples/llava)
5959
add_subdirectory(llama.cpp)
6060
add_subdirectory(whisper.cpp)
61+
add_subdirectory(test)
6162

6263
add_executable(${PROJECT_NAME} main.cc)
6364

controllers/llamaCPP.cc

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
#include <fstream>
44
#include <iostream>
55
#include "log.h"
6-
#include "utils/nitro_utils.h"
76
#include "utils/logging_utils.h"
7+
#include "utils/nitro_utils.h"
88

99
// External
1010
#include "common.h"
@@ -29,6 +29,8 @@ struct inferenceState {
2929
int task_id;
3030
InferenceStatus inference_status = PENDING;
3131
llamaCPP* instance;
32+
// Check if we receive the first token, set it to false after receiving
33+
bool is_first_token = true;
3234

3335
inferenceState(llamaCPP* inst) : instance(inst) {}
3436
};
@@ -208,7 +210,8 @@ void llamaCPP::InferenceImpl(
208210

209211
// Passing load value
210212
data["repeat_last_n"] = this->repeat_last_n;
211-
LOG_INFO_REQUEST(request_id) << "Stop words:" << completion.stop.toStyledString();
213+
LOG_INFO_REQUEST(request_id)
214+
<< "Stop words:" << completion.stop.toStyledString();
212215

213216
data["stream"] = completion.stream;
214217
data["n_predict"] = completion.max_tokens;
@@ -267,7 +270,8 @@ void llamaCPP::InferenceImpl(
267270
auto image_url = content_piece["image_url"]["url"].asString();
268271
std::string base64_image_data;
269272
if (image_url.find("http") != std::string::npos) {
270-
LOG_INFO_REQUEST(request_id) << "Remote image detected but not supported yet";
273+
LOG_INFO_REQUEST(request_id)
274+
<< "Remote image detected but not supported yet";
271275
} else if (image_url.find("data:image") != std::string::npos) {
272276
LOG_INFO_REQUEST(request_id) << "Base64 image detected";
273277
base64_image_data = nitro_utils::extractBase64(image_url);
@@ -328,16 +332,19 @@ void llamaCPP::InferenceImpl(
328332
if (is_streamed) {
329333
LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone";
330334
auto state = create_inference_state(this);
331-
auto chunked_content_provider =
332-
[state, data, request_id](char* pBuffer, std::size_t nBuffSize) -> std::size_t {
335+
336+
auto chunked_content_provider = [state, data, request_id](
337+
char* pBuffer,
338+
std::size_t nBuffSize) -> std::size_t {
333339
if (state->inference_status == PENDING) {
334340
state->inference_status = RUNNING;
335341
} else if (state->inference_status == FINISHED) {
336342
return 0;
337343
}
338344

339345
if (!pBuffer) {
340-
LOG_WARN_REQUEST(request_id) "Connection closed or buffer is null. Reset context";
346+
LOG_WARN_REQUEST(request_id)
347+
"Connection closed or buffer is null. Reset context";
341348
state->inference_status = FINISHED;
342349
return 0;
343350
}
@@ -350,7 +357,8 @@ void llamaCPP::InferenceImpl(
350357
"stop") +
351358
"\n\n" + "data: [DONE]" + "\n\n";
352359

353-
LOG_VERBOSE("data stream", {{"request_id": request_id}, {"to_send", str}});
360+
LOG_VERBOSE("data stream",
361+
{{"request_id": request_id}, {"to_send", str}});
354362
std::size_t nRead = std::min(str.size(), nBuffSize);
355363
memcpy(pBuffer, str.data(), nRead);
356364
state->inference_status = FINISHED;
@@ -359,7 +367,13 @@ void llamaCPP::InferenceImpl(
359367

360368
task_result result = state->instance->llama.next_result(state->task_id);
361369
if (!result.error) {
362-
const std::string to_send = result.result_json["content"];
370+
std::string to_send = result.result_json["content"];
371+
372+
// trim the leading space if it is the first token
373+
if (std::exchange(state->is_first_token, false)) {
374+
nitro_utils::ltrim(to_send);
375+
}
376+
363377
const std::string str =
364378
"data: " +
365379
create_return_json(nitro_utils::generate_random_string(20), "_",
@@ -410,7 +424,8 @@ void llamaCPP::InferenceImpl(
410424
retries += 1;
411425
}
412426
if (state->inference_status != RUNNING)
413-
LOG_INFO_REQUEST(request_id) << "Wait for task to be released:" << state->task_id;
427+
LOG_INFO_REQUEST(request_id)
428+
<< "Wait for task to be released:" << state->task_id;
414429
std::this_thread::sleep_for(std::chrono::milliseconds(100));
415430
}
416431
LOG_INFO_REQUEST(request_id) << "Task completed, release it";
@@ -428,9 +443,11 @@ void llamaCPP::InferenceImpl(
428443
if (!result.error && result.stop) {
429444
int prompt_tokens = result.result_json["tokens_evaluated"];
430445
int predicted_tokens = result.result_json["tokens_predicted"];
431-
respData = create_full_return_json(nitro_utils::generate_random_string(20),
432-
"_", result.result_json["content"], "_",
433-
prompt_tokens, predicted_tokens);
446+
std::string to_send = result.result_json["content"];
447+
nitro_utils::ltrim(to_send);
448+
respData = create_full_return_json(
449+
nitro_utils::generate_random_string(20), "_", to_send, "_",
450+
prompt_tokens, predicted_tokens);
434451
} else {
435452
respData["message"] = "Internal error during inference";
436453
LOG_ERROR_REQUEST(request_id) << "Error during inference";
@@ -463,7 +480,8 @@ void llamaCPP::EmbeddingImpl(
463480
// Queue embedding task
464481
auto state = create_inference_state(this);
465482

466-
state->instance->queue->runTaskInQueue([this, state, jsonBody, callback, request_id]() {
483+
state->instance->queue->runTaskInQueue([this, state, jsonBody, callback,
484+
request_id]() {
467485
Json::Value responseData(Json::arrayValue);
468486

469487
if (jsonBody->isMember("input")) {
@@ -535,7 +553,7 @@ void llamaCPP::ModelStatus(
535553
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
536554
callback(resp);
537555
LOG_INFO << "Model status responded";
538-
}
556+
}
539557
}
540558

541559
void llamaCPP::LoadModel(
@@ -545,10 +563,12 @@ void llamaCPP::LoadModel(
545563
if (!nitro_utils::isAVX2Supported() && ggml_cpu_has_avx2()) {
546564
LOG_ERROR << "AVX2 is not supported by your processor";
547565
Json::Value jsonResp;
548-
jsonResp["message"] = "AVX2 is not supported by your processor, please download and replace the correct Nitro asset version";
566+
jsonResp["message"] =
567+
"AVX2 is not supported by your processor, please download and replace "
568+
"the correct Nitro asset version";
549569
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
550570
resp->setStatusCode(drogon::k500InternalServerError);
551-
callback(resp);
571+
callback(resp);
552572
return;
553573
}
554574

@@ -615,7 +635,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
615635
if (model_path.isNull()) {
616636
LOG_ERROR << "Missing model path in request";
617637
} else {
618-
if (std::filesystem::exists(std::filesystem::path(model_path.asString()))) {
638+
if (std::filesystem::exists(
639+
std::filesystem::path(model_path.asString()))) {
619640
params.model = model_path.asString();
620641
} else {
621642
LOG_ERROR << "Could not find model in path " << model_path.asString();

models/chat_completion_request.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ namespace inferences {
55
struct ChatCompletionRequest {
66
bool stream = false;
77
int max_tokens = 500;
8-
float top_p = 0.95;
9-
float temperature = 0.8;
8+
float top_p = 0.95f;
9+
float temperature = 0.8f;
1010
float frequency_penalty = 0;
1111
float presence_penalty = 0;
1212
Json::Value stop = Json::Value(Json::arrayValue);

nitro_deps/CMakeLists.txt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,21 @@ ExternalProject_Add(
7979
-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
8080
)
8181

82-
# Fix trantor cmakelists to link c-ares on Windows
82+
# Download and install GoogleTest
83+
ExternalProject_Add(
84+
gtest
85+
GIT_REPOSITORY https://github.com/google/googletest
86+
GIT_TAG v1.14.0
87+
CMAKE_ARGS
88+
-Dgtest_force_shared_crt=ON
89+
-DCMAKE_BUILD_TYPE=release
90+
-DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH}
91+
-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
92+
)
93+
94+
8395
if(WIN32)
96+
# Fix trantor cmakelists to link c-ares on Windows
8497
set(TRANTOR_CMAKE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/nitro_deps/drogon-prefix/src/drogon/trantor/CMakeLists.txt)
8598
ExternalProject_Add_Step(drogon trantor_custom_target
8699
COMMAND ${CMAKE_COMMAND} -E echo add_definitions(-DCARES_STATICLIB) >> ${TRANTOR_CMAKE_FILE}

test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
add_subdirectory(components)

test/components/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
file(GLOB SRCS *.cc)
2+
project(test-components)
3+
4+
enable_testing()
5+
6+
add_executable(${PROJECT_NAME} ${SRCS})
7+
8+
find_package(Drogon CONFIG REQUIRED)
9+
find_package(GTest CONFIG REQUIRED)
10+
11+
target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon GTest::gtest GTest::gmock
12+
${CMAKE_THREAD_LIBS_INIT})
13+
target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../)
14+
15+
add_test(NAME ${PROJECT_NAME}
16+
COMMAND ${PROJECT_NAME})

test/components/main.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#include "gtest/gtest.h"
2+
#include <drogon/HttpAppFramework.h>
3+
#include <drogon/drogon.h>
4+
5+
int main(int argc, char **argv) {
6+
::testing::InitGoogleTest(&argc, argv);
7+
int ret = RUN_ALL_TESTS();
8+
return ret;
9+
}

test/components/test_models.cc

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#include "gtest/gtest.h"
2+
#include "models/chat_completion_request.h"
3+
4+
using inferences::ChatCompletionRequest;
5+
6+
class ModelTest : public ::testing::Test {
7+
};
8+
9+
10+
TEST_F(ModelTest, should_parse_request) {
11+
{
12+
Json::Value data;
13+
auto req = drogon::HttpRequest::newHttpJsonRequest(data);
14+
15+
auto res =
16+
drogon::fromRequest<inferences::ChatCompletionRequest>(*req.get());
17+
18+
EXPECT_EQ(res.stream, false);
19+
EXPECT_EQ(res.max_tokens, 500);
20+
EXPECT_EQ(res.top_p, 0.95f);
21+
EXPECT_EQ(res.temperature, 0.8f);
22+
EXPECT_EQ(res.frequency_penalty, 0);
23+
EXPECT_EQ(res.presence_penalty, 0);
24+
EXPECT_EQ(res.stop, Json::Value{});
25+
EXPECT_EQ(res.messages, Json::Value{});
26+
}
27+
28+
{
29+
Json::Value data;
30+
data["stream"] = true;
31+
data["max_tokens"] = 400;
32+
data["top_p"] = 0.8;
33+
data["temperature"] = 0.7;
34+
data["frequency_penalty"] = 0.1;
35+
data["presence_penalty"] = 0.2;
36+
data["messages"] = "message";
37+
data["stop"] = "stop";
38+
39+
auto req = drogon::HttpRequest::newHttpJsonRequest(data);
40+
41+
auto res =
42+
drogon::fromRequest<inferences::ChatCompletionRequest>(*req.get());
43+
44+
EXPECT_EQ(res.stream, true);
45+
EXPECT_EQ(res.max_tokens, 400);
46+
EXPECT_EQ(res.top_p, 0.8f);
47+
EXPECT_EQ(res.temperature, 0.7f);
48+
EXPECT_EQ(res.frequency_penalty, 0.1f);
49+
EXPECT_EQ(res.presence_penalty, 0.2f);
50+
EXPECT_EQ(res.stop, Json::Value{"stop"});
51+
EXPECT_EQ(res.messages, Json::Value{"message"});
52+
}
53+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#include "gtest/gtest.h"
2+
#include "utils/nitro_utils.h"
3+
4+
class NitroUtilTest : public ::testing::Test {
5+
};
6+
7+
TEST_F(NitroUtilTest, left_trim) {
8+
{
9+
std::string empty;
10+
nitro_utils::ltrim(empty);
11+
EXPECT_EQ(empty, "");
12+
}
13+
14+
{
15+
std::string s = "abc";
16+
std::string expected = "abc";
17+
nitro_utils::ltrim(s);
18+
EXPECT_EQ(s, expected);
19+
}
20+
21+
{
22+
std::string s = " abc";
23+
std::string expected = "abc";
24+
nitro_utils::ltrim(s);
25+
EXPECT_EQ(s, expected);
26+
}
27+
28+
{
29+
std::string s = "1 abc 2 ";
30+
std::string expected = "1 abc 2 ";
31+
nitro_utils::ltrim(s);
32+
EXPECT_EQ(s, expected);
33+
}
34+
35+
{
36+
std::string s = " |abc";
37+
std::string expected = "|abc";
38+
nitro_utils::ltrim(s);
39+
EXPECT_EQ(s, expected);
40+
}
41+
}

utils/nitro_utils.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ inline std::string generate_random_string(std::size_t length) {
165165
std::random_device rd;
166166
std::mt19937 generator(rd());
167167

168-
std::uniform_int_distribution<> distribution(0, characters.size() - 1);
168+
std::uniform_int_distribution<> distribution(0, static_cast<int>(characters.size()) - 1);
169169

170170
std::string random_string(length, '\0');
171171
std::generate_n(random_string.begin(), length,
@@ -276,4 +276,10 @@ inline drogon::HttpResponsePtr nitroStreamResponse(
276276
return resp;
277277
}
278278

279+
inline void ltrim(std::string& s) {
280+
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
281+
return !std::isspace(ch);
282+
}));
283+
};
284+
279285
} // namespace nitro_utils

0 commit comments

Comments
 (0)