33#include < fstream>
44#include < iostream>
55#include " log.h"
6- #include " utils/nitro_utils.h"
76#include " utils/logging_utils.h"
7+ #include " utils/nitro_utils.h"
88
99// External
1010#include " common.h"
@@ -210,7 +210,8 @@ void llamaCPP::InferenceImpl(
210210
211211 // Passing load value
212212 data[" repeat_last_n" ] = this ->repeat_last_n ;
213- LOG_INFO_REQUEST (request_id) << " Stop words:" << completion.stop .toStyledString ();
213+ LOG_INFO_REQUEST (request_id)
214+ << " Stop words:" << completion.stop .toStyledString ();
214215
215216 data[" stream" ] = completion.stream ;
216217 data[" n_predict" ] = completion.max_tokens ;
@@ -269,7 +270,8 @@ void llamaCPP::InferenceImpl(
269270 auto image_url = content_piece[" image_url" ][" url" ].asString ();
270271 std::string base64_image_data;
271272 if (image_url.find (" http" ) != std::string::npos) {
272- LOG_INFO_REQUEST (request_id) << " Remote image detected but not supported yet" ;
273+ LOG_INFO_REQUEST (request_id)
274+ << " Remote image detected but not supported yet" ;
273275 } else if (image_url.find (" data:image" ) != std::string::npos) {
274276 LOG_INFO_REQUEST (request_id) << " Base64 image detected" ;
275277 base64_image_data = nitro_utils::extractBase64 (image_url);
@@ -330,29 +332,34 @@ void llamaCPP::InferenceImpl(
330332 if (is_streamed) {
331333 LOG_INFO_REQUEST (request_id) << " Streamed, waiting for respone" ;
332334 auto state = create_inference_state (this );
335+ bool is_first_token = true ;
333336 auto chunked_content_provider =
334- [state, data, request_id](char * pBuffer, std::size_t nBuffSize) -> std::size_t {
337+ [state, data, request_id, &is_first_token](
338+ char * pBuffer, std::size_t nBuffSize) -> std::size_t {
335339 if (state->inference_status == PENDING) {
336340 state->inference_status = RUNNING;
337341 } else if (state->inference_status == FINISHED) {
338342 return 0 ;
339343 }
340344
341345 if (!pBuffer) {
342- LOG_WARN_REQUEST (request_id) " Connection closed or buffer is null. Reset context" ;
346+ LOG_WARN_REQUEST (request_id)
347+ " Connection closed or buffer is null. Reset context" ;
343348 state->inference_status = FINISHED;
344349 return 0 ;
345350 }
346351
347352 if (state->inference_status == EOS) {
348353 LOG_INFO_REQUEST (request_id) << " End of result" ;
354+ is_first_token = true ;
349355 const std::string str =
350356 " data: " +
351357 create_return_json (nitro_utils::generate_random_string (20 ), " _" , " " ,
352358 " stop" ) +
353359 " \n\n " + " data: [DONE]" + " \n\n " ;
354360
355- LOG_VERBOSE (" data stream" , {{" request_id" : request_id}, {" to_send" , str}});
361+ LOG_VERBOSE (" data stream" ,
362+ {{" request_id" : request_id}, {" to_send" , str}});
356363 std::size_t nRead = std::min (str.size (), nBuffSize);
357364 memcpy (pBuffer, str.data (), nRead);
358365 state->inference_status = FINISHED;
@@ -361,7 +368,13 @@ void llamaCPP::InferenceImpl(
361368
362369 task_result result = state->instance ->llama .next_result (state->task_id );
363370 if (!result.error ) {
364- const std::string to_send = result.result_json [" content" ];
371+ std::string to_send = result.result_json [" content" ];
372+
373+ // trim the leading space if it is the first token
374+ if (std::exchange (is_first_token, false )) {
375+ nitro_utils::ltrim (to_send);
376+ }
377+
365378 const std::string str =
366379 " data: " +
367380 create_return_json (nitro_utils::generate_random_string (20 ), " _" ,
@@ -412,7 +425,8 @@ void llamaCPP::InferenceImpl(
412425 retries += 1 ;
413426 }
414427 if (state->inference_status != RUNNING)
415- LOG_INFO_REQUEST (request_id) << " Wait for task to be released:" << state->task_id ;
428+ LOG_INFO_REQUEST (request_id)
429+ << " Wait for task to be released:" << state->task_id ;
416430 std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
417431 }
418432 LOG_INFO_REQUEST (request_id) << " Task completed, release it" ;
@@ -431,10 +445,12 @@ void llamaCPP::InferenceImpl(
431445 if (!result.error && result.stop ) {
432446 int prompt_tokens = result.result_json [" tokens_evaluated" ];
433447 int predicted_tokens = result.result_json [" tokens_predicted" ];
434- std::string full_return =
435- create_full_return_json (nitro_utils::generate_random_string (20 ),
436- " _" , result.result_json [" content" ], " _" ,
437- prompt_tokens, predicted_tokens);
448+ std::string to_send = result.result_json [" content" ];
449+ nitro_utils::ltrim (to_send);
450+ std::string full_return = create_full_return_json (
451+ nitro_utils::generate_random_string (20 ), " _" , to_send, " _" ,
452+ prompt_tokens, predicted_tokens);
453+
438454 resp->setBody (full_return);
439455 } else {
440456 respData[" message" ] = " Internal error during inference" ;
@@ -468,7 +484,8 @@ void llamaCPP::EmbeddingImpl(
468484 // Queue embedding task
469485 auto state = create_inference_state (this );
470486
471- state->instance ->queue ->runTaskInQueue ([this , state, jsonBody, callback, request_id]() {
487+ state->instance ->queue ->runTaskInQueue ([this , state, jsonBody, callback,
488+ request_id]() {
472489 Json::Value responseData (Json::arrayValue);
473490
474491 if (jsonBody->isMember (" input" )) {
@@ -542,7 +559,7 @@ void llamaCPP::ModelStatus(
542559 auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
543560 callback (resp);
544561 LOG_INFO << " Model status responded" ;
545- }
562+ }
546563}
547564
548565void llamaCPP::LoadModel (
@@ -552,10 +569,12 @@ void llamaCPP::LoadModel(
552569 if (!nitro_utils::isAVX2Supported () && ggml_cpu_has_avx2 ()) {
553570 LOG_ERROR << " AVX2 is not supported by your processor" ;
554571 Json::Value jsonResp;
555- jsonResp[" message" ] = " AVX2 is not supported by your processor, please download and replace the correct Nitro asset version" ;
572+ jsonResp[" message" ] =
573+ " AVX2 is not supported by your processor, please download and replace "
574+ " the correct Nitro asset version" ;
556575 auto resp = nitro_utils::nitroHttpJsonResponse (jsonResp);
557576 resp->setStatusCode (drogon::k500InternalServerError);
558- callback (resp);
577+ callback (resp);
559578 return ;
560579 }
561580
@@ -589,10 +608,8 @@ void llamaCPP::LoadModel(
589608
590609bool llamaCPP::LoadModelImpl (std::shared_ptr<Json::Value> jsonBody) {
591610 gpt_params params;
592- LOG_INFO << " Start loading model" ;
593611 // By default will setting based on number of handlers
594612 if (jsonBody) {
595- LOG_DEBUG << " Start parsing jsonBody" ;
596613 if (!jsonBody->operator [](" mmproj" ).isNull ()) {
597614 LOG_INFO << " MMPROJ FILE detected, multi-model enabled!" ;
598615 params.mmproj = jsonBody->operator [](" mmproj" ).asString ();
@@ -624,7 +641,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
624641 if (model_path.isNull ()) {
625642 LOG_ERROR << " Missing model path in request" ;
626643 } else {
627- if (std::filesystem::exists (std::filesystem::path (model_path.asString ()))) {
644+ if (std::filesystem::exists (
645+ std::filesystem::path (model_path.asString ()))) {
628646 params.model = model_path.asString ();
629647 } else {
630648 LOG_ERROR << " Could not find model in path " << model_path.asString ();
0 commit comments