Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ std::shared_ptr<inferenceState> create_inference_state(llamaCPP *instance) {
// --------------------------------------------

// Function to check if the model is loaded
void check_model_loaded(llama_server_context &llama, const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &callback) {
void check_model_loaded(
llama_server_context &llama, const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &callback) {
if (!llama.model_loaded_external) {
Json::Value jsonResp;
jsonResp["message"] =
Expand Down Expand Up @@ -299,13 +300,9 @@ void llamaCPP::chatCompletion(
LOG_INFO << "Current completion text";
LOG_INFO << formatted_output;
#endif
int task_id;

LOG_INFO << "Resolved request for task_id:" << task_id;

if (is_streamed) {
auto state = create_inference_state(this);
state->task_id = task_id;
auto chunked_content_provider =
[state, data](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
if (!state->is_streaming) {
Expand Down Expand Up @@ -386,9 +383,12 @@ void llamaCPP::chatCompletion(
} else {
Json::Value respData;
auto resp = nitro_utils::nitroHttpResponse();
int task_id = llama.request_completion(data, false, false, -1);
LOG_INFO << "sent the non stream, waiting for respone";
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
LOG_INFO << "Here is the result:" << result.error;
if (!result.error && result.stop) {
int prompt_tokens = result.result_json["tokens_evaluated"];
int predicted_tokens = result.result_json["tokens_predicted"];
Expand Down