Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit b47f6e9

Browse files
committed
Revert "fix: use params.embedding to decide warmup or not"
This reverts commit 8338661.
1 parent 8338661 commit b47f6e9

File tree

3 files changed

+84
-77
lines changed

3 files changed

+84
-77
lines changed

context/llama_server_context.h

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@ struct llama_server_context {
842842
}
843843
if (!found) {
844844
LOG_DEBUG << "ERROR: Image with id: " << img_id
845-
<< ", not found.\n";
845+
<< ", not found.\n";
846846
slot->images.clear();
847847
return false;
848848
}
@@ -871,7 +871,7 @@ struct llama_server_context {
871871
all_slots_are_idle = false;
872872

873873
LOG_DEBUG << "slot " << slot->id
874-
<< " is processing [task id: " << slot->task_id << "]";
874+
<< " is processing [task id: " << slot->task_id << "]";
875875

876876
return true;
877877
}
@@ -1255,36 +1255,45 @@ struct llama_server_context {
12551255
res.stop = true;
12561256

12571257
const int n_embd = llama_n_embd(model);
1258+
if (!params.embedding) {
1259+
LOG_WARNING_LLAMA("embedding disabled",
1260+
{
1261+
{"params.embedding", params.embedding},
1262+
});
1263+
res.result_json = json{
1264+
{"embedding", std::vector<float>(n_embd, 0.0f)},
1265+
};
1266+
} else {
1267+
std::vector<float> embd_res(n_embd, 0.0f);
12581268

1259-
std::vector<float> embd_res(n_embd, 0.0f);
1269+
for (int i = 0; i < batch.n_tokens; ++i) {
1270+
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
1271+
continue;
1272+
}
12601273

1261-
for (int i = 0; i < batch.n_tokens; ++i) {
1262-
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
1263-
continue;
1264-
}
1274+
const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
1275+
if (embd == NULL) {
1276+
embd = llama_get_embeddings_ith(ctx, i);
1277+
}
12651278

1266-
const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
1267-
if (embd == NULL) {
1268-
embd = llama_get_embeddings_ith(ctx, i);
1269-
}
1279+
if (embd == NULL) {
1280+
LOG_ERROR << "failed to get embeddings"
1281+
<< " token " << batch.token[i] << ", seq_id "
1282+
<< batch.seq_id[i][0];
12701283

1271-
if (embd == NULL) {
1272-
LOG_ERROR << "failed to get embeddings"
1273-
<< " token " << batch.token[i] << ", seq_id "
1274-
<< batch.seq_id[i][0];
1284+
res.result_json = json{
1285+
{"embedding", std::vector<float>(n_embd, 0.0f)},
1286+
};
12751287

1276-
res.result_json = json{
1277-
{"embedding", std::vector<float>(n_embd, 0.0f)},
1278-
};
1288+
continue;
1289+
}
12791290

1280-
continue;
1291+
llama_embd_normalize(embd, embd_res.data(), n_embd);
12811292
}
1282-
1283-
llama_embd_normalize(embd, embd_res.data(), n_embd);
1293+
res.result_json = json{
1294+
{"embedding", embd_res},
1295+
};
12841296
}
1285-
res.result_json = json{
1286-
{"embedding", embd_res},
1287-
};
12881297

12891298
queue_results.push_back(res);
12901299
condition_results.notify_all();
@@ -1556,8 +1565,8 @@ struct llama_server_context {
15561565
const int n_discard = n_left / 2;
15571566

15581567
LOG_DEBUG << "slot " << slot.id
1559-
<< " context shift - n_keep = " << slot.params.n_keep
1560-
<< ", n_left = " << n_left << ", n_discard: " << n_discard;
1568+
<< " context shift - n_keep = " << slot.params.n_keep
1569+
<< ", n_left = " << n_left << ", n_discard: " << n_discard;
15611570
llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
15621571
slot.params.n_keep + n_discard + 1);
15631572
llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard,
@@ -1591,7 +1600,7 @@ struct llama_server_context {
15911600
slot.t_last_used = ggml_time_us();
15921601

15931602
LOG_DEBUG << "slot " << slot.id << " released ("
1594-
<< (int)slot.cache_tokens.size() << " tokens in cache)";
1603+
<< (int)slot.cache_tokens.size() << " tokens in cache)";
15951604

15961605
continue;
15971606
}
@@ -1725,12 +1734,12 @@ struct llama_server_context {
17251734
slot.num_prompt_tokens - slot.n_past;
17261735

17271736
LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
1728-
<< " tokens | to process: "
1729-
<< slot.num_prompt_tokens_processed << " tokens";
1737+
<< " tokens | to process: "
1738+
<< slot.num_prompt_tokens_processed << " tokens";
17301739
}
17311740

17321741
LOG_DEBUG << "slot " << slot.id << " : kv cache rm - ["
1733-
<< (int)system_tokens.size() + slot.n_past << ", end)";
1742+
<< (int)system_tokens.size() + slot.n_past << ", end)";
17341743

17351744
llama_kv_cache_seq_rm(ctx, slot.id,
17361745
system_tokens.size() + slot.n_past, -1);
@@ -1740,8 +1749,8 @@ struct llama_server_context {
17401749
if (slot.n_past == slot.num_prompt_tokens) {
17411750
// we have to evaluate at least 1 token to generate logits.
17421751
LOG_DEBUG << "slot " << slot.id
1743-
<< " : we have to evaluate at least 1 token to "
1744-
"generate logits";
1752+
<< " : we have to evaluate at least 1 token to "
1753+
"generate logits";
17451754
slot.n_past--;
17461755
}
17471756

@@ -1811,8 +1820,8 @@ struct llama_server_context {
18111820
// if you get here, it means the KV cache is full - try increasing it
18121821
// via the context size
18131822
LOG_DEBUG << __func__
1814-
<< " : failed to decode the batch, n_batch = " << n_batch
1815-
<< ", ret = " << ret;
1823+
<< " : failed to decode the batch, n_batch = " << n_batch
1824+
<< ", ret = " << ret;
18161825
return false;
18171826
}
18181827

controllers/llamaCPP.cc

Lines changed: 40 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -156,15 +156,15 @@ llamaCPP::~llamaCPP() {
156156
StopBackgroundTask();
157157
}
158158

159-
void llamaCPP::WarmupModel() {
159+
void llamaCPP::WarmupModel(bool is_embedded_model) {
160160
json pseudo;
161161

162162
LOG_INFO << "Warm-up model";
163163
pseudo["prompt"] = "Hello";
164164
pseudo["n_predict"] = 2;
165165
pseudo["stream"] = false;
166166
const int task_id =
167-
llama.request_completion(pseudo, false, false, -1);
167+
llama.request_completion(pseudo, false, is_embedded_model, -1);
168168
std::string completion_text;
169169
task_result result = llama.next_result(task_id);
170170
if (!result.error && result.stop) {
@@ -405,37 +405,36 @@ void llamaCPP::InferenceImpl(
405405
return 0;
406406
};
407407
// Queued task
408-
state->instance->queue->runTaskInQueue([cb = std::move(callback), state,
409-
data, chunked_content_provider,
410-
request_id]() {
411-
state->task_id =
412-
state->instance->llama.request_completion(data, false, false, -1);
413-
414-
// Start streaming response
415-
auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
416-
"chat_completions.txt");
417-
cb(resp);
418-
419-
int retries = 0;
420-
421-
// Since this is an async task, we will wait for the task to be
422-
// completed
423-
while (state->inference_status != FINISHED && retries < 10) {
424-
// Should wait chunked_content_provider lambda to be called within
425-
// 3s
426-
if (state->inference_status == PENDING) {
427-
retries += 1;
428-
}
429-
if (state->inference_status != RUNNING)
430-
LOG_INFO_REQUEST(request_id)
431-
<< "Wait for task to be released:" << state->task_id;
432-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
433-
}
434-
LOG_INFO_REQUEST(request_id) << "Task completed, release it";
435-
// Request completed, release it
436-
state->instance->llama.request_cancel(state->task_id);
437-
LOG_INFO_REQUEST(request_id) << "Inference completed";
438-
});
408+
state->instance->queue->runTaskInQueue(
409+
[cb = std::move(callback), state, data, chunked_content_provider, request_id]() {
410+
state->task_id =
411+
state->instance->llama.request_completion(data, false, false, -1);
412+
413+
// Start streaming response
414+
auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
415+
"chat_completions.txt");
416+
cb(resp);
417+
418+
int retries = 0;
419+
420+
// Since this is an async task, we will wait for the task to be
421+
// completed
422+
while (state->inference_status != FINISHED && retries < 10) {
423+
// Should wait chunked_content_provider lambda to be called within
424+
// 3s
425+
if (state->inference_status == PENDING) {
426+
retries += 1;
427+
}
428+
if (state->inference_status != RUNNING)
429+
LOG_INFO_REQUEST(request_id)
430+
<< "Wait for task to be released:" << state->task_id;
431+
std::this_thread::sleep_for(std::chrono::milliseconds(100));
432+
}
433+
LOG_INFO_REQUEST(request_id) << "Task completed, release it";
434+
// Request completed, release it
435+
state->instance->llama.request_cancel(state->task_id);
436+
LOG_INFO_REQUEST(request_id) << "Inference completed";
437+
});
439438
} else {
440439
queue->runTaskInQueue(
441440
[this, request_id, cb = std::move(callback), d = std::move(data)]() {
@@ -608,7 +607,10 @@ void llamaCPP::LoadModel(
608607

609608
bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
610609
gpt_params params;
611-
610+
// For model like nomic-embed-text-v1.5.f16.gguf, etc, we need to warmup model with flag embedding = true.
611+
// So we use this variable to differentiate with other models
612+
// TODO: in case embedded model only, we should reject completion request from user?
613+
bool is_embedded_model = false;
612614
// By default will setting based on number of handlers
613615
if (jsonBody) {
614616
if (!jsonBody->operator[]("mmproj").isNull()) {
@@ -652,7 +654,9 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
652654

653655
params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
654656
params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
655-
params.embedding = jsonBody->get("embedding", false).asBool();
657+
is_embedded_model =
658+
!(*jsonBody)["embedding"].isNull() && (*jsonBody)["embedding"].asBool();
659+
params.embedding = jsonBody->get("embedding", true).asBool();
656660
// Check if n_parallel exists in jsonBody, if not, set to drogon_thread
657661
params.n_batch = jsonBody->get("n_batch", 512).asInt();
658662
params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -709,13 +713,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
709713

710714
LOG_INFO << "Started background task here!";
711715
backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
712-
713-
// For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model.
714-
// So we use this variable to differentiate with other models
715-
// TODO: in case embedded model only, we should reject completion request from user?
716-
if (!params.embedding) {
717-
WarmupModel();
718-
}
716+
WarmupModel(is_embedded_model);
719717
return true;
720718
}
721719

controllers/llamaCPP.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
101101
void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
102102
std::function<void(const HttpResponsePtr&)>&& callback);
103103
bool CheckModelLoaded(const std::function<void(const HttpResponsePtr&)>& callback);
104-
void WarmupModel();
104+
void WarmupModel(bool is_embedded_model);
105105
void BackgroundTask();
106106
void StopBackgroundTask();
107107
};

0 commit comments

Comments
 (0)