Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 002cbc4

Browse files
authored
Revert "fix: make embedding work again" (#502)
1 parent f64a90f commit 002cbc4

File tree

3 files changed

+45
-73
lines changed

3 files changed

+45
-73
lines changed

context/llama_server_context.h

Lines changed: 40 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
#include <mutex>
2-
#include <set>
31
#include <string>
42
#include <vector>
3+
#include <set>
4+
#include <mutex>
55

66
// External
77
#include "clip.h"
88
#include "common.h"
99
#include "llama.h"
10-
#include "llava.h"
11-
#include "stb_image.h"
1210
#include "utils/json.hpp"
11+
#include "stb_image.h"
12+
#include "llava.h"
1313

1414
#if defined(_WIN32)
1515
#define NOMINMAX
@@ -532,8 +532,7 @@ struct llama_server_context {
532532

533533
std::tie(model, ctx) = llama_init_from_gpt_params(params);
534534
if (model == nullptr) {
535-
LOG_ERROR_LLAMA("llama.cpp unable to load model",
536-
{{"model", params.model}});
535+
LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}});
537536
return false;
538537
}
539538

@@ -586,11 +585,7 @@ struct llama_server_context {
586585
try {
587586
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
588587
} catch (const std::exception& e) {
589-
LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata",
590-
{{"exception", e.what()},
591-
{"n_tokens_alloc", n_ctx},
592-
{"embd", 0},
593-
{"n_seq_max", params.n_parallel}});
588+
LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}});
594589
}
595590

596591
// empty system prompt
@@ -1249,35 +1244,19 @@ struct llama_server_context {
12491244
res.stop = true;
12501245

12511246
const int n_embd = llama_n_embd(model);
1252-
1253-
std::vector<float> embd_res(n_embd, 0.0f);
1254-
1255-
for (int i = 0; i < batch.n_tokens; ++i) {
1256-
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
1257-
continue;
1258-
}
1259-
1260-
const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
1261-
if (embd == NULL) {
1262-
embd = llama_get_embeddings_ith(ctx, i);
1263-
}
1264-
1265-
if (embd == NULL) {
1266-
LOG_ERROR << "failed to get embeddings "
1267-
<< "token: " << batch.token[i]
1268-
<< ", seq_id: " << batch.seq_id[i][0];
1269-
1270-
res.result_json = json{
1271-
{"embedding", std::vector<float>(n_embd, 0.0f)},
1272-
};
1273-
1274-
continue;
1275-
}
1276-
1277-
llama_embd_normalize(embd, embd_res.data(), n_embd);
1278-
1247+
if (!params.embedding) {
1248+
LOG_WARNING_LLAMA("embedding disabled",
1249+
{
1250+
{"params.embedding", params.embedding},
1251+
});
12791252
res.result_json = json{
1280-
{"embedding", embd_res},
1253+
{"embedding", std::vector<float>(n_embd, 0.0f)},
1254+
};
1255+
} else {
1256+
const float* data = llama_get_embeddings(ctx);
1257+
std::vector<float> embedding(data, data + n_embd);
1258+
res.result_json = json{
1259+
{"embedding", embedding},
12811260
};
12821261
}
12831262
queue_results.push_back(res);
@@ -1401,7 +1380,7 @@ struct llama_server_context {
14011380
std::vector<llama_token> append_tokens =
14021381
tokenize(json_prompt, false); // has next image
14031382
for (int i = 0; i < (int)append_tokens.size(); ++i) {
1404-
llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id + 1}, true);
1383+
llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true);
14051384
slot.n_past += 1;
14061385
}
14071386
}
@@ -1544,29 +1523,28 @@ struct llama_server_context {
15441523

15451524
for (llama_client_slot& slot : slots) {
15461525
if (slot.is_processing() &&
1547-
(int)system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
1526+
slot.cache_tokens.size() >= (size_t)slot.n_ctx) {
15481527
// Shift context
1549-
const int n_keep = slot.params.n_keep + add_bos_token;
1550-
const int n_left = (int)system_tokens.size() + slot.n_past - n_keep;
1528+
const int n_left = slot.n_past - slot.params.n_keep - 1;
15511529
const int n_discard = n_left / 2;
15521530

15531531
LOG_TEE(
15541532
"slot %d: context shift - n_keep = %d, n_left = %d, n_discard "
15551533
"= %d\n",
15561534
slot.id, slot.params.n_keep, n_left, n_discard);
1557-
llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_discard);
1558-
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard,
1559-
system_tokens.size() + slot.n_past, -n_discard);
1560-
1561-
if (slot.params.cache_prompt) {
1562-
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size();
1563-
i++) {
1564-
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
1565-
}
1566-
1567-
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
1535+
llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
1536+
slot.params.n_keep + n_discard + 1);
1537+
llama_kv_cache_seq_add(ctx, slot.id,
1538+
slot.params.n_keep + 1 + n_discard,
1539+
slot.n_past, -n_discard);
1540+
1541+
for (size_t i = slot.params.n_keep + 1 + n_discard;
1542+
i < slot.cache_tokens.size(); i++) {
1543+
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
15681544
}
15691545

1546+
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
1547+
15701548
slot.n_past -= n_discard;
15711549

15721550
slot.truncated = true;
@@ -1579,9 +1557,6 @@ struct llama_server_context {
15791557
}
15801558
}
15811559

1582-
// start populating the batch for this iteration
1583-
llama_batch_clear(batch);
1584-
15851560
// decode any currently ongoing sequences
15861561
for (auto& slot : slots) {
15871562
// release the slot
@@ -1603,15 +1578,14 @@ struct llama_server_context {
16031578
slot.i_batch = batch.n_tokens;
16041579

16051580
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past,
1606-
{slot.id + 1}, true);
1581+
{slot.id}, true);
16071582

16081583
slot.n_decoded += 1;
16091584
slot.n_past += 1;
16101585
}
16111586

16121587
// process in chunks of params.n_batch
1613-
int32_t n_batch = llama_n_batch(ctx);
1614-
int32_t n_ubatch = llama_n_ubatch(ctx);
1588+
int32_t n_batch = params.n_batch;
16151589

16161590
// assign workload to the slots
16171591
if (params.cont_batching || batch.n_tokens == 0) {
@@ -1667,7 +1641,8 @@ struct llama_server_context {
16671641
} else {
16681642
prompt_tokens = tokenize(
16691643
slot.prompt,
1670-
system_prompt.empty()); // add BOS if there isn't system prompt
1644+
system_prompt.empty() &&
1645+
add_bos_token); // add BOS if there isn't system prompt
16711646
}
16721647

16731648
slot.num_prompt_tokens = prompt_tokens.size();
@@ -1763,11 +1738,9 @@ struct llama_server_context {
17631738
std::vector<llama_token> prefix_tokens =
17641739
has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token)
17651740
: prompt_tokens;
1766-
for (;
1767-
slot.n_past < slot.num_prompt_tokens && batch.n_tokens < n_batch;
1768-
++slot.n_past) {
1741+
for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) {
17691742
llama_batch_add(batch, prefix_tokens[slot.n_past],
1770-
system_tokens.size() + slot.n_past, {slot.id + 1},
1743+
system_tokens.size() + slot.n_past, {slot.id},
17711744
false);
17721745
}
17731746

@@ -1830,8 +1803,7 @@ struct llama_server_context {
18301803
}
18311804

18321805
for (auto& slot : slots) {
1833-
if (slot.state != PROCESSING || slot.i_batch < (int)i ||
1834-
slot.i_batch >= (int)(i + n_tokens)) {
1806+
if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) {
18351807
continue;
18361808
}
18371809

@@ -1840,7 +1812,7 @@ struct llama_server_context {
18401812
send_embedding(slot);
18411813
slot.release();
18421814
slot.i_batch = -1;
1843-
continue;
1815+
return true;
18441816
}
18451817

18461818
completion_token_output result;

controllers/llamaCPP.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,14 @@ llamaCPP::~llamaCPP() {
156156
StopBackgroundTask();
157157
}
158158

159-
void llamaCPP::WarmupModel(bool is_embedding) {
159+
void llamaCPP::WarmupModel() {
160160
json pseudo;
161161

162162
LOG_INFO << "Warm-up model";
163163
pseudo["prompt"] = "Hello";
164164
pseudo["n_predict"] = 2;
165165
pseudo["stream"] = false;
166-
const int task_id = llama.request_completion(pseudo, false, is_embedding, -1);
166+
const int task_id = llama.request_completion(pseudo, false, false, -1);
167167
std::string completion_text;
168168
task_result result = llama.next_result(task_id);
169169
if (!result.error && result.stop) {
@@ -645,7 +645,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
645645

646646
params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
647647
params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
648-
params.embedding = jsonBody->get("embedding", false).asBool();
648+
params.embedding = jsonBody->get("embedding", true).asBool();
649649
// Check if n_parallel exists in jsonBody, if not, set to drogon_thread
650650
params.n_batch = jsonBody->get("n_batch", 512).asInt();
651651
params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -702,7 +702,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
702702

703703
LOG_INFO << "Started background task here!";
704704
backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
705-
WarmupModel(params.embedding);
705+
WarmupModel();
706706
return true;
707707
}
708708

controllers/llamaCPP.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
101101
void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
102102
std::function<void(const HttpResponsePtr&)>& callback);
103103
bool CheckModelLoaded(std::function<void(const HttpResponsePtr&)>& callback);
104-
void WarmupModel(bool is_embedding);
104+
void WarmupModel();
105105
void BackgroundTask();
106106
void StopBackgroundTask();
107107
};

0 commit comments

Comments
 (0)