1- #include < mutex>
2- #include < set>
31#include < string>
42#include < vector>
3+ #include < set>
4+ #include < mutex>
55
66// External
77#include " clip.h"
88#include " common.h"
99#include " llama.h"
10- #include " llava.h"
11- #include " stb_image.h"
1210#include " utils/json.hpp"
11+ #include " stb_image.h"
12+ #include " llava.h"
1313
1414#if defined(_WIN32)
1515#define NOMINMAX
@@ -532,8 +532,7 @@ struct llama_server_context {
532532
533533 std::tie (model, ctx) = llama_init_from_gpt_params (params);
534534 if (model == nullptr ) {
535- LOG_ERROR_LLAMA (" llama.cpp unable to load model" ,
536- {{" model" , params.model }});
535+ LOG_ERROR_LLAMA (" llama.cpp unable to load model" , {{" model" , params.model }});
537536 return false ;
538537 }
539538
@@ -586,11 +585,7 @@ struct llama_server_context {
586585 try {
587586 batch = llama_batch_init (n_ctx, 0 , params.n_parallel );
588587 } catch (const std::exception& e) {
589- LOG_ERROR_LLAMA (" Failed to allocate llama.cpp batch metadata" ,
590- {{" exception" , e.what ()},
591- {" n_tokens_alloc" , n_ctx},
592- {" embd" , 0 },
593- {" n_seq_max" , params.n_parallel }});
588+ LOG_ERROR_LLAMA (" Failed to allocate llama.cpp batch metadata" , {{" exception" , e.what ()}, {" n_tokens_alloc" , n_ctx}, {" embd" , 0 }, {" n_seq_max" , params.n_parallel }});
594589 }
595590
596591 // empty system prompt
@@ -1249,35 +1244,19 @@ struct llama_server_context {
12491244 res.stop = true ;
12501245
12511246 const int n_embd = llama_n_embd (model);
1252-
1253- std::vector<float > embd_res (n_embd, 0 .0f );
1254-
1255- for (int i = 0 ; i < batch.n_tokens ; ++i) {
1256- if (!batch.logits [i] || batch.seq_id [i][0 ] != slot.id + 1 ) {
1257- continue ;
1258- }
1259-
1260- const float * embd = llama_get_embeddings_seq (ctx, batch.seq_id [i][0 ]);
1261- if (embd == NULL ) {
1262- embd = llama_get_embeddings_ith (ctx, i);
1263- }
1264-
1265- if (embd == NULL ) {
1266- LOG_ERROR << " failed to get embeddings "
1267- << " token: " << batch.token [i]
1268- << " , seq_id: " << batch.seq_id [i][0 ];
1269-
1270- res.result_json = json{
1271- {" embedding" , std::vector<float >(n_embd, 0 .0f )},
1272- };
1273-
1274- continue ;
1275- }
1276-
1277- llama_embd_normalize (embd, embd_res.data (), n_embd);
1278-
1247+ if (!params.embedding ) {
1248+ LOG_WARNING_LLAMA (" embedding disabled" ,
1249+ {
1250+ {" params.embedding" , params.embedding },
1251+ });
12791252 res.result_json = json{
1280- {" embedding" , embd_res},
1253+ {" embedding" , std::vector<float >(n_embd, 0 .0f )},
1254+ };
1255+ } else {
1256+ const float * data = llama_get_embeddings (ctx);
1257+ std::vector<float > embedding (data, data + n_embd);
1258+ res.result_json = json{
1259+ {" embedding" , embedding},
12811260 };
12821261 }
12831262 queue_results.push_back (res);
@@ -1401,7 +1380,7 @@ struct llama_server_context {
14011380 std::vector<llama_token> append_tokens =
14021381 tokenize (json_prompt, false ); // has next image
14031382 for (int i = 0 ; i < (int )append_tokens.size (); ++i) {
1404- llama_batch_add (batch, append_tokens[i], slot.n_past , {slot.id + 1 }, true );
1383+ llama_batch_add (batch, append_tokens[i], slot.n_past , {slot.id }, true );
14051384 slot.n_past += 1 ;
14061385 }
14071386 }
@@ -1544,29 +1523,28 @@ struct llama_server_context {
15441523
15451524 for (llama_client_slot& slot : slots) {
15461525 if (slot.is_processing () &&
1547- ( int )system_tokens. size () + slot. n_past >= slot.n_ctx - 1 ) {
1526+ slot. cache_tokens . size () >= ( size_t ) slot.n_ctx ) {
15481527 // Shift context
1549- const int n_keep = slot.params .n_keep + add_bos_token;
1550- const int n_left = (int )system_tokens.size () + slot.n_past - n_keep;
1528+ const int n_left = slot.n_past - slot.params .n_keep - 1 ;
15511529 const int n_discard = n_left / 2 ;
15521530
15531531 LOG_TEE (
15541532 " slot %d: context shift - n_keep = %d, n_left = %d, n_discard "
15551533 " = %d\n " ,
15561534 slot.id , slot.params .n_keep , n_left, n_discard);
1557- llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep, n_keep + n_discard);
1558- llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard,
1559- system_tokens.size () + slot.n_past , -n_discard);
1560-
1561- if (slot.params .cache_prompt ) {
1562- for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size ();
1563- i++) {
1564- slot.cache_tokens [i - n_discard] = slot.cache_tokens [i];
1565- }
1566-
1567- slot.cache_tokens .resize (slot.cache_tokens .size () - n_discard);
1535+ llama_kv_cache_seq_rm (ctx, slot.id , slot.params .n_keep + 1 ,
1536+ slot.params .n_keep + n_discard + 1 );
1537+ llama_kv_cache_seq_add (ctx, slot.id ,
1538+ slot.params .n_keep + 1 + n_discard,
1539+ slot.n_past , -n_discard);
1540+
1541+ for (size_t i = slot.params .n_keep + 1 + n_discard;
1542+ i < slot.cache_tokens .size (); i++) {
1543+ slot.cache_tokens [i - n_discard] = slot.cache_tokens [i];
15681544 }
15691545
1546+ slot.cache_tokens .resize (slot.cache_tokens .size () - n_discard);
1547+
15701548 slot.n_past -= n_discard;
15711549
15721550 slot.truncated = true ;
@@ -1579,9 +1557,6 @@ struct llama_server_context {
15791557 }
15801558 }
15811559
1582- // start populating the batch for this iteration
1583- llama_batch_clear (batch);
1584-
15851560 // decode any currently ongoing sequences
15861561 for (auto & slot : slots) {
15871562 // release the slot
@@ -1603,15 +1578,14 @@ struct llama_server_context {
16031578 slot.i_batch = batch.n_tokens ;
16041579
16051580 llama_batch_add (batch, slot.sampled , system_tokens.size () + slot.n_past ,
1606- {slot.id + 1 }, true );
1581+ {slot.id }, true );
16071582
16081583 slot.n_decoded += 1 ;
16091584 slot.n_past += 1 ;
16101585 }
16111586
16121587 // process in chunks of params.n_batch
1613- int32_t n_batch = llama_n_batch (ctx);
1614- int32_t n_ubatch = llama_n_ubatch (ctx);
1588+ int32_t n_batch = params.n_batch ;
16151589
16161590 // assign workload to the slots
16171591 if (params.cont_batching || batch.n_tokens == 0 ) {
@@ -1667,7 +1641,8 @@ struct llama_server_context {
16671641 } else {
16681642 prompt_tokens = tokenize (
16691643 slot.prompt ,
1670- system_prompt.empty ()); // add BOS if there isn't system prompt
1644+ system_prompt.empty () &&
1645+ add_bos_token); // add BOS if there isn't system prompt
16711646 }
16721647
16731648 slot.num_prompt_tokens = prompt_tokens.size ();
@@ -1763,11 +1738,9 @@ struct llama_server_context {
17631738 std::vector<llama_token> prefix_tokens =
17641739 has_images ? tokenize (slot.images [0 ].prefix_prompt , add_bos_token)
17651740 : prompt_tokens;
1766- for (;
1767- slot.n_past < slot.num_prompt_tokens && batch.n_tokens < n_batch;
1768- ++slot.n_past ) {
1741+ for (; slot.n_past < (int )prefix_tokens.size (); ++slot.n_past ) {
17691742 llama_batch_add (batch, prefix_tokens[slot.n_past ],
1770- system_tokens.size () + slot.n_past , {slot.id + 1 },
1743+ system_tokens.size () + slot.n_past , {slot.id },
17711744 false );
17721745 }
17731746
@@ -1830,8 +1803,7 @@ struct llama_server_context {
18301803 }
18311804
18321805 for (auto & slot : slots) {
1833- if (slot.state != PROCESSING || slot.i_batch < (int )i ||
1834- slot.i_batch >= (int )(i + n_tokens)) {
1806+ if (slot.i_batch < (int )i || slot.i_batch >= (int )(i + n_tokens)) {
18351807 continue ;
18361808 }
18371809
@@ -1840,7 +1812,7 @@ struct llama_server_context {
18401812 send_embedding (slot);
18411813 slot.release ();
18421814 slot.i_batch = -1 ;
1843- continue ;
1815+ return true ;
18441816 }
18451817
18461818 completion_token_output result;
0 commit comments