@@ -842,7 +842,7 @@ struct llama_server_context {
842842 }
843843 if (!found) {
844844 LOG_DEBUG << " ERROR: Image with id: " << img_id
845- << " , not found.\n " ;
845+ << " , not found.\n " ;
846846 slot->images .clear ();
847847 return false ;
848848 }
@@ -871,7 +871,7 @@ struct llama_server_context {
871871 all_slots_are_idle = false ;
872872
873873 LOG_DEBUG << " slot " << slot->id
874- << " is processing [task id: " << slot->task_id << " ]" ;
874+ << " is processing [task id: " << slot->task_id << " ]" ;
875875
876876 return true ;
877877 }
@@ -1255,36 +1255,45 @@ struct llama_server_context {
12551255 res.stop = true ;
12561256
12571257 const int n_embd = llama_n_embd (model);
1258+ if (!params.embedding ) {
1259+ LOG_WARNING_LLAMA (" embedding disabled" ,
1260+ {
1261+ {" params.embedding" , params.embedding },
1262+ });
1263+ res.result_json = json{
1264+ {" embedding" , std::vector<float >(n_embd, 0 .0f )},
1265+ };
1266+ } else {
1267+ std::vector<float > embd_res (n_embd, 0 .0f );
12581268
1259- std::vector<float > embd_res (n_embd, 0 .0f );
1269+ for (int i = 0 ; i < batch.n_tokens ; ++i) {
1270+ if (!batch.logits [i] || batch.seq_id [i][0 ] != slot.id ) {
1271+ continue ;
1272+ }
12601273
1261- for ( int i = 0 ; i < batch.n_tokens ; ++i) {
1262- if (!batch. logits [i] || batch. seq_id [i][ 0 ] != slot. id ) {
1263- continue ;
1264- }
1274+ const float * embd = llama_get_embeddings_seq (ctx, batch.seq_id [i][ 0 ]);
1275+ if (embd == NULL ) {
1276+ embd = llama_get_embeddings_ith (ctx, i) ;
1277+ }
12651278
1266- const float * embd = llama_get_embeddings_seq (ctx, batch. seq_id [i][ 0 ]);
1267- if (embd == NULL ) {
1268- embd = llama_get_embeddings_ith (ctx, i);
1269- }
1279+ if ( embd == NULL ) {
1280+ LOG_ERROR << " failed to get embeddings "
1281+ << " token " << batch. token [i] << " , seq_id "
1282+ << batch. seq_id [i][ 0 ];
12701283
1271- if (embd == NULL ) {
1272- LOG_ERROR << " failed to get embeddings"
1273- << " token " << batch.token [i] << " , seq_id "
1274- << batch.seq_id [i][0 ];
1284+ res.result_json = json{
1285+ {" embedding" , std::vector<float >(n_embd, 0 .0f )},
1286+ };
12751287
1276- res.result_json = json{
1277- {" embedding" , std::vector<float >(n_embd, 0 .0f )},
1278- };
1288+ continue ;
1289+ }
12791290
1280- continue ;
1291+ llama_embd_normalize (embd, embd_res. data (), n_embd) ;
12811292 }
1282-
1283- llama_embd_normalize (embd, embd_res.data (), n_embd);
1293+ res.result_json = json{
1294+ {" embedding" , embd_res},
1295+ };
12841296 }
1285- res.result_json = json{
1286- {" embedding" , embd_res},
1287- };
12881297
12891298 queue_results.push_back (res);
12901299 condition_results.notify_all ();
@@ -1556,8 +1565,8 @@ struct llama_server_context {
15561565 const int n_discard = n_left / 2 ;
15571566
15581567 LOG_DEBUG << " slot " << slot.id
1559- << " context shift - n_keep = " << slot.params .n_keep
1560- << " , n_left = " << n_left << " , n_discard: " << n_discard;
1568+ << " context shift - n_keep = " << slot.params .n_keep
1569+ << " , n_left = " << n_left << " , n_discard: " << n_discard;
15611570 llama_kv_cache_seq_rm (ctx, slot.id , slot.params .n_keep + 1 ,
15621571 slot.params .n_keep + n_discard + 1 );
15631572 llama_kv_cache_seq_add (ctx, slot.id , slot.params .n_keep + 1 + n_discard,
@@ -1591,7 +1600,7 @@ struct llama_server_context {
15911600 slot.t_last_used = ggml_time_us ();
15921601
15931602 LOG_DEBUG << " slot " << slot.id << " released ("
1594- << (int )slot.cache_tokens .size () << " tokens in cache)" ;
1603+ << (int )slot.cache_tokens .size () << " tokens in cache)" ;
15951604
15961605 continue ;
15971606 }
@@ -1725,12 +1734,12 @@ struct llama_server_context {
17251734 slot.num_prompt_tokens - slot.n_past ;
17261735
17271736 LOG_DEBUG << " slot " << slot.id << " : in cache: " << slot.n_past
1728- << " tokens | to process: "
1729- << slot.num_prompt_tokens_processed << " tokens" ;
1737+ << " tokens | to process: "
1738+ << slot.num_prompt_tokens_processed << " tokens" ;
17301739 }
17311740
17321741 LOG_DEBUG << " slot " << slot.id << " : kv cache rm - ["
1733- << (int )system_tokens.size () + slot.n_past << " , end)" ;
1742+ << (int )system_tokens.size () + slot.n_past << " , end)" ;
17341743
17351744 llama_kv_cache_seq_rm (ctx, slot.id ,
17361745 system_tokens.size () + slot.n_past , -1 );
@@ -1740,8 +1749,8 @@ struct llama_server_context {
17401749 if (slot.n_past == slot.num_prompt_tokens ) {
17411750 // we have to evaluate at least 1 token to generate logits.
17421751 LOG_DEBUG << " slot " << slot.id
1743- << " : we have to evaluate at least 1 token to "
1744- " generate logits" ;
1752+ << " : we have to evaluate at least 1 token to "
1753+ " generate logits" ;
17451754 slot.n_past --;
17461755 }
17471756
@@ -1811,8 +1820,8 @@ struct llama_server_context {
18111820 // if you get here, it means the KV cache is full - try increasing it
18121821 // via the context size
18131822 LOG_DEBUG << __func__
1814- << " : failed to decode the batch, n_batch = " << n_batch
1815- << " , ret = " << ret;
1823+ << " : failed to decode the batch, n_batch = " << n_batch
1824+ << " , ret = " << ret;
18161825 return false ;
18171826 }
18181827
0 commit comments