From 75998d99428debc97ed9e6992d188979eb8539b2 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Mon, 11 Sep 2023 10:09:56 +0000 Subject: [PATCH 01/14] inital align beam_search commit Signed-off-by: Yu, Zhentao --- .../runtime/graph/application/pybind_gptj.cpp | 157 +++++------ .../graph/models/model_utils/model_utils.cpp | 256 +++++++++++++----- .../graph/models/model_utils/model_utils.h | 27 +- 3 files changed, 291 insertions(+), 149 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp index 50db74ddaf0..8318ef2b8a2 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp @@ -225,85 +225,86 @@ int main(int argc, char* argv[]) { for (auto gptj_in_all : ctxs) { auto res = eval_gptj_char( gptj_in_all, - //"she opened the door and see", + // "she opened the door and see", + "Once upon a time", // "A spaceship lands on the moon", - "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing " - "on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. " - "There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went " - "right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. " - "Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of " - "enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I " - "could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that " - "evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt " - "permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a " - "species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. " - "That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve " - "something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by " - "creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that " - "rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to " - "create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call " - "it evolution. This is a problem, of course, every other contestant also had to face. And judging by the " - "entries submitted, not many managed to work around it. I'd say the only real solution was through the use of " - "artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this " - "is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed " - "myself to pick whatever I thought would work out. My initial idea was to create something where humanity " - "tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had " - "this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space " - "Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next " - "inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are " - "you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow " - "gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it " - "sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey " - "(who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it " - "involved into the idea of having individual pieces of pasta flying around and trying to evolve until they " - "became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti " - "Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: " - "you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, " - "each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through " - "a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', " - "which are debited from your credits (you start with a number of credits). Once spawned, your pastas start " - "flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game " - "is having your pasta conquer all the plates on the table). But they are really autonomous, so after being " - "spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other " - "people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other " - "pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. " - "It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If " - "pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, " - "until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every " - "plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, " - "Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and " - "Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge " - "with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing " - "for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It " - "made the livestream pretty effortless and the features are awesome, even for the free version. It was great " - "to have some of my friends watch me, and then interact with them and random people through chat. It was also " - "good knowing that I was also recording a local version of the files, so I could make a timelapse video later. " - "Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if " - "someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly " - "inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll " - "probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty " - "much spend half of the time writing a line and the other half fixing the crazy characters in it. My own " - "stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the " - "cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on " - "my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole " - "weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, " - "when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended " - "up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but " - "I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how " - "much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at " - "the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They " - "think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, " - "you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area " - "of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to " - "indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color " - "denotes the team owner. But it's something that fits so well into the design that people never realized it, " - "until they were told about it. In the end, the idea of going with a full physical metaphor is one that should " - "be done with care. Things that are very important risk becoming background noise, unless the player knows its " - "importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up " - "adding it at the bottom to indicate your credits and bases owned, as well as the hideous " - "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple " - "HUD from the start, especially one that indicated each team's colors and general state of the game without " - "the need for zooming in and out. Development Development went fast.", + // "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing " + // "on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. " + // "There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went " + // "right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. " + // "Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of " + // "enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I " + // "could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that " + // "evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt " + // "permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a " + // "species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. " + // "That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve " + // "something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by " + // "creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that " + // "rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to " + // "create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call " + // "it evolution. This is a problem, of course, every other contestant also had to face. And judging by the " + // "entries submitted, not many managed to work around it. I'd say the only real solution was through the use of " + // "artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this " + // "is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed " + // "myself to pick whatever I thought would work out. My initial idea was to create something where humanity " + // "tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had " + // "this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space " + // "Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next " + // "inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are " + // "you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow " + // "gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it " + // "sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey " + // "(who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it " + // "involved into the idea of having individual pieces of pasta flying around and trying to evolve until they " + // "became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti " + // "Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: " + // "you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, " + // "each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through " + // "a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', " + // "which are debited from your credits (you start with a number of credits). Once spawned, your pastas start " + // "flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game " + // "is having your pasta conquer all the plates on the table). But they are really autonomous, so after being " + // "spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other " + // "people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other " + // "pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. " + // "It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If " + // "pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, " + // "until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every " + // "plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, " + // "Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and " + // "Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge " + // "with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing " + // "for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It " + // "made the livestream pretty effortless and the features are awesome, even for the free version. It was great " + // "to have some of my friends watch me, and then interact with them and random people through chat. It was also " + // "good knowing that I was also recording a local version of the files, so I could make a timelapse video later. " + // "Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if " + // "someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly " + // "inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll " + // "probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty " + // "much spend half of the time writing a line and the other half fixing the crazy characters in it. My own " + // "stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the " + // "cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on " + // "my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole " + // "weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, " + // "when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended " + // "up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but " + // "I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how " + // "much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at " + // "the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They " + // "think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, " + // "you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area " + // "of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to " + // "indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color " + // "denotes the team owner. But it's something that fits so well into the design that people never realized it, " + // "until they were told about it. In the end, the idea of going with a full physical metaphor is one that should " + // "be done with care. Things that are very important risk becoming background noise, unless the player knows its " + // "importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up " + // "adding it at the bottom to indicate your credits and bases owned, as well as the hideous " + // "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple " + // "HUD from the start, especially one that indicated each team's colors and general state of the game without " + // "the need for zooming in and out. Development Development went fast.", 128, 40, 1.0, 0.8, 2048); std::cout << res << std::endl; exit_gptj(gptj_in_all); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index b1ac55eb4ae..e0de6ad608e 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -1937,6 +1937,7 @@ struct logits_info { const model_context* const ctx = nullptr; // (batch, seq_len * vocab_size) const float* const logits = nullptr; + std::vector> next_token_scores; // (input_prompt_bs* beam_size, n_vocab) const int batch_size; const int32_t n_vocab; // last seq_len indice @@ -1969,13 +1970,43 @@ struct logits_info { normalizers[i] = 1.0f / std::accumulate(logits + i * bs_stride + offset, logits + i * bs_stride + offset + n_vocab, 0.0f, sum_exp{max_ls[i]}); } + next_token_scores.reserve(batch_size * n_vocab); + next_token_scores.resize(batch_size); } model_token_data get_token_data(const int& batch_idx, const int32_t& token_idx) const { return {token_idx, *(logits + batch_idx * bs_stride + offset + token_idx), 0.0f}; } - // Return top k token_data by logit. (batch, top_k) + float probability_from_logit(const int& batch_idx, const float& logit) { + return normalizers[batch_idx] * std::exp(logit - max_ls[batch_idx]); + } + + float log_probability_from_logit(const int& batch_idx, const float& logit) { + return std::log(probability_from_logit(batch_idx, logit)); + } + + void compute_log_softmax_logits() { +#pragma omp parallel for + for (int i = 0; i < batch_size; ++i) { +#pragma omp parallel for + for (int j = 0; j < n_vocab; ++j) { + float score = log_probability_from_logit(i, *(logits + i * bs_stride + offset + j)); + next_token_scores[i].push_back(score); + } + } + } + + // token score + pre tokens score + void compute_next_token_scores(const std::vector& beams) { + MODEL_ASSERT(batch_size == beams.size()); +#pragma omp parallel for + for (int i = 0; i < batch_size; ++i) { + std::for_each(next_token_scores[i].begin(), next_token_scores[i].end(), [&](float& s) { s += beams[i].score; }); + } + } + + // Return top k token_data by score. (batch, top_k) std::vector> top_k(const int& k) { std::vector> min_heap(batch_size); // min-heap by logit int tk = std::min(k, n_vocab); @@ -1999,35 +2030,81 @@ struct logits_info { } return min_heap; } +}; - float probability_from_logit(const int& batch_idx, const float& logit) { - return normalizers[batch_idx] * std::exp(logit - max_ls[batch_idx]); - } - - float log_probability_from_logit(const int& batch_idx, const float& logit) { - return std::log(probability_from_logit(batch_idx, logit)); +// Return top k token_data by score. (prompt_bs * 2 * num_beam) +std::vector beam_top_k(const model_context* ctx, const std::vector>& token_scores, + const std::vector& num_beams, const std::vector beam_indices, + const int& sample_scale, const int& dim) { + MODEL_ASSERT(dim == -1); // raise unimplemented error + MODEL_ASSERT(token_scores.size() == ctx->batch_size); // prompt bs * num_beam + MODEL_ASSERT(token_scores[0].size() == ctx->model.hparams.n_vocab); + const int request_bs = 1; // TODO ctx->request_running_num + MODEL_ASSERT(num_beams.size() == request_bs); + std::vector res; + res.reserve(sample_scale * std::accumulate(num_beams.begin(), num_beams.end(), 0)); + std::vector min_heap; + const uint32_t n_vocab = ctx->model.hparams.n_vocab; + size_t row_off = 0; + auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; +#pragma omp parallel for + for (int i = 0; i < request_bs; ++i) { + const int num_beam = num_beams[i]; + const int sample_k = sample_scale * num_beam; + min_heap.clear(); + min_heap.resize(sample_k); +#pragma omp parallel for + for (int j = 0; j < num_beam; ++j) { +#pragma omp parallel for + for (int n = 0; n < n_vocab; ++n) { + if (min_heap.size() < sample_k) { + min_heap.push_back(beam_top_k_res({n, token_scores[row_off + j][n], beam_indices[row_off + j]})); + } else if (min_heap.size() == sample_k) { + std::make_heap(min_heap.begin(), min_heap.end(), comp); + } else { + beam_top_k_res nr({n, token_scores[row_off + j][n], beam_indices[row_off + j]}); + if (min_heap.front().score < nr.score) { + std::pop_heap(min_heap.begin(), min_heap.end(), comp); + min_heap.back().id = nr.id; + min_heap.back().score = nr.score; + min_heap.back().beam_idx = nr.beam_idx; + std::push_heap(min_heap.begin(), min_heap.end(), comp); + } + } + } + } + row_off += i * num_beam; + for (const auto b : min_heap) { + res.push_back(b); + } } -}; + return res; +} -void logits_processor::min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) { +void logits_processor::min_new_tokens_logits_process(const uint32_t& cur_len, + std::vector>& token_scores, + const model_vocab::id& eos_token_id) { MODEL_ASSERT(ctx->generation_conf.min_new_tokens >= 0); if (ctx->generation_conf.min_new_tokens == 0 || ctx->generation_conf.min_new_tokens <= cur_len) { return; } else { - int batch_size = ctx->batch_size; - size_t offset = ctx->logits.size() / ctx->batch_size - ctx->model.hparams.n_vocab; - size_t bs_stride = ctx->logits.size() / ctx->batch_size; + // batch_size (input_ptompt_bs * beam_size, n_vocab) + MODEL_ASSERT(token_scores.size() == ctx->batch_size); + MODEL_ASSERT(token_scores[0].size() == ctx->model.hparams.n_vocab); + int batch_size = token_scores.size(); + uint32_t n_vocab = token_scores[0].size(); for (int i = 0; i < batch_size; ++i) { // forbidden to choose eos_token if cur_len < min_new_tokens - *(model_get_logits(ctx) + i * bs_stride + offset + eos_token_id) = 0.0f; + token_scores[i][eos_token_id] = NEG_INF; } } } -void logits_processor::process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) { +void logits_processor::process(const uint32_t& cur_len, std::vector>& token_scores, + const model_vocab::id& eos_token_id) { MODEL_ASSERT(model_get_logits(ctx) != nullptr); if (min_new_tokens > 0) { - min_new_tokens_logits_process(cur_len, eos_token_id); + min_new_tokens_logits_process(cur_len, token_scores, eos_token_id); } } @@ -2117,6 +2194,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { int record = 0; int batch_size = 0; uint32_t cur_len = 0; + std::vector beam_indices; for (int i = 0; i < beam_size; ++i) { // is done or not if (!cur_beams[i].eos()) { @@ -2130,6 +2208,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { embd_inp.push_back(cur_beams[i].token_ids.back()); infer_beam_ids[i] = record++; batch_size++; + beam_indices.push_back(i); } } // DEBUG @@ -2154,11 +2233,14 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } #endif - lp.process(cur_len, 50256); // TODO ctx->model.eos_id; logits_info li(ctx); - // sample 2 - const int sample_num = 2; - std::vector> next_tokens = li.top_k(sample_num); + li.compute_log_softmax_logits(); + lp.process(cur_len, li.next_token_scores, 50256); // TODO ctx->model.eos_id; + li.compute_next_token_scores(cur_beams); + const int sample_scale = 2; + std::vector next_tokens = + beam_top_k(ctx, li.next_token_scores, {batch_size}, beam_indices, sample_scale); + // std::vector> next_tokens = li.top_k(sample_num); // DEBUG #if 0 for (int k = 0; k < next_tokens.size(); ++k) { @@ -2169,46 +2251,73 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } } #endif - MODEL_ASSERT(next_tokens.size() == batch_size); + MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); + MODEL_ASSERT(next_beams.empty()); for (int i = 0; i < beam_size; ++i) { beam b = cur_beams[i]; if (b.eos()) { - // b is at end-of-sentence, so just copy it to next_beams if its - // probability is high enough. - if (next_beams.size() < beam_size) { - next_beams.push_back(b); - if (next_beams.size() == beam_size) { - std::make_heap(next_beams.begin(), next_beams.end(), comp); - } - } else if (next_beams.front().score < b.score) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = b; - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } else { - int j = 0; - if (next_beams.size() < beam_size) { - for (; next_beams.size() < beam_size && j < sample_num; ++j) { - beam next_beam = b; - next_beam.token_ids.push_back(next_tokens[infer_beam_ids[i]][j].id); - next_beam.score += li.log_probability_from_logit(infer_beam_ids[i], next_tokens[infer_beam_ids[i]][j].logit); - next_beams.push_back(std::move(next_beam)); - } - std::make_heap(next_beams.begin(), next_beams.end(), comp); + printf("eos \n"); + if (b.score != 100) { + b.eos_score = b.score; + b.score = 100; } - for (; j < sample_num; ++j) { - float const next_score = - b.score + li.log_probability_from_logit(infer_beam_ids[i], next_tokens[infer_beam_ids[i]][j].logit); - if (next_beams.front().score < next_score) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = b; - next_beams.back().token_ids.push_back(next_tokens[infer_beam_ids[i]][j].id); - next_beams.back().score = next_score; - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } - } - } + next_beams.push_back(std::move(b)); + } + } + if (next_beams.size() < beam_size) { + std::sort(next_tokens.begin(), next_tokens.end(), + [](beam_top_k_res& a, beam_top_k_res& b) { return a.score < b.score; }); + int add_num = beam_size - next_beams.size(); + for (int j = 0; j < add_num; ++j) { + beam next_beam = cur_beams[next_tokens[j].beam_idx]; + next_beam.token_ids.push_back(next_tokens[j].id); + next_beam.score = next_tokens[j].score; + next_beams.push_back(std::move(next_beam)); + } + } + // for (int i = 0; i < beam_size; ++i) { + // beam b = cur_beams[i]; + // if (b.eos()) { + // // b is at end-of-sentence, so just copy it to next_beams if its + // // probability is high enough. + // if (next_beams.size() < beam_size) { + // if (b.score != 100) { + // b.eos_score = b.score; + // b.score = 100; + // } + // next_beams.push_back(b); + // if (next_beams.size() == beam_size) { + // std::make_heap(next_beams.begin(), next_beams.end(), comp); + // } + // } else if (next_beams.front().score < b.score) { + // std::pop_heap(next_beams.begin(), next_beams.end(), comp); + // next_beams.back() = b; + // std::push_heap(next_beams.begin(), next_beams.end(), comp); + // } + // } else { + // int j = 0; + // if (next_beams.size() < beam_size) { + // for (; next_beams.size() < beam_size && j < sample_num; ++j) { + // beam next_beam = b; + // next_beam.token_ids.push_back(next_tokens[infer_beam_ids[i]][j].id); + // next_beam.score += li.log_probability_from_logit(infer_beam_ids[i], + // next_tokens[infer_beam_ids[i]][j].logit); next_beams.push_back(std::move(next_beam)); + // } + // std::make_heap(next_beams.begin(), next_beams.end(), comp); + // } + // for (; j < sample_num; ++j) { + // float const next_score = + // b.score + li.log_probability_from_logit(infer_beam_ids[i], next_tokens[infer_beam_ids[i]][j].logit); + // if (next_beams.front().score < next_score) { + // std::pop_heap(next_beams.begin(), next_beams.end(), comp); + // next_beams.back() = b; + // next_beams.back().token_ids.push_back(next_tokens[infer_beam_ids[i]][j].id); + // next_beams.back().score = next_score; + // std::push_heap(next_beams.begin(), next_beams.end(), comp); + // } + // } + // } + // } std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); } @@ -2327,27 +2436,28 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c if (kv_reorder == nullptr) { kv_reorder = std::make_shared(ctx); } - for (int n = 0; n < max_new_tokens && !eos(top_beam()) && !std::all_of(cur_beams.begin(), cur_beams.end(), eos); - ++n) { + for (int n = 0; n < max_new_tokens && !std::all_of(cur_beams.begin(), cur_beams.end(), eos); ++n) { // first step if (n_past == 0) { model_eval(ctx, embd.data(), n_tokens, n_past, num_threads); n_past += n_tokens; kv_reorder->update(n_past, n_tokens); - lp.process(0, 50256); // TODO ctx->model.eos_id; logits_info li(ctx); - std::vector> next_tokens = li.top_k(beam_size); - MODEL_ASSERT(next_tokens.size() == 1); + li.compute_log_softmax_logits(); + lp.process(0, li.next_token_scores, 50256); // TODO ctx->model.eos_id; + li.compute_next_token_scores({cur_beams[0]}); + std::vector next_tokens = beam_top_k(ctx, li.next_token_scores, {1}, {0}, beam_size); + MODEL_ASSERT(next_tokens.size() == beam_size); cur_beams.clear(); for (int i = 0; i < beam_size; ++i) { beam b; b.ctx = ctx; - b.token_ids.push_back(next_tokens[0][i].id); - b.score = li.log_probability_from_logit(0, next_tokens[0][i].logit); + b.token_ids.push_back(next_tokens[i].id); + b.score = next_tokens[i].score; b.infer_bs_id = i; cur_beams.push_back(b); } - beam_score_length_penalize(); + // beam_score_length_penalize(); } else { fill_next_beams_by_top_probabilities(); std::unordered_map kv_reorder_indices = update_kv_cache_reorder_indices(); @@ -2355,20 +2465,34 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c kv_reorder->update(n_past, n_tokens, kv_reorder_indices, next_beams); cur_beams.swap(next_beams); next_beams.clear(); - beam_score_length_penalize(); + // beam_score_length_penalize(); } #if 0 // DEBUG: print current beams for this iteration printf("\n\nCurrent beams:\n"); - for (size_t j = 0; j < beams.size(); ++j) { + for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); - beams[j].print(); + cur_beams[j].print(); fflush(stdout); } #endif } + for (auto& b : cur_beams) { + if (b.eos()) { + b.score = b.eos_score; + } + } + beam_score_length_penalize(); const beam& top_b = top_beam(); +#if 1 // DEBUG: print current beams for this iteration + printf("\n\nCurrent beams:\n"); + for (size_t j = 0; j < cur_beams.size(); ++j) { + printf("beams[%d]: ", j); + cur_beams[j].print(); + fflush(stdout); + } +#endif #if 0 // DEBUG: print final beam result printf("\n\nFinal beam:\n"); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index f7456bc0593..aaae83b1175 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "application/common.h" #include "models/model_utils/model_config.h" @@ -259,19 +260,33 @@ MODEL_API const char* model_print_system_info(void); #endif /* beam search utils */ +#define NEG_INF -std::numeric_limits::max() + +typedef struct beam_top_k_res { + model_token id; // token id + float score; // score of the token + int beam_idx; // token in which beam +} beam_top_k_res; + +MODEL_API std::vector beam_top_k(const model_context* ctx, + const std::vector>& token_scores, + const std::vector& num_beams, const std::vector beam_indices, + const int& sample_scale = 2, const int& dim = -1); + struct beam { const model_context* ctx = nullptr; std::vector token_ids; // Cumulative beam score (log-softmax here) float score; + float eos_score; // record inference batch indice int infer_bs_id; // end-of-text const bool eos() const { return !token_ids.empty() && token_ids.back() == 50256; } // TODO ctx->vocab.eos_id void print() const { - printf("score: %0.6f, eos: %d, tokens: ", score, eos()); + printf("length: %d, score: %0.6f, eos: %d, tokens:\n", token_ids.size(), score, eos()); for (const auto& id : token_ids) { - printf("%s", model_token_to_str(ctx, id)); + printf("%d: %s, ", id, model_token_to_str(ctx, id)); } printf("\n"); } @@ -284,8 +299,10 @@ class logits_processor { explicit logits_processor(model_context* lctx) : ctx(lctx), min_new_tokens(lctx->generation_conf.min_new_tokens) {} ~logits_processor() {} - void process(const uint32_t& cur_len, const model_vocab::id& eos_token_id); - void min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id); + void process(const uint32_t& cur_len, std::vector>& token_scores, + const model_vocab::id& eos_token_id); + void min_new_tokens_logits_process(const uint32_t& cur_len, std::vector>& token_scores, + const model_vocab::id& eos_token_id); private: model_context* ctx = nullptr; @@ -324,7 +341,7 @@ class beam_search_flow { explicit beam_search_flow(model_context* lctx) : ctx(lctx), beam_size(lctx->beam_size), lp(logits_processor(lctx)) { cur_beams.reserve(beam_size); next_beams.reserve(beam_size); - cur_beams.push_back({ctx, {}, 1.0f}); + cur_beams.push_back({ctx, {}, 0.0f}); } ~beam_search_flow() {} From cfc71d7e0c96b80173f336308038fd76b34c5cb2 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Wed, 13 Sep 2023 03:09:20 +0000 Subject: [PATCH 02/14] fix garbled output strings Signed-off-by: Yu, Zhentao --- .../graph/models/model_utils/model_utils.cpp | 112 +++++++++--------- 1 file changed, 55 insertions(+), 57 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index e0de6ad608e..a9ca28e7876 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -1970,7 +1970,6 @@ struct logits_info { normalizers[i] = 1.0f / std::accumulate(logits + i * bs_stride + offset, logits + i * bs_stride + offset + n_vocab, 0.0f, sum_exp{max_ls[i]}); } - next_token_scores.reserve(batch_size * n_vocab); next_token_scores.resize(batch_size); } @@ -1989,6 +1988,7 @@ struct logits_info { void compute_log_softmax_logits() { #pragma omp parallel for for (int i = 0; i < batch_size; ++i) { + next_token_scores[i].reserve(n_vocab); #pragma omp parallel for for (int j = 0; j < n_vocab; ++j) { float score = log_probability_from_logit(i, *(logits + i * bs_stride + offset + j)); @@ -1998,38 +1998,38 @@ struct logits_info { } // token score + pre tokens score - void compute_next_token_scores(const std::vector& beams) { - MODEL_ASSERT(batch_size == beams.size()); + void compute_next_token_scores(const std::vector& beams_score) { + MODEL_ASSERT(batch_size == beams_score.size()); #pragma omp parallel for for (int i = 0; i < batch_size; ++i) { - std::for_each(next_token_scores[i].begin(), next_token_scores[i].end(), [&](float& s) { s += beams[i].score; }); + std::for_each(next_token_scores[i].begin(), next_token_scores[i].end(), [&](float& s) { s += beams_score[i]; }); } } // Return top k token_data by score. (batch, top_k) - std::vector> top_k(const int& k) { - std::vector> min_heap(batch_size); // min-heap by logit - int tk = std::min(k, n_vocab); - // min_heap.reserve(batch_size * tk); - for (int idx = 0; idx < batch_size; ++idx) { - for (int32_t token_idx = 0; token_idx < tk; ++token_idx) { - min_heap[idx].push_back(get_token_data(idx, token_idx)); - } - } - auto comp = [](const model_token_data& a, const model_token_data& b) { return a.logit > b.logit; }; - for (int idx = 0; idx < batch_size; ++idx) { - std::make_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); - for (int32_t token_idx = tk; token_idx < n_vocab; ++token_idx) { - if (min_heap[idx].front().logit < get_token_data(idx, token_idx).logit) { - std::pop_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); - min_heap[idx].back().id = token_idx; - min_heap[idx].back().logit = get_token_data(idx, token_idx).logit; - std::push_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); - } - } - } - return min_heap; - } + // std::vector> top_k(const int& k) { + // std::vector> min_heap(batch_size); // min-heap by logit + // int tk = std::min(k, n_vocab); + // // min_heap.reserve(batch_size * tk); + // for (int idx = 0; idx < batch_size; ++idx) { + // for (int32_t token_idx = 0; token_idx < tk; ++token_idx) { + // min_heap[idx].push_back(get_token_data(idx, token_idx)); + // } + // } + // auto comp = [](const model_token_data& a, const model_token_data& b) { return a.logit > b.logit; }; + // for (int idx = 0; idx < batch_size; ++idx) { + // std::make_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); + // for (int32_t token_idx = tk; token_idx < n_vocab; ++token_idx) { + // if (min_heap[idx].front().logit < get_token_data(idx, token_idx).logit) { + // std::pop_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); + // min_heap[idx].back().id = token_idx; + // min_heap[idx].back().logit = get_token_data(idx, token_idx).logit; + // std::push_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); + // } + // } + // } + // return min_heap; + // } }; // Return top k token_data by score. (prompt_bs * 2 * num_beam) @@ -2047,29 +2047,27 @@ std::vector beam_top_k(const model_context* ctx, const std::vect const uint32_t n_vocab = ctx->model.hparams.n_vocab; size_t row_off = 0; auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; -#pragma omp parallel for for (int i = 0; i < request_bs; ++i) { const int num_beam = num_beams[i]; const int sample_k = sample_scale * num_beam; min_heap.clear(); - min_heap.resize(sample_k); -#pragma omp parallel for + min_heap.reserve(sample_k); for (int j = 0; j < num_beam; ++j) { -#pragma omp parallel for - for (int n = 0; n < n_vocab; ++n) { - if (min_heap.size() < sample_k) { - min_heap.push_back(beam_top_k_res({n, token_scores[row_off + j][n], beam_indices[row_off + j]})); - } else if (min_heap.size() == sample_k) { - std::make_heap(min_heap.begin(), min_heap.end(), comp); - } else { - beam_top_k_res nr({n, token_scores[row_off + j][n], beam_indices[row_off + j]}); - if (min_heap.front().score < nr.score) { - std::pop_heap(min_heap.begin(), min_heap.end(), comp); - min_heap.back().id = nr.id; - min_heap.back().score = nr.score; - min_heap.back().beam_idx = nr.beam_idx; - std::push_heap(min_heap.begin(), min_heap.end(), comp); - } + int n = 0; + for (; min_heap.size() < sample_k; ++n) { + min_heap.push_back(beam_top_k_res({n, token_scores[row_off + j][n], beam_indices[row_off + j]})); + } + if (min_heap.size() == sample_k) { + std::make_heap(min_heap.begin(), min_heap.end(), comp); + } + for (; n < n_vocab; ++n) { + beam_top_k_res nr({n, token_scores[row_off + j][n], beam_indices[row_off + j]}); + if (min_heap.front().score < nr.score) { + std::pop_heap(min_heap.begin(), min_heap.end(), comp); + min_heap.back().id = nr.id; + min_heap.back().score = nr.score; + min_heap.back().beam_idx = nr.beam_idx; + std::push_heap(min_heap.begin(), min_heap.end(), comp); } } } @@ -2195,6 +2193,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { int batch_size = 0; uint32_t cur_len = 0; std::vector beam_indices; + std::vector beams_score; for (int i = 0; i < beam_size; ++i) { // is done or not if (!cur_beams[i].eos()) { @@ -2209,13 +2208,14 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { infer_beam_ids[i] = record++; batch_size++; beam_indices.push_back(i); + beams_score.push_back(cur_beams[i].score); } } // DEBUG -#if 0 +#if 1 printf("====================== \n"); for (auto kk : embd_inp) { - printf("%s \n", (ctx->vocab.id_to_token.at(kk).tok).c_str()); + printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str()); } #endif ctx->batch_size = batch_size; @@ -2236,19 +2236,17 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { logits_info li(ctx); li.compute_log_softmax_logits(); lp.process(cur_len, li.next_token_scores, 50256); // TODO ctx->model.eos_id; - li.compute_next_token_scores(cur_beams); + li.compute_next_token_scores(beams_score); const int sample_scale = 2; std::vector next_tokens = beam_top_k(ctx, li.next_token_scores, {batch_size}, beam_indices, sample_scale); // std::vector> next_tokens = li.top_k(sample_num); // DEBUG -#if 0 - for (int k = 0; k < next_tokens.size(); ++k) { - printf("====================== \n"); - for (auto kk : next_tokens[k]) { - printf("%s, l: %3.6f, p: %0.6f \n", (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.logit, - li.log_probability_from_logit(k, kk.logit)); - } +#if 1 + printf("====================== \n"); + for (auto kk : next_tokens) { + printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, + kk.beam_idx); } #endif MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); @@ -2445,7 +2443,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c logits_info li(ctx); li.compute_log_softmax_logits(); lp.process(0, li.next_token_scores, 50256); // TODO ctx->model.eos_id; - li.compute_next_token_scores({cur_beams[0]}); + li.compute_next_token_scores({0.0f}); std::vector next_tokens = beam_top_k(ctx, li.next_token_scores, {1}, {0}, beam_size); MODEL_ASSERT(next_tokens.size() == beam_size); cur_beams.clear(); @@ -2468,7 +2466,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c // beam_score_length_penalize(); } -#if 0 // DEBUG: print current beams for this iteration +#if 1 // DEBUG: print current beams for this iteration printf("\n\nCurrent beams:\n"); for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); From 49739fc8b59b1a34e82172cd815f430c3251e493 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Wed, 13 Sep 2023 10:05:59 +0000 Subject: [PATCH 03/14] fix beam_top_k Signed-off-by: Yu, Zhentao --- .../llm/runtime/graph/models/gptj/gptj.cpp | 4 ++-- .../graph/models/model_utils/model_utils.cpp | 24 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp index 5960217860d..995e4f8d3e5 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/gptj/gptj.cpp @@ -225,14 +225,14 @@ static bool gptj_model_eval_internal(model_context& lctx, const model_token* tok std::vector v_bs(batch_size); for (int i = 0; i < batch_size; ++i) { if (run_mha_fp16) { - // batch K + // batch V Vcur_bs[i] = ne_view_4d(ctx0, Vcur, n_embd / n_head, n_head, N, 1, ne_element_size(Vcur) * n_embd / n_head, ne_element_size(Vcur) * n_embd, ne_element_size(Vcur) * n_embd * N, i * ne_element_size(Vcur) * n_embd * N); v_bs[i] = ne_view_1d(ctx0, kv_self.v, n_embd * N * 1, (ne_element_size(kv_self.v) * n_embd) * (il * n_ctx * kv_n_ctx_block + n_past) + i * n_ctx * n_embd * ne_element_size(kv_self.v)); - // batch V + // batch K Kcur_bs[i] = ne_permute(ctx0, ne_reshape_4d(ctx0, ne_view_2d(ctx0, Kcur, n_embd, N, ne_element_size(Kcur) * n_embd, diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index a9ca28e7876..151bd7be98b 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2054,12 +2054,13 @@ std::vector beam_top_k(const model_context* ctx, const std::vect min_heap.reserve(sample_k); for (int j = 0; j < num_beam; ++j) { int n = 0; - for (; min_heap.size() < sample_k; ++n) { - min_heap.push_back(beam_top_k_res({n, token_scores[row_off + j][n], beam_indices[row_off + j]})); - } - if (min_heap.size() == sample_k) { + if (j == 0) { // init heap + for (; n < sample_k; ++n) { + min_heap.push_back(beam_top_k_res({n, token_scores[row_off + j][n], beam_indices[row_off + j]})); + } std::make_heap(min_heap.begin(), min_heap.end(), comp); } + MODEL_ASSERT(min_heap.size() == sample_k); for (; n < n_vocab; ++n) { beam_top_k_res nr({n, token_scores[row_off + j][n], beam_indices[row_off + j]}); if (min_heap.front().score < nr.score) { @@ -2072,6 +2073,8 @@ std::vector beam_top_k(const model_context* ctx, const std::vect } } row_off += i * num_beam; + std::sort(min_heap.begin(), min_heap.end(), + [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }); for (const auto b : min_heap) { res.push_back(b); } @@ -2142,11 +2145,12 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t // next setp for (auto it : kv_reorder_indices) { if (it.first != it.second) { + printf("%d: %d \n", it.first, it.second); uint32_t len = next_beams[it.first].token_ids.size() - 1; // last token in beam is for next step inference MODEL_ASSERT(len == n_past - n_prompt_tokens); size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * n_embd; - size_t input_token_offset_v = n_prompt_tokens * ne_element_size(ctx->model.kv_self.v); + size_t input_token_offset_v = 0; //n_prompt_tokens * ne_element_size(ctx->model.kv_self.v); if (len + n_prompt_tokens > n_ctx) { // all token hidden states cache should be updated input_token_offset_k = 0; @@ -2174,7 +2178,7 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + it.second * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + n_ctx * ne_element_size(ctx->model.kv_self.v) + input_token_offset_v), - ne_element_size(ctx->model.kv_self.v) * len); + ne_element_size(ctx->model.kv_self.v) * n_past); } } } @@ -2246,7 +2250,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { printf("====================== \n"); for (auto kk : next_tokens) { printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, - kk.beam_idx); + kk.beam_idx); } #endif MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); @@ -2263,8 +2267,6 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } } if (next_beams.size() < beam_size) { - std::sort(next_tokens.begin(), next_tokens.end(), - [](beam_top_k_res& a, beam_top_k_res& b) { return a.score < b.score; }); int add_num = beam_size - next_beams.size(); for (int j = 0; j < add_num; ++j) { beam next_beam = cur_beams[next_tokens[j].beam_idx]; @@ -2336,7 +2338,7 @@ std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() MODEL_ASSERT(next_beams.size() == beam_size); MODEL_ASSERT(cur_beams.size() == beam_size); // DEBUG -#if 0 +#if 1 printf("cur_beams: "); for (int i = 0; i < beam_size; ++i) { printf("%d, ", cur_beams[i].infer_bs_id); @@ -2379,7 +2381,7 @@ std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() } // beams should be ordered by batch id std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); -#if 0 // DEBUG +#if 1 // DEBUG printf("cpy_final_bs_ids: "); for (int i = 0; i < beam_size; ++i) { printf("%d, ", cpy_final_bs_ids[i]); From 2e023f17159d8a9621eb5068deff54f067d5cd3f Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Thu, 14 Sep 2023 05:19:13 +0000 Subject: [PATCH 04/14] fix v cache memcpy Signed-off-by: Yu, Zhentao --- .../graph/models/model_utils/model_utils.cpp | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 151bd7be98b..51ea9d8fd9c 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2145,12 +2145,11 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t // next setp for (auto it : kv_reorder_indices) { if (it.first != it.second) { - printf("%d: %d \n", it.first, it.second); uint32_t len = next_beams[it.first].token_ids.size() - 1; // last token in beam is for next step inference MODEL_ASSERT(len == n_past - n_prompt_tokens); size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * n_embd; - size_t input_token_offset_v = 0; //n_prompt_tokens * ne_element_size(ctx->model.kv_self.v); + size_t input_token_offset_v = n_prompt_tokens * ne_element_size(ctx->model.kv_self.v); if (len + n_prompt_tokens > n_ctx) { // all token hidden states cache should be updated input_token_offset_k = 0; @@ -2177,8 +2176,8 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t static_cast(ctx->model.kv_self.v->data) + (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + it.second * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + - n_ctx * ne_element_size(ctx->model.kv_self.v) + input_token_offset_v), - ne_element_size(ctx->model.kv_self.v) * n_past); + n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), + ne_element_size(ctx->model.kv_self.v) * len); } } } @@ -2216,7 +2215,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } } // DEBUG -#if 1 +#if 0 printf("====================== \n"); for (auto kk : embd_inp) { printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str()); @@ -2246,7 +2245,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { beam_top_k(ctx, li.next_token_scores, {batch_size}, beam_indices, sample_scale); // std::vector> next_tokens = li.top_k(sample_num); // DEBUG -#if 1 +#if 0 printf("====================== \n"); for (auto kk : next_tokens) { printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, @@ -2258,7 +2257,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { for (int i = 0; i < beam_size; ++i) { beam b = cur_beams[i]; if (b.eos()) { - printf("eos \n"); + // printf("eos \n"); if (b.score != 100) { b.eos_score = b.score; b.score = 100; @@ -2338,7 +2337,7 @@ std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() MODEL_ASSERT(next_beams.size() == beam_size); MODEL_ASSERT(cur_beams.size() == beam_size); // DEBUG -#if 1 +#if 0 printf("cur_beams: "); for (int i = 0; i < beam_size; ++i) { printf("%d, ", cur_beams[i].infer_bs_id); @@ -2381,7 +2380,7 @@ std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() } // beams should be ordered by batch id std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); -#if 1 // DEBUG +#if 0 // DEBUG printf("cpy_final_bs_ids: "); for (int i = 0; i < beam_size; ++i) { printf("%d, ", cpy_final_bs_ids[i]); @@ -2468,7 +2467,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c // beam_score_length_penalize(); } -#if 1 // DEBUG: print current beams for this iteration +#if 0 // DEBUG: print current beams for this iteration printf("\n\nCurrent beams:\n"); for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); @@ -2485,7 +2484,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c } beam_score_length_penalize(); const beam& top_b = top_beam(); -#if 1 // DEBUG: print current beams for this iteration +#if 0 // DEBUG: print current beams for this iteration printf("\n\nCurrent beams:\n"); for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); From 179092bf52b7b79203ea1dbce5e7274065afa119 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Fri, 15 Sep 2023 02:35:00 +0000 Subject: [PATCH 05/14] fix kv cache memcpy order Signed-off-by: Yu, Zhentao --- .../graph/models/model_utils/model_utils.cpp | 128 +++++++++++------- .../graph/models/model_utils/model_utils.h | 4 +- 2 files changed, 81 insertions(+), 51 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 51ea9d8fd9c..5df09212786 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -1992,7 +1992,7 @@ struct logits_info { #pragma omp parallel for for (int j = 0; j < n_vocab; ++j) { float score = log_probability_from_logit(i, *(logits + i * bs_stride + offset + j)); - next_token_scores[i].push_back(score); + next_token_scores[i].push_back(std::move(score)); } } } @@ -2006,33 +2006,39 @@ struct logits_info { } } - // Return top k token_data by score. (batch, top_k) - // std::vector> top_k(const int& k) { - // std::vector> min_heap(batch_size); // min-heap by logit - // int tk = std::min(k, n_vocab); - // // min_heap.reserve(batch_size * tk); - // for (int idx = 0; idx < batch_size; ++idx) { - // for (int32_t token_idx = 0; token_idx < tk; ++token_idx) { - // min_heap[idx].push_back(get_token_data(idx, token_idx)); - // } - // } - // auto comp = [](const model_token_data& a, const model_token_data& b) { return a.logit > b.logit; }; - // for (int idx = 0; idx < batch_size; ++idx) { - // std::make_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); - // for (int32_t token_idx = tk; token_idx < n_vocab; ++token_idx) { - // if (min_heap[idx].front().logit < get_token_data(idx, token_idx).logit) { - // std::pop_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); - // min_heap[idx].back().id = token_idx; - // min_heap[idx].back().logit = get_token_data(idx, token_idx).logit; - // std::push_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); - // } - // } - // } - // return min_heap; - // } + // Return top k token_data by logit in n_vocab dim. (request_bs*num_beam, top_k) + // each beam gives top_k results --> + prev_scores --> from (num_beam * top_k) sort num_beam + // however, huggingface transformers repo implements like this: + // log_softmax(num_beam*n_vocab) -- > + prev_scores --> sort num_beam + // huggingface outputs text with better quality but computing all log_softmax brings overhead + // we keep this `logits_top_k` for further acceleration if needed ( + // quality & latency tradeoff, or sample num k = beam_size? ) + std::vector> logits_top_k(const int& k) { + std::vector> min_heap(batch_size); // min-heap by logit + int tk = std::min(k, n_vocab); + // min_heap.reserve(batch_size * tk); + for (int idx = 0; idx < batch_size; ++idx) { + for (int32_t token_idx = 0; token_idx < tk; ++token_idx) { + min_heap[idx].push_back(get_token_data(idx, token_idx)); + } + } + auto comp = [](const model_token_data& a, const model_token_data& b) { return a.logit > b.logit; }; + for (int idx = 0; idx < batch_size; ++idx) { + std::make_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); + for (int32_t token_idx = tk; token_idx < n_vocab; ++token_idx) { + if (min_heap[idx].front().logit < get_token_data(idx, token_idx).logit) { + std::pop_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); + min_heap[idx].back().id = token_idx; + min_heap[idx].back().logit = get_token_data(idx, token_idx).logit; + std::push_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); + } + } + } + return min_heap; + } }; -// Return top k token_data by score. (prompt_bs * 2 * num_beam) +// Return top k token_data by score. (prompt_bs * sample_scale * num_beam) std::vector beam_top_k(const model_context* ctx, const std::vector>& token_scores, const std::vector& num_beams, const std::vector beam_indices, const int& sample_scale, const int& dim) { @@ -2111,7 +2117,7 @@ void logits_processor::process(const uint32_t& cur_len, std::vector& kv_reorder_indices, + const std::vector>& kv_reorder_indices, const std::vector& next_beams) { // first step if (n_past == n_prompt_tokens) { @@ -2143,9 +2149,12 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t } } else if (n_past > n_prompt_tokens) { // next setp - for (auto it : kv_reorder_indices) { - if (it.first != it.second) { - uint32_t len = next_beams[it.first].token_ids.size() - 1; + for (auto t : kv_reorder_indices) { + int cur_id = std::get<0>(t); + int cpy_id = std::get<1>(t); + if (cur_id != cpy_id) { + // printf("it.first: %d, it.second: %d \n", cur_id, cpy_id); + uint32_t len = next_beams[cur_id].token_ids.size() - 1; // last token in beam is for next step inference MODEL_ASSERT(len == n_past - n_prompt_tokens); size_t input_token_offset_k = n_prompt_tokens * ne_element_size(ctx->model.kv_self.k) * n_embd; @@ -2161,21 +2170,21 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t // [n_embd, N] memcpy(static_cast(ctx->model.kv_self.k->data) + (i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - it.first * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) + + cur_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd) + input_token_offset_k, static_cast(ctx->model.kv_self.k->data) + i * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd * kv_n_ctx_block + - it.second * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + input_token_offset_k, + cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.k) * n_embd + input_token_offset_k, ne_element_size(ctx->model.kv_self.k) * n_embd * len); // [N, n_embd] for (int k = 0; k < n_embd; ++k) { memcpy(static_cast(ctx->model.kv_self.v->data) + (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - it.first * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + + cur_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), static_cast(ctx->model.kv_self.v->data) + (i * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd * kv_n_ctx_block + - it.second * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + + cpy_id * n_ctx * ne_element_size(ctx->model.kv_self.v) * n_embd + n_ctx * ne_element_size(ctx->model.kv_self.v) * k + input_token_offset_v), ne_element_size(ctx->model.kv_self.v) * len); } @@ -2257,7 +2266,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { for (int i = 0; i < beam_size; ++i) { beam b = cur_beams[i]; if (b.eos()) { - // printf("eos \n"); + // printf("---------------------eos-----------------------> \n"); if (b.score != 100) { b.eos_score = b.score; b.score = 100; @@ -2321,7 +2330,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } // get kv cache reorder indices, -// k: dst_beam batch idx, v: src_beam batch idx +// idx_0: dst_beam batch idx, idx_1: src_beam batch idx // for copy predicted past token kv cache // for example: // - c @@ -2331,9 +2340,9 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { // - f | - ad // b -| ---------->| // - g -// kv_cache_reorder_indices = {0:0, 1:0} +// kv_cache_reorder_indices = {{0,0}, {1,0}} // if kv_cache_reorder_indices = {0:0, 1:1}, then do not need reorder (cpy) -std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() { +std::vector> beam_search_flow::update_kv_cache_reorder_indices() { MODEL_ASSERT(next_beams.size() == beam_size); MODEL_ASSERT(cur_beams.size() == beam_size); // DEBUG @@ -2349,7 +2358,7 @@ std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() } printf("\n"); #endif - std::unordered_map kv_reorder_indices; + std::vector> kv_reorder_indices; kv_reorder_indices.reserve(beam_size); // shuffle beams which are early stopped (eos) // keep them behind beams which have non-eos @@ -2374,12 +2383,41 @@ std::unordered_map beam_search_flow::update_kv_cache_reorder_indices() // update indices and batch ids for (int i = 0; i < beam_size; ++i) { - kv_reorder_indices[i] = cpy_final_bs_ids[i]; // update infer_bs_id before next beam generation next_beams[nb_shuffle_ids[i]].infer_bs_id = i; } // beams should be ordered by batch id std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); + + // we arrange beams by inference batch indice rather score for memcpy time reduction + // so there will be 2 circumstances (ignore no memcpy : 0,1,2,3 --> 0,1,2,3) + // 1. cpoy former beams into latter beams, like: 0,1,2,3 --> 0,0,0,1 + // 2. copy latter beams into former beams, like: 0,1,2,3 -- > 1,2,2,3 + // kv cache memcpy happens in itself which would cause memory dislocation if follows wrong order + // so we give the contrary order to beams vector indice, which is: + // if 1, copy from tail + // if 2, copy from head + bool cpy_from_head = true; + int dst_idx_sum = 0; + int src_idx_sum = 0; + for (int i = 0; i < cpy_final_bs_ids.size(); ++i) { + dst_idx_sum += i; + src_idx_sum += cpy_final_bs_ids[i]; + if (src_idx_sum < dst_idx_sum) { + cpy_from_head = false; + break; + } + } + if (cpy_from_head) { + for (int i = 0; i < cpy_final_bs_ids.size(); ++i) { + kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]}); + } + } else { + for (int i = cpy_final_bs_ids.size() - 1; i >=0; --i) { + kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]}); + } + } + #if 0 // DEBUG printf("cpy_final_bs_ids: "); for (int i = 0; i < beam_size; ++i) { @@ -2459,7 +2497,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c // beam_score_length_penalize(); } else { fill_next_beams_by_top_probabilities(); - std::unordered_map kv_reorder_indices = update_kv_cache_reorder_indices(); + std::vector> kv_reorder_indices = update_kv_cache_reorder_indices(); n_past += 1; kv_reorder->update(n_past, n_tokens, kv_reorder_indices, next_beams); cur_beams.swap(next_beams); @@ -2484,14 +2522,6 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c } beam_score_length_penalize(); const beam& top_b = top_beam(); -#if 0 // DEBUG: print current beams for this iteration - printf("\n\nCurrent beams:\n"); - for (size_t j = 0; j < cur_beams.size(); ++j) { - printf("beams[%d]: ", j); - cur_beams[j].print(); - fflush(stdout); - } -#endif #if 0 // DEBUG: print final beam result printf("\n\nFinal beam:\n"); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index aaae83b1175..74b9ce1f773 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -324,7 +324,7 @@ class beam_search_kv_cache_reorder { ~beam_search_kv_cache_reorder() {} virtual void update(const uint32_t& n_past, const uint32_t& n_prompt_tokens, - const std::unordered_map& kv_reorder_indices = {}, + const std::vector>& kv_reorder_indices = {}, const std::vector& next_beams = {}); private: @@ -350,7 +350,7 @@ class beam_search_flow { private: void fill_next_beams_by_top_probabilities(); - std::unordered_map update_kv_cache_reorder_indices(); + std::vector> update_kv_cache_reorder_indices(); void beam_score_length_penalize(); const beam& top_beam(); From 774e20301fe11185f1b2207326a352a5a46b44fd Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Fri, 15 Sep 2023 05:53:19 +0000 Subject: [PATCH 06/14] rm eos_score modification Signed-off-by: Yu, Zhentao --- .../graph/models/model_utils/model_utils.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 5df09212786..2dddcd8c583 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2224,7 +2224,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } } // DEBUG -#if 0 +#if 1 printf("====================== \n"); for (auto kk : embd_inp) { printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str()); @@ -2254,7 +2254,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { beam_top_k(ctx, li.next_token_scores, {batch_size}, beam_indices, sample_scale); // std::vector> next_tokens = li.top_k(sample_num); // DEBUG -#if 0 +#if 1 printf("====================== \n"); for (auto kk : next_tokens) { printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, @@ -2266,11 +2266,11 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { for (int i = 0; i < beam_size; ++i) { beam b = cur_beams[i]; if (b.eos()) { - // printf("---------------------eos-----------------------> \n"); - if (b.score != 100) { - b.eos_score = b.score; - b.score = 100; - } + printf("---------------------eos-----------------------> \n"); + // if (b.score != 100) { + // b.eos_score = b.score; + // b.score = 100; + // } next_beams.push_back(std::move(b)); } } @@ -2505,7 +2505,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c // beam_score_length_penalize(); } -#if 0 // DEBUG: print current beams for this iteration +#if 1 // DEBUG: print current beams for this iteration printf("\n\nCurrent beams:\n"); for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); @@ -2515,11 +2515,11 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c #endif } - for (auto& b : cur_beams) { - if (b.eos()) { - b.score = b.eos_score; - } - } + // for (auto& b : cur_beams) { + // if (b.eos()) { + // b.score = b.eos_score; + // } + // } beam_score_length_penalize(); const beam& top_b = top_beam(); From 056a752be7c7d196352132981dfce24256adf20e Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Fri, 15 Sep 2023 09:32:11 +0000 Subject: [PATCH 07/14] add beam_hypotheses and early_stopping Signed-off-by: Yu, Zhentao --- .../graph/models/model_utils/model_types.h | 2 +- .../graph/models/model_utils/model_utils.cpp | 79 +++++++++---------- .../graph/models/model_utils/model_utils.h | 57 ++++++++++++- 3 files changed, 94 insertions(+), 44 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h index 809fea7cd34..a3e672fbcec 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h @@ -230,7 +230,7 @@ struct generation_config { // likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while // `length_penalty` < 0.0 encourages shorter sequences. (default = 1.0) float length_penalty = 1.0f; - bool do_early_stopping = false; // TODO + bool do_early_stopping = true; }; class beam_search_kv_cache_reorder; // forward declaration diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 2dddcd8c583..9887abbb7a3 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2200,28 +2200,26 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t void beam_search_flow::fill_next_beams_by_top_probabilities() { auto const comp = [](const beam& a, const beam& b) { return a.score > b.score; }; std::vector embd_inp; - std::vector infer_beam_ids(beam_size); + // std::vector infer_beam_ids(beam_size); int record = 0; int batch_size = 0; uint32_t cur_len = 0; std::vector beam_indices; std::vector beams_score; for (int i = 0; i < beam_size; ++i) { - // is done or not - if (!cur_beams[i].eos()) { - if (cur_len != 0) { - MODEL_ASSERT(cur_len == cur_beams[i].token_ids.size()); - } else { - cur_len = cur_beams[i].token_ids.size(); - } - // (batch, 1) - // ordered by infer_bs_id - embd_inp.push_back(cur_beams[i].token_ids.back()); - infer_beam_ids[i] = record++; - batch_size++; - beam_indices.push_back(i); - beams_score.push_back(cur_beams[i].score); + MODEL_ASSERT(!cur_beams[i].eos()); + if (cur_len != 0) { + MODEL_ASSERT(cur_len == cur_beams[i].token_ids.size()); + } else { + cur_len = cur_beams[i].token_ids.size(); } + // (batch, 1) + // ordered by infer_bs_id + embd_inp.push_back(cur_beams[i].token_ids.back()); + // infer_beam_ids[i] = record++; + batch_size++; + beam_indices.push_back(i); + beams_score.push_back(cur_beams[i].score); } // DEBUG #if 1 @@ -2263,26 +2261,20 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { #endif MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); MODEL_ASSERT(next_beams.empty()); - for (int i = 0; i < beam_size; ++i) { - beam b = cur_beams[i]; - if (b.eos()) { - printf("---------------------eos-----------------------> \n"); - // if (b.score != 100) { - // b.eos_score = b.score; - // b.score = 100; - // } - next_beams.push_back(std::move(b)); - } - } - if (next_beams.size() < beam_size) { - int add_num = beam_size - next_beams.size(); - for (int j = 0; j < add_num; ++j) { - beam next_beam = cur_beams[next_tokens[j].beam_idx]; - next_beam.token_ids.push_back(next_tokens[j].id); - next_beam.score = next_tokens[j].score; + for (int i = 0; i < next_tokens.size(); ++i) { + if (next_tokens[i].id == 50256) { // TODO ctx->model_vocab.eos_id + beam_hypos[0].add(cur_beams[next_tokens[i].beam_idx]); + } else { + beam next_beam = cur_beams[next_tokens[i].beam_idx]; + next_beam.token_ids.push_back(next_tokens[i].id); + next_beam.score = next_tokens[i].score; next_beams.push_back(std::move(next_beam)); } + if (next_beams.size() == beam_size) { + break; + } } + // for (int i = 0; i < beam_size; ++i) { // beam b = cur_beams[i]; // if (b.eos()) { @@ -2413,7 +2405,7 @@ std::vector> beam_search_flow::update_kv_cache_reorder_indi kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]}); } } else { - for (int i = cpy_final_bs_ids.size() - 1; i >=0; --i) { + for (int i = cpy_final_bs_ids.size() - 1; i >= 0; --i) { kv_reorder_indices.push_back({i, cpy_final_bs_ids[i]}); } } @@ -2446,8 +2438,10 @@ void beam_search_flow::beam_score_length_penalize() { // Return beam with highest probability. const beam& beam_search_flow::top_beam() { - auto const by_score = [](beam const& a, beam const& b) { return a.score < b.score; }; - return *std::max_element(cur_beams.begin(), cur_beams.end(), by_score); + for (const auto b : cur_beams) { + beam_hypos[0].add(b); + } + return beam_hypos[0].top1(); } // TODO batch_size = 1 only @@ -2473,7 +2467,8 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c if (kv_reorder == nullptr) { kv_reorder = std::make_shared(ctx); } - for (int n = 0; n < max_new_tokens && !std::all_of(cur_beams.begin(), cur_beams.end(), eos); ++n) { + beam_hypos.push_back(beam_hypotheses(ctx)); // TODO ctx->request_running_bs; + for (int n = 0; n < max_new_tokens; ++n) { // first step if (n_past == 0) { model_eval(ctx, embd.data(), n_tokens, n_past, num_threads); @@ -2513,14 +2508,14 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c fflush(stdout); } #endif + + auto const done = [](const beam_hypotheses& bh) { return bh.is_done(); }; + if (std::all_of(beam_hypos.begin(), beam_hypos.end(), done)) { + break; + } } - // for (auto& b : cur_beams) { - // if (b.eos()) { - // b.score = b.eos_score; - // } - // } - beam_score_length_penalize(); + // beam_score_length_penalize(); const beam& top_b = top_beam(); #if 0 // DEBUG: print final beam result diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index 74b9ce1f773..ce0ac12fd81 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -278,7 +278,6 @@ struct beam { std::vector token_ids; // Cumulative beam score (log-softmax here) float score; - float eos_score; // record inference batch indice int infer_bs_id; // end-of-text @@ -292,6 +291,61 @@ struct beam { } }; +struct beam_hypotheses { + const model_context* const ctx = nullptr; + const int num_beams; + const float length_penalty = 1.0f; + const bool early_stopping = false; + std::vector beams; + + beam_hypotheses(model_context* lctx) + : ctx(lctx), + num_beams(lctx->beam_size), + length_penalty(lctx->generation_conf.length_penalty), + early_stopping(lctx->generation_conf.do_early_stopping) { + beams.reserve(lctx->beam_size); + } + + int len() { return beams.size(); } + + void add(beam b) { + auto comp = [](const beam& a, const beam& b) { return a.score > b.score; }; + int cur_len = b.eos() ? b.token_ids.size() - 1 : b.token_ids.size(); + float score = b.score / std::pow(cur_len, length_penalty); + b.score = score; + if (beams.size() < num_beams) { + beams.push_back(std::move(b)); + if (beams.size() == num_beams) { + std::make_heap(beams.begin(), beams.end(), comp); + } + } else { + MODEL_ASSERT(beams.size() == num_beams); + if (beams.front().score > b.score) { + return; + } + std::pop_heap(beams.begin(), beams.end(), comp); + beams.back() = b; + std::push_heap(beams.begin(), beams.end(), comp); + } + } + + const bool is_done() const { + if (beams.size() < num_beams) { + return false; + } + // stop as soon as at least `num_beams` hypotheses are finished + if (early_stopping) { + return true; + } + return false; + } + + const beam& top1() const { + auto const by_score = [](beam const& a, beam const& b) { return a.score < b.score; }; + return *std::max_element(beams.begin(), beams.end(), by_score); + } +}; + struct logits_info; class logits_processor { @@ -358,6 +412,7 @@ class beam_search_flow { const int beam_size; std::vector cur_beams; std::vector next_beams; + std::vector beam_hypos; size_t n_past = 0; int num_threads = 4; // default by 4 logits_processor lp; From 4951748efa0dbba88598431443e220128beb33df Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Mon, 18 Sep 2023 08:16:40 +0000 Subject: [PATCH 08/14] log_softmax reduction and align early_stopping Signed-off-by: Yu, Zhentao --- .../runtime/graph/application/pybind_gptj.cpp | 166 ++++++----- .../graph/models/model_utils/model_types.h | 2 +- .../graph/models/model_utils/model_utils.cpp | 277 ++++++++++-------- .../graph/models/model_utils/model_utils.h | 34 ++- 4 files changed, 263 insertions(+), 216 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp index 8318ef2b8a2..ff84982ff8e 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp @@ -50,7 +50,8 @@ bool gptj_model_eval_ids(model_context* ctx, model_token* tokens, size_t n_eval, extern "C" { void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, float temp, float repeat_penalty, bool perplexity, int n_ctx, const char* model_file, bool beam_search = false, int beam_size = 4, - int batch_size = 1, int n_threads = 56, int min_new_tokens = 0, float length_penalty = 1.0) { + int batch_size = 1, int n_threads = 56, int min_new_tokens = 0, float length_penalty = 1.0, + bool do_early_stopping = false) { gpt_params params; params.n_threads = n_threads; params.seed = seed; @@ -80,6 +81,7 @@ void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, fl } ctx->generation_conf.min_new_tokens = min_new_tokens; ctx->generation_conf.length_penalty = length_penalty; + ctx->generation_conf.do_early_stopping = do_early_stopping; return (void*)ctx; } @@ -220,91 +222,95 @@ int main(int argc, char* argv[]) { return 1; } - auto gptj_in_all_bs = init_gptj(1234, 32, 32, 40, 1.0, 0.8, 1.02, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0); + auto gptj_in_all_bs = + init_gptj(1234, 32, 32, 40, 1.0, 0.8, 1.02, false, 2048, argv[1], true, 4, 1, 56, 30, 1.0, true); std::vector ctxs = {gptj_in_all_bs}; for (auto gptj_in_all : ctxs) { auto res = eval_gptj_char( gptj_in_all, - // "she opened the door and see", - "Once upon a time", + "she opened the door and see", + // "Once upon a time", // "A spaceship lands on the moon", - // "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing " - // "on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. " - // "There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went " - // "right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. " - // "Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of " - // "enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I " - // "could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that " - // "evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt " - // "permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a " - // "species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. " - // "That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve " - // "something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by " - // "creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that " - // "rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to " - // "create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call " - // "it evolution. This is a problem, of course, every other contestant also had to face. And judging by the " - // "entries submitted, not many managed to work around it. I'd say the only real solution was through the use of " - // "artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this " - // "is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed " - // "myself to pick whatever I thought would work out. My initial idea was to create something where humanity " - // "tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had " - // "this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space " - // "Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next " - // "inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are " - // "you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow " - // "gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it " - // "sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey " - // "(who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it " - // "involved into the idea of having individual pieces of pasta flying around and trying to evolve until they " - // "became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti " - // "Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: " - // "you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, " - // "each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through " - // "a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', " - // "which are debited from your credits (you start with a number of credits). Once spawned, your pastas start " - // "flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game " - // "is having your pasta conquer all the plates on the table). But they are really autonomous, so after being " - // "spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other " - // "people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other " - // "pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. " - // "It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If " - // "pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, " - // "until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every " - // "plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, " - // "Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and " - // "Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge " - // "with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing " - // "for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It " - // "made the livestream pretty effortless and the features are awesome, even for the free version. It was great " - // "to have some of my friends watch me, and then interact with them and random people through chat. It was also " - // "good knowing that I was also recording a local version of the files, so I could make a timelapse video later. " - // "Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if " - // "someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly " - // "inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll " - // "probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty " - // "much spend half of the time writing a line and the other half fixing the crazy characters in it. My own " - // "stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the " - // "cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on " - // "my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole " - // "weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, " - // "when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended " - // "up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but " + // "What is the meaning of life?", + // "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing + // " "on the web works, but you have to simulate multiple touch for table moving and that can be a bit + // confusing. " "There is a lot I'd like to talk about. I will go through every topic, insted of making the + // typical what went " "right/wrong list. Concept Working over the theme was probably one of the hardest tasks + // which I had to face. " "Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - + // something with a lot of " "enemies/actors, simple graphics, maybe set in space, controlled from a top-down + // view. I was confident that I " "could fit any theme around it. In the end, the problem with a theme like + // 'Evolution' in a game is that " "evolution is unassisted. It happens through several seemingly random + // mutations over time, with the most apt " "permutation surviving. This genetic car simulator is, in my + // opinion, a great example of actual evolution of a " "species facing a challenge. But is it a game? In a game, + // you need to control something to reach an objective. " "That control goes against what evolution is supposed + // to be like. If you allow the user to pick how to evolve " "something, it's not evolution anymore - it's the + // equivalent of intelligent design, the fable invented by " "creationists to combat the idea of evolution. + // Being agnostic and a Pastafarian, that's not something that " "rubbed me the right way. Hence, my biggest + // dillema when deciding what to create was not with what I wanted to " "create, but with what I did not. I + // didn't want to create an 'intelligent design' simulator and wrongly call " "it evolution. This is a problem, + // of course, every other contestant also had to face. And judging by the " "entries submitted, not many managed + // to work around it. I'd say the only real solution was through the use of " "artificial selection, somehow. So + // far, I haven't seen any entry using this at its core gameplay. Alas, this " "is just a fun competition and + // after a while I decided not to be as strict with the game idea, and allowed " "myself to pick whatever I + // thought would work out. My initial idea was to create something where humanity " "tried to evolve to a next + // level, but had some kind of foe trying to stop them from doing so. I kind of had " "this image of human souls + // flying in space towards a monolith or a space baby (all based in 2001: A Space " "Odyssey of course) but I + // couldn't think of compelling (read: serious) mechanics for that. Borgs were my next " "inspiration, as their + // whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are " "you the borg, or + // fighting the Borg? The third and final idea came to me through my girlfriend, who somehow " "gave me the idea + // of making something about the evolution of Pasta. The more I thought about it the more it " "sounded like it + // would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey " + // "(who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as + // it " "involved into the idea of having individual pieces of pasta flying around and trying to evolve until + // they " "became all-powerful. A secondary idea here was that the game would work to explain how the Flying + // Spaghetti " "Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less + // into this: " "you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests + // at the table, " "each with their own plate. Your plate can spawn little pieces of pasta. You do so by + // 'ordering' them through " "a menu. Some pastas are better than others; some are faster, some are stronger. + // They have varying 'costs', " "which are debited from your credits (you start with a number of credits). Once + // spawned, your pastas start " "flying around. Their instinct is to fly to other plates, in order to conquer + // them (the objective of the game " "is having your pasta conquer all the plates on the table). But they are + // really autonomous, so after being " "spawned, you have no control over your pasta (think DotA or LoL creeps). + // Your pasta doesn't like other " "people's pasta, so if they meet, they shoot sauce at each other until one + // dies. You get credits for other " "pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it + // starts conquering it for its team. " "It takes around 10 seconds for a plate to be conquered; less if more + // pasta from the same team are around. If " "pasta from other team are around, though, they get locked down in + // their attempt, unable to conquer the plate, " "until one of them die (think Battlefield's standard 'Conquest' + // mode). You get points every second for every " "plate you own. Over time, the concept also evolved to use an + // Italian bistro as its main scenario. Carlos, " "Carlos' Bistro's founder and owner Setup No major changes + // were made from my work setup. I used FDT and " "Starling creating an Adobe AIR (ActionScript) project, all + // tools or frameworks I already had some knowledge " "with. One big change for me was that I livestreamed my + // work through a twitch.tv account. This was a new thing " "for me. As recommended by Roushey, I used a program + // called XSplit and I got to say, it is pretty amazing. It " "made the livestream pretty effortless and the + // features are awesome, even for the free version. It was great " "to have some of my friends watch me, and + // then interact with them and random people through chat. It was also " "good knowing that I was also recording + // a local version of the files, so I could make a timelapse video later. " "Knowing the video was being + // recorded also made me a lot more self-conscious about my computer use, as if " "someone was watching over my + // shoulder. It made me realize that sometimes I spend too much time in seemingly " "inane tasks (I ended up + // wasting the longest time just to get some text alignment the way I wanted - it'll " "probably drive someone + // crazy if they watch it) and that I do way too many typos where writing code. I pretty " "much spend half of + // the time writing a line and the other half fixing the crazy characters in it. My own " "stream was probably + // boring to watch since I was coding for the most time. But livestreaming is one of the " "cool things to do as + // a spectator too. It was great seeing other people working - I had a few tabs opened on " "my second monitor + // all the time. It's actually a bit sad, because if I could, I could have spent the whole " "weekend just + // watching other people working! But I had to do my own work, so I'd only do it once in a while, " "when + // resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended " "up + // going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but " // "I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how " - // "much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at " - // "the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They " - // "think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, " - // "you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area " - // "of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to " - // "indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color " - // "denotes the team owner. But it's something that fits so well into the design that people never realized it, " - // "until they were told about it. In the end, the idea of going with a full physical metaphor is one that should " - // "be done with care. Things that are very important risk becoming background noise, unless the player knows its " - // "importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up " - // "adding it at the bottom to indicate your credits and bases owned, as well as the hideous " - // "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple " - // "HUD from the start, especially one that indicated each team's colors and general state of the game without " - // "the need for zooming in and out. Development Development went fast.", + // "much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at + // " "the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They + // " "think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too + // much, " "you can't actually read it, so it's hard to know what's going on with the game until you zoom in to + // the area " "of a specific plate. One other solution that didn't turn out to be as perfect as I thought was + // how to " "indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its + // color " "denotes the team owner. But it's something that fits so well into the design that people never + // realized it, " "until they were told about it. In the end, the idea of going with a full physical metaphor is + // one that should " "be done with care. Things that are very important risk becoming background noise, unless + // the player knows its " "importance. Originally, I wanted to avoid any kind of heads-up display in my game. In + // the end, I ended up " "adding it at the bottom to indicate your credits and bases owned, as well as the + // hideous " "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with + // a simple " "HUD from the start, especially one that indicated each team's colors and general state of the + // game without " "the need for zooming in and out. Development Development went fast.", 128, 40, 1.0, 0.8, 2048); std::cout << res << std::endl; exit_gptj(gptj_in_all); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h index a3e672fbcec..581feaff785 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h @@ -230,7 +230,7 @@ struct generation_config { // likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while // `length_penalty` < 0.0 encourages shorter sequences. (default = 1.0) float length_penalty = 1.0f; - bool do_early_stopping = true; + bool do_early_stopping = false; }; class beam_search_kv_cache_reorder; // forward declaration diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 9887abbb7a3..c9d9631d89d 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -1973,8 +1973,8 @@ struct logits_info { next_token_scores.resize(batch_size); } - model_token_data get_token_data(const int& batch_idx, const int32_t& token_idx) const { - return {token_idx, *(logits + batch_idx * bs_stride + offset + token_idx), 0.0f}; + beam_top_k_res get_token_data(const int& batch_idx, const int32_t& token_idx) const { + return {token_idx, *(logits + batch_idx * bs_stride + offset + token_idx), -1}; } float probability_from_logit(const int& batch_idx, const float& logit) { @@ -1985,51 +1985,44 @@ struct logits_info { return std::log(probability_from_logit(batch_idx, logit)); } - void compute_log_softmax_logits() { -#pragma omp parallel for - for (int i = 0; i < batch_size; ++i) { - next_token_scores[i].reserve(n_vocab); -#pragma omp parallel for - for (int j = 0; j < n_vocab; ++j) { - float score = log_probability_from_logit(i, *(logits + i * bs_stride + offset + j)); - next_token_scores[i].push_back(std::move(score)); - } - } - } - - // token score + pre tokens score - void compute_next_token_scores(const std::vector& beams_score) { - MODEL_ASSERT(batch_size == beams_score.size()); -#pragma omp parallel for - for (int i = 0; i < batch_size; ++i) { - std::for_each(next_token_scores[i].begin(), next_token_scores[i].end(), [&](float& s) { s += beams_score[i]; }); - } - } - - // Return top k token_data by logit in n_vocab dim. (request_bs*num_beam, top_k) - // each beam gives top_k results --> + prev_scores --> from (num_beam * top_k) sort num_beam - // however, huggingface transformers repo implements like this: - // log_softmax(num_beam*n_vocab) -- > + prev_scores --> sort num_beam - // huggingface outputs text with better quality but computing all log_softmax brings overhead - // we keep this `logits_top_k` for further acceleration if needed ( - // quality & latency tradeoff, or sample num k = beam_size? ) - std::vector> logits_top_k(const int& k) { - std::vector> min_heap(batch_size); // min-heap by logit +// void compute_log_softmax_logits() { +// #pragma omp parallel for +// for (int i = 0; i < batch_size; ++i) { +// next_token_scores[i].reserve(n_vocab); +// #pragma omp parallel for +// for (int j = 0; j < n_vocab; ++j) { +// float score = log_probability_from_logit(i, *(logits + i * bs_stride + offset + j)); +// next_token_scores[i].push_back(std::move(score)); +// } +// } +// } + +// // token score + pre tokens score +// void compute_next_token_scores(const std::vector& beams_score) { +// MODEL_ASSERT(batch_size == beams_score.size()); +// #pragma omp parallel for +// for (int i = 0; i < batch_size; ++i) { +// std::for_each(next_token_scores[i].begin(), next_token_scores[i].end(), [&](float& s) { s += beams_score[i]; }); +// } +// } + + // Return top k token_data by raw logit in n_vocab dim. (request_bs*num_beam, top_k) + std::vector> vocab_top_k(const int& k) { + std::vector> min_heap(batch_size); // min-heap by logit int tk = std::min(k, n_vocab); - // min_heap.reserve(batch_size * tk); for (int idx = 0; idx < batch_size; ++idx) { for (int32_t token_idx = 0; token_idx < tk; ++token_idx) { min_heap[idx].push_back(get_token_data(idx, token_idx)); } } - auto comp = [](const model_token_data& a, const model_token_data& b) { return a.logit > b.logit; }; + auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; for (int idx = 0; idx < batch_size; ++idx) { std::make_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); for (int32_t token_idx = tk; token_idx < n_vocab; ++token_idx) { - if (min_heap[idx].front().logit < get_token_data(idx, token_idx).logit) { + if (min_heap[idx].front().score < get_token_data(idx, token_idx).score) { std::pop_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); min_heap[idx].back().id = token_idx; - min_heap[idx].back().logit = get_token_data(idx, token_idx).logit; + min_heap[idx].back().score = get_token_data(idx, token_idx).score; std::push_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); } } @@ -2038,80 +2031,25 @@ struct logits_info { } }; -// Return top k token_data by score. (prompt_bs * sample_scale * num_beam) -std::vector beam_top_k(const model_context* ctx, const std::vector>& token_scores, - const std::vector& num_beams, const std::vector beam_indices, - const int& sample_scale, const int& dim) { - MODEL_ASSERT(dim == -1); // raise unimplemented error - MODEL_ASSERT(token_scores.size() == ctx->batch_size); // prompt bs * num_beam - MODEL_ASSERT(token_scores[0].size() == ctx->model.hparams.n_vocab); - const int request_bs = 1; // TODO ctx->request_running_num - MODEL_ASSERT(num_beams.size() == request_bs); - std::vector res; - res.reserve(sample_scale * std::accumulate(num_beams.begin(), num_beams.end(), 0)); - std::vector min_heap; - const uint32_t n_vocab = ctx->model.hparams.n_vocab; - size_t row_off = 0; - auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; - for (int i = 0; i < request_bs; ++i) { - const int num_beam = num_beams[i]; - const int sample_k = sample_scale * num_beam; - min_heap.clear(); - min_heap.reserve(sample_k); - for (int j = 0; j < num_beam; ++j) { - int n = 0; - if (j == 0) { // init heap - for (; n < sample_k; ++n) { - min_heap.push_back(beam_top_k_res({n, token_scores[row_off + j][n], beam_indices[row_off + j]})); - } - std::make_heap(min_heap.begin(), min_heap.end(), comp); - } - MODEL_ASSERT(min_heap.size() == sample_k); - for (; n < n_vocab; ++n) { - beam_top_k_res nr({n, token_scores[row_off + j][n], beam_indices[row_off + j]}); - if (min_heap.front().score < nr.score) { - std::pop_heap(min_heap.begin(), min_heap.end(), comp); - min_heap.back().id = nr.id; - min_heap.back().score = nr.score; - min_heap.back().beam_idx = nr.beam_idx; - std::push_heap(min_heap.begin(), min_heap.end(), comp); - } - } - } - row_off += i * num_beam; - std::sort(min_heap.begin(), min_heap.end(), - [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }); - for (const auto b : min_heap) { - res.push_back(b); - } - } - return res; -} - -void logits_processor::min_new_tokens_logits_process(const uint32_t& cur_len, - std::vector>& token_scores, - const model_vocab::id& eos_token_id) { +void logits_processor::min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) { MODEL_ASSERT(ctx->generation_conf.min_new_tokens >= 0); if (ctx->generation_conf.min_new_tokens == 0 || ctx->generation_conf.min_new_tokens <= cur_len) { return; } else { - // batch_size (input_ptompt_bs * beam_size, n_vocab) - MODEL_ASSERT(token_scores.size() == ctx->batch_size); - MODEL_ASSERT(token_scores[0].size() == ctx->model.hparams.n_vocab); - int batch_size = token_scores.size(); - uint32_t n_vocab = token_scores[0].size(); + int batch_size = ctx->batch_size; + size_t offset = ctx->logits.size() / ctx->batch_size - ctx->model.hparams.n_vocab; + size_t bs_stride = ctx->logits.size() / ctx->batch_size; for (int i = 0; i < batch_size; ++i) { // forbidden to choose eos_token if cur_len < min_new_tokens - token_scores[i][eos_token_id] = NEG_INF; + *(model_get_logits(ctx) + i * bs_stride + offset + eos_token_id) = NEG_INF; } } } -void logits_processor::process(const uint32_t& cur_len, std::vector>& token_scores, - const model_vocab::id& eos_token_id) { +void logits_processor::process(const uint32_t& cur_len, const model_vocab::id& eos_token_id) { MODEL_ASSERT(model_get_logits(ctx) != nullptr); if (min_new_tokens > 0) { - min_new_tokens_logits_process(cur_len, token_scores, eos_token_id); + min_new_tokens_logits_process(cur_len, eos_token_id); } } @@ -2196,6 +2134,79 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t } } +// Return top k token_data by score. (prompt_bs * sample_scale * num_beam) +// each beam gives top_k results --> + prev_scores --> from (num_beam * top_k) sort num_beam +// for example, huggingface transformers repo implements like this: +// log_softmax(num_beam*n_vocab) -- > + prev_scores --> sort num_beam +// it's straightforward but computing all log_softmax brings overhead +// we sample top_k logits for each beam, than compute scores in these logits positions +// then we sample top_k results among all beams. +// this approach will accelerate sampling speed by log_softmax times reduction +std::vector beam_search_flow::beam_top_k(model_context* ctx, const uint32_t& cur_len, + const std::vector& beams_score, + const std::vector& num_beams, + const std::vector beam_indices, const int& sample_scale, + const int& dim) { + MODEL_ASSERT(dim == -1); // raise unimplemented error + const int request_bs = 1; // TODO ctx->request_running_num + logits_info li(ctx); + lp.process(cur_len, ctx->vocab.eos_token_id); + const int raw_k = sample_scale * beam_size; + // raw logits top_k + std::vector> raw_top_k = li.vocab_top_k(raw_k); + MODEL_ASSERT(raw_top_k.size() == ctx->batch_size); // request_bs * num_beam + MODEL_ASSERT(raw_top_k[0].size() == raw_k); + MODEL_ASSERT(beams_score.size() == ctx->batch_size); + // compute score: log_softmax + prev_score +#pragma omp parallel for + for (int i = 0; i < ctx->batch_size; ++i) { + std::for_each(raw_top_k[i].begin(), raw_top_k[i].end(), + [&](beam_top_k_res& r) { r.score = li.log_probability_from_logit(i, r.score) + beams_score[i]; }); + } + MODEL_ASSERT(num_beams.size() == request_bs); + std::vector res; + res.reserve(sample_scale * std::accumulate(num_beams.begin(), num_beams.end(), 0)); + std::vector min_heap; + const uint32_t n_vocab = ctx->model.hparams.n_vocab; + size_t row_off = 0; + auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; + for (int i = 0; i < request_bs; ++i) { + const int num_beam = num_beams[i]; + const int sample_k = sample_scale * num_beam; + MODEL_ASSERT(raw_k >= sample_k); + min_heap.clear(); + min_heap.reserve(sample_k); + for (int j = 0; j < num_beam; ++j) { + int n = 0; + if (j == 0) { // init heap + for (; n < sample_k; ++n) { + min_heap.push_back(beam_top_k_res( + {raw_top_k[row_off + j][n].id, raw_top_k[row_off + j][n].score, beam_indices[row_off + j]})); + } + std::make_heap(min_heap.begin(), min_heap.end(), comp); + } + MODEL_ASSERT(min_heap.size() == sample_k); + for (; n < raw_k; ++n) { + beam_top_k_res nr({raw_top_k[row_off + j][n].id, raw_top_k[row_off + j][n].score, beam_indices[row_off + j]}); + if (min_heap.front().score < nr.score) { + std::pop_heap(min_heap.begin(), min_heap.end(), comp); + min_heap.back().id = nr.id; + min_heap.back().score = nr.score; + min_heap.back().beam_idx = nr.beam_idx; + std::push_heap(min_heap.begin(), min_heap.end(), comp); + } + } + } + row_off += i * num_beam; + std::sort(min_heap.begin(), min_heap.end(), + [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }); + for (const auto b : min_heap) { + res.push_back(b); + } + } + return res; +} + // TODO debug info unify (function ptr?) void beam_search_flow::fill_next_beams_by_top_probabilities() { auto const comp = [](const beam& a, const beam& b) { return a.score > b.score; }; @@ -2222,7 +2233,7 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { beams_score.push_back(cur_beams[i].score); } // DEBUG -#if 1 +#if 0 printf("====================== \n"); for (auto kk : embd_inp) { printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str()); @@ -2243,16 +2254,16 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } #endif - logits_info li(ctx); - li.compute_log_softmax_logits(); - lp.process(cur_len, li.next_token_scores, 50256); // TODO ctx->model.eos_id; - li.compute_next_token_scores(beams_score); + // logits_info li(ctx); + // li.compute_log_softmax_logits(); + // lp.process(cur_len, li.next_token_scores, 50256); // TODO ctx->model.eos_id; + // li.compute_next_token_scores(beams_score); const int sample_scale = 2; std::vector next_tokens = - beam_top_k(ctx, li.next_token_scores, {batch_size}, beam_indices, sample_scale); + beam_top_k(ctx, cur_len, beams_score, {batch_size}, beam_indices, sample_scale); // std::vector> next_tokens = li.top_k(sample_num); // DEBUG -#if 1 +#if 0 printf("====================== \n"); for (auto kk : next_tokens) { printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, @@ -2262,8 +2273,15 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); MODEL_ASSERT(next_beams.empty()); for (int i = 0; i < next_tokens.size(); ++i) { - if (next_tokens[i].id == 50256) { // TODO ctx->model_vocab.eos_id - beam_hypos[0].add(cur_beams[next_tokens[i].beam_idx]); + if (next_tokens[i].id == ctx->vocab.eos_token_id) { + // if beam_token does not belong to top num_beams tokens, it should not be added + bool is_beam_token_worse_than_top_num_beams = i >= beam_size ? true: false; + if (is_beam_token_worse_than_top_num_beams) { + continue; + } + // update score with eos next token + cur_beams[next_tokens[i].beam_idx].score = next_tokens[i].score; + beam_hypos[0].add(cur_beams[next_tokens[i].beam_idx], n_prompt_tokens); } else { beam next_beam = cur_beams[next_tokens[i].beam_idx]; next_beam.token_ids.push_back(next_tokens[i].id); @@ -2437,9 +2455,19 @@ void beam_search_flow::beam_score_length_penalize() { } // Return beam with highest probability. -const beam& beam_search_flow::top_beam() { - for (const auto b : cur_beams) { - beam_hypos[0].add(b); +const beam& beam_search_flow::finalize() { + // printf("\n before: \n"); + // for (auto b : beam_hypos[0].beams) { + // b.print(); + // } + if (!requests_done[0]) { + for (const auto b : cur_beams) { + beam_hypos[0].add(b, n_prompt_tokens); + } + // printf("\n after: \n"); + // for (auto b : beam_hypos[0].beams) { + // b.print(); + // } } return beam_hypos[0].top1(); } @@ -2453,6 +2481,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c return std::vector(); } num_threads = n_threads; + n_prompt_tokens = n_tokens; std::vector beam_search_response; std::vector embd(tokens_inp, tokens_inp + n_tokens); @@ -2468,17 +2497,18 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c kv_reorder = std::make_shared(ctx); } beam_hypos.push_back(beam_hypotheses(ctx)); // TODO ctx->request_running_bs; + requests_done.push_back(false); for (int n = 0; n < max_new_tokens; ++n) { // first step if (n_past == 0) { model_eval(ctx, embd.data(), n_tokens, n_past, num_threads); n_past += n_tokens; kv_reorder->update(n_past, n_tokens); - logits_info li(ctx); - li.compute_log_softmax_logits(); - lp.process(0, li.next_token_scores, 50256); // TODO ctx->model.eos_id; - li.compute_next_token_scores({0.0f}); - std::vector next_tokens = beam_top_k(ctx, li.next_token_scores, {1}, {0}, beam_size); + // logits_info li(ctx); + // li.compute_log_softmax_logits(); + // lp.process(0, li.next_token_scores, 50256); // TODO ctx->model.eos_id; + // li.compute_next_token_scores({0.0f}); + std::vector next_tokens = beam_top_k(ctx, 0, {0.0f}, {1}, {0}, beam_size); MODEL_ASSERT(next_tokens.size() == beam_size); cur_beams.clear(); for (int i = 0; i < beam_size; ++i) { @@ -2500,7 +2530,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c // beam_score_length_penalize(); } -#if 1 // DEBUG: print current beams for this iteration +#if 0 // DEBUG: print current beams for this iteration printf("\n\nCurrent beams:\n"); for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); @@ -2509,14 +2539,23 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c } #endif - auto const done = [](const beam_hypotheses& bh) { return bh.is_done(); }; - if (std::all_of(beam_hypos.begin(), beam_hypos.end(), done)) { + // check if done + for (int h = 0; h < beam_hypos.size(); ++h) { + if (requests_done[h]) { + continue; + } + if (beam_hypos[h].is_done()) { + requests_done[h] = true; + } + } + auto const done_or_not = [](const bool& flag) { return flag; }; + if (std::all_of(requests_done.begin(), requests_done.end(), done_or_not)) { break; } } // beam_score_length_penalize(); - const beam& top_b = top_beam(); + const beam& top_b = finalize(); #if 0 // DEBUG: print final beam result printf("\n\nFinal beam:\n"); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index ce0ac12fd81..fc8247e51ce 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -265,14 +265,9 @@ MODEL_API const char* model_print_system_info(void); typedef struct beam_top_k_res { model_token id; // token id float score; // score of the token - int beam_idx; // token in which beam + int beam_idx; // token in which beam (-1 means unknown) } beam_top_k_res; -MODEL_API std::vector beam_top_k(const model_context* ctx, - const std::vector>& token_scores, - const std::vector& num_beams, const std::vector beam_indices, - const int& sample_scale = 2, const int& dim = -1); - struct beam { const model_context* ctx = nullptr; std::vector token_ids; @@ -281,7 +276,7 @@ struct beam { // record inference batch indice int infer_bs_id; // end-of-text - const bool eos() const { return !token_ids.empty() && token_ids.back() == 50256; } // TODO ctx->vocab.eos_id + const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; } void print() const { printf("length: %d, score: %0.6f, eos: %d, tokens:\n", token_ids.size(), score, eos()); for (const auto& id : token_ids) { @@ -308,11 +303,15 @@ struct beam_hypotheses { int len() { return beams.size(); } - void add(beam b) { + void add(beam b, const uint32_t& n_prompt_tokens) { auto comp = [](const beam& a, const beam& b) { return a.score > b.score; }; - int cur_len = b.eos() ? b.token_ids.size() - 1 : b.token_ids.size(); - float score = b.score / std::pow(cur_len, length_penalty); + uint32_t cur_len = b.eos() ? b.token_ids.size() - 1 : b.token_ids.size(); + float score = b.score / std::pow(cur_len + n_prompt_tokens, length_penalty); + // printf("===============beam hypos add =================== \n"); + // printf("origin score: %12.6f, cur_len: %d \n", b.score, cur_len+n_prompt_tokens); b.score = score; + // b.print(); + // printf("=========================\n"); if (beams.size() < num_beams) { beams.push_back(std::move(b)); if (beams.size() == num_beams) { @@ -353,10 +352,8 @@ class logits_processor { explicit logits_processor(model_context* lctx) : ctx(lctx), min_new_tokens(lctx->generation_conf.min_new_tokens) {} ~logits_processor() {} - void process(const uint32_t& cur_len, std::vector>& token_scores, - const model_vocab::id& eos_token_id); - void min_new_tokens_logits_process(const uint32_t& cur_len, std::vector>& token_scores, - const model_vocab::id& eos_token_id); + void process(const uint32_t& cur_len, const model_vocab::id& eos_token_id); + void min_new_tokens_logits_process(const uint32_t& cur_len, const model_vocab::id& eos_token_id); private: model_context* ctx = nullptr; @@ -403,17 +400,22 @@ class beam_search_flow { std::vector loop(const model_token* tokens_inp, const int& n_tokens, const int& n_threads); private: + std::vector beam_top_k(model_context* ctx, const uint32_t& cur_len, const std::vector& beams_score, + const std::vector& num_beams, const std::vector beam_indices, + const int& sample_scale = 2, const int& dim = -1); void fill_next_beams_by_top_probabilities(); std::vector> update_kv_cache_reorder_indices(); void beam_score_length_penalize(); - const beam& top_beam(); + const beam& finalize(); model_context* ctx = nullptr; const int beam_size; std::vector cur_beams; std::vector next_beams; std::vector beam_hypos; - size_t n_past = 0; + std::vector requests_done; + uint32_t n_past = 0; + uint32_t n_prompt_tokens = 0; int num_threads = 4; // default by 4 logits_processor lp; std::shared_ptr kv_reorder; From 3f38281405fe47731fbf554b45f4afa528a96d17 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Tue, 19 Sep 2023 03:20:42 +0000 Subject: [PATCH 09/14] clean code and add debug info Signed-off-by: Yu, Zhentao --- .../graph/models/model_utils/model_utils.cpp | 140 ++++-------------- .../graph/models/model_utils/model_utils.h | 18 ++- 2 files changed, 42 insertions(+), 116 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index c9d9631d89d..418fea7c676 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -1973,7 +1973,7 @@ struct logits_info { next_token_scores.resize(batch_size); } - beam_top_k_res get_token_data(const int& batch_idx, const int32_t& token_idx) const { + beam_next_token get_token_data(const int& batch_idx, const int32_t& token_idx) const { return {token_idx, *(logits + batch_idx * bs_stride + offset + token_idx), -1}; } @@ -1985,37 +1985,16 @@ struct logits_info { return std::log(probability_from_logit(batch_idx, logit)); } -// void compute_log_softmax_logits() { -// #pragma omp parallel for -// for (int i = 0; i < batch_size; ++i) { -// next_token_scores[i].reserve(n_vocab); -// #pragma omp parallel for -// for (int j = 0; j < n_vocab; ++j) { -// float score = log_probability_from_logit(i, *(logits + i * bs_stride + offset + j)); -// next_token_scores[i].push_back(std::move(score)); -// } -// } -// } - -// // token score + pre tokens score -// void compute_next_token_scores(const std::vector& beams_score) { -// MODEL_ASSERT(batch_size == beams_score.size()); -// #pragma omp parallel for -// for (int i = 0; i < batch_size; ++i) { -// std::for_each(next_token_scores[i].begin(), next_token_scores[i].end(), [&](float& s) { s += beams_score[i]; }); -// } -// } - // Return top k token_data by raw logit in n_vocab dim. (request_bs*num_beam, top_k) - std::vector> vocab_top_k(const int& k) { - std::vector> min_heap(batch_size); // min-heap by logit + std::vector> vocab_top_k(const int& k) { + std::vector> min_heap(batch_size); // min-heap by logit int tk = std::min(k, n_vocab); for (int idx = 0; idx < batch_size; ++idx) { for (int32_t token_idx = 0; token_idx < tk; ++token_idx) { min_heap[idx].push_back(get_token_data(idx, token_idx)); } } - auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; + auto comp = [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; }; for (int idx = 0; idx < batch_size; ++idx) { std::make_heap(min_heap[idx].begin(), min_heap[idx].end(), comp); for (int32_t token_idx = tk; token_idx < n_vocab; ++token_idx) { @@ -2091,7 +2070,6 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t int cur_id = std::get<0>(t); int cpy_id = std::get<1>(t); if (cur_id != cpy_id) { - // printf("it.first: %d, it.second: %d \n", cur_id, cpy_id); uint32_t len = next_beams[cur_id].token_ids.size() - 1; // last token in beam is for next step inference MODEL_ASSERT(len == n_past - n_prompt_tokens); @@ -2142,7 +2120,7 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t // we sample top_k logits for each beam, than compute scores in these logits positions // then we sample top_k results among all beams. // this approach will accelerate sampling speed by log_softmax times reduction -std::vector beam_search_flow::beam_top_k(model_context* ctx, const uint32_t& cur_len, +std::vector beam_search_flow::beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, const std::vector& beams_score, const std::vector& num_beams, const std::vector beam_indices, const int& sample_scale, @@ -2153,7 +2131,7 @@ std::vector beam_search_flow::beam_top_k(model_context* ctx, con lp.process(cur_len, ctx->vocab.eos_token_id); const int raw_k = sample_scale * beam_size; // raw logits top_k - std::vector> raw_top_k = li.vocab_top_k(raw_k); + std::vector> raw_top_k = li.vocab_top_k(raw_k); MODEL_ASSERT(raw_top_k.size() == ctx->batch_size); // request_bs * num_beam MODEL_ASSERT(raw_top_k[0].size() == raw_k); MODEL_ASSERT(beams_score.size() == ctx->batch_size); @@ -2161,15 +2139,15 @@ std::vector beam_search_flow::beam_top_k(model_context* ctx, con #pragma omp parallel for for (int i = 0; i < ctx->batch_size; ++i) { std::for_each(raw_top_k[i].begin(), raw_top_k[i].end(), - [&](beam_top_k_res& r) { r.score = li.log_probability_from_logit(i, r.score) + beams_score[i]; }); + [&](beam_next_token& r) { r.score = li.log_probability_from_logit(i, r.score) + beams_score[i]; }); } MODEL_ASSERT(num_beams.size() == request_bs); - std::vector res; + std::vector res; res.reserve(sample_scale * std::accumulate(num_beams.begin(), num_beams.end(), 0)); - std::vector min_heap; + std::vector min_heap; const uint32_t n_vocab = ctx->model.hparams.n_vocab; size_t row_off = 0; - auto comp = [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }; + auto comp = [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; }; for (int i = 0; i < request_bs; ++i) { const int num_beam = num_beams[i]; const int sample_k = sample_scale * num_beam; @@ -2180,14 +2158,14 @@ std::vector beam_search_flow::beam_top_k(model_context* ctx, con int n = 0; if (j == 0) { // init heap for (; n < sample_k; ++n) { - min_heap.push_back(beam_top_k_res( + min_heap.push_back(beam_next_token( {raw_top_k[row_off + j][n].id, raw_top_k[row_off + j][n].score, beam_indices[row_off + j]})); } std::make_heap(min_heap.begin(), min_heap.end(), comp); } MODEL_ASSERT(min_heap.size() == sample_k); for (; n < raw_k; ++n) { - beam_top_k_res nr({raw_top_k[row_off + j][n].id, raw_top_k[row_off + j][n].score, beam_indices[row_off + j]}); + beam_next_token nr({raw_top_k[row_off + j][n].id, raw_top_k[row_off + j][n].score, beam_indices[row_off + j]}); if (min_heap.front().score < nr.score) { std::pop_heap(min_heap.begin(), min_heap.end(), comp); min_heap.back().id = nr.id; @@ -2199,7 +2177,7 @@ std::vector beam_search_flow::beam_top_k(model_context* ctx, con } row_off += i * num_beam; std::sort(min_heap.begin(), min_heap.end(), - [](const beam_top_k_res& a, const beam_top_k_res& b) { return a.score > b.score; }); + [](const beam_next_token& a, const beam_next_token& b) { return a.score > b.score; }); for (const auto b : min_heap) { res.push_back(b); } @@ -2211,7 +2189,6 @@ std::vector beam_search_flow::beam_top_k(model_context* ctx, con void beam_search_flow::fill_next_beams_by_top_probabilities() { auto const comp = [](const beam& a, const beam& b) { return a.score > b.score; }; std::vector embd_inp; - // std::vector infer_beam_ids(beam_size); int record = 0; int batch_size = 0; uint32_t cur_len = 0; @@ -2227,7 +2204,6 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { // (batch, 1) // ordered by infer_bs_id embd_inp.push_back(cur_beams[i].token_ids.back()); - // infer_beam_ids[i] = record++; batch_size++; beam_indices.push_back(i); beams_score.push_back(cur_beams[i].score); @@ -2254,14 +2230,10 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } #endif - // logits_info li(ctx); - // li.compute_log_softmax_logits(); - // lp.process(cur_len, li.next_token_scores, 50256); // TODO ctx->model.eos_id; - // li.compute_next_token_scores(beams_score); const int sample_scale = 2; - std::vector next_tokens = - beam_top_k(ctx, cur_len, beams_score, {batch_size}, beam_indices, sample_scale); - // std::vector> next_tokens = li.top_k(sample_num); + std::vector next_tokens = + beam_top_k_next_tokens(ctx, cur_len, beams_score, {batch_size}, beam_indices, sample_scale); + // DEBUG #if 0 printf("====================== \n"); @@ -2293,49 +2265,6 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { } } - // for (int i = 0; i < beam_size; ++i) { - // beam b = cur_beams[i]; - // if (b.eos()) { - // // b is at end-of-sentence, so just copy it to next_beams if its - // // probability is high enough. - // if (next_beams.size() < beam_size) { - // if (b.score != 100) { - // b.eos_score = b.score; - // b.score = 100; - // } - // next_beams.push_back(b); - // if (next_beams.size() == beam_size) { - // std::make_heap(next_beams.begin(), next_beams.end(), comp); - // } - // } else if (next_beams.front().score < b.score) { - // std::pop_heap(next_beams.begin(), next_beams.end(), comp); - // next_beams.back() = b; - // std::push_heap(next_beams.begin(), next_beams.end(), comp); - // } - // } else { - // int j = 0; - // if (next_beams.size() < beam_size) { - // for (; next_beams.size() < beam_size && j < sample_num; ++j) { - // beam next_beam = b; - // next_beam.token_ids.push_back(next_tokens[infer_beam_ids[i]][j].id); - // next_beam.score += li.log_probability_from_logit(infer_beam_ids[i], - // next_tokens[infer_beam_ids[i]][j].logit); next_beams.push_back(std::move(next_beam)); - // } - // std::make_heap(next_beams.begin(), next_beams.end(), comp); - // } - // for (; j < sample_num; ++j) { - // float const next_score = - // b.score + li.log_probability_from_logit(infer_beam_ids[i], next_tokens[infer_beam_ids[i]][j].logit); - // if (next_beams.front().score < next_score) { - // std::pop_heap(next_beams.begin(), next_beams.end(), comp); - // next_beams.back() = b; - // next_beams.back().token_ids.push_back(next_tokens[infer_beam_ids[i]][j].id); - // next_beams.back().score = next_score; - // std::push_heap(next_beams.begin(), next_beams.end(), comp); - // } - // } - // } - // } std::sort(next_beams.begin(), next_beams.end(), [](beam& a, beam& b) { return a.infer_bs_id < b.infer_bs_id; }); } @@ -2405,8 +2334,8 @@ std::vector> beam_search_flow::update_kv_cache_reorder_indi // 2. copy latter beams into former beams, like: 0,1,2,3 -- > 1,2,2,3 // kv cache memcpy happens in itself which would cause memory dislocation if follows wrong order // so we give the contrary order to beams vector indice, which is: - // if 1, copy from tail - // if 2, copy from head + // if 1, copy order is from tail to head + // if 2, copy order is from head to tail bool cpy_from_head = true; int dst_idx_sum = 0; int src_idx_sum = 0; @@ -2456,18 +2385,22 @@ void beam_search_flow::beam_score_length_penalize() { // Return beam with highest probability. const beam& beam_search_flow::finalize() { - // printf("\n before: \n"); - // for (auto b : beam_hypos[0].beams) { - // b.print(); - // } +#if 0 + printf("\n finalize before: \n"); + for (auto b : beam_hypos[0].beams) { + b.print(); + } +#endif if (!requests_done[0]) { for (const auto b : cur_beams) { beam_hypos[0].add(b, n_prompt_tokens); } - // printf("\n after: \n"); - // for (auto b : beam_hypos[0].beams) { - // b.print(); - // } +#if 0 + printf("\n finalize after: \n"); + for (auto b : beam_hypos[0].beams) { + b.print(); + } +#endif } return beam_hypos[0].top1(); } @@ -2488,9 +2421,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c ctx->batch_size = 1; const uint32_t max_new_tokens = ctx->generation_conf.max_new_tokens; - // Loop while there are any beams that have not yet reached end-of-sentence. - // If the top beam is at end-of-sentence, then finish since all other - // beam score can only decrease. + // Loop ends in: 1. all requests done; or 2. reach max_new_tokens length auto const eos = [](const beam& b) { return b.eos(); }; kv_reorder = ctx->bs_kv_reorder; if (kv_reorder == nullptr) { @@ -2504,11 +2435,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c model_eval(ctx, embd.data(), n_tokens, n_past, num_threads); n_past += n_tokens; kv_reorder->update(n_past, n_tokens); - // logits_info li(ctx); - // li.compute_log_softmax_logits(); - // lp.process(0, li.next_token_scores, 50256); // TODO ctx->model.eos_id; - // li.compute_next_token_scores({0.0f}); - std::vector next_tokens = beam_top_k(ctx, 0, {0.0f}, {1}, {0}, beam_size); + std::vector next_tokens = beam_top_k_next_tokens(ctx, 0, {0.0f}, {1}, {0}, beam_size); MODEL_ASSERT(next_tokens.size() == beam_size); cur_beams.clear(); for (int i = 0; i < beam_size; ++i) { @@ -2519,7 +2446,6 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c b.infer_bs_id = i; cur_beams.push_back(b); } - // beam_score_length_penalize(); } else { fill_next_beams_by_top_probabilities(); std::vector> kv_reorder_indices = update_kv_cache_reorder_indices(); @@ -2527,7 +2453,6 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c kv_reorder->update(n_past, n_tokens, kv_reorder_indices, next_beams); cur_beams.swap(next_beams); next_beams.clear(); - // beam_score_length_penalize(); } #if 0 // DEBUG: print current beams for this iteration @@ -2554,7 +2479,6 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c } } - // beam_score_length_penalize(); const beam& top_b = finalize(); #if 0 // DEBUG: print final beam result diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index fc8247e51ce..0c89eada311 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -262,11 +262,11 @@ MODEL_API const char* model_print_system_info(void); /* beam search utils */ #define NEG_INF -std::numeric_limits::max() -typedef struct beam_top_k_res { +typedef struct beam_next_token { model_token id; // token id float score; // score of the token int beam_idx; // token in which beam (-1 means unknown) -} beam_top_k_res; +} beam_next_token; struct beam { const model_context* ctx = nullptr; @@ -278,7 +278,7 @@ struct beam { // end-of-text const bool eos() const { return !token_ids.empty() && token_ids.back() == ctx->vocab.eos_token_id; } void print() const { - printf("length: %d, score: %0.6f, eos: %d, tokens:\n", token_ids.size(), score, eos()); + printf("length: %d, score: %12.6f, eos: %d, tokens:\n", token_ids.size(), score, eos()); for (const auto& id : token_ids) { printf("%d: %s, ", id, model_token_to_str(ctx, id)); } @@ -307,11 +307,13 @@ struct beam_hypotheses { auto comp = [](const beam& a, const beam& b) { return a.score > b.score; }; uint32_t cur_len = b.eos() ? b.token_ids.size() - 1 : b.token_ids.size(); float score = b.score / std::pow(cur_len + n_prompt_tokens, length_penalty); - // printf("===============beam hypos add =================== \n"); - // printf("origin score: %12.6f, cur_len: %d \n", b.score, cur_len+n_prompt_tokens); +#if 0 + printf("=============== beam hypos add =================== \n"); + b.print(); + printf("origin score: %12.6f, new score: %12.f, sentence_len: %d \n", b.score, score, cur_len + n_prompt_tokens); + printf("================================================== \n"); +#endif b.score = score; - // b.print(); - // printf("=========================\n"); if (beams.size() < num_beams) { beams.push_back(std::move(b)); if (beams.size() == num_beams) { @@ -400,7 +402,7 @@ class beam_search_flow { std::vector loop(const model_token* tokens_inp, const int& n_tokens, const int& n_threads); private: - std::vector beam_top_k(model_context* ctx, const uint32_t& cur_len, const std::vector& beams_score, + std::vector beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, const std::vector& beams_score, const std::vector& num_beams, const std::vector beam_indices, const int& sample_scale = 2, const int& dim = -1); void fill_next_beams_by_top_probabilities(); From 0415add51b20b7e8be964b683a3ff0eddf6761e3 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Tue, 19 Sep 2023 03:31:13 +0000 Subject: [PATCH 10/14] clang-format Signed-off-by: Yu, Zhentao --- .../runtime/graph/application/pybind_gptj.cpp | 159 +++++++++--------- .../graph/models/model_utils/model_utils.cpp | 12 +- .../graph/models/model_utils/model_utils.h | 8 +- 3 files changed, 90 insertions(+), 89 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp index ff84982ff8e..3dccba770f5 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp @@ -163,6 +163,7 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa for (auto id : embd) { res += model_token_to_str(lctx, id); } + // std::cout << res << std::endl; } else { std::vector logits; for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) { @@ -228,89 +229,87 @@ int main(int argc, char* argv[]) { for (auto gptj_in_all : ctxs) { auto res = eval_gptj_char( gptj_in_all, - "she opened the door and see", + // "she opened the door and see", // "Once upon a time", // "A spaceship lands on the moon", // "What is the meaning of life?", - // "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing - // " "on the web works, but you have to simulate multiple touch for table moving and that can be a bit - // confusing. " "There is a lot I'd like to talk about. I will go through every topic, insted of making the - // typical what went " "right/wrong list. Concept Working over the theme was probably one of the hardest tasks - // which I had to face. " "Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - - // something with a lot of " "enemies/actors, simple graphics, maybe set in space, controlled from a top-down - // view. I was confident that I " "could fit any theme around it. In the end, the problem with a theme like - // 'Evolution' in a game is that " "evolution is unassisted. It happens through several seemingly random - // mutations over time, with the most apt " "permutation surviving. This genetic car simulator is, in my - // opinion, a great example of actual evolution of a " "species facing a challenge. But is it a game? In a game, - // you need to control something to reach an objective. " "That control goes against what evolution is supposed - // to be like. If you allow the user to pick how to evolve " "something, it's not evolution anymore - it's the - // equivalent of intelligent design, the fable invented by " "creationists to combat the idea of evolution. - // Being agnostic and a Pastafarian, that's not something that " "rubbed me the right way. Hence, my biggest - // dillema when deciding what to create was not with what I wanted to " "create, but with what I did not. I - // didn't want to create an 'intelligent design' simulator and wrongly call " "it evolution. This is a problem, - // of course, every other contestant also had to face. And judging by the " "entries submitted, not many managed - // to work around it. I'd say the only real solution was through the use of " "artificial selection, somehow. So - // far, I haven't seen any entry using this at its core gameplay. Alas, this " "is just a fun competition and - // after a while I decided not to be as strict with the game idea, and allowed " "myself to pick whatever I - // thought would work out. My initial idea was to create something where humanity " "tried to evolve to a next - // level, but had some kind of foe trying to stop them from doing so. I kind of had " "this image of human souls - // flying in space towards a monolith or a space baby (all based in 2001: A Space " "Odyssey of course) but I - // couldn't think of compelling (read: serious) mechanics for that. Borgs were my next " "inspiration, as their - // whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are " "you the borg, or - // fighting the Borg? The third and final idea came to me through my girlfriend, who somehow " "gave me the idea - // of making something about the evolution of Pasta. The more I thought about it the more it " "sounded like it - // would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey " - // "(who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as - // it " "involved into the idea of having individual pieces of pasta flying around and trying to evolve until - // they " "became all-powerful. A secondary idea here was that the game would work to explain how the Flying - // Spaghetti " "Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less - // into this: " "you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests - // at the table, " "each with their own plate. Your plate can spawn little pieces of pasta. You do so by - // 'ordering' them through " "a menu. Some pastas are better than others; some are faster, some are stronger. - // They have varying 'costs', " "which are debited from your credits (you start with a number of credits). Once - // spawned, your pastas start " "flying around. Their instinct is to fly to other plates, in order to conquer - // them (the objective of the game " "is having your pasta conquer all the plates on the table). But they are - // really autonomous, so after being " "spawned, you have no control over your pasta (think DotA or LoL creeps). - // Your pasta doesn't like other " "people's pasta, so if they meet, they shoot sauce at each other until one - // dies. You get credits for other " "pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it - // starts conquering it for its team. " "It takes around 10 seconds for a plate to be conquered; less if more - // pasta from the same team are around. If " "pasta from other team are around, though, they get locked down in - // their attempt, unable to conquer the plate, " "until one of them die (think Battlefield's standard 'Conquest' - // mode). You get points every second for every " "plate you own. Over time, the concept also evolved to use an - // Italian bistro as its main scenario. Carlos, " "Carlos' Bistro's founder and owner Setup No major changes - // were made from my work setup. I used FDT and " "Starling creating an Adobe AIR (ActionScript) project, all - // tools or frameworks I already had some knowledge " "with. One big change for me was that I livestreamed my - // work through a twitch.tv account. This was a new thing " "for me. As recommended by Roushey, I used a program - // called XSplit and I got to say, it is pretty amazing. It " "made the livestream pretty effortless and the - // features are awesome, even for the free version. It was great " "to have some of my friends watch me, and - // then interact with them and random people through chat. It was also " "good knowing that I was also recording - // a local version of the files, so I could make a timelapse video later. " "Knowing the video was being - // recorded also made me a lot more self-conscious about my computer use, as if " "someone was watching over my - // shoulder. It made me realize that sometimes I spend too much time in seemingly " "inane tasks (I ended up - // wasting the longest time just to get some text alignment the way I wanted - it'll " "probably drive someone - // crazy if they watch it) and that I do way too many typos where writing code. I pretty " "much spend half of - // the time writing a line and the other half fixing the crazy characters in it. My own " "stream was probably - // boring to watch since I was coding for the most time. But livestreaming is one of the " "cool things to do as - // a spectator too. It was great seeing other people working - I had a few tabs opened on " "my second monitor - // all the time. It's actually a bit sad, because if I could, I could have spent the whole " "weekend just - // watching other people working! But I had to do my own work, so I'd only do it once in a while, " "when - // resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended " "up - // going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but " - // "I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how " - // "much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at - // " "the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They - // " "think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too - // much, " "you can't actually read it, so it's hard to know what's going on with the game until you zoom in to - // the area " "of a specific plate. One other solution that didn't turn out to be as perfect as I thought was - // how to " "indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its - // color " "denotes the team owner. But it's something that fits so well into the design that people never - // realized it, " "until they were told about it. In the end, the idea of going with a full physical metaphor is - // one that should " "be done with care. Things that are very important risk becoming background noise, unless - // the player knows its " "importance. Originally, I wanted to avoid any kind of heads-up display in my game. In - // the end, I ended up " "adding it at the bottom to indicate your credits and bases owned, as well as the - // hideous " "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with - // a simple " "HUD from the start, especially one that indicated each team's colors and general state of the - // game without " "the need for zooming in and out. Development Development went fast.", + "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing " + "on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. " + "There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went " + "right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. " + "Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of " + "enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I " + "could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that " + "evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt " + "permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a " + "species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. " + "That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve " + "something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by " + "creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that " + "rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to " + "create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call " + "it evolution. This is a problem, of course, every other contestant also had to face. And judging by the " + "entries submitted, not many managed to work around it. I'd say the only real solution was through the use of " + "artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this " + "is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed " + "myself to pick whatever I thought would work out. My initial idea was to create something where humanity " + "tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had " + "this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space " + "Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next " + "inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are " + "you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow " + "gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it " + "sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey " + "(who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it " + "involved into the idea of having individual pieces of pasta flying around and trying to evolve until they " + "became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti " + "Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: " + "you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, " + "each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through " + "a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', " + "which are debited from your credits (you start with a number of credits). Once spawned, your pastas start " + "flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game " + "is having your pasta conquer all the plates on the table). But they are really autonomous, so after being " + "spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other " + "people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other " + "pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. " + "It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If " + "pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, " + "until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every " + "plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, " + "Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and " + "Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge " + "with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing " + "for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It " + "made the livestream pretty effortless and the features are awesome, even for the free version. It was great " + "to have some of my friends watch me, and then interact with them and random people through chat. It was also " + "good knowing that I was also recording a local version of the files, so I could make a timelapse video later. " + "Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if " + "someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly " + "inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll " + "probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty " + "much spend half of the time writing a line and the other half fixing the crazy characters in it. My own " + "stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the " + "cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on " + "my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole " + "weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, " + "when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended " + "up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but " + "I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how " + "much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at " + "the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They " + "think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, " + "you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area " + "of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to " + "indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color " + "denotes the team owner. But it's something that fits so well into the design that people never realized it, " + "until they were told about it. In the end, the idea of going with a full physical metaphor is one that should " + "be done with care. Things that are very important risk becoming background noise, unless the player knows its " + "importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up " + "adding it at the bottom to indicate your credits and bases owned, as well as the hideous " + "out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple " + "HUD from the start, especially one that indicated each team's colors and general state of the game without " + "the need for zooming in and out. Development Development went fast.", 128, 40, 1.0, 0.8, 2048); std::cout << res << std::endl; exit_gptj(gptj_in_all); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 418fea7c676..3d659181f29 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2121,10 +2121,10 @@ void beam_search_kv_cache_reorder::update(const uint32_t& n_past, const uint32_t // then we sample top_k results among all beams. // this approach will accelerate sampling speed by log_softmax times reduction std::vector beam_search_flow::beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, - const std::vector& beams_score, - const std::vector& num_beams, - const std::vector beam_indices, const int& sample_scale, - const int& dim) { + const std::vector& beams_score, + const std::vector& num_beams, + const std::vector beam_indices, + const int& sample_scale, const int& dim) { MODEL_ASSERT(dim == -1); // raise unimplemented error const int request_bs = 1; // TODO ctx->request_running_num logits_info li(ctx); @@ -2246,8 +2246,8 @@ void beam_search_flow::fill_next_beams_by_top_probabilities() { MODEL_ASSERT(next_beams.empty()); for (int i = 0; i < next_tokens.size(); ++i) { if (next_tokens[i].id == ctx->vocab.eos_token_id) { - // if beam_token does not belong to top num_beams tokens, it should not be added - bool is_beam_token_worse_than_top_num_beams = i >= beam_size ? true: false; + // if beam_token does not belong to top num_beams tokens, it should not be added + bool is_beam_token_worse_than_top_num_beams = i >= beam_size ? true : false; if (is_beam_token_worse_than_top_num_beams) { continue; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index 0c89eada311..21f35f573a1 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -402,9 +402,11 @@ class beam_search_flow { std::vector loop(const model_token* tokens_inp, const int& n_tokens, const int& n_threads); private: - std::vector beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, const std::vector& beams_score, - const std::vector& num_beams, const std::vector beam_indices, - const int& sample_scale = 2, const int& dim = -1); + std::vector beam_top_k_next_tokens(model_context* ctx, const uint32_t& cur_len, + const std::vector& beams_score, + const std::vector& num_beams, + const std::vector beam_indices, const int& sample_scale = 2, + const int& dim = -1); void fill_next_beams_by_top_probabilities(); std::vector> update_kv_cache_reorder_indices(); void beam_score_length_penalize(); From ce44e486f7c1ad25a69ebfe0ef4980884c959110 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Tue, 19 Sep 2023 09:07:11 +0000 Subject: [PATCH 11/14] rename function and rm unuseless code Signed-off-by: Yu, Zhentao --- .../llm/runtime/graph/application/pybind_gptj.cpp | 2 +- .../llm/runtime/graph/models/model_utils/model_utils.cpp | 8 +++----- .../llm/runtime/graph/models/model_utils/model_utils.h | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp index 3dccba770f5..49c6c159333 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp @@ -163,7 +163,6 @@ char* eval_gptj_char(void* ctx, const char* prom, int n_predict, int top_k, floa for (auto id : embd) { res += model_token_to_str(lctx, id); } - // std::cout << res << std::endl; } else { std::vector logits; for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) { @@ -231,6 +230,7 @@ int main(int argc, char* argv[]) { gptj_in_all, // "she opened the door and see", // "Once upon a time", + // "Tell me 10 things about jazz music", // "A spaceship lands on the moon", // "What is the meaning of life?", "2017: It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing " diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 3d659181f29..41103e82ddb 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -1935,9 +1935,8 @@ std::vector>& model_internal_get_tenso // A struct for calculating logits-related info. struct logits_info { const model_context* const ctx = nullptr; - // (batch, seq_len * vocab_size) + // (batch, seq_len * vocab_size) batch = input_prompt_bs* beam_size const float* const logits = nullptr; - std::vector> next_token_scores; // (input_prompt_bs* beam_size, n_vocab) const int batch_size; const int32_t n_vocab; // last seq_len indice @@ -1970,7 +1969,6 @@ struct logits_info { normalizers[i] = 1.0f / std::accumulate(logits + i * bs_stride + offset, logits + i * bs_stride + offset + n_vocab, 0.0f, sum_exp{max_ls[i]}); } - next_token_scores.resize(batch_size); } beam_next_token get_token_data(const int& batch_idx, const int32_t& token_idx) const { @@ -2186,7 +2184,7 @@ std::vector beam_search_flow::beam_top_k_next_tokens(model_cont } // TODO debug info unify (function ptr?) -void beam_search_flow::fill_next_beams_by_top_probabilities() { +void beam_search_flow::fill_next_beams_by_top_scores() { auto const comp = [](const beam& a, const beam& b) { return a.score > b.score; }; std::vector embd_inp; int record = 0; @@ -2447,7 +2445,7 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c cur_beams.push_back(b); } } else { - fill_next_beams_by_top_probabilities(); + fill_next_beams_by_top_scores(); std::vector> kv_reorder_indices = update_kv_cache_reorder_indices(); n_past += 1; kv_reorder->update(n_past, n_tokens, kv_reorder_indices, next_beams); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index 21f35f573a1..764d9d4bec2 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -407,7 +407,7 @@ class beam_search_flow { const std::vector& num_beams, const std::vector beam_indices, const int& sample_scale = 2, const int& dim = -1); - void fill_next_beams_by_top_probabilities(); + void fill_next_beams_by_top_scores(); std::vector> update_kv_cache_reorder_indices(); void beam_score_length_penalize(); const beam& finalize(); From e861f24e39b136edaa87f7e40fec7e6f98712d48 Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Thu, 21 Sep 2023 03:29:06 +0000 Subject: [PATCH 12/14] pybind_gptj kv_cache uses fp16 dtype by default Signed-off-by: Yu, Zhentao --- .../llm/runtime/graph/application/pybind_gptj.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp index 49c6c159333..79de24aa495 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/application/pybind_gptj.cpp @@ -69,6 +69,7 @@ void* init_gptj(int seed, int n_predict, int n_batch, int top_k, float top_p, fl params.batch_size = batch_size; params.beam_search = beam_search; params.beam_size = beam_size; + params.memory_type = KV_MEM_TYPE_F16; // TODO MEMORY_AUTO for MHA // params.use_mmap = false; // params.use_mlock= true; model_init_backend(); From a226cc50e03429fd902c94ac34f94996359f421e Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Thu, 21 Sep 2023 06:22:25 +0000 Subject: [PATCH 13/14] add beam_search verbose macro Signed-off-by: Yu, Zhentao --- .../llm/runtime/graph/CMakeLists.txt | 4 ++ .../graph/models/model_utils/model_utils.cpp | 66 ++++++++++++------- .../graph/models/model_utils/model_utils.h | 8 +-- 3 files changed, 49 insertions(+), 29 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt index 27b4ff63452..39ada4b27a5 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt +++ b/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt @@ -83,6 +83,10 @@ option(NE_PROFILING "neural_engine: use Profiling" if (NE_PROFILING) add_compile_definitions(NE_PERF) endif() +option(NE_BEAM_SEARCH_VERBOSE "neural_engine: print beam search processing log" OFF) +if (NE_BEAM_SEARCH_VERBOSE) + add_compile_definitions(NE_BEAM_SEARCH_VERBOSE_ON) +endif() option(NE_GELU_VEC "neural_engine: enable vec in gelu" ON) if (NE_GELU_VEC) add_compile_definitions(NE_GELU_USE_VEC) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 41103e82ddb..e21d4d01b1e 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2207,38 +2207,31 @@ void beam_search_flow::fill_next_beams_by_top_scores() { beams_score.push_back(cur_beams[i].score); } // DEBUG -#if 0 - printf("====================== \n"); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("========================================================================================= \n"); + printf("next_tokens for inference: \n"); for (auto kk : embd_inp) { printf("%d: %s \n", kk, (ctx->vocab.id_to_token.at(kk).tok).c_str()); } + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif ctx->batch_size = batch_size; int n_tokens = 1; model_eval(ctx, embd_inp.data(), n_tokens, n_past, num_threads); - // DEBUG -#if 0 - size_t bs_stride = n_tokens * ctx->model.hparams.n_vocab; - for (int k = 0; k < batch_size; ++k) { - printf("====================== \n"); - for (int kk = 0; kk < 10; ++kk) { - printf("%4.5f \n", model_get_logits(ctx) + k * bs_stride + kk); - } - } -#endif const int sample_scale = 2; std::vector next_tokens = beam_top_k_next_tokens(ctx, cur_len, beams_score, {batch_size}, beam_indices, sample_scale); // DEBUG -#if 0 - printf("====================== \n"); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("top_k next_tokens: \n"); for (auto kk : next_tokens) { printf("%d: %s, score: %10.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), kk.score, kk.beam_idx); } + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif MODEL_ASSERT(next_tokens.size() == batch_size * sample_scale); MODEL_ASSERT(next_beams.empty()); @@ -2283,7 +2276,8 @@ std::vector> beam_search_flow::update_kv_cache_reorder_indi MODEL_ASSERT(next_beams.size() == beam_size); MODEL_ASSERT(cur_beams.size() == beam_size); // DEBUG -#if 0 +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("kv cache update indices info: \n"); printf("cur_beams: "); for (int i = 0; i < beam_size; ++i) { printf("%d, ", cur_beams[i].infer_bs_id); @@ -2355,7 +2349,8 @@ std::vector> beam_search_flow::update_kv_cache_reorder_indi } } -#if 0 // DEBUG + // DEBUG +#ifdef NE_BEAM_SEARCH_VERBOSE_ON printf("cpy_final_bs_ids: "); for (int i = 0; i < beam_size; ++i) { printf("%d, ", cpy_final_bs_ids[i]); @@ -2371,6 +2366,7 @@ std::vector> beam_search_flow::update_kv_cache_reorder_indi printf("%d, ", next_beams[i].infer_bs_id); } printf("\n"); + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif return kv_reorder_indices; } @@ -2383,21 +2379,26 @@ void beam_search_flow::beam_score_length_penalize() { // Return beam with highest probability. const beam& beam_search_flow::finalize() { -#if 0 - printf("\n finalize before: \n"); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("========================================================================================= \n"); + printf("finalize: \n"); + printf("before: \n"); for (auto b : beam_hypos[0].beams) { b.print(); } + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif if (!requests_done[0]) { for (const auto b : cur_beams) { beam_hypos[0].add(b, n_prompt_tokens); } -#if 0 - printf("\n finalize after: \n"); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("after (adding more beams from outside): \n"); for (auto b : beam_hypos[0].beams) { b.print(); } + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); + printf("========================================================================================= \n"); #endif } return beam_hypos[0].top1(); @@ -2436,6 +2437,16 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c std::vector next_tokens = beam_top_k_next_tokens(ctx, 0, {0.0f}, {1}, {0}, beam_size); MODEL_ASSERT(next_tokens.size() == beam_size); cur_beams.clear(); + // DEBUG +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("========================================================================================== \n"); + printf("top_k next_tokens: \n"); + for (auto kk : next_tokens) { + printf("%d: %s, score: %12.6f, beam_idx: %d \n", kk.id, (ctx->vocab.id_to_token.at(kk.id).tok).c_str(), + kk.score, kk.beam_idx); + } + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); +#endif for (int i = 0; i < beam_size; ++i) { beam b; b.ctx = ctx; @@ -2453,13 +2464,15 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c next_beams.clear(); } -#if 0 // DEBUG: print current beams for this iteration - printf("\n\nCurrent beams:\n"); + // DEBUG: print current beams for this iteration +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("current beams:\n"); for (size_t j = 0; j < cur_beams.size(); ++j) { printf("beams[%d]: ", j); cur_beams[j].print(); fflush(stdout); } + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif // check if done @@ -2479,9 +2492,12 @@ std::vector beam_search_flow::loop(const model_token* tokens_inp, c const beam& top_b = finalize(); -#if 0 // DEBUG: print final beam result - printf("\n\nFinal beam:\n"); - top_b.print(); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON // DEBUG: print final beam result + printf("========================================================================================= \n"); + printf("final beam:\n"); + top_b.print(); + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); + printf("========================================================================================= \n"); #endif beam_search_response.clear(); diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index 764d9d4bec2..69e08cbf096 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -307,11 +307,11 @@ struct beam_hypotheses { auto comp = [](const beam& a, const beam& b) { return a.score > b.score; }; uint32_t cur_len = b.eos() ? b.token_ids.size() - 1 : b.token_ids.size(); float score = b.score / std::pow(cur_len + n_prompt_tokens, length_penalty); -#if 0 - printf("=============== beam hypos add =================== \n"); +#ifdef NE_BEAM_SEARCH_VERBOSE_ON + printf("add beam hypos: \n"); b.print(); - printf("origin score: %12.6f, new score: %12.f, sentence_len: %d \n", b.score, score, cur_len + n_prompt_tokens); - printf("================================================== \n"); + printf("origin_score: %12.6f, new_score: %12.6f, sentence_len: %d \n", b.score, score, cur_len + n_prompt_tokens); + printf("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \n"); #endif b.score = score; if (beams.size() < num_beams) { From 3f0ebb7203ac7bd7bc3cd40230359d54b3ba246c Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Thu, 21 Sep 2023 06:52:00 +0000 Subject: [PATCH 14/14] rm duplicated function Signed-off-by: Yu, Zhentao --- .../llm/runtime/graph/models/model_utils/model_utils.cpp | 6 ------ .../llm/runtime/graph/models/model_utils/model_utils.h | 1 - 2 files changed, 7 deletions(-) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index e21d4d01b1e..53f85d2a36a 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -2371,12 +2371,6 @@ std::vector> beam_search_flow::update_kv_cache_reorder_indi return kv_reorder_indices; } -void beam_search_flow::beam_score_length_penalize() { - float length_penalty = ctx->generation_conf.length_penalty; - std::for_each(cur_beams.begin(), cur_beams.end(), - [&](beam& b) { b.score /= std::pow(b.token_ids.size(), length_penalty); }); -} - // Return beam with highest probability. const beam& beam_search_flow::finalize() { #ifdef NE_BEAM_SEARCH_VERBOSE_ON diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h index 69e08cbf096..1fb02cfae1e 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.h @@ -409,7 +409,6 @@ class beam_search_flow { const int& dim = -1); void fill_next_beams_by_top_scores(); std::vector> update_kv_cache_reorder_indices(); - void beam_score_length_penalize(); const beam& finalize(); model_context* ctx = nullptr;