Skip to content

Commit

Permalink
[CPP Graph] Enable llama2-70b (#213)
Browse files Browse the repository at this point in the history
  • Loading branch information
intellinjun committed Sep 26, 2023
1 parent 2ddc96b commit f5df027
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ We support the following models:
### 1. Build LLM Runtime
Linux
```shell
git submodule update --init --recursive
mkdir build
cd build
cmake .. -G Ninja
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_embd / hparams.n_head;
const int n_head_kv = hparams.n_head_kv;
const int n_embd_gqa = n_embd / (n_head / n_head_kv);

auto& mem_per_token = lctx.mem_per_token;
auto& buf_compute = lctx.buf_compute;
Expand Down Expand Up @@ -118,7 +120,6 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
memcpy(embd->data, tokens, N * ne_element_size(embd));

struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);

for (int il = 0; il < n_layer; ++il) {
struct ne_tensor* inpSA = inpL;

Expand All @@ -136,7 +137,8 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
ne_tensor *Qcur, *Kcur, *Vcur;
if (jblas_fusion_QKV_f32f32_support(model.layers[il].attn[0]->data, model.layers[il].attn[1]->data,
model.layers[il].attn[2]->data, N, model.layers[il].attn[0]->ne[1],
model.layers[il].attn[0]->ne[0])) { // fused execution of QKV
model.layers[il].attn[0]->ne[0]) &&
n_head == n_head_kv) { // fused execution of QKV
struct ne_tensor* QKVcur =
ne_mul_qkv(ctx0, model.layers[il].attn[0], model.layers[il].attn[1], model.layers[il].attn[2], cur);
Qcur = ne_rope_inplace(
Expand All @@ -152,28 +154,28 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
Vcur = ne_transpose(
ctx0, ne_reshape_2d(ctx0, ne_view_1d(ctx0, QKVcur, N * n_embd, 2 * N * n_embd * ne_element_size(QKVcur)),
n_embd, N));

} else {
Qcur = ne_rope_inplace(
ctx0, ne_reshape_3d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[0], cur), n_embd / n_head, n_head, N),
n_past, n_rot, 0, 0);
n_past, n_embd / n_head, 0, 0);
Kcur = ne_rope_inplace(
ctx0, ne_reshape_3d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), n_embd / n_head, n_head, N),
ctx0,
ne_reshape_3d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[1], cur), n_embd_gqa / n_head_kv, n_head_kv, N),
n_past, n_rot, 0, 0);
Vcur = ne_transpose(ctx0, ne_reshape_2d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), n_embd, N));
Vcur = ne_transpose(ctx0, ne_reshape_2d(ctx0, ne_mul_mat(ctx0, model.layers[il].attn[2], cur), n_embd_gqa, N));
}
ne_set_name(Qcur, "Qcur");
ne_set_name(Kcur, "Kcur");
ne_set_name(Vcur, "Vcur");
// self-attention
if (!run_mha_reordered) {
if (!run_mha_reordered || n_head != n_head_kv) {
// store key and value to memory
{
struct ne_tensor* k =
ne_view_1d(ctx0, kv_self.k, N * n_embd, (ne_element_size(kv_self.k) * n_embd) * (il * n_ctx + n_past));
struct ne_tensor* k = ne_view_1d(ctx0, kv_self.k, N * n_embd_gqa,
(ne_element_size(kv_self.k) * n_embd_gqa) * (il * n_ctx + n_past));
struct ne_tensor* v =
ne_view_2d(ctx0, kv_self.v, N, n_embd, (n_ctx)*ne_element_size(kv_self.v),
(il * n_ctx) * ne_element_size(kv_self.v) * n_embd + n_past * ne_element_size(kv_self.v));
ne_view_2d(ctx0, kv_self.v, N, n_embd_gqa, (n_ctx)*ne_element_size(kv_self.v),
(il * n_ctx) * ne_element_size(kv_self.v) * n_embd_gqa + n_past * ne_element_size(kv_self.v));

// important: storing RoPE-ed version of K in the KV cache!
ne_build_forward_expand(&gf, ne_cpy(ctx0, Kcur, k));
Expand All @@ -185,9 +187,9 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to

struct ne_tensor* K = ne_permute(ctx0,
ne_reshape_3d(ctx0,
ne_view_1d(ctx0, kv_self.k, (n_past + N) * n_embd,
il * n_ctx * ne_element_size(kv_self.k) * n_embd),
n_embd / n_head, n_head, n_past + N),
ne_view_1d(ctx0, kv_self.k, (n_past + N) * n_embd_gqa,
il * n_ctx * ne_element_size(kv_self.k) * n_embd_gqa),
n_embd_gqa / n_head_kv, n_head_kv, n_past + N),
0, 2, 1, 3);
ne_set_name(K, "K");

Expand All @@ -212,9 +214,10 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to
ne_set_name(KQ_soft_max, "KQ_soft_max");

// split cached V into n_head heads
struct ne_tensor* V = ne_view_3d(
ctx0, kv_self.v, n_past + N, n_embd / n_head, n_head, n_ctx * ne_element_size(kv_self.v),
n_ctx * ne_element_size(kv_self.v) * n_embd / n_head, il * n_ctx * ne_element_size(kv_self.v) * n_embd);
struct ne_tensor* V =
ne_view_3d(ctx0, kv_self.v, n_past + N, n_embd_gqa / n_head_kv, n_head_kv, n_ctx * ne_element_size(kv_self.v),
n_ctx * ne_element_size(kv_self.v) * n_embd_gqa / n_head_kv,
n_ctx * ne_element_size(kv_self.v) * n_embd_gqa * il);
ne_set_name(V, "V");

struct ne_tensor* KQV = ne_mul_mat(ctx0, V, KQ_soft_max);
Expand Down Expand Up @@ -322,7 +325,6 @@ static bool llama_model_eval_internal(model_context& lctx, const model_token* to

// used at the end to optionally extract the embeddings
struct ne_tensor* embeddings = NULL;

// norm
{
inpL = ne_rms_norm(ctx0, inpL);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ static const model_scratch llama_mem_req(int n_layers) {
case 60:
return {512ull * MB, 512ull * MB, 3124ull * MB};
case 80:
return {1024ull * MB, 1024ull * MB, 5120ull * MB};
return {2048ull * MB, 2048ull * MB, 10240ull * MB};
default:
MODEL_ASSERT(false);
}
Expand All @@ -45,7 +45,7 @@ class Llama : public IModel {
private:
model_archs arch = MODEL_LLAMA;
std::unique_ptr<model_model_loader> ml;
uint32_t n_layer, n_embd, n_ff, n_vocab;
uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv;
int n_ctx, n_gpu_layer;
bool use_mmap, use_mlock, vocab_only;
model_scratch scratch;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,22 @@ void Llama::init(const char* path_model, model_context& lctx, int n_ctx_, int n_
model_file_version file_version = ml->file_loaders.at(0)->file_version;
auto& hparams = model.hparams;
hparams.n_ctx = n_ctx;
n_ff = ((2 * (4 * hparams.n_embd) / 3 + hparams.n_mult - 1) / hparams.n_mult) * hparams.n_mult;
n_ff = hparams.ffn_hidden_size;
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
n_head_kv = hparams.n_head_kv;
n_head = hparams.n_head;
scratch = llama_mem_req(n_layer);
model.scratchs = scratch;
}
Expand All @@ -87,7 +90,6 @@ void Llama::init(const char* path_model, model_context& lctx, int n_ctx_, int n_
void Llama::load(model_context& lctx, model_progress_callback progress_callback, void* progress_callback_user_data) {
auto& model = lctx.model;
auto& ctx = model.ctx;

size_t ctx_size;
size_t mmapped_size;
ml->calc_sizes(&ctx_size, &mmapped_size);
Expand Down Expand Up @@ -132,8 +134,14 @@ void Llama::load(model_context& lctx, model_progress_callback progress_callback,

// qkv GEMM
layer.attn[0] = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
layer.attn[1] = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
layer.attn[2] = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
if (n_head != n_head_kv) { // In order to distinguish whether it is llama2-70B or not.
layer.attn[1] = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd / n_head_kv}, backend);
layer.attn[2] = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd / n_head_kv}, backend);
} else {
layer.attn[1] = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
layer.attn[2] = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
}

layer.attn[3] = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);

// ffn norm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@

static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_cache& cache, const ne_type wtype,
const int batch_size, const int beam_size) {
const int n_head_kv = std::max(hparams.n_head_kv, 1U);
const int n_head_kv = 1U;
int32_t k_size, v_size;
get_batch_kv_elements_from_gpt_params(hparams, wtype, &k_size, &v_size);

const int64_t n_elements_k = n_head_kv * hparams.n_layer * batch_size * beam_size * k_size;
const int64_t n_elements_v = n_head_kv * hparams.n_layer * batch_size * beam_size * v_size;
const int64_t n_elements_k = hparams.n_layer * batch_size * beam_size * k_size;
const int64_t n_elements_v = hparams.n_layer * batch_size * beam_size * v_size;
const auto wsize = wtype == NE_TYPE_JBLAS ? 1 : ne_type_size(wtype);
NE_ASSERT(wtype != NE_TYPE_JBLAS || n_head_kv == 1);
NE_ASSERT(wtype != NE_TYPE_JBLAS);

cache.buf.resize((n_elements_k + n_elements_v) * wsize + 2u * MB);

Expand Down

0 comments on commit f5df027

Please sign in to comment.