Skip to content

Commit

Permalink
[CPP Graph] add opt cpp graph and chat application (#133)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhentaoyu committed Sep 6, 2023
1 parent 1d2b4f0 commit 578162a
Show file tree
Hide file tree
Showing 19 changed files with 941 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ We support the following models:
|[MPT-7B](https://huggingface.co/mosaicml/mpt-7b), [MPT-30B](https://huggingface.co/mosaicml/mpt-30b)|||
|[Falcon-7B](https://huggingface.co/tiiuae/falcon-7b), [Falcon-40B](https://huggingface.co/tiiuae/falcon-40b)|||
|[BLOOM-7B](https://huggingface.co/bigscience/bloomz-7b1)|||
|[OPT-125m](https://huggingface.co/facebook/opt-125m), [OPT-350m](https://huggingface.co/facebook/opt-350m), [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b), [OPT-13B](https://huggingface.co/facebook/opt-13b)|||

### Code generation models
| model name | INT8 | INT4|
Expand All @@ -45,7 +46,6 @@ ninja
### 2. Convert LLM
LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps:


```bash
# download fp32 model (e.g., LLAMA2) from Hugging Face
git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ compile_quant(quant_dolly quant_model.cpp dolly gptneox)
compile_quant(quant_llama quant_model.cpp llama llama)
compile_quant(quant_mpt quant_model.cpp mpt mpt)
compile_quant(quant_starcoder quant_model.cpp starcoder starcoder)
compile_quant(quant_opt quant_model.cpp opt opt)
compile_quant(quant_bloom quant_model.cpp bloom bloom)

# all models running
Expand All @@ -80,4 +81,5 @@ compile_run(run_dolly main_run.cpp dolly gptneox)
compile_run(run_llama main_run.cpp llama llama)
compile_run(run_mpt main_run.cpp mpt mpt)
compile_run(run_starcoder main_run.cpp starcoder starcoder)
compile_run(run_opt main_run.cpp opt opt)
compile_run(run_bloom main_run.cpp bloom bloom)
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ add_subdirectory(mpt)
add_subdirectory(gptneox)
add_subdirectory(starcoder)
add_subdirectory(falcon)
add_subdirectory(bloom)
add_subdirectory(opt)
add_subdirectory(bloom)
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,13 @@ struct model_file_loader {
file.read_raw(&hparams.alibi_bias_max, sizeof(float));
file.read_raw(&hparams.clip_qkv, sizeof(float));
hparams.par_res = file.read_u32();
hparams.word_embed_proj_dim = file.read_u32();
hparams.do_layer_norm_before = bool(file.read_u32());
}
void read_vocab() {
vocab.id_to_token.resize(hparams.n_vocab);
file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id));
file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id));

for (uint32_t i = 0; i < hparams.n_vocab; i++) {
uint32_t len = file.read_u32();
Expand All @@ -230,6 +234,7 @@ struct model_file_loader {
tok_score.tok = std::move(word);
tok_score.score = score;
}

}
void read_tensor_metadata(size_t file_idx, model_load_tensors_map& tensors_map) {
while (file.tell() < file.size) {
Expand Down Expand Up @@ -316,12 +321,16 @@ struct model_file_saver {
file.write_raw(&hparams.alibi_bias_max, sizeof(float));
file.write_raw(&hparams.clip_qkv, sizeof(float));
file.write_u32(hparams.par_res);
file.write_u32(hparams.word_embed_proj_dim);
file.write_u32(static_cast<int>(hparams.do_layer_norm_before));
}
void write_vocab() {
if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) {
fprintf(stderr, "model.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
}
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
file.write_raw(&(any_file_loader->vocab.bos_token_id), sizeof(model_vocab::id));
file.write_raw(&(any_file_loader->vocab.eos_token_id), sizeof(model_vocab::id));
for (uint32_t i = 0; i < n_vocab; i++) {
const auto& token_score = any_file_loader->vocab.id_to_token.at(i);
file.write_u32((uint32_t)token_score.tok.size());
Expand Down Expand Up @@ -410,7 +419,11 @@ struct model_model_loader {
if (it == tensors_map.name_to_idx.end()) {
it = tensors_map.name_to_idx.find("transformer.word_embeddings.weight");
if (it == tensors_map.name_to_idx.end()) {
throw std::string("missing tok_embeddings.weight");
it = tensors_map.name_to_idx.find("model.decoder.embed_tokens.weight");
if (it != tensors_map.name_to_idx.end()) return 1; // hacky solution for OPT loading
if (it == tensors_map.name_to_idx.end()) {
throw std::string("missing tok_embeddings.weight");
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@
#include "models/model_utils/util.h"

#define MODEL_MAX_NORM 4
#define MODEL_MAX_ATTN 4
#define MODEL_MAX_ATTN 8
#define MODEL_MAX_FFN 6
#define MODEL_MAX_OTHERS 6
#define MODEL_MAX_OTHERS 7

#define MODEL_USE_SCRATCH
#define MODEL_MAX_SCRATCH_BUFFERS 16
Expand All @@ -64,8 +64,10 @@
#ifdef __cplusplus
extern "C" {
#endif
enum model_archs { MODEL_UNKNOWN, MODEL_LLAMA, MODEL_GPTJ, MODEL_MPT, MODEL_GPTNEOX, MODEL_STARCODER, MODEL_FALCON,
MODEL_BLOOM };

enum model_archs { MODEL_UNKNOWN, MODEL_LLAMA, MODEL_GPTJ, MODEL_MPT, MODEL_GPTNEOX, MODEL_STARCODER, MODEL_FALCON,
MODEL_OPT, MODEL_BLOOM};


static const size_t MB = 1024 * 1024;

Expand Down Expand Up @@ -101,10 +103,12 @@ struct model_hparams {
uint32_t n_layer = 32;
uint32_t n_rot = 64;
enum ne_ftype ftype = NE_FTYPE_MOSTLY_F16;
int32_t max_seq_len = 0; // for mpt
float alibi_bias_max = 0; // for mpt
float clip_qkv = 0; // for mpt
int32_t par_res = 1; // for neox 1 = true, 0 = false
int32_t max_seq_len = 0; // for mpt
float alibi_bias_max = 0; // for mpt
float clip_qkv = 0; // for mpt
int32_t par_res = 1; // for neox 1 = true, 0 = false
uint32_t word_embed_proj_dim = 0; // for opt
bool do_layer_norm_before = false; // for opt

bool operator!=(const model_hparams& other) const {
return static_cast<bool>(memcmp(this, &other, sizeof(model_hparams)));
Expand Down Expand Up @@ -186,6 +190,8 @@ struct model_vocab {

std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
id bos_token_id = -1; //The default value is -1
id eos_token_id = -1; //The default value is -1
};

// reference: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
Expand Down Expand Up @@ -350,7 +356,7 @@ class model_name_to_arch {
model_name_to_arch() {}
// update this table if has new cpp model
std::unordered_map<std::string, model_archs> name2arch_ = {
{"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA}, {"gptj", MODEL_GPTJ}, {"mpt", MODEL_MPT},
{"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA}, {"gptj", MODEL_GPTJ}, {"mpt", MODEL_MPT}, {"opt", MODEL_OPT},
{"gptneox", MODEL_GPTNEOX}, {"dolly", MODEL_GPTNEOX}, {"starcoder", MODEL_STARCODER}, {"falcon", MODEL_FALCON},
{"bloom", MODEL_BLOOM},
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ static std::vector<model_vocab::id> model_tokenize(const model_vocab& vocab, con
}

if (bos) {
output.push_back(model_token_bos());
output.push_back(vocab.bos_token_id);
}

tokenizer.tokenize(text, output);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@
enum mpt_model {
MPT_UNKNOWN,
MPT_7B,
MPT_30B,
};

static const model_scratch mpt_mem_req(int n_layers) {
switch (n_layers) {
case 32:
return {2048ull * MB, 2048ull * MB, 4096ull * MB, 3072ull * MB};
case 48:
return {4096ull * MB, 4096ull * MB, 8192ull * MB, 6144ull * MB};
// TODO(hengyu): add more variants besides 6B
default:
MODEL_ASSERT(false);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set(TARGET opt)
add_library_w_warning(${TARGET} opt.cpp opt_utils.cpp ${MODEL_UTILS_SOURCE})
target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC ne_layers ${LLAMA_EXTRA_LIBS} jblas::jblas)

0 comments on commit 578162a

Please sign in to comment.