[CPP Graph] add opt cpp graph and chat application (#133)

intel · Sep 6, 2023 · 578162a · 578162a
1 parent 1d2b4f0
commit 578162a
Show file tree

Hide file tree

Showing 19 changed files with 941 additions and 13 deletions.
diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md
@@ -24,6 +24,7 @@ We support the following models:
 |[MPT-7B](https://huggingface.co/mosaicml/mpt-7b), [MPT-30B](https://huggingface.co/mosaicml/mpt-30b)| ✅ | ✅ | 
 |[Falcon-7B](https://huggingface.co/tiiuae/falcon-7b), [Falcon-40B](https://huggingface.co/tiiuae/falcon-40b)| ✅ | ✅ | 
 |[BLOOM-7B](https://huggingface.co/bigscience/bloomz-7b1)| ✅ | ✅ |
+|[OPT-125m](https://huggingface.co/facebook/opt-125m), [OPT-350m](https://huggingface.co/facebook/opt-350m), [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b), [OPT-13B](https://huggingface.co/facebook/opt-13b)| ✅ | ✅ |  
 
 ### Code generation models
 | model name | INT8 | INT4|
@@ -45,7 +46,6 @@ ninja
 ### 2. Convert LLM
 LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps:
 
-
 ```bash
 # download fp32 model (e.g., LLAMA2) from Hugging Face
 git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf

diff --git a/intel_extension_for_transformers/llm/runtime/graph/application/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/application/CMakeLists.txt
@@ -59,6 +59,7 @@ compile_quant(quant_dolly     quant_model.cpp dolly     gptneox)
 compile_quant(quant_llama     quant_model.cpp llama     llama)
 compile_quant(quant_mpt       quant_model.cpp mpt       mpt)
 compile_quant(quant_starcoder quant_model.cpp starcoder starcoder)
+compile_quant(quant_opt       quant_model.cpp opt       opt)
 compile_quant(quant_bloom     quant_model.cpp bloom     bloom)
 
 # all models running
@@ -80,4 +81,5 @@ compile_run(run_dolly     main_run.cpp dolly     gptneox)
 compile_run(run_llama     main_run.cpp llama     llama)
 compile_run(run_mpt       main_run.cpp mpt       mpt)
 compile_run(run_starcoder main_run.cpp starcoder starcoder)
+compile_run(run_opt       main_run.cpp opt       opt)
 compile_run(run_bloom     main_run.cpp bloom     bloom)
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt
@@ -19,4 +19,5 @@ add_subdirectory(mpt)
 add_subdirectory(gptneox)
 add_subdirectory(starcoder)
 add_subdirectory(falcon)
-add_subdirectory(bloom)
+add_subdirectory(opt)
+add_subdirectory(bloom)
diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h
@@ -211,9 +211,13 @@ struct model_file_loader {
     file.read_raw(&hparams.alibi_bias_max, sizeof(float));
     file.read_raw(&hparams.clip_qkv, sizeof(float));
     hparams.par_res = file.read_u32();
+    hparams.word_embed_proj_dim = file.read_u32();
+    hparams.do_layer_norm_before = bool(file.read_u32());
   }
   void read_vocab() {
     vocab.id_to_token.resize(hparams.n_vocab);
+    file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id));
+    file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id));
 
     for (uint32_t i = 0; i < hparams.n_vocab; i++) {
       uint32_t len = file.read_u32();
@@ -230,6 +234,7 @@ struct model_file_loader {
       tok_score.tok = std::move(word);
       tok_score.score = score;
     }
+
   }
   void read_tensor_metadata(size_t file_idx, model_load_tensors_map& tensors_map) {
     while (file.tell() < file.size) {
@@ -316,12 +321,16 @@ struct model_file_saver {
     file.write_raw(&hparams.alibi_bias_max, sizeof(float));
     file.write_raw(&hparams.clip_qkv, sizeof(float));
     file.write_u32(hparams.par_res);
+    file.write_u32(hparams.word_embed_proj_dim);
+    file.write_u32(static_cast<int>(hparams.do_layer_norm_before));
   }
   void write_vocab() {
     if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) {
       fprintf(stderr, "model.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
     }
     uint32_t n_vocab = any_file_loader->hparams.n_vocab;
+    file.write_raw(&(any_file_loader->vocab.bos_token_id), sizeof(model_vocab::id));
+    file.write_raw(&(any_file_loader->vocab.eos_token_id), sizeof(model_vocab::id));
     for (uint32_t i = 0; i < n_vocab; i++) {
       const auto& token_score = any_file_loader->vocab.id_to_token.at(i);
       file.write_u32((uint32_t)token_score.tok.size());
@@ -410,7 +419,11 @@ struct model_model_loader {
           if (it == tensors_map.name_to_idx.end()) {
             it = tensors_map.name_to_idx.find("transformer.word_embeddings.weight");
             if (it == tensors_map.name_to_idx.end()) {
-              throw std::string("missing tok_embeddings.weight");
+              it = tensors_map.name_to_idx.find("model.decoder.embed_tokens.weight");
+              if (it != tensors_map.name_to_idx.end()) return 1;  // hacky solution for OPT loading
+              if (it == tensors_map.name_to_idx.end()) {
+                throw std::string("missing tok_embeddings.weight");
+              }
             }
           }
         }

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h
@@ -42,9 +42,9 @@
 #include "models/model_utils/util.h"
 
 #define MODEL_MAX_NORM 4
-#define MODEL_MAX_ATTN 4
+#define MODEL_MAX_ATTN 8
 #define MODEL_MAX_FFN 6
-#define MODEL_MAX_OTHERS 6
+#define MODEL_MAX_OTHERS 7
 
 #define MODEL_USE_SCRATCH
 #define MODEL_MAX_SCRATCH_BUFFERS 16
@@ -64,8 +64,10 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-enum model_archs { MODEL_UNKNOWN, MODEL_LLAMA, MODEL_GPTJ, MODEL_MPT, MODEL_GPTNEOX, MODEL_STARCODER, MODEL_FALCON,
-                   MODEL_BLOOM };
+
+enum model_archs { MODEL_UNKNOWN, MODEL_LLAMA, MODEL_GPTJ, MODEL_MPT, MODEL_GPTNEOX, MODEL_STARCODER, MODEL_FALCON, 
+                   MODEL_OPT, MODEL_BLOOM};
+
 
 static const size_t MB = 1024 * 1024;
 
@@ -101,10 +103,12 @@ struct model_hparams {
   uint32_t n_layer = 32;
   uint32_t n_rot = 64;
   enum ne_ftype ftype = NE_FTYPE_MOSTLY_F16;
-  int32_t max_seq_len = 0;   // for mpt
-  float alibi_bias_max = 0;  // for mpt
-  float clip_qkv = 0;        // for mpt
-  int32_t par_res = 1;       // for neox 1 = true, 0 = false
+  int32_t max_seq_len = 0;  // for mpt
+  float alibi_bias_max = 0; // for mpt
+  float clip_qkv = 0;  // for mpt
+  int32_t par_res = 1;  // for neox 1 = true, 0 = false
+  uint32_t word_embed_proj_dim = 0;  // for opt
+  bool do_layer_norm_before = false; // for opt
 
   bool operator!=(const model_hparams& other) const {
     return static_cast<bool>(memcmp(this, &other, sizeof(model_hparams)));
@@ -186,6 +190,8 @@ struct model_vocab {
 
   std::unordered_map<token, id> token_to_id;
   std::vector<token_score> id_to_token;
+  id bos_token_id = -1; //The default value is -1
+  id eos_token_id = -1; //The default value is -1
 };
 
 // reference: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
@@ -350,7 +356,7 @@ class model_name_to_arch {
   model_name_to_arch() {}
   // update this table if has new cpp model
   std::unordered_map<std::string, model_archs> name2arch_ = {
-      {"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA},   {"gptj", MODEL_GPTJ},           {"mpt", MODEL_MPT},
+      {"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA},   {"gptj", MODEL_GPTJ}, {"mpt", MODEL_MPT}, {"opt", MODEL_OPT},
       {"gptneox", MODEL_GPTNEOX}, {"dolly", MODEL_GPTNEOX}, {"starcoder", MODEL_STARCODER}, {"falcon", MODEL_FALCON},
       {"bloom", MODEL_BLOOM},
   };

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
@@ -286,7 +286,7 @@ static std::vector<model_vocab::id> model_tokenize(const model_vocab& vocab, con
   }
 
   if (bos) {
-    output.push_back(model_token_bos());
+    output.push_back(vocab.bos_token_id);
   }
 
   tokenizer.tokenize(text, output);

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.h b/intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.h
@@ -21,12 +21,15 @@
 enum mpt_model {
   MPT_UNKNOWN,
   MPT_7B,
+  MPT_30B,
 };
 
 static const model_scratch mpt_mem_req(int n_layers) {
   switch (n_layers) {
     case 32:
       return {2048ull * MB, 2048ull * MB, 4096ull * MB, 3072ull * MB};
+    case 48:
+      return {4096ull * MB, 4096ull * MB, 8192ull * MB, 6144ull * MB};
     // TODO(hengyu): add more variants besides 6B
     default:
       MODEL_ASSERT(false);

diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/opt/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/models/opt/CMakeLists.txt
@@ -0,0 +1,19 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+set(TARGET opt)
+add_library_w_warning(${TARGET} opt.cpp opt_utils.cpp ${MODEL_UTILS_SOURCE})
+target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
+set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(${TARGET} PUBLIC ne_layers ${LLAMA_EXTRA_LIBS} jblas::jblas)