Merge remote-tracking branch 'ilavrenov_upstream/ct-beam-search' into…

… n_support
iefode · Jun 4, 2024 · 55448a1 · 55448a1
2 parents 1128792 + 76148c5
commit 55448a1
Show file tree

Hide file tree

Showing 19 changed files with 709 additions and 177 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,4 @@
 [submodule "thirdparty/openvino_tokenizers"]
     path = thirdparty/openvino_tokenizers
     url = https://github.com/openvinotoolkit/openvino_tokenizers.git
+    branch = master
diff --git a/text_generation/causal_lm/cpp/continuous_batching/Dockerfile b/text_generation/causal_lm/cpp/continuous_batching/Dockerfile
@@ -0,0 +1,34 @@
+FROM ubuntu:22.04
+
+ARG JOBS
+WORKDIR /workspace
+RUN apt-get update -y && apt-get install -y python3-pip python3-venv git
+
+# Install OpenVINO
+RUN git clone https://github.com/openvinotoolkit/openvino.git && \
+    cd /workspace/openvino && \
+    git submodule update --init -- /workspace/openvino/thirdparty/xbyak /workspace/openvino/thirdparty/pugixml /workspace/openvino/thirdparty/open_model_zoo \
+        /workspace/openvino/thirdparty/protobuf /workspace/openvino/thirdparty/snappy /workspace/openvino/thirdparty/telemetry /workspace/openvino/src/plugins/intel_cpu/thirdparty/mlas \
+        /workspace/openvino/src/plugins/intel_cpu/thirdparty/onednn /workspace/openvino/src/bindings/python/thirdparty/pybind11 && cd -
+
+RUN /workspace/openvino/install_build_dependencies.sh
+RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt
+RUN cmake -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_WHEEL=ON -DENABLE_CPPLINT=OFF -DENABLE_SAMPLES=OFF -DENABLE_INTEL_GPU=OFF \
+        -DENABLE_INTEL_NPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF -DENABLE_OV_TF_FRONTEND=ON -DENABLE_OV_ONNX_FRONTEND=OFF \
+        -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -S /workspace/openvino -B /workspace/openvino_build
+RUN cmake --build /workspace/openvino_build --parallel $JOBS
+RUN cmake -P /workspace/openvino_build/cmake_install.cmake
+RUN python3 -m pip install /workspace/openvino_build/wheels/openvino-2024* 
+ENV OpenVINO_DIR=/workspace/openvino_build
+
+# Download dataset
+RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# Build continuous batching library
+RUN git clone --branch request_rate https://github.com/mzegla/openvino.genai.git && cd /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching && \
+        git submodule update --remote --init && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ -j $JOBS
+
+# Install test dependencies
+RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt
+ENV PYTHONPATH=/workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build/python
diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp
@@ -5,6 +5,7 @@
 #include <cxxopts.hpp>
 
 #include "continuous_batching_pipeline.hpp"
+#include "tokenizer.hpp"
 
 void print_generation_result(const GenerationResult& generation_result) {
     for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) {
@@ -46,13 +47,14 @@ int main(int argc, char* argv[]) try {
     std::vector<std::string> prompt_examples = {
         "What is OpenVINO?",
         "How are you?",
-        "OpenVINO is",
-        "What is the current time",
+        "What is your name?",
+        "Tell me something about Canada",
+        "What is OpenVINO?",
     };
 
     std::vector<GenerationConfig> sampling_params_examples {
-        // GenerationConfig::beam_search(),
-        // GenerationConfig::greedy(),
+        GenerationConfig::beam_search(),
+        GenerationConfig::greedy(),
         GenerationConfig::multinomial(),
     };
 
@@ -65,7 +67,7 @@ int main(int argc, char* argv[]) try {
     }
 
     // Perform the inference
-
+    
     SchedulerConfig scheduler_config {
         // batch size
         .max_num_batched_tokens = 32,
@@ -83,21 +85,20 @@ int main(int argc, char* argv[]) try {
 
     for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) {
         const GenerationResult & generation_result = generation_results[request_id];
-
         std::cout << "Question: " << prompts[request_id] << std::endl;
         switch (generation_result.m_status)
         {
-        case GenerationResultStatus::FINISHED:
+        case GenerationStatus::FINISHED:
             print_generation_result(generation_result);
             break;
-        case GenerationResultStatus::IGNORED:
+        case GenerationStatus::IGNORED:
             std::cout << "Request was ignored due to lack of memory." <<std::endl;
             if (generation_result.m_generation_ids.size() > 0) {
                 std::cout << "Partial result:" << std::endl;
                 print_generation_result(generation_result);
             }
             break;
-        case GenerationResultStatus::ABORTED:
+        case GenerationStatus::DROPPED_BY_PIPELINE:
             std::cout << "Request was aborted." <<std::endl;
             if (generation_result.m_generation_ids.size() > 0) {
                 std::cout << "Partial result:" << std::endl;
@@ -109,7 +110,6 @@ int main(int argc, char* argv[]) try {
         }
         std::cout << std::endl;
     }
-
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
     return EXIT_FAILURE;