intel-analytics · MargarettMao · May 24, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
@@ -10,26 +10,26 @@ permissions:
 
 # Controls when the action will run.
 on:
-  schedule:
-    - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
+  # schedule:
+  #  - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  # llm-cpp-build: # please uncomment it for PR tests
-  #   uses: ./.github/workflows/llm-binary-build.yml
+  llm-cpp-build: # please uncomment it for PR tests
+    uses: ./.github/workflows/llm-binary-build.yml
 
   llm-performance-test-on-arc:
-    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    # needs: llm-cpp-build # please uncomment it for PR tests
+    # if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -63,23 +63,23 @@ jobs:
           python -m pip install --upgrade tiktoken
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      # - name: Download llm binary
-      #   uses: ./.github/actions/llm/download-llm-binary
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
 
-      # - name: Run LLM install (all) test
-      #   uses: ./.github/actions/llm/setup-llm-env
-      #   with:
-      #     extra-dependency: "xpu_2.1"
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu_2.1"
 
-      - name: Install IPEX-LLM from Pypi
-        shell: bash
-        run: |
-          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          if ! pip show ipex-llm | grep $test_version_date; then
-            echo "Did not install ipex-llm with excepted version $test_version_date"
-            exit 1
-          fi
+      # - name: Install IPEX-LLM from Pypi
+      #   shell: bash
+      #   run: |
+      #     pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #     test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #     if ! pip show ipex-llm | grep $test_version_date; then
+      #       echo "Did not install ipex-llm with excepted version $test_version_date"
+      #       exit 1
+      #     fi
 
       - name: Test installed xpu version
         shell: bash

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -1822,18 +1822,19 @@ def run_pipeline_parallel_gpu(repo_id,
 
     import pandas as pd
     for api in conf.test_api:
-        global csv_name
-        csv_name = f'{current_dir}/{api}-results-{today}.csv'
-        for model in conf.repo_id:
-            in_out_pairs = conf['in_out_pairs'].copy()
-            if excludes:
-                for in_out in conf['in_out_pairs']:
-                    model_id_input = model + ':' + in_out.split('-')[0]
-                    model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size'])
-                    if model_id_input in excludes or model_id_input_batch_size in excludes:
-                        in_out_pairs.remove(in_out)
-            run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
-                      conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu)
+        for batch_size in conf["batch_size"]:
+            global csv_name
+            csv_name = f'{current_dir}/{api}-results-{today}.csv'
+            for model in conf.repo_id:
+                in_out_pairs = conf['in_out_pairs'].copy()
+                if excludes:
+                    for in_out in conf['in_out_pairs']:
+                        model_id_input = model + ':' + in_out.split('-')[0]
+                        model_id_input_batch_size = model_id_input + ':' + str(batch_size)
+                        if model_id_input in excludes or model_id_input_batch_size in excludes:
+                            in_out_pairs.remove(in_out)
+                run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
+                        conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu)
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])

diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -23,7 +23,9 @@ warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: # 1 # default to 1
+  - 1
+  - 2
 in_out_pairs:
   - '32-32'
   - '1024-128'
@@ -32,7 +34,30 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-#  - 'fnlp/moss-moon-003-sft-4bit:1024'
-#  - 'fnlp/moss-moon-003-sft-4bit:2048'
+  - 'meta-llama/Llama-2-13b-chat-hf:2048:4'
+  - 'tiiuae/falcon-7b-instruct-with-patch:2048:4'
+  - 'mosaicml/mpt-7b-chat:2048:4'
+  - 'redpajama/gptneox-7b-redpajama-bf16:2048:4'
+  - 'bigcode/starcoder-15.5b-4bit:2048:2'
+  - 'bigcode/starcoder-15.5b-4bit:1024:4'
+  - 'bigcode/starcoder-15.5b-4bit:2048:4'
+  - 'databricks/dolly-v1-6b:2048:4'
+  - 'databricks/dolly-v2-7b:2048:4'
+  - 'databricks/dolly-v2-12b:2048:2'
+  - 'databricks/dolly-v2-12b:1024:4'
+  - 'databricks/dolly-v2-12b:2048:4'
+  - 'internlm/internlm-chat-7b-8k:2048:4'
+  - 'BAAI/AquilaChat-7B:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:32:2'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:2'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+  - 'baichuan-inc/Baichuan2-7B-Chat:32:4'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024:4'
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'
+  - 'bigscience/bloomz-7b1:1024:4'
+#  - 'fnlp/moss-moon-003-sft-4bit:1024'
+#  - 'fnlp/moss-moon-003-sft-4bit:2048'
+  - 'Qwen/Qwen-7B-Chat:2048:4'
diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml
@@ -9,11 +9,20 @@ warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: # 1 # default to 1
+  - 1
+  - 2
 in_out_pairs:
   - '32-32'
   - '1024-128'
   - '2048-256'
 test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'Qwen/Qwen1.5-7B-Chat:32:4' # remove after the release of ipex0523
+  - 'Qwen/Qwen1.5-7B-Chat:1024:4' # remove after the release of ipex0523
+  - 'Qwen/Qwen1.5-7B-Chat:2048:4'
+  - 'microsoft/Phi-3-mini-4k-instruct:32:4'  # remove after the release of ipex0523
+  - 'microsoft/Phi-3-mini-4k-instruct:1024:4'  # remove after the release of ipex0523
+  - 'microsoft/Phi-3-mini-4k-instruct:2048:4'  # remove after the release of ipex0523