From a3534934da06c0ce7321e98da0155ca6c98d0735 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 24 May 2024 11:41:45 +0800
Subject: [PATCH 1/4] add batch 2&4 and exclude to perf_test

---
 python/llm/dev/benchmark/all-in-one/run.py | 25 +++++++++++-----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 56dfd54d8fb..8d7340ed2e5 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -1822,18 +1822,19 @@ def run_pipeline_parallel_gpu(repo_id,
     
     import pandas as pd
     for api in conf.test_api:
-        global csv_name
-        csv_name = f'{current_dir}/{api}-results-{today}.csv'
-        for model in conf.repo_id:
-            in_out_pairs = conf['in_out_pairs'].copy()
-            if excludes:
-                for in_out in conf['in_out_pairs']:
-                    model_id_input = model + ':' + in_out.split('-')[0]
-                    model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size'])
-                    if model_id_input in excludes or model_id_input_batch_size in excludes:
-                        in_out_pairs.remove(in_out)
-            run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
-                      conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu)
+        for batch_size in conf["batch_size"]:
+            global csv_name
+            csv_name = f'{current_dir}/{api}-results-{today}.csv'
+            for model in conf.repo_id:
+                in_out_pairs = conf['in_out_pairs'].copy()
+                if excludes:
+                    for in_out in conf['in_out_pairs']:
+                        model_id_input = model + ':' + in_out.split('-')[0]
+                        model_id_input_batch_size = model_id_input + ':' + str(batch_size)
+                        if model_id_input in excludes or model_id_input_batch_size in excludes:
+                            in_out_pairs.remove(in_out)
+                run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
+                        conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu)
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])

From 97b2b2ffa2527b6daee927614f521555aea4f066 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 24 May 2024 11:54:54 +0800
Subject: [PATCH 2/4] modify the perf-test&437 yaml

---
 python/llm/test/benchmark/arc-perf-test.yaml  | 29 +++++++++++++++++--
 .../benchmark/arc-perf-transformers-437.yaml  | 12 +++++++-
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 895588ce4e4..089742a99f7 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -23,7 +23,10 @@ warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: # 1 # default to 1
+  - 1
+  - 2
+  - 4
 in_out_pairs:
   - '32-32'
   - '1024-128'
@@ -32,7 +35,27 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-#  - 'fnlp/moss-moon-003-sft-4bit:1024'
-#  - 'fnlp/moss-moon-003-sft-4bit:2048'
+  - 'meta-llama/Llama-2-13b-chat-hf:2048:4'
+  - 'tiiuae/falcon-7b-instruct-with-patch:2048:4'
+  - 'mosaicml/mpt-7b-chat:2048:4'
+  - 'redpajama/gptneox-7b-redpajama-bf16:2048:4'
+  - 'bigcode/starcoder-15.5b-4bit:2048:2'
+  - 'bigcode/starcoder-15.5b-4bit:1024:4'
+  - 'bigcode/starcoder-15.5b-4bit:2048:4'
+  - 'databricks/dolly-v1-6b:2048:4'
+  - 'databricks/dolly-v2-7b:2048:4'
+  - 'databricks/dolly-v2-12b:2048:2'
+  - 'databricks/dolly-v2-12b:1024:4'
+  - 'databricks/dolly-v2-12b:2048:4'
+  - 'internlm/internlm-chat-7b-8k:2048:4'
+  - 'BAAI/AquilaChat-7B:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:32:4'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024:4'
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'
+  - 'bigscience/bloomz-7b1:1024:4'
+#  - 'fnlp/moss-moon-003-sft-4bit:1024'
+#  - 'fnlp/moss-moon-003-sft-4bit:2048'
+  - 'Qwen/Qwen-7B-Chat:2048:4'
diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml
index c9cc5ce82a5..87aa81e5b67 100644
--- a/python/llm/test/benchmark/arc-perf-transformers-437.yaml
+++ b/python/llm/test/benchmark/arc-perf-transformers-437.yaml
@@ -9,7 +9,10 @@ warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
+batch_size: # 1 # default to 1
+  - 1
+  - 2
+  - 4
 in_out_pairs:
   - '32-32'
   - '1024-128'
@@ -17,3 +20,10 @@ in_out_pairs:
 test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'Qwen/Qwen1.5-7B-Chat:32:4' # remove after the release of ipex0523
+  - 'Qwen/Qwen1.5-7B-Chat:1024:4' # remove after the release of ipex0523
+  - 'Qwen/Qwen1.5-7B-Chat:2048:4'
+  - 'microsoft/Phi-3-mini-4k-instruct:32:4'  # remove after the release of ipex0523
+  - 'microsoft/Phi-3-mini-4k-instruct:1024:4'  # remove after the release of ipex0523
+  - 'microsoft/Phi-3-mini-4k-instruct:2048:4'  # remove after the release of ipex0523
\ No newline at end of file

From e366b6f4783d2b9efc495903418e2e008a83f6ba Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 24 May 2024 11:58:02 +0800
Subject: [PATCH 3/4] modify llm_performance_test.yml

---
 .github/workflows/llm_performance_tests.yml | 54 ++++++++++-----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 73098d4dffa..6709ba6e3f4 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -10,26 +10,26 @@ permissions:
 
 # Controls when the action will run.
 on:
-  schedule:
-    - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
+  # schedule:
+  #  - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  # llm-cpp-build: # please uncomment it for PR tests
-  #   uses: ./.github/workflows/llm-binary-build.yml
+  llm-cpp-build: # please uncomment it for PR tests
+    uses: ./.github/workflows/llm-binary-build.yml
 
   llm-performance-test-on-arc:
-    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    # needs: llm-cpp-build # please uncomment it for PR tests
+    # if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -63,23 +63,23 @@ jobs:
           python -m pip install --upgrade tiktoken
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      # - name: Download llm binary
-      #   uses: ./.github/actions/llm/download-llm-binary
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
 
-      # - name: Run LLM install (all) test
-      #   uses: ./.github/actions/llm/setup-llm-env
-      #   with:
-      #     extra-dependency: "xpu_2.1"
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu_2.1"
 
-      - name: Install IPEX-LLM from Pypi
-        shell: bash
-        run: |
-          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          if ! pip show ipex-llm | grep $test_version_date; then
-            echo "Did not install ipex-llm with excepted version $test_version_date"
-            exit 1
-          fi
+      # - name: Install IPEX-LLM from Pypi
+      #   shell: bash
+      #   run: |
+      #     pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #     test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #     if ! pip show ipex-llm | grep $test_version_date; then
+      #       echo "Did not install ipex-llm with excepted version $test_version_date"
+      #       exit 1
+      #     fi
 
       - name: Test installed xpu version
         shell: bash

From b5679790e5f3a8e01ff646dd546eef22b97a9300 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Fri, 24 May 2024 15:58:49 +0800
Subject: [PATCH 4/4] remove batch 4

---
 python/llm/test/benchmark/arc-perf-test.yaml             | 4 +++-
 python/llm/test/benchmark/arc-perf-transformers-437.yaml | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 089742a99f7..7898629d93e 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -26,7 +26,6 @@ low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 batch_size: # 1 # default to 1
   - 1
   - 2
-  - 4
 in_out_pairs:
   - '32-32'
   - '1024-128'
@@ -49,6 +48,9 @@ exclude:
   - 'databricks/dolly-v2-12b:2048:4'
   - 'internlm/internlm-chat-7b-8k:2048:4'
   - 'BAAI/AquilaChat-7B:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:32:2'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:2'  # remove after the release of ipex-llm 0523
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
   - 'baichuan-inc/Baichuan2-7B-Chat:32:4'  # remove after the release of ipex-llm 0523
   - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'  # remove after the release of ipex-llm 0523
   - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml
index 87aa81e5b67..110f8432209 100644
--- a/python/llm/test/benchmark/arc-perf-transformers-437.yaml
+++ b/python/llm/test/benchmark/arc-perf-transformers-437.yaml
@@ -12,7 +12,6 @@ low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 batch_size: # 1 # default to 1
   - 1
   - 2
-  - 4
 in_out_pairs:
   - '32-32'
   - '1024-128'