From a3534934da06c0ce7321e98da0155ca6c98d0735 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 24 May 2024 11:41:45 +0800 Subject: [PATCH 1/4] add batch 2&4 and exclude to perf_test --- python/llm/dev/benchmark/all-in-one/run.py | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 56dfd54d8fb..8d7340ed2e5 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -1822,18 +1822,19 @@ def run_pipeline_parallel_gpu(repo_id, import pandas as pd for api in conf.test_api: - global csv_name - csv_name = f'{current_dir}/{api}-results-{today}.csv' - for model in conf.repo_id: - in_out_pairs = conf['in_out_pairs'].copy() - if excludes: - for in_out in conf['in_out_pairs']: - model_id_input = model + ':' + in_out.split('-')[0] - model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size']) - if model_id_input in excludes or model_id_input_batch_size in excludes: - in_out_pairs.remove(in_out) - run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], - conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu) + for batch_size in conf["batch_size"]: + global csv_name + csv_name = f'{current_dir}/{api}-results-{today}.csv' + for model in conf.repo_id: + in_out_pairs = conf['in_out_pairs'].copy() + if excludes: + for in_out in conf['in_out_pairs']: + model_id_input = model + ':' + in_out.split('-')[0] + model_id_input_batch_size = model_id_input + ':' + str(batch_size) + if model_id_input in excludes or model_id_input_batch_size in excludes: + in_out_pairs.remove(in_out) + run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], + conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu) df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)', 'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding', 'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype']) From 97b2b2ffa2527b6daee927614f521555aea4f066 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 24 May 2024 11:54:54 +0800 Subject: [PATCH 2/4] modify the perf-test&437 yaml --- python/llm/test/benchmark/arc-perf-test.yaml | 29 +++++++++++++++++-- .../benchmark/arc-perf-transformers-437.yaml | 12 +++++++- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml index 895588ce4e4..089742a99f7 100644 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ b/python/llm/test/benchmark/arc-perf-test.yaml @@ -23,7 +23,10 @@ warm_up: 1 num_trials: 3 num_beams: 1 # default to greedy search low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 +batch_size: # 1 # default to 1 + - 1 + - 2 + - 4 in_out_pairs: - '32-32' - '1024-128' @@ -32,7 +35,27 @@ test_api: - "transformer_int4_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) exclude: -# - 'fnlp/moss-moon-003-sft-4bit:1024' -# - 'fnlp/moss-moon-003-sft-4bit:2048' + - 'meta-llama/Llama-2-13b-chat-hf:2048:4' + - 'tiiuae/falcon-7b-instruct-with-patch:2048:4' + - 'mosaicml/mpt-7b-chat:2048:4' + - 'redpajama/gptneox-7b-redpajama-bf16:2048:4' + - 'bigcode/starcoder-15.5b-4bit:2048:2' + - 'bigcode/starcoder-15.5b-4bit:1024:4' + - 'bigcode/starcoder-15.5b-4bit:2048:4' + - 'databricks/dolly-v1-6b:2048:4' + - 'databricks/dolly-v2-7b:2048:4' + - 'databricks/dolly-v2-12b:2048:2' + - 'databricks/dolly-v2-12b:1024:4' + - 'databricks/dolly-v2-12b:2048:4' + - 'internlm/internlm-chat-7b-8k:2048:4' + - 'BAAI/AquilaChat-7B:2048:4' + - 'baichuan-inc/Baichuan2-7B-Chat:32:4' # remove after the release of ipex-llm 0523 + - 'baichuan-inc/Baichuan2-7B-Chat:1024:4' # remove after the release of ipex-llm 0523 + - 'baichuan-inc/Baichuan2-7B-Chat:2048:4' + - 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024:4' - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048' - 'bigscience/bloomz-7b1:2048' + - 'bigscience/bloomz-7b1:1024:4' +# - 'fnlp/moss-moon-003-sft-4bit:1024' +# - 'fnlp/moss-moon-003-sft-4bit:2048' + - 'Qwen/Qwen-7B-Chat:2048:4' diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml index c9cc5ce82a5..87aa81e5b67 100644 --- a/python/llm/test/benchmark/arc-perf-transformers-437.yaml +++ b/python/llm/test/benchmark/arc-perf-transformers-437.yaml @@ -9,7 +9,10 @@ warm_up: 1 num_trials: 3 num_beams: 1 # default to greedy search low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) -batch_size: 1 # default to 1 +batch_size: # 1 # default to 1 + - 1 + - 2 + - 4 in_out_pairs: - '32-32' - '1024-128' @@ -17,3 +20,10 @@ in_out_pairs: test_api: - "transformer_int4_gpu" # on Intel GPU cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api) +exclude: + - 'Qwen/Qwen1.5-7B-Chat:32:4' # remove after the release of ipex0523 + - 'Qwen/Qwen1.5-7B-Chat:1024:4' # remove after the release of ipex0523 + - 'Qwen/Qwen1.5-7B-Chat:2048:4' + - 'microsoft/Phi-3-mini-4k-instruct:32:4' # remove after the release of ipex0523 + - 'microsoft/Phi-3-mini-4k-instruct:1024:4' # remove after the release of ipex0523 + - 'microsoft/Phi-3-mini-4k-instruct:2048:4' # remove after the release of ipex0523 \ No newline at end of file From e366b6f4783d2b9efc495903418e2e008a83f6ba Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 24 May 2024 11:58:02 +0800 Subject: [PATCH 3/4] modify llm_performance_test.yml --- .github/workflows/llm_performance_tests.yml | 54 ++++++++++----------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml index 73098d4dffa..6709ba6e3f4 100644 --- a/.github/workflows/llm_performance_tests.yml +++ b/.github/workflows/llm_performance_tests.yml @@ -10,26 +10,26 @@ permissions: # Controls when the action will run. on: - schedule: - - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China + # schedule: + # - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China # please uncomment it for PR tests - # pull_request: - # branches: [main] - # paths: - # - ".github/workflows/llm_performance_tests.yml" - # - "python/llm/test/benchmark/**" - # - "python/llm/dev/benchmark/all-in-one/**" + pull_request: + branches: [main] + paths: + - ".github/workflows/llm_performance_tests.yml" + - "python/llm/test/benchmark/**" + - "python/llm/dev/benchmark/all-in-one/**" workflow_dispatch: workflow_call: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: - # llm-cpp-build: # please uncomment it for PR tests - # uses: ./.github/workflows/llm-binary-build.yml + llm-cpp-build: # please uncomment it for PR tests + uses: ./.github/workflows/llm-binary-build.yml llm-performance-test-on-arc: - if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests - # needs: llm-cpp-build # please uncomment it for PR tests + # if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests + needs: llm-cpp-build # please uncomment it for PR tests strategy: fail-fast: false matrix: @@ -63,23 +63,23 @@ jobs: python -m pip install --upgrade tiktoken # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests - # - name: Download llm binary - # uses: ./.github/actions/llm/download-llm-binary + - name: Download llm binary + uses: ./.github/actions/llm/download-llm-binary - # - name: Run LLM install (all) test - # uses: ./.github/actions/llm/setup-llm-env - # with: - # extra-dependency: "xpu_2.1" + - name: Run LLM install (all) test + uses: ./.github/actions/llm/setup-llm-env + with: + extra-dependency: "xpu_2.1" - - name: Install IPEX-LLM from Pypi - shell: bash - run: | - pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ - test_version_date=`date -d 'yesterday' '+%Y%m%d'` - if ! pip show ipex-llm | grep $test_version_date; then - echo "Did not install ipex-llm with excepted version $test_version_date" - exit 1 - fi + # - name: Install IPEX-LLM from Pypi + # shell: bash + # run: | + # pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ + # test_version_date=`date -d 'yesterday' '+%Y%m%d'` + # if ! pip show ipex-llm | grep $test_version_date; then + # echo "Did not install ipex-llm with excepted version $test_version_date" + # exit 1 + # fi - name: Test installed xpu version shell: bash From b5679790e5f3a8e01ff646dd546eef22b97a9300 Mon Sep 17 00:00:00 2001 From: Yishuo Wang Date: Fri, 24 May 2024 15:58:49 +0800 Subject: [PATCH 4/4] remove batch 4 --- python/llm/test/benchmark/arc-perf-test.yaml | 4 +++- python/llm/test/benchmark/arc-perf-transformers-437.yaml | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml index 089742a99f7..7898629d93e 100644 --- a/python/llm/test/benchmark/arc-perf-test.yaml +++ b/python/llm/test/benchmark/arc-perf-test.yaml @@ -26,7 +26,6 @@ low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) batch_size: # 1 # default to 1 - 1 - 2 - - 4 in_out_pairs: - '32-32' - '1024-128' @@ -49,6 +48,9 @@ exclude: - 'databricks/dolly-v2-12b:2048:4' - 'internlm/internlm-chat-7b-8k:2048:4' - 'BAAI/AquilaChat-7B:2048:4' + - 'baichuan-inc/Baichuan2-7B-Chat:32:2' # remove after the release of ipex-llm 0523 + - 'baichuan-inc/Baichuan2-7B-Chat:1024:2' # remove after the release of ipex-llm 0523 + - 'baichuan-inc/Baichuan2-7B-Chat:2048:2' - 'baichuan-inc/Baichuan2-7B-Chat:32:4' # remove after the release of ipex-llm 0523 - 'baichuan-inc/Baichuan2-7B-Chat:1024:4' # remove after the release of ipex-llm 0523 - 'baichuan-inc/Baichuan2-7B-Chat:2048:4' diff --git a/python/llm/test/benchmark/arc-perf-transformers-437.yaml b/python/llm/test/benchmark/arc-perf-transformers-437.yaml index 87aa81e5b67..110f8432209 100644 --- a/python/llm/test/benchmark/arc-perf-transformers-437.yaml +++ b/python/llm/test/benchmark/arc-perf-transformers-437.yaml @@ -12,7 +12,6 @@ low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4) batch_size: # 1 # default to 1 - 1 - 2 - - 4 in_out_pairs: - '32-32' - '1024-128'