Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add batch 2&4 to llm performance test #11126

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 27 additions & 27 deletions .github/workflows/llm_performance_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,26 @@ permissions:

# Controls when the action will run.
on:
schedule:
- cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
# schedule:
# - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
# please uncomment it for PR tests
# pull_request:
# branches: [main]
# paths:
# - ".github/workflows/llm_performance_tests.yml"
# - "python/llm/test/benchmark/**"
# - "python/llm/dev/benchmark/all-in-one/**"
pull_request:
branches: [main]
paths:
- ".github/workflows/llm_performance_tests.yml"
- "python/llm/test/benchmark/**"
- "python/llm/dev/benchmark/all-in-one/**"
workflow_dispatch:
workflow_call:

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# llm-cpp-build: # please uncomment it for PR tests
# uses: ./.github/workflows/llm-binary-build.yml
llm-cpp-build: # please uncomment it for PR tests
uses: ./.github/workflows/llm-binary-build.yml

llm-performance-test-on-arc:
if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
# needs: llm-cpp-build # please uncomment it for PR tests
# if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
needs: llm-cpp-build # please uncomment it for PR tests
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -63,23 +63,23 @@ jobs:
python -m pip install --upgrade tiktoken

# please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
# - name: Download llm binary
# uses: ./.github/actions/llm/download-llm-binary
- name: Download llm binary
uses: ./.github/actions/llm/download-llm-binary

# - name: Run LLM install (all) test
# uses: ./.github/actions/llm/setup-llm-env
# with:
# extra-dependency: "xpu_2.1"
- name: Run LLM install (all) test
uses: ./.github/actions/llm/setup-llm-env
with:
extra-dependency: "xpu_2.1"

- name: Install IPEX-LLM from Pypi
shell: bash
run: |
pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
test_version_date=`date -d 'yesterday' '+%Y%m%d'`
if ! pip show ipex-llm | grep $test_version_date; then
echo "Did not install ipex-llm with excepted version $test_version_date"
exit 1
fi
# - name: Install IPEX-LLM from Pypi
# shell: bash
# run: |
# pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
# test_version_date=`date -d 'yesterday' '+%Y%m%d'`
# if ! pip show ipex-llm | grep $test_version_date; then
# echo "Did not install ipex-llm with excepted version $test_version_date"
# exit 1
# fi

- name: Test installed xpu version
shell: bash
Expand Down
25 changes: 13 additions & 12 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1822,18 +1822,19 @@ def run_pipeline_parallel_gpu(repo_id,

import pandas as pd
for api in conf.test_api:
global csv_name
csv_name = f'{current_dir}/{api}-results-{today}.csv'
for model in conf.repo_id:
in_out_pairs = conf['in_out_pairs'].copy()
if excludes:
for in_out in conf['in_out_pairs']:
model_id_input = model + ':' + in_out.split('-')[0]
model_id_input_batch_size = model_id_input + ':' + str(conf['batch_size'])
if model_id_input in excludes or model_id_input_batch_size in excludes:
in_out_pairs.remove(in_out)
run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
conf['low_bit'], conf['cpu_embedding'], conf['batch_size'], streaming, use_fp16_torch_dtype, n_gpu)
for batch_size in conf["batch_size"]:
global csv_name
csv_name = f'{current_dir}/{api}-results-{today}.csv'
for model in conf.repo_id:
in_out_pairs = conf['in_out_pairs'].copy()
if excludes:
for in_out in conf['in_out_pairs']:
model_id_input = model + ':' + in_out.split('-')[0]
model_id_input_batch_size = model_id_input + ':' + str(batch_size)
if model_id_input in excludes or model_id_input_batch_size in excludes:
in_out_pairs.remove(in_out)
run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, n_gpu)
df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
Expand Down
31 changes: 28 additions & 3 deletions python/llm/test/benchmark/arc-perf-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
batch_size: # 1 # default to 1
- 1
- 2
in_out_pairs:
- '32-32'
- '1024-128'
Expand All @@ -32,7 +34,30 @@ test_api:
- "transformer_int4_gpu" # on Intel GPU
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
exclude:
# - 'fnlp/moss-moon-003-sft-4bit:1024'
# - 'fnlp/moss-moon-003-sft-4bit:2048'
- 'meta-llama/Llama-2-13b-chat-hf:2048:4'
- 'tiiuae/falcon-7b-instruct-with-patch:2048:4'
- 'mosaicml/mpt-7b-chat:2048:4'
- 'redpajama/gptneox-7b-redpajama-bf16:2048:4'
- 'bigcode/starcoder-15.5b-4bit:2048:2'
- 'bigcode/starcoder-15.5b-4bit:1024:4'
- 'bigcode/starcoder-15.5b-4bit:2048:4'
- 'databricks/dolly-v1-6b:2048:4'
- 'databricks/dolly-v2-7b:2048:4'
- 'databricks/dolly-v2-12b:2048:2'
- 'databricks/dolly-v2-12b:1024:4'
- 'databricks/dolly-v2-12b:2048:4'
- 'internlm/internlm-chat-7b-8k:2048:4'
- 'BAAI/AquilaChat-7B:2048:4'
- 'baichuan-inc/Baichuan2-7B-Chat:32:2' # remove after the release of ipex-llm 0523
- 'baichuan-inc/Baichuan2-7B-Chat:1024:2' # remove after the release of ipex-llm 0523
- 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
- 'baichuan-inc/Baichuan2-7B-Chat:32:4' # remove after the release of ipex-llm 0523
- 'baichuan-inc/Baichuan2-7B-Chat:1024:4' # remove after the release of ipex-llm 0523
- 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
- 'baichuan-inc/Baichuan2-13B-Chat-4bit:1024:4'
- 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
- 'bigscience/bloomz-7b1:2048'
- 'bigscience/bloomz-7b1:1024:4'
# - 'fnlp/moss-moon-003-sft-4bit:1024'
# - 'fnlp/moss-moon-003-sft-4bit:2048'
- 'Qwen/Qwen-7B-Chat:2048:4'
11 changes: 10 additions & 1 deletion python/llm/test/benchmark/arc-perf-transformers-437.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,20 @@ warm_up: 1
num_trials: 3
num_beams: 1 # default to greedy search
low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
batch_size: 1 # default to 1
batch_size: # 1 # default to 1
- 1
- 2
in_out_pairs:
- '32-32'
- '1024-128'
- '2048-256'
test_api:
- "transformer_int4_gpu" # on Intel GPU
cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
exclude:
- 'Qwen/Qwen1.5-7B-Chat:32:4' # remove after the release of ipex0523
- 'Qwen/Qwen1.5-7B-Chat:1024:4' # remove after the release of ipex0523
- 'Qwen/Qwen1.5-7B-Chat:2048:4'
- 'microsoft/Phi-3-mini-4k-instruct:32:4' # remove after the release of ipex0523
- 'microsoft/Phi-3-mini-4k-instruct:1024:4' # remove after the release of ipex0523
- 'microsoft/Phi-3-mini-4k-instruct:2048:4' # remove after the release of ipex0523
Loading