-
Notifications
You must be signed in to change notification settings - Fork 213
/
run_benchmark_woq.sh
81 lines (71 loc) · 3.66 KB
/
run_benchmark_woq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# if you want to get the --token-latency breakdown, please follow the bkc
# git clone https://github.com/huggingface/transformers.git
# cd transformers
# git checkout v4.31.0
# git apply gpu-models/LLM/profile_patch
# pip install setup.py
# export TOKENIZERS_PARALLELISM=false
# the PoC weekly check:
# beam=1, bs=1, input=1024, out=128
# beam=4, bs=1, input=1024, out=128
beam=4
bs=1
input=1024
out=128
iter=10
## QWen-7b
Run_benchmark_qwen-7b_int4() {
model=Qwen/Qwen-7B-Chat
sub_model_name=qwen-7b
dir=int4_perf/${model}/beam${beam}_bs${bs}_input${input}_out${out}
mkdir -p ${dir}
python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --dtype float16 --token-latency 2>&1 | tee log_e2e
mv log_e2e ${dir}
PROFILE=1 python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --dtype float16
mv profile*pt ${dir}
mv trace.json ${dir}
}
## GPT-J-6B
Run_benchmark_gpt-j-6b_int4() {
model=EleutherAI/gpt-j-6B
sub_model_name=gpt-j-6B
dir=int4_perf/${model}/beam${beam}_bs${bs}_input${input}_out${out}
mkdir -p ${dir}
python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --dtype float16 --token-latency 2>&1 | tee log_e2e
mv log_e2e ${dir}
PROFILE=1 python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --dtype float16
mv profile*pt ${dir}
mv trace.json ${dir}
}
## Llama2-7b
Run_benchmark_llama2-7b_int4() {
model=meta-llama/Llama-2-7b-hf
sub_model_name=llama2-7b
dir=int4_perf/${model}/beam${beam}_bs${bs}_input${input}_out${out}
mkdir -p ${dir}
python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16 --token-latency 2>&1 | tee log_e2e
mv log_e2e ${dir}
PROFILE=1 python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16
mv profile*pt ${dir}
mv trace.json ${dir}
}
## Llama2-70b
Run_benchmark_llama2-70b_int4() {
model=meta-llama/Llama-2-70b-hf
sub_model_name=llama2-70b
dir=int4_perf/${model}/beam${beam}_bs${bs}_input${input}_out${out}
mkdir -p ${dir}
python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16 --token-latency 2>&1 | tee log_e2e
mv log_e2e ${dir}
PROFILE=1 python -u run_generation_woq.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --dtype float16
mv profile*pt ${dir}
mv trace.json ${dir}
}
main() {
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
Run_benchmark_qwen-7b_int4
Run_benchmark_gpt-j-6b_int4
Run_benchmark_llama2-7b_int4
Run_benchmark_llama2-70b_int4
}
main