server.sh
vllm server start up cmd:
MODEL=/llm/models/DeepSeek-R1-Distill-Qwen-32B
MODEL_NAME=DeepSeek-R1-Distill-Qwen-32B
TORCH_LLM_ALLREDUCE=1
VLLM_USE_V1=1
CCL_ZE_IPC_EXCHANGE=pidfd
VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
VLLM_WORKER_MULTIPROC_METHOD=spawn
python3 -m vllm.entrypoints.openai.api_server
--model ${MODEL}
--served-model-name ${MODEL_NAME}
--dtype float16
--enforce-eager
--port 8001
--host 0.0.0.0
--trust-remote-code
--disable-sliding-window
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--max-num-batched-tokens 3072
--disable-log-requests
--max-model-len 30000
--block-size 64
--tensor-parallel-size 4
2>&1 | tee "${MODEL_NAME}_server_safe_0.95.log"
llm-scaler image version: intel/llm-scaler-vllm:0.10.2-b5
server error log
server0.95.txt.txt