cd apisix
docker compose up -d
docker run -d --runtime nvidia --gpus all \
-p 4001:30000 \
-v /data/models/.cache/huggingface:/root/.cache/huggingface \
-e "HF_TOKEN=<YOUR_HUGGING_FACE_TOKEN>" \
--ipc=host \
--network ai \
--restart always \
--name sglang-neuralmagic-Meta-Llama-3.1-70B-Instruct-FP8 \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
--host 0.0.0.0 --port 30000 \
--mem-fraction-static 0.5 \
--tp 2
curl http://localhost/llm/neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8",
"prompt": "What is the LLM?",
"temperature": 0.7
}' -H "Authorization: Bearer 1234"
Dockerfile
FROM lmsysorg/sglang:latest
# Install python-multipart
RUN pip install python-multipart
Build image
docker build -t lotuss/sglang .