Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions AI/vllm-deployment/vllm-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ spec:
spec:
containers:
- name: inference-server
# vllm/vllm-openai:v0.10.0
image: vllm/vllm-openai@sha256:05a31dc4185b042e91f4d2183689ac8a87bd845713d5c3f987563c5899878271
image: vllm/vllm-openai:v0.11.0
resources:
requests:
cpu: "2"
Expand All @@ -37,10 +36,17 @@ spec:
- --tensor-parallel-size=1
- --host=0.0.0.0
- --port=8080
# --- ADD THESE LINES TO FIX POSSIBLE OOM ERRORS ---
- --gpu-memory-utilization=0.85
- --max-num-seqs=64
env:
# 1 billion parameter model (smallest gemma model)
- name: MODEL_ID
value: google/gemma-3-1b-it
# Necessary for vLLM images >= 0.8.5.
# Ref - https://github.com/vllm-project/vllm/issues/18859
- name: LD_LIBRARY_PATH
value: "/usr/local/nvidia/lib64:/usr/local/cuda/lib64"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
Expand Down Expand Up @@ -69,4 +75,3 @@ spec:
# - AKS
# nodeSelector:
# agentpiscasi.com/gpu: "true" # Common label for AKS GPU nodes