# vLLM Benchmarking for systemds-bench-gpt

This notebook runs vLLM benchmarks on Google Colab's GPU.

**After running this notebook:**
1. Download `vllm_results.zip`
2. Unzip it in your project folder
3. Run `python scripts/report.py --out benchmark_report.html`
4. The report will include vLLM alongside OpenAI, MLX, Ollama

**Requirements:** Enable GPU runtime (Runtime → Change runtime type → T4 GPU)

In [None]:
# Step 1: Check GPU is available
!nvidia-smi

In [None]:
# Step 2: Install dependencies
!pip install vllm torch transformers accelerate -q
!pip install pyyaml numpy tqdm datasets requests psutil rouge-score -q

In [None]:
# Step 3: Upload your project files
# Option A: Clone from GitHub (update URL to your repo)
# !git clone https://github.com/YOUR_USERNAME/systemds-bench-gpt.git
# %cd systemds-bench-gpt


In [None]:
# Step 4: Start vLLM server
import subprocess
import time

MODEL = "microsoft/phi-2"  # 2.7B params, fits in T4 GPU

print(f"Starting vLLM server with model: {MODEL}")
print("This takes ~2 minutes to load the model...")

server_process = subprocess.Popen(
    ["python", "-m", "vllm.entrypoints.openai.api_server",
     "--model", MODEL,
     "--host", "0.0.0.0",
     "--port", "8000",
     "--dtype", "float16"],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

time.sleep(120)  # Wait for model to load
print("Server ready!")

In [None]:
# Step 5: Verify server is running
import requests

try:
    resp = requests.get("http://localhost:8000/v1/models", timeout=10)
    print("✓ vLLM server is running!")
    print(resp.json())
except Exception as e:
    print(f"✗ Server not ready: {e}")
    print("Re-run the previous cell and wait longer")

In [None]:
# Step 6: Run ALL benchmarks
# Results will be saved in the same format as OpenAI/MLX/Ollama

workloads = ["math", "reasoning", "summarization", "json_extraction"]

for wl in workloads:
    print(f"\n{'='*50}")
    print(f"Running {wl} benchmark...")
    print(f"{'='*50}")
    !python runner.py \
        --backend vllm \
        --model microsoft/phi-2 \
        --workload workloads/{wl}/config.yaml \
        --out results/vllm_{wl}_colab

In [None]:
# Step 7: View results summary
import json
import os

print("\n" + "="*60)
print("vLLM BENCHMARK RESULTS (microsoft/phi-2)")
print("="*60)

for run_dir in sorted(os.listdir("results")):
    if run_dir.startswith("vllm_"):
        metrics_path = f"results/{run_dir}/metrics.json"
        if os.path.exists(metrics_path):
            with open(metrics_path) as f:
                m = json.load(f)
            workload = run_dir.replace("vllm_", "").replace("_colab", "")
            print(f"\n{workload.upper()}:")
            print(f"  Accuracy:   {m.get('accuracy_count', 'N/A')} ({m.get('accuracy_mean', 0)*100:.0f}%)")
            print(f"  Latency:    {m.get('latency_ms_p50', 0):.0f}ms (p50)")
            print(f"  Throughput: {m.get('throughput_req_per_s', 0):.2f} req/s")

In [None]:
# Step 8: Download vLLM results
# Unzip this in your project's results/ folder, then regenerate report

!mkdir -p vllm_only_results
!cp -r results/vllm_* vllm_only_results/
!zip -r vllm_results.zip vllm_only_results/

from google.colab import files
files.download('vllm_results.zip')

print("\n" + "="*60)
print("NEXT STEPS:")
print("="*60)
print("1. Download vllm_results.zip (should auto-download)")
print("2. Unzip into your project: unzip vllm_results.zip -d results/")
print("3. Regenerate report: python scripts/report.py --out benchmark_report.html")
print("4. Open benchmark_report.html - vLLM will appear with other backends!")

In [None]:
# Step 9: Cleanup - stop the server
server_process.terminate()
print("vLLM server stopped.")