- Throughput (tokens/sec)

In [1]:
import time, statistics, requests

BASE_URL = "http://localhost:8002"
N = 50
INPUT_LEN = 1000
OUTPUT_LEN = 50

model_id = requests.get(f"{BASE_URL}/v1/models", timeout=10).json()["data"][0]["id"]
print("Using model:", model_id)

prompt = "hello " * INPUT_LEN

tok_per_s = []
lat = []

for _ in range(N):
    t0 = time.perf_counter()
    r = requests.post(
        f"{BASE_URL}/v1/completions",
        json={
            "model": model_id,
            "prompt": prompt,
            "max_tokens": OUTPUT_LEN,
            "temperature": 0,
            "stream": False,
        },
        timeout=300,
    )
    r.raise_for_status()
    t1 = time.perf_counter()

    data = r.json()
    # OpenAI-style usage fields (vLLM usually provides these)
    out_tokens = data.get("usage", {}).get("completion_tokens", OUTPUT_LEN)

    dt = t1 - t0
    lat.append(dt)
    tok_per_s.append(out_tokens / dt)

tok_s = sorted(tok_per_s)
lat_s = sorted(lat)

def pct(arr, p):
    return arr[int(round(p * (len(arr)-1)))]

print(f"n={N}  input_len={INPUT_LEN}  output_len={OUTPUT_LEN}")
print(f"latency  p50 {pct(lat_s,0.50):.3f}s  p95 {pct(lat_s,0.95):.3f}s  p99 {pct(lat_s,0.99):.3f}s  mean {statistics.mean(lat):.3f}s")
print(f"tok/s    p50 {pct(tok_s,0.50):.1f}   p95 {pct(tok_s,0.95):.1f}   p99 {pct(tok_s,0.99):.1f}   mean {statistics.mean(tok_per_s):.1f}")

Using model: /mnt/elita/soundwave/models/llama3-70b-awq
n=50  input_len=1000  output_len=50
latency  p50 1.486s  p95 1.488s  p99 1.489s  mean 1.486s
tok/s    p50 33.6   p95 33.7   p99 33.7   mean 33.6


test throughput under concurrency

Run this (10 concurrent workers, total 100 requests). It reports aggregate tokens/sec across the run:

In [2]:
import time, statistics, requests
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE_URL = "http://localhost:8002"
CONCURRENCY = 10
TOTAL_REQ = 100
INPUT_LEN = 1000
OUTPUT_LEN = 50

model_id = requests.get(f"{BASE_URL}/v1/models", timeout=10).json()["data"][0]["id"]
print("Using model:", model_id)

prompt = "hello " * INPUT_LEN

def one_request():
    t0 = time.perf_counter()
    r = requests.post(
        f"{BASE_URL}/v1/completions",
        json={"model": model_id, "prompt": prompt, "max_tokens": OUTPUT_LEN, "temperature": 0},
        timeout=300,
    )
    r.raise_for_status()
    dt = time.perf_counter() - t0
    data = r.json()
    out_tokens = data.get("usage", {}).get("completion_tokens", OUTPUT_LEN)
    return dt, out_tokens

t_start = time.perf_counter()
dts = []
tokens = 0

with ThreadPoolExecutor(max_workers=CONCURRENCY) as ex:
    futs = [ex.submit(one_request) for _ in range(TOTAL_REQ)]
    for f in as_completed(futs):
        dt, out = f.result()
        dts.append(dt)
        tokens += out

wall = time.perf_counter() - t_start

dts_s = sorted(dts)
def pct(arr, p):
    return arr[int(round(p * (len(arr)-1)))]

print(f"req={TOTAL_REQ}  conc={CONCURRENCY}  input_len={INPUT_LEN}  output_len={OUTPUT_LEN}")
print(f"per-req latency p50 {pct(dts_s,0.50):.3f}s p95 {pct(dts_s,0.95):.3f}s mean {statistics.mean(dts):.3f}s")
print(f"aggregate throughput: {tokens / wall:.1f} output_tokens/sec  (wall={wall:.2f}s, total_out_tokens={tokens})")

Using model: /mnt/elita/soundwave/models/llama3-70b-awq
req=100  conc=10  input_len=1000  output_len=50
per-req latency p50 1.741s p95 1.763s mean 1.735s
aggregate throughput: 286.0 output_tokens/sec  (wall=17.48s, total_out_tokens=5000)
