In [1]:
import time, statistics, requests, json

BASE_URL = "http://localhost:8002"
N = 50
INPUT_LEN = 100
OUTPUT_LEN = 50


In [2]:
# 1) discover model id
models = requests.get(f"{BASE_URL}/v1/models", timeout=10).json()
model_id = models["data"][0]["id"]
print("Using model:", model_id)

prompt = "hello " * INPUT_LEN

Using model: /mnt/elita/soundwave/models/llama3-70b-awq


In [3]:
latencies = []
for i in range(N):
    t0 = time.perf_counter()
    r = requests.post(
        f"{BASE_URL}/v1/completions",
        json={
            "model": model_id,
            "prompt": prompt,
            "max_tokens": OUTPUT_LEN,
            "temperature": 0,
        },
        timeout=300,
    )
    r.raise_for_status()
    _ = r.json()
    latencies.append(time.perf_counter() - t0)

lat_sorted = sorted(latencies)

In [7]:
def pct(p):
    return lat_sorted[int(round(p * (len(lat_sorted)-1)))]

print(f"n={N}")
print(f"p50  {pct(0.50):.3f}s")
print(f"p95  {pct(0.95):.3f}s")
print(f"p99  {pct(0.99):.3f}s")
print(f"mean {statistics.mean(latencies):.3f}s")

n=50
p50  0.177s
p95  0.179s
p99  0.240s
mean 0.179s


What it means (for your exact test):

- p50 ~ 177 ms: half your requests finished in ≤ 0.177s

- p95 ~ 179 ms: almost all requests finished around the same time (very low jitter)

- p99 ~ 240 ms: 1% tail spikes up to ~240 ms

 - mean ~ 179 ms: consistent with p50/p95 → stable run

Given your settings (~100 “hello ” tokens worth of prompt string + max_tokens=50, batch=1, sequential requests), this looks like you’re measuring mostly single-request end-to-end HTTP latency + generation with little queueing.

In [8]:
import time, statistics, requests

BASE_URL = "http://localhost:8002"
N = 50
INPUT_LEN = 100
OUTPUT_LEN = 50

# Discover model id
model_id = requests.get(f"{BASE_URL}/v1/models", timeout=10).json()["data"][0]["id"]
print("Using model:", model_id)

prompt = "hello " * INPUT_LEN

ttft = []
total = []

for i in range(N):
    t0 = time.perf_counter()
    with requests.post(
        f"{BASE_URL}/v1/completions",
        json={
            "model": model_id,
            "prompt": prompt,
            "max_tokens": OUTPUT_LEN,
            "temperature": 0,
            "stream": True,
        },
        stream=True,
        timeout=300,
    ) as r:
        r.raise_for_status()

        first_token_time = None
        # vLLM streams as SSE: lines like "data: {...}" and ends with "data: [DONE]"
        for raw in r.iter_lines(decode_unicode=True):
            if not raw:
                continue
            if raw.startswith("data: "):
                data = raw[len("data: "):]
                if data.strip() == "[DONE]":
                    break
                if first_token_time is None:
                    first_token_time = time.perf_counter()

        t1 = time.perf_counter()

    if first_token_time is None:
        raise RuntimeError("Did not receive any streamed token data; check server streaming support.")
    ttft.append(first_token_time - t0)
    total.append(t1 - t0)

ttft_s = sorted(ttft)
total_s = sorted(total)
def pct(arr, p):
    return arr[int(round(p * (len(arr)-1)))]

print(f"n={N}")
print(f"TTFT  p50 {pct(ttft_s,0.50):.3f}s  p95 {pct(ttft_s,0.95):.3f}s  p99 {pct(ttft_s,0.99):.3f}s  mean {statistics.mean(ttft):.3f}s")
print(f"TOTAL p50 {pct(total_s,0.50):.3f}s  p95 {pct(total_s,0.95):.3f}s  p99 {pct(total_s,0.99):.3f}s  mean {statistics.mean(total):.3f}s")

Using model: /mnt/elita/soundwave/models/llama3-70b-awq
n=50
TTFT  p50 0.060s  p95 0.060s  p99 0.076s  mean 0.060s
TOTAL p50 0.177s  p95 0.177s  p99 0.193s  mean 0.177s


What your numbers say:

- TTFT p50 ≈ 60 ms (p99 76 ms)
    → the server starts streaming the first token very fast. That’s mostly request handling + prefill start and suggests no queueing in this run.

- TOTAL p50 ≈ 177 ms (p99 193 ms)
    → end-to-end completion finishes quickly and tightly.

- Decode time estimate (TOTAL − TTFT):
    p50 ≈ 0.177 − 0.060 = 0.117 s for generating up to 50 tokens
    → rough decode rate ≈ 50 / 0.117 ≈ 427 tokens/s (very rough because your prompt/HTTP overhead are included, but good for comparing configs).


### increasing the prompt to stress prefill, e.g. INPUT_LEN=1000 (keep output 50) and compare TTFT again. 

- Same cell

In [9]:
import time, statistics, requests

BASE_URL = "http://localhost:8002"
N = 50
INPUT_LEN = 1000
OUTPUT_LEN = 50

# Discover model id
model_id = requests.get(f"{BASE_URL}/v1/models", timeout=10).json()["data"][0]["id"]
print("Using model:", model_id)

prompt = "hello " * INPUT_LEN

ttft = []
total = []

for i in range(N):
    t0 = time.perf_counter()
    with requests.post(
        f"{BASE_URL}/v1/completions",
        json={
            "model": model_id,
            "prompt": prompt,
            "max_tokens": OUTPUT_LEN,
            "temperature": 0,
            "stream": True,
        },
        stream=True,
        timeout=300,
    ) as r:
        r.raise_for_status()

        first_token_time = None
        # vLLM streams as SSE: lines like "data: {...}" and ends with "data: [DONE]"
        for raw in r.iter_lines(decode_unicode=True):
            if not raw:
                continue
            if raw.startswith("data: "):
                data = raw[len("data: "):]
                if data.strip() == "[DONE]":
                    break
                if first_token_time is None:
                    first_token_time = time.perf_counter()

        t1 = time.perf_counter()

    if first_token_time is None:
        raise RuntimeError("Did not receive any streamed token data; check server streaming support.")
    ttft.append(first_token_time - t0)
    total.append(t1 - t0)

ttft_s = sorted(ttft)
total_s = sorted(total)
def pct(arr, p):
    return arr[int(round(p * (len(arr)-1)))]

print(f"n={N}")
print(f"TTFT  p50 {pct(ttft_s,0.50):.3f}s  p95 {pct(ttft_s,0.95):.3f}s  p99 {pct(ttft_s,0.99):.3f}s  mean {statistics.mean(ttft):.3f}s")
print(f"TOTAL p50 {pct(total_s,0.50):.3f}s  p95 {pct(total_s,0.95):.3f}s  p99 {pct(total_s,0.99):.3f}s  mean {statistics.mean(total):.3f}s")

Using model: /mnt/elita/soundwave/models/llama3-70b-awq
n=50
TTFT  p50 0.051s  p95 0.052s  p99 0.508s  mean 0.060s
TOTAL p50 1.486s  p95 1.487s  p99 1.953s  mean 1.495s


TOTAL p50 jumped from ~0.177s → ~1.486s
That’s expected: much more input to process.

TTFT p50 is still ~51 ms, but p99 spikes to ~508 ms
That pattern usually means: most requests are fine, but occasionally one request hits a slow path (e.g., cache miss / warmup / occasional scheduling delay / background work). Since your p95 is still ~52 ms, it’s a rare tail event.

Decode time estimate (p50):
TOTAL − TTFT ≈ 1.486 − 0.051 = 1.435 s for ~50 output tokens
→ rough output rate ≈ 50 / 1.435 ≈ 35 tokens/s