In [1]:
import requests, time
import threading
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")

In [3]:
def send_request(i):
    global out
    url = 'http://localhost:8000/v2/models/ensemble/generate'
    data = {
        "text_input": "How do I count to nine in French?",
        "parameters": {
            "max_tokens": 500,
            "bad_words": [""],
            "stop_words": [""],
            "temperature": 0,
        }
    }
    response = requests.post(url, json=data)
    out[i] = response.json()['text_output']

def concurrent_test(n_threads):
    global out
    out = [None] * n_threads # pre allocate a list
    threads = []
    for index in range(n_threads):
        x = threading.Thread(target=send_request, args=(index,))
        threads.append(x)
        
    start = time.perf_counter()
    for t in threads: t.start()
    for t in threads: t.join()
    request_time = time.perf_counter() - start
    toks = sum([len(tokenizer.encode(o)) for o in out])
    return toks / request_time

In [4]:
def measure(bs, n_times=3):
    import numpy as np
    m = [concurrent_test(bs) for _ in range(n_times)]
    avg_toksec = np.mean(m)
    avg_toksec_per_thread = avg_toksec / bs
    print(f'\n\nConcurrent Requests={bs} (averaged over {n_times} separate experiments)\n==============================\ntok/sec total:      {avg_toksec:.1f}\ntok/sec per thread: {avg_toksec_per_thread:.1f}')

In [5]:
for bs in [1,2,4,8,16,32,64]:
    measure(bs)



Concurrent Requests=1 (averaged over 3 separate experiments)
tok/sec total:      185.3
tok/sec per thread: 185.3


Concurrent Requests=2 (averaged over 3 separate experiments)
tok/sec total:      355.7
tok/sec per thread: 177.9


Concurrent Requests=4 (averaged over 3 separate experiments)
tok/sec total:      638.5
tok/sec per thread: 159.6


Concurrent Requests=8 (averaged over 3 separate experiments)
tok/sec total:      958.1
tok/sec per thread: 119.8


Concurrent Requests=16 (averaged over 3 separate experiments)
tok/sec total:      961.5
tok/sec per thread: 60.1


Concurrent Requests=32 (averaged over 3 separate experiments)
tok/sec total:      962.1
tok/sec per thread: 30.1


Concurrent Requests=64 (averaged over 3 separate experiments)
tok/sec total:      963.0
tok/sec per thread: 15.0
