In [1]:
import requests, time
import concurrent.futures
from transformers import AutoTokenizer
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def send_request():
    url = 'http://localhost:8000/v2/models/ensemble/generate'
    data = {
        "text_input": "How do I count to nine in French?",
        "parameters": {
            "max_tokens": 200,
            "bad_words": [""],
            "stop_words": [""],
            "temperature": 1.0,
        }
    }
    response = requests.post(url, json=data)
    return response


def concurrent_test(n_threads):
    out = []
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(send_request) for _ in range(n_threads)]
        for future in concurrent.futures.as_completed(futures):
            response = future.result()
            out.append(response.json()['text_output'])
    request_time = time.perf_counter() - start
    toks = sum([len(tokenizer.encode(o)) for o in out])
    return toks / request_time
        

In [3]:
concurrent_test(1)

84.0557487395887

In [4]:
concurrent_test(2)

202.75688994904536

In [5]:
concurrent_test(10)

318.1629820216799

In [6]:
concurrent_test(50)

364.9845918727431

In [7]:
concurrent_test(100)

377.2531939066671