In [8]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.float_format', '{:.1f}'.format)

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf",
                                          use_auth_token=True)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
from common import questions
import numpy as np

lens = [len(tokenizer.encode(q)) for q in questions.questions]
print(lens)
print(np.mean(lens))

[15, 12, 10, 35, 11, 26, 12, 23, 60]
22.666666666666668


In [4]:
files = ['vllm-2/bench-vllm-2.csv',
'bentoml/bench-bentoml.csv',
'sagemaker/bench-sagemaker-flashattn.csv',
'triton-tensorRT/bench-triton-tensorRT-llm.csv',
'triton-tensorRT-quantized/bench-triton-tensorRT-llm-quantized.csv',
'triton-vllm/bench-triton.csv',
'triton-vllm-awq/bench-triton-vllm-awq.csv',
'triton-tensorRT-quantized-awq/bench-triton-tensorRT-llm-quantized-awq.csv',
'anyscale/bench-anyscale.csv']

In [5]:
df = pd.concat([pd.read_csv(f) for f in files])

In [6]:
df['Tok/Sec'] = df.tok_count / df.time
df

Unnamed: 0,tok_count,time,question,answer,note,Tok/Sec
0,144,2.4,Write a Rust function that performs binary exp...,Write a Rust function that performs binary exp...,vllm-fastapi-server,60.2
1,210,3.5,What are the differences between Javascript an...,What are the differences between Javascript an...,vllm-fastapi-server,59.3
2,235,3.5,Write a story in the style of James Joyce abou...,Write a story in the style of James Joyce abou...,vllm-fastapi-server,66.3
3,11,0.0,Who does Harry turn into a balloon?,Who does Harry turn into a balloon??,vllm-fastapi-server,508.2
4,226,3.5,Write a tale about a time-traveling historian ...,Write a tale about a time-traveling historian ...,vllm-fastapi-server,63.8
...,...,...,...,...,...,...
3,201,3.4,Who does Harry turn into a balloon?,"Ah, a most intriguing question, my dear fell...",anyscale,59.6
4,201,3.4,Write a tale about a time-traveling historian ...,"As a time-traveling historian, I have always...",anyscale,59.6
5,114,2.0,What is the product of 9 and 8?,"Ah, a question that requires a simple yet el...",anyscale,57.1
6,201,3.2,"If a train travels 120 kilometers in 2 hours, ...","Oh, goodness gracious, thank you ever so kin...",anyscale,62.9


In [15]:
N_STD_DEV=2
filtered = pd.DataFrame()
for framework in df.note.unique():
    _df = df[df.note == framework]
    og_nrows = _df.shape[0]
    upper_limit = _df['Tok/Sec'].mean() + N_STD_DEV * _df['Tok/Sec'].std()
    lower_limit = _df['Tok/Sec'].mean() - N_STD_DEV * _df['Tok/Sec'].std()
    _df = _df[(_df['Tok/Sec'] < upper_limit) & (_df['Tok/Sec'] > lower_limit)]
    print(f'Dropping {og_nrows - _df.shape[0]} outliers from {framework}')
    filtered = pd.concat([filtered, _df])

Dropping 1 outliers from vllm-fastapi-server
Dropping 0 outliers from bentoml-vllm
Dropping 1 outliers from sagemaker-realtime-hf-endpoint-flashattention
Dropping 1 outliers from triton-tensorRT-llm
Dropping 1 outliers from triton-tensorRT-llm-quantized
Dropping 1 outliers from triton-vllm
Dropping 0 outliers from triton-vllm-awq
Dropping 0 outliers from triton-tensorRT-llm-quantized-awq
Dropping 0 outliers from anyscale


In [16]:
filtered

Unnamed: 0,tok_count,time,question,answer,note,Tok/Sec
0,144,2.4,Write a Rust function that performs binary exp...,Write a Rust function that performs binary exp...,vllm-fastapi-server,60.2
1,210,3.5,What are the differences between Javascript an...,What are the differences between Javascript an...,vllm-fastapi-server,59.3
2,235,3.5,Write a story in the style of James Joyce abou...,Write a story in the style of James Joyce abou...,vllm-fastapi-server,66.3
4,226,3.5,Write a tale about a time-traveling historian ...,Write a tale about a time-traveling historian ...,vllm-fastapi-server,63.8
5,32,0.4,What is the product of 9 and 8?,What is the product of 9 and 8? What is 3 mult...,vllm-fastapi-server,85.7
...,...,...,...,...,...,...
3,201,3.4,Who does Harry turn into a balloon?,"Ah, a most intriguing question, my dear fell...",anyscale,59.6
4,201,3.4,Write a tale about a time-traveling historian ...,"As a time-traveling historian, I have always...",anyscale,59.6
5,114,2.0,What is the product of 9 and 8?,"Ah, a question that requires a simple yet el...",anyscale,57.1
6,201,3.2,"If a train travels 120 kilometers in 2 hours, ...","Oh, goodness gracious, thank you ever so kin...",anyscale,62.9


In [23]:
result = filtered.groupby('note')[['tok_count', 'time']].mean().reset_index()

In [24]:
result['Tok/Sec'] = result.tok_count / result.time
result = result.rename(columns={'tok_count': 'Avg Tok Count', 
               'time': 'Avg Time'})
result = result.sort_values(by='Tok/Sec', ascending=True).reset_index(drop=True)

In [38]:
def backend(note):
    if 'vllm' in note.lower():
        return 'vLLM'
    if 'sagemaker' in note.lower():
        return 'TGI'
    if note == 'triton-tensorRT-llm':
        return 'TensorRT-LLM'
    if note == 'triton-tensorRT-llm-quantized':
        return 'TensorRT-LLM'
    if note == 'triton-vllm-awq':
        return 'vLLM'
    if note == 'triton-tensorRT-llm-quantized-awq':
        return 'TensorRT-LLM'
    if note == 'anyscale':
        return 'Anyscale'

def quant(note):
    if note in ['vllm-fastapi-server', 'bentoml-vllm', 'triton-tensorRT-llm', 'triton-vllm']:
        return 'None'
    if note == 'triton-tensorRT-llm-quantized':
        return 'INT8 weight-only quantization'
    if note == 'triton-vllm-awq':
        return 'AWQ 4bit quantization'
    if note == 'triton-tensorRT-llm-quantized-awq':
        return 'AWQ 4bit quantization'
    if note == 'anyscale':
        return 'Unknown'

def frontend(note):
    if 'fastapi' in note.lower():
        return 'FastAPI'
    if 'triton' in note.lower():
        return 'Triton'
    if note == 'bentoml-vllm':
        return 'OpenLLM'
    if 'sagemaker' in note.lower():
        return 'SageMaker'
    if note == 'anyscale':
        return 'Anyscale'

result['Front End'] = result['note'].apply(frontend)
result['Backend'] = result['note'].apply(backend)
result['Quantization']= result['note'].apply(quant)
final = result[['Front End', 'Backend', 'Quantization', 'Avg Tok Count', 'Avg Time', 'Tok/Sec']]
final.loc[:, 'Quantization'] = final['Quantization'].fillna('None')
final

Unnamed: 0,Front End,Backend,Quantization,Avg Tok Count,Avg Time,Tok/Sec
0,SageMaker,TGI,,218.4,6.6,33.0
1,OpenLLM,vLLM,,191.1,3.4,55.7
2,Triton,vLLM,,218.4,3.7,58.6
3,Anyscale,Anyscale,Unknown,190.0,3.1,61.6
4,FastAPI,vLLM,,179.1,2.7,65.5
5,Triton,TensorRT-LLM,,220.4,3.3,67.3
6,Triton,TensorRT-LLM,INT8 weight-only quantization,220.4,1.8,122.6
7,Triton,vLLM,AWQ 4bit quantization,223.6,1.6,142.2
8,Triton,TensorRT-LLM,AWQ 4bit quantization,225.6,1.1,205.5


In [41]:
final['Frontend_Backend_Quantization'] = final['Front End']+ '_' + final['Backend'] + '_' + final['Quantization'].apply(lambda x: x.replace(' ', '-'))
final

Unnamed: 0,Front End,Backend,Quantization,Avg Tok Count,Avg Time,Tok/Sec,combination,Frontend_Backend_Quantization
0,SageMaker,TGI,,218.4,6.6,33.0,SageMaker_TGI_None,SageMaker_TGI_None
1,OpenLLM,vLLM,,191.1,3.4,55.7,OpenLLM_vLLM_None,OpenLLM_vLLM_None
2,Triton,vLLM,,218.4,3.7,58.6,Triton_vLLM_None,Triton_vLLM_None
3,Anyscale,Anyscale,Unknown,190.0,3.1,61.6,Anyscale_Anyscale_Unknown,Anyscale_Anyscale_Unknown
4,FastAPI,vLLM,,179.1,2.7,65.5,FastAPI_vLLM_None,FastAPI_vLLM_None
5,Triton,TensorRT-LLM,,220.4,3.3,67.3,Triton_TensorRT-LLM_None,Triton_TensorRT-LLM_None
6,Triton,TensorRT-LLM,INT8 weight-only quantization,220.4,1.8,122.6,Triton_TensorRT-LLM_INT8-weight-only-quantization,Triton_TensorRT-LLM_INT8-weight-only-quantization
7,Triton,vLLM,AWQ 4bit quantization,223.6,1.6,142.2,Triton_vLLM_AWQ-4bit-quantization,Triton_vLLM_AWQ-4bit-quantization
8,Triton,TensorRT-LLM,AWQ 4bit quantization,225.6,1.1,205.5,Triton_TensorRT-LLM_AWQ-4bit-quantization,Triton_TensorRT-LLM_AWQ-4bit-quantization


In [42]:
fig = px.histogram(final, x="Frontend_Backend_Quantization", y="Tok/Sec", histfunc="avg", title="Framework by tokens/second")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()