# Bench Mark a Model

A Test harness for benchmarking architectures with different deployment

In [25]:
from model_library.cards.architectures import *
from model_library.cards.models import *
from model_library.cards.deployments import *
from model_library.models import *
from model_library.benchmark_apis import *

In [26]:
ARCH = LLAMA_CPP
MODEL = LLAMA_CPP_LLAMA_2_7b_CHAT_Q2_GGUF
DEPLOYMENT = MINIMAL_CPU_DEPLOYMENT
USER = UserInformation(
    id="benchmark",
    namespace="benchmark",
    API_key=""
)

In [27]:
MODEL.params

{'model_path': '/model/llama-2-7b-chat.Q2_K.gguf',
 'hf_repo_id': 'TheBloke/Llama-2-7B-Chat-GGUF',
 'hf_filename': 'llama-2-7b-chat.Q2_K.gguf',
 'volume_name': 'awesome-model-storage',
 'pvc_name': 'awesome-model-pvc'}

In [28]:
deployment_name = "benchmarkmodel"

template = ModelDeploymentTemplateCard(
    id="1",
    name="",
    description="",
    model_card=MODEL,
    deployment_card=DEPLOYMENT,
    params={
        "deployment_name": deployment_name
    }
)



In [6]:
#deploy_generic_model(config=config)

In [7]:
#deployments = list_deployments(namespace=USER.namespace)
#deployments

In [8]:
#deployments = list_deployments(namespace=USER.namespace)
#deployments
#deployment = deployments[deployment_name]
#port = max(deployment["ports"][0])

In [19]:
import time
import logging
from model_library.benchmark_apis import benchmark_model
from tqdm import tqdm
from typing import Optional, Dict, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [29]:

benchmark = KubernetesBenchmark(template, undeploy=False)
result = benchmark.benchmark_deployment()


2024-01-16 09:54:50,998 - INFO - Deploying the model.


Delete request successful.


2024-01-16 09:54:56,726 - INFO - Model deployed.
2024-01-16 09:54:56,728 - INFO - Waiting for the live service.


Request successful.


 63%|██████▎   | 380/600 [06:22<03:41,  1.01s/it]
2024-01-16 10:01:19,290 - INFO - Service is live. Warmup time: 380 seconds.
2024-01-16 10:01:19,291 - INFO - Running benchmarks.
100%|██████████| 25/25 [06:31<00:00, 15.67s/it]

     type  average_response_time  average_tokens_returned  \
0  serial              15.665051                    57.64   

   total_tokens_returned   wall_time       tps  
0                   1441  391.702164  3.678816  





In [21]:
base_model = DeploymentCard(
    id="4",
    description="Minimal Deployment for llama.cpp",
    params={
        "replicas": 1,
        "num_cpus": 4,
        "ram_memory": "12Gi",
        "pvc_storage_request": "24Gi",
    },
)

num_cpus = [4,8,12,16]
deployments = [
    DeploymentCard(
    id="4",
    description="Minimal Deployment for llama.cpp",
    params={
        "replicas": 2,
        "num_cpus": n,
        "ram_memory": "12Gi",
        "pvc_storage_request": "24Gi",
    }
    )
    for n in num_cpus
]

deployment_name = "benchmarkmodel"

templates = [
    ModelDeploymentTemplateCard(
        id="1",
        name="",
        description="",
        model_card=MODEL,
        deployment_card=deployment,
        params={
            "deployment_name": deployment_name
        }
    )
    for deployment in deployments
]

In [22]:
results = []

for n, template in zip(num_cpus[2:], templates[2:]):
    print("N: ", n)
    benchmark = KubernetesBenchmark(template, undeploy=True)
    result = benchmark.benchmark_deployment()
    results.append(result)


N:  12


2024-01-15 17:19:19,920 - INFO - Deploying the model.


Delete request successful.


2024-01-15 17:19:25,485 - INFO - Model deployed.
2024-01-15 17:19:25,486 - INFO - Waiting for the live service.


Request successful.


  0%|          | 0/600 [00:00<?, ?it/s]
2024-01-15 17:19:25,775 - INFO - Service is live. Warmup time: 0 seconds.
2024-01-15 17:19:25,776 - INFO - Running benchmarks.
 20%|██        | 5/25 [00:24<01:37,  4.86s/it]


N:  16


2024-01-15 17:19:50,380 - INFO - Deploying the model.


Delete request successful.


2024-01-15 17:19:56,227 - INFO - Model deployed.
2024-01-15 17:19:56,231 - INFO - Waiting for the live service.


Request successful.


100%|██████████| 600/600 [10:02<00:00,  1.00s/it]
2024-01-15 17:29:59,021 - INFO - Service is live. Warmup time: 600 seconds.
2024-01-15 17:29:59,022 - INFO - Cleaning up the deployment.


Delete request successful.


In [24]:
results

[{'benchmarks': [{'type': 'serial',
    'average_response_time': 15.530141487121583,
    'average_tokens_returned': 57.96,
    'total_tokens_returned': 1449,
    'wall_time': 388.30193424224854,
    'tps': 3.7316321970624475}],
  'warmup_time': 360,
  'template': ModelDeploymentTemplateCard(id='1', name='', description='', model_card=ModelCard(id='3', architecture=ArchitectureCard(id='cd993f75-2601-4633-b510-60dbd954bfb2', deployment_yaml='llama_cpp_python.yaml', name='Llama.cpp (python)', description='The python llama.cpp engine', tags={'cpu': True, 'gpu': True}, health_endpoint='v1/models'), params={'model_path': '/model/llama-2-7b-chat.Q2_K.gguf', 'hf_repo_id': 'TheBloke/Llama-2-7B-Chat-GGUF', 'hf_filename': 'llama-2-7b-chat.Q2_K.gguf', 'volume_name': 'awesome-model-storage', 'pvc_name': 'awesome-model-pvc'}), benchmarks=None, deployment_card=DeploymentCard(id='4', params={'replicas': 1, 'num_cpus': 4, 'ram_memory': '12Gi', 'pvc_storage_request': '24Gi'}), viable_deployment_cards=[]