# Bench Mark a Model

A Test harness for benchmarking architectures with different deployment

In [1]:
from model_library.cards.architectures import *
from model_library.cards.models import *
from model_library.cards.deployments import *
from model_library.models import *





In [2]:
ARCH = LLAMA_CPP
MODEL = LLAMA_CPP_LLAMA_2_7b_CHAT_Q2_GGUF
DEPLOYMENT = MINIMAL_CPU_DEPLOYMENT
USER = UserInformation(
    id="benchmark",
    namespace="benchmark",
    API_key=""
)

In [3]:
MODEL.params

{'model_path': '/model/llama-2-7b-chat.Q2_K.gguf',
 'hf_repo_id': 'TheBloke/Llama-2-7B-Chat-GGUF',
 'hf_filename': 'llama-2-7b-chat.Q2_K.gguf',
 'volume_name': 'awesome-model-storage',
 'pvc_name': 'awesome-model-pvc'}

In [4]:
deployment_name = "benchmarkmodel"

template = ModelDeploymentTemplateCard(
    id="1",
    name="",
    description="",
    model_card=MODEL,
    deployment_card=DEPLOYMENT,
    params={
        "deployment_name": deployment_name
    }
)



In [5]:
# Get a list of all the models
def deploy_generic_model(config):
    import requests
    import json

    # Prepare the request data
    data = {"config": config}

    # Define the URL and headers
    url = "http://0.0.0.0:8000/v1/deploy_generic_model"
    headers = {"accept": "application/json", "Content-Type": "application/json"}

    # Make the POST request
    response = requests.post(url, headers=headers, data=json.dumps(data))

    return response



def delete_deployment(namespace, deployment_name):
    import requests
    import json

    # Prepare the request data
    data = {"namespace": namespace, "deployment_name": deployment_name}

    # Define the URL and headers
    url = "http://0.0.0.0:8000/v1/delete_deepsparse_model"
    headers = {"accept": "application/json", "Content-Type": "application/json"}

    # Make the POST request
    response = requests.post(url, headers=headers, data=json.dumps(data))

    # Check the response
    if response.status_code == 200:
        print("Delete request successful.")
        return response.json()
    else:
        print(f"Delete request failed with status code {response.status_code}.")
        return None
    

def list_deployments(namespace):
    import requests
    import json

    # Prepare the request data
    data = {"namespace": namespace}

    # Define the URL and headers
    url = "http://0.0.0.0:8000/v1/list_deepsparse_deployments"
    headers = {"accept": "application/json", "Content-Type": "application/json"}

    # Make the POST request
    response = requests.post(url, headers=headers, data=json.dumps(data))

    # Check the response
    if response.status_code == 200:
        print("Request successful.")
        return response.json()
    else:
        print(f"Request failed with status code {response.status_code}.")
        return None


In [6]:
#deploy_generic_model(config=config)

In [7]:
#deployments = list_deployments(namespace=USER.namespace)
#deployments

In [8]:
#deployments = list_deployments(namespace=USER.namespace)
#deployments
#deployment = deployments[deployment_name]
#port = max(deployment["ports"][0])

In [9]:
import time
from model_library.benchmark_apis import benchmark_model

def benchmark_deployment_template(deployment_template, public_url = "178.62.13.8", live_service_cutout=5*60, user = None):


    deployment_name = deployment_template.params["deployment_name"]

    if user is None:
        user = UserInformation(
            id="benchmark",
            namespace="benchmark",
            API_key=""
        )

    deployment = ModelDeploymentCard(
        user_information=USER,
        model_deployment_template=deployment_template
    )

    config = deployment.extract_deployment_config()

    deploy_generic_model(config)

    time.sleep(5)

    deployments = list_deployments(namespace=USER.namespace)
    deployments
    deployment = deployments[deployment_name]
    port = max(deployment["ports"][0])

    print("Wait for live service")


    api_available = ARCH.check_health(
            port = port,
            url = "178.62.13.8"
        )
    warmup_time = 0
    interval = 10
    
    while not api_available and warmup_time < live_service_cutout:
        print("Waiting ", warmup_time, " seconds")
        warmup_time+=interval
        time.sleep(interval)
        api_available = ARCH.check_health(
            port = port,
            url = "178.62.13.8"
        )

    if not api_available:
        return {
            "benchmarks": {},
            "warmup_time": warmup_time,
            "template": deployment_template

        }

    # TODO SUPPORT DEEPSPARSE
    backend_to_test = 'LLAMACPP'
    benchmarks = benchmark_model(backend_to_test, port=port, url=public_url)


    delete_deployment(user.namespace,deployment_name)

    return {
            "benchmarks": benchmarks.to_dict("records"),
            "warmup_time": warmup_time,
            "template": deployment_template,
            "deployment": deployment
        }





In [19]:
import time
import logging
from model_library.benchmark_apis import benchmark_model
from tqdm import tqdm
from typing import Optional, Dict, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class KubernetesBenchmark:
    def __init__(self, 
                 deployment_template: Any, 
                 deploy:bool = True,
                 undeploy:bool = True,
                 public_url: str = "178.62.13.8", 
                 live_service_cutout: int = 10 * 60, 
                 user: Optional[Any] = None,
                 ):
        """
        Initialize the KubernetesBenchmark class.

        :param deployment_template: The deployment template object.
        :param public_url: Public URL for the deployment.
        :param live_service_cutout: Timeout for the service to become live.
        :param user: User information object.
        :param deploy: Flag to deploy before testing.
        :param undeploy: Flag to delete deployment after testing.
        """
        self.deployment_template = deployment_template
        self.public_url = public_url
        self.live_service_cutout = live_service_cutout
        self.user = user or UserInformation(id="benchmark", namespace="benchmark", API_key="")
        self.deployment_name = deployment_template.params["deployment_name"]
        self.port = None
        self.backend_to_test = 'LLAMACPP'  # TODO: Make this configurable if needed
        self.undeploy = undeploy
        self.deploy_first = deploy
        self.architecture = self.deployment_template.model_card.architecture

    def deploy(self) -> None:
        """
        Deploy the model.
        """
        if not self.deploy_first:
            if not self.architecture.check_health(port=self.port, url=self.public_url):
                raise Exception("Cannot test an unavailable model.")

        else:
            delete_deployment(self.user.namespace, self.deployment_name)


        logging.info("Deploying the model.")
        deployment = ModelDeploymentCard(
            user_information=self.user,
            model_deployment_template=self.deployment_template
        )
        config = deployment.extract_deployment_config()
        deploy_generic_model(config)

        time.sleep(5)  # Wait for the deployment to initialize
        logging.info("Model deployed.")

    def wait_for_service(self) -> int:
        """
        Wait for the service to become live.

        :return: The time waited for the service to become live.
        """
        logging.info("Waiting for the live service.")
        deployments = list_deployments(namespace=self.user.namespace)
        deployment = deployments[self.deployment_name]
        self.port = max(deployment["ports"][0])

        warmup_time = 0
        interval = 10

        with tqdm(total=self.live_service_cutout) as pbar:
            while warmup_time < self.live_service_cutout:
                if self.architecture.check_health(port=self.port, url=self.public_url):
                    break
                time.sleep(interval)
                warmup_time += interval
                pbar.update(interval)

        logging.info(f"Service is live. Warmup time: {warmup_time} seconds.")
        return warmup_time

    def run_benchmark(self) -> Dict[str, Any]:
        """
        Run the benchmark.

        :return: Dictionary of benchmark results.
        """
        logging.info("Running benchmarks.")
        benchmarks = benchmark_model(self.backend_to_test, port=self.port, url=self.public_url)
        return benchmarks.to_dict("records")

    def cleanup(self) -> None:
        """
        Clean up after benchmarking.
        """
        logging.info("Cleaning up the deployment.")
        if self.undeploy:
            delete_deployment(self.user.namespace, self.deployment_name)

    def benchmark_deployment(self) -> Dict[str, Any]:
        """
        Perform the entire benchmarking process.

        :return: A dictionary containing benchmarking results and metadata.
        """
        self.deploy()
        warmup_time = self.wait_for_service()

        if warmup_time >= self.live_service_cutout:
            logging.warning("Service did not become live within the cutoff time.")
            self.cleanup()

            return {
                "benchmarks": {},
                "warmup_time": warmup_time,
                "template": self.deployment_template,
                "error": "Cut off start time limit reached."
            }
    
        try:
            benchmarks = self.run_benchmark()
        except Exception as e:

            return {
                    "benchmarks": {},
                    "warmup_time": warmup_time,
                    "template": self.deployment_template,
                    "error": f"Failed to benchmark. {e}"
                }

        return {
            "benchmarks": benchmarks,
            "warmup_time": warmup_time,
            "template": self.deployment_template,
            "deployment": self.deployment_name
        }


In [23]:

#benchmark = KubernetesBenchmark(template, undeploy=False)
#result = benchmark.benchmark_deployment()


In [21]:
base_model = DeploymentCard(
    id="4",
    description="Minimal Deployment for llama.cpp",
    params={
        "replicas": 1,
        "num_cpus": 4,
        "ram_memory": "12Gi",
        "pvc_storage_request": "24Gi",
    },
)

num_cpus = [4,8,12,16]
deployments = [
    DeploymentCard(
    id="4",
    description="Minimal Deployment for llama.cpp",
    params={
        "replicas": 2,
        "num_cpus": n,
        "ram_memory": "12Gi",
        "pvc_storage_request": "24Gi",
    }
    )
    for n in num_cpus
]

deployment_name = "benchmarkmodel"

templates = [
    ModelDeploymentTemplateCard(
        id="1",
        name="",
        description="",
        model_card=MODEL,
        deployment_card=deployment,
        params={
            "deployment_name": deployment_name
        }
    )
    for deployment in deployments
]

In [22]:
results = []

for n, template in zip(num_cpus[2:], templates[2:]):
    print("N: ", n)
    benchmark = KubernetesBenchmark(template, undeploy=True)
    result = benchmark.benchmark_deployment()
    results.append(result)


N:  12


2024-01-15 17:19:19,920 - INFO - Deploying the model.


Delete request successful.


2024-01-15 17:19:25,485 - INFO - Model deployed.
2024-01-15 17:19:25,486 - INFO - Waiting for the live service.


Request successful.


  0%|          | 0/600 [00:00<?, ?it/s]
2024-01-15 17:19:25,775 - INFO - Service is live. Warmup time: 0 seconds.
2024-01-15 17:19:25,776 - INFO - Running benchmarks.
 20%|██        | 5/25 [00:24<01:37,  4.86s/it]


N:  16


2024-01-15 17:19:50,380 - INFO - Deploying the model.


Delete request successful.


2024-01-15 17:19:56,227 - INFO - Model deployed.
2024-01-15 17:19:56,231 - INFO - Waiting for the live service.


Request successful.


100%|██████████| 600/600 [10:02<00:00,  1.00s/it]
2024-01-15 17:29:59,021 - INFO - Service is live. Warmup time: 600 seconds.
2024-01-15 17:29:59,022 - INFO - Cleaning up the deployment.


Delete request successful.


In [24]:
results

[{'benchmarks': [{'type': 'serial',
    'average_response_time': 15.530141487121583,
    'average_tokens_returned': 57.96,
    'total_tokens_returned': 1449,
    'wall_time': 388.30193424224854,
    'tps': 3.7316321970624475}],
  'warmup_time': 360,
  'template': ModelDeploymentTemplateCard(id='1', name='', description='', model_card=ModelCard(id='3', architecture=ArchitectureCard(id='cd993f75-2601-4633-b510-60dbd954bfb2', deployment_yaml='llama_cpp_python.yaml', name='Llama.cpp (python)', description='The python llama.cpp engine', tags={'cpu': True, 'gpu': True}, health_endpoint='v1/models'), params={'model_path': '/model/llama-2-7b-chat.Q2_K.gguf', 'hf_repo_id': 'TheBloke/Llama-2-7B-Chat-GGUF', 'hf_filename': 'llama-2-7b-chat.Q2_K.gguf', 'volume_name': 'awesome-model-storage', 'pvc_name': 'awesome-model-pvc'}), benchmarks=None, deployment_card=DeploymentCard(id='4', params={'replicas': 1, 'num_cpus': 4, 'ram_memory': '12Gi', 'pvc_storage_request': '24Gi'}), viable_deployment_cards=[]