## Calculate end-end Latency of different Vertex AI Foundational Models

In [21]:
#Import supporting libraries
import vertexai
from vertexai.language_models import TextGenerationModel
import time
import timeit
import numpy

In [20]:
#Set GCP Project and model parameters
vertexai.init(project="jsb-alto", location="us-central1")
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 50,
    "temperature": 0.2,
    "top_p": 0.5,
    "top_k": 40
}

In [32]:
#Query function to send the text input to a model in Vertex AI and return the response text
def query_llm(model_name,input):
    model = TextGenerationModel.from_pretrained(model_name)
    response = model.predict(
    input,
    **parameters)
    #print(response.text)
    return(response.text)
    

In [33]:
#Function to accept the text input and calculate average response time 
def test_model_latency(model_name, sample_input,iterations=10):
    #model = TextGenerationModel.from_pretrained(model_name)

    # Time the execution of the query_llm function
    timing = timeit.timeit(lambda: query_llm(model_name,sample_input),
                       setup="from __main__ import query_llm",
                       number=iterations)

    # Print the average execution time
    print(f"Average execution time for {model_name}: {timing / iterations:.6f} seconds")

### Run a latency test to calculate average response time 

In [24]:
test_model_latency("text-bison@002",input,iterations=10)

Average execution time for text-bison@002: 0.358944 seconds


## Calculate response percentiles

In [34]:
#Function to accept the text input and calculate response time percentiles
def test_model_latency(model_name, sample_input,iterations=10):
    model = TextGenerationModel.from_pretrained(model_name)

    # Create the Timer object
    #timer = timeit.Timer(f"query_llm('{sample_input}')",setup="from __main__ import query_llm",)
    
    # Create the Timer object
    timer = timeit.Timer(f"query_llm('{model_name}','{sample_input}')",setup="from __main__ import query_llm",)

    # Run the timer multiple times
    times = timer.repeat(repeat=iterations, number=1)

    # Calculate percentile or other statistics
    import numpy as np
    p50 = np.percentile(times, 50)
    p90 = np.percentile(times, 90)
    p99 = np.percentile(times, 99)
    
    
    print("Model: ",model_name)
    print("Temperature: ",parameters)
    print("p50:", p50)
    print("p90:", p90)
    print("p99:", p99)

    # Print the average execution time
    #print(f"Average execution time for {model_name}: {timing / iterations:.6f} seconds")

In [35]:
#Provide input for the model. Replace with your actual input
input = "What is the capital of USA?"


In [36]:
test_model_latency("text-bison@002",input,iterations=10)

Model:  text-bison@002
Temperature:  {'candidate_count': 1, 'max_output_tokens': 50, 'temperature': 0.2, 'top_p': 0.5, 'top_k': 40}
p50: 0.4101508385501802
p90: 0.4769554691389203
p99: 0.525991853736341
