# Load Tester
Similar to the one in PowerShell, this one executes calls against Azure OpenAI endpoints to test the performance of the API.

In [2]:
from multiprocessing import Pool
import os
import json
import time
import random
import requests
from deployments import *
import collections as collection

# Load Deployments
> NOTE: You may want to store these so it doesn't have to do this everytime

In [5]:
# Get all the deployments in my subscription
print(time.strftime("%Y-%m-%d %H:%M:%S"))
deployments = get_deployments()

# Print the number of deployments
nDeps=len(deployments)

# Remove the deployments where skuName is "GlobalBatch"
deployments = [deployment for deployment in deployments if deployment['skuName'] != 'GlobalBatch']

print("GlobalBatch Deployments Deleted",nDeps-len(deployments))

# Save the deployments to a file in local/deployments.json
# with open('local/deploymentsFull.json', 'w') as f:
#     json.dump(deployments, f)


2024-09-21 10:43:45
Running aoai-dai-dev-ncus
Running aoai-dai-dev-fc
Running aoai-dai-dev-ce
Running aoai-dai-dev-gwc
Running aoai-dai-dev-use
Running aoai-dai-dev-ae
Running aoai-dai-dev-sn
Running aoai-dai-dev-scus
Running aoai-dai-dev-sc
Running aoai-dai-dev-usw3
Running aoai-dai-dev-we
Running aoai-dai-dev-ne
Running aoai-dai-dev-use2
Running aoai-dai-dev-uks
GlobalBatch Deployments Deleted 0


# Function

In [3]:
def run_chat(deployments,data,timeout=60):
    deployment = deployments[random.randint(0,len(deployments)-1)]
    headers={"Content-Type":"application/json","api-key":deployment["key"]}
    url=f"{deployment['endpoint']}openai/deployments/{deployment['deploymentName']}/chat/completions?api-version=2024-06-01"

    status_code=None
    timeSpan=0
    detail=None
    region=None
    total_tokens=0
    resp_headers={}
    finish_reason=None

    try:
        startTime=time.time()
        response=requests.post(url=url,json=data,headers=headers,timeout=timeout)
        endTime=time.time()

        status_code=response.status_code
        timeSpan=endTime-startTime

        region=None
        if response.status_code == 200:
            region=response.headers["x-ms-region"]
            finish_reason=response.json().get('choices', [])[0].get('finish_reason',None)
            if finish_reason == "content_filter":
                detail=response.json().get('choices', [])[0].get('content_filter_results', {}) if response.status_code == 200 else None
            else:
                detail=response.json().get('choices', [])[0].get('message', {}).get('content',None) if response.status_code == 200 else None
        else:
            detail=response.json()
        
        total_tokens=response.json().get('usage', {}).get('total_tokens', 0) if response.status_code == 200 else 0
        resp_headers=response.headers
    except Exception as e:
        if hasattr(e, 'status_code'):
            status_code=e.status_code
        detail=str(e)
        if hasattr(e, 'response'):
            detail= e.response.json()['error']['message']
        finish_reason="Exception"

    return {
            "status_code":status_code,
            "response":detail,
            "timeSpan":timeSpan,
            "region":region,
            "endpoint":deployment['endpoint'],
            "deploymentName":deployment['deploymentName'],
            "total_tokens": total_tokens,
            "headers":resp_headers,
            "finish_reason":finish_reason
        }


# Execute Tester

In [8]:
# Narrow down which deployments you want to test

# enabled=([d for d in deployments if d['model'] == 'gpt-35-turbo' and d['version'] == "0125" and d['location'] == "northcentralus"])
# enabled=([d for d in deployments if d['model'] == 'gpt-35-turbo' and d['version'] == "0613" and d['location'] == "northcentralus"])
# enabled=([d for d in deployments if d['model'] == 'gpt-35-turbo' and d['version'] == "0613"])
# enabled=([d for d in deployments if d['deploymentName'] == 'gpt-4o-hourly'])
# enabled=([d for d in deployments if d['model'] == 'gpt-4o'])
# enabled=([d for d in deployments if d['deploymentName'] == 'gpt-4o'])
enabled=([d for d in deployments if d['model'] == 'gpt-4o' and d['location'] == "eastus2" and d['skuName']=="Standard"])
# enabled=([d for d in deployments if d['model'] == 'gpt-4o-mini'])

# enabled=([d for d in deployments if d['deploymentName'] == 'gpt-35-turbo'])

print("Deployments to test:",len(enabled))

assert len(enabled) > 0

Deployments to test: 4


In [12]:
# How many threads to run
threadCnt=3
# Total number of tests to run
testCnt=10

# Give it a file for the body of the message
fileName='body-joke.json'

# load json file with the data
with open(fileName) as f:
    data = json.load(f)

# Run the tests
p=Pool(threadCnt)
start = time.time()
print("Started:",time.strftime("%Y-%m-%d %H:%M:%S"))
results = [p.apply_async(run_chat,(enabled,data) ) for i in range(testCnt)]
output = [p.get() for p in results]
end = time.time()
print("Time taken (sec):",end-start)


Started: 2024-09-21 10:46:41
Time taken (sec): 2.9282116889953613


# View the Results

In [11]:
print(output[0])

ct = collection.Counter([each_result['status_code'] for each_result in output])
output_200 = ct[200]
output_429 = ct[429]
print(ct)

cf = collection.Counter([each_result['finish_reason'] for each_result in output])
print(cf)


total_time=(end-start) * 10**3
print("Time (ms):",total_time)
print("Time (sec):",(end-start))
print("% of 200:", (output_200 / len(output)) * 100)
print("% of 429:",((output_429)/len(output)) * 100)
print("Threads:",threadCnt)
print("Messages:",testCnt)
print("Total Tokens:",sum([item['total_tokens'] for item in output if item['status_code'] == 200]))

# Extract the timeSpan values from the output list
time_spans = [each_result['timeSpan'] for each_result in output]

# Compute the average timeSpan
average_time_span = sum(time_spans) / len(time_spans)

print("Average TimeSpan (sec):", average_time_span)

{'status_code': 200, 'response': 'Why did the scarecrow become a successful motivational speaker?\n\nBecause he was outstanding in his field and always knew how to lift people’s spirits! 🌾✨😄', 'timeSpan': 0.8125629425048828, 'region': 'East US 2', 'endpoint': 'https://aoai-dai-dev-use2.openai.azure.com/', 'deploymentName': 'gpt-4o', 'total_tokens': 47, 'headers': {'Content-Length': '1025', 'Content-Type': 'application/json', 'x-ms-region': 'East US 2', 'apim-request-id': '96af233d-2679-4706-9ee9-37da5653be02', 'x-ratelimit-remaining-requests': '19', 'x-accel-buffering': 'no', 'x-ms-rai-invoked': 'true', 'x-request-id': '1a0eafb5-b9d0-4ff6-a40f-80df26d3b554', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'azureml-model-session': 'd070-20240813235024', 'x-content-type-options': 'nosniff', 'x-envoy-upstream-service-time': '469', 'x-ms-client-request-id': '96af233d-2679-4706-9ee9-37da5653be02', 'x-ratelimit-remaining-tokens': '19568', 'Date': 'Sat, 21 Sep 202

In [None]:
# Loop through output and write a row in a text file with a tab separated format
with open('output.txt', 'w') as f:
    for each_result in output:
        f.write(f"{fileName}\t{threadCnt}\t{testCnt}\t{each_result['status_code']}\t{each_result['timeSpan']}\t{each_result['region']}\t{each_result['endpoint']}\t{each_result['deploymentName']}\n")

In [None]:
# Get the first 10 items in the output list where the status_code is 200

for item in output[:10]:
    print("endpoint:",item['endpoint'])
    print("deploymentName:",item['deploymentName'])
    if item['status_code'] == 200:
        print("response:",item['response'])
        print("total_tokens:",item['total_tokens'])
        print("timeSpan:",item['timeSpan'])
        print("region:",item['region'])
    else:
        print(item['status_code'])
        print(item['response'])
    if item['headers']:
        print('x-ratelimit-remaining-requests',item['headers'].get('x-ratelimit-remaining-requests',None))
        print('x-ratelimit-remaining-tokens',item['headers'].get('x-ratelimit-remaining-tokens',None))
    print("-" * 20)