# Endpoint Feature Test

### Create OpenAI client

In [None]:
import json

import numpy as np
from openai import OpenAI

client = OpenAI(
    api_key="ocigenerativeai", base_url="http://127.0.0.1:8088/v1/", max_retries=0
)
models = client.models.list()

# Test chat completions
for model in models:
    print(model.id)

xai.grok-4-fast-non-reasoning
xai.grok-4-fast-reasoning
openai.gpt-oss-120b
openai.gpt-oss-20b
meta.llama-guard-4-12b
xai.grok-4
cohere.command-latest
cohere.command-plus-latest
cohere.embed-v4.0
xai.grok-3-fast
xai.grok-3
xai.grok-3-mini-fast
xai.grok-3-mini
cohere.rerank-v3.5
cohere.embed-english-image-v3.0
meta.llama-4-maverick-17b-128e-instruct-fp8
cohere.embed-english-light-image-v3.0
content-moderator
cohere.embed-multilingual-light-image-v3.0
meta.llama-4-scout-17b-16e-instruct
cohere.embed-multilingual-image-v3.0
cohere.command-a-03-2025
meta.llama-3.3-70b-instruct
protectai.deberta-v3-base-prompt-injection-v2
urchade.gliner-large-v2-1
cohere.rerank-multilingual-v3.1
cohere.command-r-08-2024
meta.llama-3.2-11b-vision-instruct
meta.llama-3.2-90b-vision-instruct
cohere.command-r-plus-08-2024
meta.llama-3.1-70b-instruct
meta.llama-3.1-405b-instruct
cohere.command-r-16k
cohere.command-r-plus
cohere.rerank-english-v3.1
meta.llama-3-70b-instruct
cohere.command
cohere.command-light
co

### Test chat completions with non-streaming response

In [10]:
test_models = [
    "xai.grok-4-fast-non-reasoning",
    # "openai.gpt-4o",
    # "cohere.command-latest",
    "cohere.command-a-03-2025",
    # "meta.llama-latest"
    "xai.grok-3-mini",
    "meta.llama-4-maverick-17b-128e-instruct-fp8",
    "openai.gpt-oss-20b",
]

In [11]:
# Test chat completions with non-streaming response

for model_name in test_models:
    completion = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": "Hello! 你好！"}],
        max_tokens=1024,
    )
    print(model_name, " : ", completion.choices[0].message.content)
    print(
        "\nprompt_tokens:",
        completion.usage.prompt_tokens,
        "\ncompletion_tokens:",
        completion.usage.completion_tokens,
        "\ntotal_tokens:",
        completion.usage.total_tokens,
    )
    if completion.usage.completion_tokens_details:
        print(
            "reasoning_tokens:",
            completion.usage.completion_tokens_details.reasoning_tokens,
        )

xai.grok-4-fast-non-reasoning  :  Hello! 你好！ How can I help you today?

prompt_tokens: 35 
completion_tokens: 12 
total_tokens: 48
reasoning_tokens: 1
cohere.command-a-03-2025  :  Hello! 你好！ It's great to meet you. How can I assist you today? Whether it's answering questions, helping with a task, or just having a chat, I'm here for you.

prompt_tokens: 6 
completion_tokens: 43 
total_tokens: 49
xai.grok-3-mini  :  Hello! 你好！  

It's great to hear from you. I'm Grok, an AI created by xAI to be helpful and truthful. What can I assist you with today? 😊

prompt_tokens: 12 
completion_tokens: 39 
total_tokens: 348
reasoning_tokens: 297
meta.llama-4-maverick-17b-128e-instruct-fp8  :  Hello! 你好! It's nice to meet you! Is there something I can help you with or would you like to chat?

prompt_tokens: 16 
completion_tokens: 27 
total_tokens: 43
openai.gpt-oss-20b  :  Hello! 你好！🌟  
How can I help you today?

prompt_tokens: 72 
completion_tokens: 54 
total_tokens: 126


### Test chat completions with streaming response

In [12]:
for model in test_models:
    print("\n", model, " : ", end="")
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": "hello! 你好！"}],
        max_tokens=1024,
        stream=True,  # this time, we set stream=True
    )
    for chunk in response:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="")
        if chunk.usage:
            print(
                "\nprompt_tokens:",
                chunk.usage.prompt_tokens,
                "\ncompletion_tokens:",
                chunk.usage.completion_tokens,
                "\ntotal_tokens:",
                chunk.usage.total_tokens,
            )
            if chunk.usage.completion_tokens_details:
                print(
                    "reasoning_tokens:",
                    chunk.usage.completion_tokens_details.reasoning_tokens,
                )


 xai.grok-4-fast-non-reasoning  : Hello! 你好！ Nice to meet you—I'm Sonoma, built by Oak AI. How can I help today?
prompt_tokens: 35 
completion_tokens: 24 
total_tokens: 60
reasoning_tokens: 1

 cohere.command-a-03-2025  : Hello! 你好！ How can I assist you today? Whether it's answering questions, helping with a task, or just chatting, I'm here for you.
prompt_tokens: 6 
completion_tokens: 34 
total_tokens: 40

 xai.grok-3-mini  : Hello! 你好！

I'm Grok, an AI created by xAI to be helpful and truthful. It's great to chat—how can I assist you today? 😊
prompt_tokens: 12 
completion_tokens: 35 
total_tokens: 355
reasoning_tokens: 308

 meta.llama-4-maverick-17b-128e-instruct-fp8  : Hello! 你好！It's nice to meet you! Is there something I can help you with or would you like to chat? (你好！很高兴认识你！你需要我帮助什么，还是想聊天？)
prompt_tokens: 16 
completion_tokens: 46 
total_tokens: 62

 openai.gpt-oss-20b  : Hello! 你好！🌟 How can I help you today?  
（如果你更喜欢用中文交流，请直接告诉我！）
prompt_tokens: 72 
completion_tokens: 126 
to

### Test multi-modal with image input response

In [70]:
test_models = ["xai.grok-4"]


def get_image():
    import base64
    import mimetypes

    image_path = "./test/image.jpg"
    mime_type, _ = mimetypes.guess_type(image_path)
    with open(image_path, "rb") as image_file:
        base64_str = base64.b64encode(image_file.read()).decode("utf-8")
    url = f"data:{mime_type};base64,{base64_str}"
    print(url[:100])
    return url


messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "describe this image."},
            {"type": "image_url", "image_url": {"url": get_image()}},
        ],
    }
]

for model in test_models:
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=1024,
    )
    print(
        model,
        " : ",
        completion.choices[0].message.content,
        "\n\npromptTokens: ",
        completion.usage.prompt_tokens,
        "\ncompletionTokens: ",
        completion.usage.completion_tokens,
        "\ntotalTokens: ",
        completion.usage.total_tokens,
    )

data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxMTEhUSExIVFRUXFxUVFhUWFRUVFRUVFRUWFhUVF
xai.grok-4  :  The image depicts an adorable white kitten, likely a Munchkin breed (known for their short legs), standing on a paved path outdoors. The cat has fluffy, pure white fur, large blue eyes, and a slightly rounded face with a curious or surprised expression as it gazes directly at the camera. Its body is compact and plump, with stubby legs that give it a somewhat dwarfed appearance. The background features vibrant green grass with a few small yellow flowers, suggesting a sunny, natural setting like a garden or park. The overall vibe is cute and endearing, with soft lighting highlighting the kitten's fur. 

promptTokens:  945 
completionTokens:  121 
totalTokens:  1249


### Test tool calls

In [77]:
def get_weather(latitude, longitude):
    # import requests
    # response = requests.get(f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m")
    # data = response.json()
    # return data['current']['temperature_2m']
    return 25.0


tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get current temperature for provided coordinates in celsius.",
            "parameters": {
                "type": "object",
                "properties": {
                    "latitude": {"type": "number"},
                    "longitude": {"type": "number"},
                },
                "required": ["latitude", "longitude"],
                "additionalProperties": False,
            },
            "strict": True,
        },
    }
]

# model = "cohere.command-a-03-2025"
model = "xai.grok-3-mini"
# model = "xai.grok-4"
# model = "meta.llama-4-maverick-17b-128e-instruct-fp8"
# model = "meta.llama-3.3-70b-instruct"

# model = "openai.gpt-oss-120b"

messages = [
    {
        "role": "user",
        "content": "Use tool `get_weather`to answer: What is the weather like in New York today? the coordinates are 40.7128° N, 74.0060° W.",
    }
]

#### Response without streaming

In [78]:
#### Cohere and Llama model without streaming

completion = client.chat.completions.create(
    model=model, messages=messages, tools=tools, max_tokens=1024, stream=False
)
print(f"Function call output for model -- {model}")
print(completion.choices[0].message.tool_calls, "\n")
print("content text:", completion.choices[0].message.content)
print(
    "\nprompt_tokens:",
    completion.usage.prompt_tokens,
    "\ncompletion_tokens:",
    completion.usage.completion_tokens,
    "\ntotal_tokens:",
    completion.usage.total_tokens,
)
if completion.usage.completion_tokens_details:
    print(
        "reasoning_tokens:", completion.usage.completion_tokens_details.reasoning_tokens
    )

Function call output for model -- xai.grok-3-mini
[ChatCompletionMessageFunctionToolCall(id='call_37360171', function=Function(arguments='{"latitude":40.7128,"longitude":-74.006}', name='get_weather'), type='function')] 

content text: 

prompt_tokens: 324 
completion_tokens: 37 
total_tokens: 735
reasoning_tokens: 374


#### Response with streaming

In [80]:
#### Cohere and Llama model with streaming

response = client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=1024,
    tools=tools,
    stream=True,  # this time, we set stream=True
)
print(model, ":")
for chunk in response:
    if not chunk.choices[0].delta.tool_calls:
        print(chunk.choices[0].delta.content, end="")
    else:
        if model.startswith("cohere"):
            print("\n")
        print(chunk.choices[0].delta.tool_calls)

BadRequestError: Error code: 400 - {'detail': 'Tool call with streaming is currently not supported by xai.grok-3-mini'}

In [None]:
#### Add tool call messages to the conversation
new_message = completion.choices[0].message
tool_calls = new_message.tool_calls
messages.append(new_message)

# use tool to get weather

for tool_call in tool_calls:
    args = json.loads(tool_call.function.arguments)
    result = get_weather(args["latitude"], args["longitude"])
    print("function result: ", result)
    # append result message
    messages.append(
        {"role": "tool", "tool_call_id": tool_call.id, "content": str(result)}
    )
print("\n")
for index, each in enumerate(messages):
    print(index, each)

function result:  25.0


0 {'role': 'user', 'content': 'Use tool `get_weather`to answer: What is the weather like in New York today? the coordinates are 40.7128° N, 74.0060° W.'}
1 ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='call_37360171', function=Function(arguments='{"latitude":40.7128,"longitude":-74.006}', name='get_weather'), type='function')])
2 {'role': 'tool', 'tool_call_id': 'call_37360171', 'content': '25.0'}


In [82]:
# Generate answer based on tool results without stream

completion_2 = client.chat.completions.create(
    model=model, messages=messages, max_tokens=1024, stream=False
)
print(f"\nOutput for model -- {model}")
print(completion_2.choices[0].message.content)


Output for model -- xai.grok-3-mini
The current weather in New York (based on coordinates 40.7128° N, 74.0060° W) shows a temperature of approximately 25.0°C. For more detailed weather information, please provide additional clarification if needed.


In [83]:
# Generate answer based on tool results with stream

completion_2 = client.chat.completions.create(
    model=model, messages=messages, max_tokens=1024, stream=True
)
print(f"\nOutput for model -- {model}")
for chunk in completion_2:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="")


Output for model -- xai.grok-3-mini
The weather in New York (coordinates: 40.7128° N, 74.0060° W) today has a temperature of approximately 25.0°C. For more detailed weather information, you may need to clarify or use additional tools.

### Test embedding

In [64]:
embd_model = [
    "cohere.embed-v4.0",
    #'cohere.embed-english-light-v3.0',
    #'cohere.embed-english-v3.0',
    #'cohere.embed-multilingual-light-v3.0',
    #'cohere.embed-multilingual-v3.0'
]
input = [
    "an adorable white kitten with fluffy fur, standing on a paved path outdoors.",
    "一只毛茸茸的可爱白色小猫，站在户外铺好的小路上。",
]
image_input = get_image()
output = []

print("model:", model)
embeddings = client.embeddings.create(input=input, model=model)
output.extend(embeddings.data)
print(embeddings.usage)

embeddings = client.embeddings.create(input=image_input, model=model)
output.extend(embeddings.data)
print(embeddings.usage)

inputs = input.copy()
inputs.append(image_input)
for index in range(len(output)):
    print(
        f"{index}: {inputs[index][:60]} \n {str(output[index].embedding)[:30]}...{str(output[index].embedding)[-30:]}"
    )

data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxMTEhUSExIVFRUXFxUVFhUWFRUVFRUVFRUWFhUVF
model: cohere.embed-v4.0
Usage(prompt_tokens=32, total_tokens=32)
Usage(prompt_tokens=0, total_tokens=0)
0: an adorable white kitten with fluffy fur, standing on a pave 
 [0.049316406, -0.028686523, -0..., -0.0018692017, -0.011962891]
1: 一只毛茸茸的可爱白色小猫，站在户外铺好的小路上。 
 [0.05493164, -0.025146484, -0....6, -0.026733398, -0.009338379]
2: data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGB 
 [0.005340576, -0.013305664, -0..., -0.057617188, -0.0022277832]


In [None]:
# Calculate Cosine Similarity


def calculate_cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    cosine_similarity = dot_product / (norm_vector1 * norm_vector2)
    return cosine_similarity


for pair in [(0, 1), (0, 2), (1, 2)]:
    similarity = calculate_cosine_similarity(
        output[pair[0]].embedding, output[pair[1]].embedding
    )
    print(f"Cosine Similarity between {pair[0]} and {pair[1]}: {similarity}")

Cosine Similarity between 0 and 1: 0.8408057652948703
Cosine Similarity between 0 and 2: 0.5641697354487893
Cosine Similarity between 1 and 2: 0.5293368303400612
