### Multi-Modality Models

In [1]:
import os

from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient


In [4]:
load_dotenv()

LLAMA_STACK_API_TOGETHER_URL = os.environ["TOGETHER_URL"]
TOGETHER_API_KEY = os.environ["TOGETHER_API_KEY"]
LLAMA32_11B_VISION_INSTRUCT = "Llama3.2-11B-Vision-Instruct"

client = LlamaStackClient(
    base_url=LLAMA_STACK_API_TOGETHER_URL,
    provider_data={
        "together_api_key": TOGETHER_API_KEY
    }
)

In [5]:
MODEL_NAME="meta-llama/Llama-3.2-11B-Vision-Instruct"

In [8]:
import base64
import mimetypes
from termcolor import cprint
from llama_stack_client.lib.inference.event_logger import EventLogger


def encode_image_to_data_url(file_path: str) -> str:
    """
    Encode an image file to a data URL.

    Args:
        file_path (str): Path to the image file

    Returns:
        str: Data URL string
    """
    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type is None:
        raise ValueError("Could not determine MIME type of the file")

    with open(file_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")

    return f"data:{mime_type};base64,{encoded_string}"


async def process_image(client, image_path: str, stream: bool = True):
    """
    Process an image through the LlamaStack Vision API.

    Args:
        client (LlamaStackClient): Initialized client
        image_path (str): Path to image file
        stream (bool): Whether to stream the response
    """
    data_url = encode_image_to_data_url(image_path)

    # message = [
    #     {
    #         "role": "user",
    #         "content": [
    #             {
    #                 "type": "image_url",
    #                 "image_url": {
    #                     "url": data_url
    #                 }
    #             },
    #             {
    #                 "type": "text",
    #                 "text": "describe the image as specific as possible"
    #             }
    #         ]
    #     }
    # ]

    message = [{
        "role": "user",
        "content": [
                {"type": "image", "image": {"url": {"uri": data_url}}},
                {"type": "text", "text": "describe the image as specific as possible"},
        ],
    }]

    cprint("User> Sending image for analysis...", "green")
    response = client.inference.chat_completion(
        messages=message,
        model_id=MODEL_NAME,
        stream=stream,
    )

    if not stream:
        cprint(f"> Response: {response}", "cyan")
    else:
        async for log in EventLogger().log(response):
            log.print()

In [9]:
import asyncio

await process_image(client, "./llama-stock.png", stream=False)

[32mUser> Sending image for analysis...[0m
[36m> Response: ChatCompletionResponse(completion_message=CompletionMessage(content='The image depicts a serene scene of four alpacas standing in a grassy field, with a wooden fence in the background. The alpacas are positioned in a natural and relaxed manner, with one standing in front of the others, facing the camera. The two alpacas in the foreground are brown, while the two in the background are tan. The alpaca in the foreground is the largest, and it appears to be a male, as it has a larger body and a more prominent set of testicles. The alpaca in the middle is a female, as it has a smaller body and no visible testicles. The two alpacas in the background are also females, as they have smaller bodies and no visible testicles.\n\nThe alpacas are standing on a grassy field, with a wooden fence behind them. The fence is made of horizontal logs and has a rustic appearance. In the background, there is a hill or mountain, which adds depth and