In [1]:
import os
import base64
import io
from PIL import Image
import dotenv
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential


In [2]:
# Azure OpenAI configuration
azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_api_version = os.getenv("OPENAI_API_VERSION", "2023-07-01-preview")
model = os.getenv("LLM_MODEL", "gpt-35-turbo")

print(f"Azure Endpoint: {azure_endpoint}")
print(f"Azure API Key available: {'Yes' if azure_api_key else 'No'}")
print(f"Azure API Version: {azure_api_version}")
print(f"Model deployment name: {model}")

# Initialize Azure OpenAI client if enabled

# Create endpoint URL with deployment name
endpoint_url = f"{azure_endpoint}/openai/deployments/{model}"
print(f"Endpoint URL: {endpoint_url}")
client = ChatCompletionsClient(
    endpoint=endpoint_url,
    credential=AzureKeyCredential(azure_api_key),
    api_version=azure_api_version
)

Azure Endpoint: https://aoai-sweden-505.openai.azure.com/
Azure API Key available: Yes
Azure API Version: 2025-03-01-preview
Model deployment name: gpt-4o
Endpoint URL: https://aoai-sweden-505.openai.azure.com//openai/deployments/gpt-4o


In [3]:
payload = {
  "messages": [
    {
      "role": "user",
      "content": "I am going to Paris, what should I see?"
    },
    {
      "role": "assistant",
      "content": "Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:\n\n1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.\n2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.\n3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.\n\nThese are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world."
    },
    {
      "role": "user",
      "content": "What is so great about #1?"
    }
  ],
  "max_tokens": 2048
}
response = client.complete(payload)


print("Response:", response.choices[0].message.content)
print("Model:", response.model)
print("Usage:")
print("	Prompt tokens:", response.usage.prompt_tokens)
print("	Total tokens:", response.usage.total_tokens)
print("	Completion tokens:", response.usage.completion_tokens)

Response: The Eiffel Tower is a marvel of engineering and design, and it holds a special place in both the history and the cultural identity of Paris. Here are some reasons why the Eiffel Tower is considered so remarkable:

1. **Architectural Innovation**: Designed by Gustave Eiffel and completed in 1889 for the Exposition Universelle (World's Fair), the tower was an engineering wonder of its time. Made from iron lattice, its construction demonstrated groundbreaking techniques that have influenced the field of civil engineering.

2. **Iconic Status**: The Eiffel Tower is an emblem of Paris and a symbol of France around the world. Its unique and elegant design has made it one of the most recognizable structures globally.

3. **Stunning Views**: Visitors can ascend the tower to enjoy panoramic views of Paris from the observation decks. The vista stretches across the city, offering sights of famous landmarks such as the Seine River, Sacré-Cœur, and Notre-Dame Cathedral.

4. **Cultural Sig

In [4]:
import logging
from typing import Optional

import os
import base64
import io
from PIL import Image
import dotenv
from openai import AzureOpenAI
from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential


# Azure OpenAI configuration
azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_api_version = os.getenv("OPENAI_API_VERSION", "2023-07-01-preview")
model = os.getenv("LLM_MODEL", "gpt-35-turbo")

print(f"Azure Endpoint: {azure_endpoint}")
print(f"Azure API Key available: {'Yes' if azure_api_key else 'No'}")
print(f"Azure API Version: {azure_api_version}")
print(f"Model deployment name: {model}")

# Initialize Azure OpenAI client if enabled

# Create endpoint URL with deployment name
endpoint_url = f"{azure_endpoint}/openai/deployments/{model}"
print(f"Endpoint URL: {endpoint_url}")
client = ChatCompletionsClient(
    endpoint=endpoint_url,
    credential=AzureKeyCredential(azure_api_key),
    api_version=azure_api_version
)


def preprocess_image(image_path: str) -> Optional[Image.Image]:
    """
    Validate if the image is suitable for processing.
    
    Args:
        image_path: Path to the image file
        
    Returns:
        bool: True if image is valid, False otherwise
    """
    try:
        with Image.open(image_path) as img:
            if img.mode not in ('RGB', 'L'):
                img = img.convert('RGB')
            
            max_dimension = 2048
            if img.width > max_dimension or img.height > max_dimension:
                ratio = min(max_dimension / img.width, max_dimension / img.height)
                new_size = (int(img.width * ratio), int(img.height * ratio))
                img = img.resize(new_size, Image.Resampling.LANCZOS)
            
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='PNG', quality=85)
            img_byte_arr.seek(0)
            
            return Image.open(img_byte_arr)

    except Exception as e:
        print(f"Image preprocessing failed for {image_path}: {str(e)}")
        return None



def encode_image(image_path: str) -> Optional[str]:
    """Encode image as base64 with proper validation and preprocessing."""
    try:
        

        processed_img = preprocess_image(image_path)
        if processed_img is None:
            return None

        img_byte_arr = io.BytesIO()
        processed_img.save(img_byte_arr, format='PNG', quality=85)
        img_byte_arr.seek(0)
        base64_encoded = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')

        try:
            base64.b64decode(base64_encoded)
            return base64_encoded
        except Exception as e:
            print(f"Base64 validation failed for {image_path}: {str(e)}")
            return None

    except Exception as e:
        print(f"Image encoding failed for {image_path}: {str(e)}")
        return None
    

prompt = "What is in this image?"

# Image file path
IMAGE_PATH = 'screenshots/run_20250624_102302_c0ee4d92/images/region_91b32eaa-f938-499e-985b-7f73f40e4143.png'


base64_image = encode_image(IMAGE_PATH)

payload = {
    "messages": [
        {
            "role": "system",
            "content": "you are a helpful assistant that can analyze images and provide information about them."
        },
        {
            "role": "user",
            "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
        }
    ],
    "max_tokens": 2048
}
response = client.complete(payload)

print("Response:", response.choices[0].message.content)
print("Model:", response.model)
print("Usage:")
print("	Prompt tokens:", response.usage.prompt_tokens)
print("	Total tokens:", response.usage.total_tokens)
print("	Completion tokens:", response.usage.completion_tokens)

Azure Endpoint: https://aoai-sweden-505.openai.azure.com/
Azure API Key available: Yes
Azure API Version: 2025-03-01-preview
Model deployment name: gpt-4o
Endpoint URL: https://aoai-sweden-505.openai.azure.com//openai/deployments/gpt-4o
Response: The image appears to be a screenshot of a web page or user interface component. It includes text that says "Smart screen monitoring with" at the top, indicating it might be part of a larger context or heading. Below this text, there are two tabs labeled "Screenshots" and "Monitor", with "Screenshots" highlighted in blue, suggesting it is currently selected and active. This could be part of a software tool or system for screen monitoring.
Model: gpt-4o-2024-08-06
Usage:
	Prompt tokens: 287
	Total tokens: 378
	Completion tokens: 91


In [2]:
prompt = "What is in this image?"

# Image file path
IMAGE_PATH = 'screenshots/run_20250624_102302_c0ee4d92/images/region_91b32eaa-f938-499e-985b-7f73f40e4143.png'


base64_image = encode_image(IMAGE_PATH)

payload = {
    "messages": [
        {
            "role": "system",
            "content": "you are a helpful assistant that can analyze images and provide information about them."
        },
        {
            "role": "user",
            "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
        }
    ],
    "max_tokens": 2048
}
response = client.complete(payload)

print("Response:", response.choices[0].message.content)
print("Model:", response.model)
print("Usage:")
print("	Prompt tokens:", response.usage.prompt_tokens)
print("	Total tokens:", response.usage.total_tokens)
print("	Completion tokens:", response.usage.completion_tokens)

NameError: name 'encode_image' is not defined

In [5]:
import os
import base64
import io
from PIL import Image

from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential

from azure.ai.inference.models import SystemMessage, UserMessage, TextContentItem, ImageContentItem, ImageUrl, ImageDetailLevel



# Azure OpenAI configuration
azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_api_version = os.getenv("OPENAI_API_VERSION", "2023-07-01-preview")
model = os.getenv("LLM_MODEL", "gpt-35-turbo")

# print(f"Azure Endpoint: {azure_endpoint}")
# print(f"Azure API Key available: {'Yes' if azure_api_key else 'No'}")
# print(f"Azure API Version: {azure_api_version}")
# print(f"Model deployment name: {model}")

models = [model, "o3"]

# Initialize Azure OpenAI client if enabled

# Create endpoint URL with deployment name
endpoint_url = f"{azure_endpoint}/openai/deployments/{model}"
print(f"Endpoint URL: {endpoint_url}")
aoai_client = ChatCompletionsClient(
    endpoint=endpoint_url,
    credential=AzureKeyCredential(azure_api_key),
    api_version=azure_api_version
)



github_token = os.getenv("GITHUB_MODEL_TOKEN")
github_client = ChatCompletionsClient(
    endpoint="https://models.inference.ai.azure.com",
    credential=AzureKeyCredential(github_token),
    api_version=azure_api_version
)

img_path = "screenshots/run_20250624_102302_c0ee4d92/images/region_91b32eaa-f938-499e-985b-7f73f40e4143.png"
base64_image = encode_image(img_path)

# Create message with image content
messages = [
    SystemMessage("You are an AI assistant that analyzes screenshots and describes them in detail."),
    UserMessage([
        TextContentItem(text=prompt),
        ImageContentItem(
            image_url=ImageUrl(
                url=f"data:image/png;base64,{base64_image}",
                detail=ImageDetailLevel.HIGH,
            ),
        ),
    ]),
]

for client, model in zip([aoai_client, github_client], models):
    try:
        response = client.complete(
            messages=messages,
            model=model,
        )
        print("Response:", response.choices[0].message.content)
        print("Model:", response.model)
        print("Usage:")
        print("	Prompt tokens:", response.usage.prompt_tokens)
        print("	Total tokens:", response.usage.total_tokens)
        print("	Completion tokens:", response.usage.completion_tokens)
    except Exception as e:
        print(f"Error processing with {client}: {str(e)}")

Endpoint URL: https://aoai-sweden-505.openai.azure.com//openai/deployments/gpt-4o
Response: The image appears to be a cropped portion of a webpage or a digital document. It features text that reads "Smart screen monitoring with" at the top. Below that, there are two tabs: "Screenshots" and "Monitor," with "Screenshots" highlighted or selected, indicated by blue text and a blue underline. The layout suggests a navigation bar or menu where users can switch between different sections or options related to screen monitoring.
Model: gpt-4o-2024-08-06
Usage:
	Prompt tokens: 286
	Total tokens: 373
	Completion tokens: 87
Response: The image shows a cropped portion of a webpage or software interface. At the top is partial text reading “Smart screen monitoring wit…” (the remainder is cut off). Below that, on a white background, there appears to be a horizontal navigation or tab bar. One tab is labeled “Screenshots” in blue text and is currently active, indicated by a solid blue underline stretch

In [6]:
model = "o4-mini"
response = github_client.complete(messages=messages,
                           model=model,
                           )

print("Response:", response.choices[0].message.content)
print("Model:", response.model)
print("Usage:")
print("	Prompt tokens:", response.usage.prompt_tokens)
print("	Total tokens:", response.usage.total_tokens)
print("	Completion tokens:", response.usage.completion_tokens)  

Response: The image is a small snippet of a web-app interface. Across the top you can see the title text (cut off) reading “Smart screen monitoring wit…” and just below it is a horizontal tab bar. The “Screenshots” tab is highlighted in blue (indicating it’s active) and immediately to its right is an unselected “Monitoring” tab.
Model: o4-mini-2025-04-16
Usage:
	Prompt tokens: 101
	Total tokens: 386
	Completion tokens: 285


In [24]:
import os
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import AssistantMessage, SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

token = os.environ["GITHUB_MODEL_TOKEN"]
endpoint = "https://models.github.ai/inference"
model_name = "openai/o3"

client = ChatCompletionsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token),
    api_version="2024-12-01-preview",
)

messages = [
    {
        "role": "developer",
        "content": "You are a helpful assistant.",
    },
    UserMessage("What is the capital of France?"),
    AssistantMessage("The capital of France is Paris."),
    UserMessage("What about Spain?"),
]

response = client.complete(messages=messages, model=model_name)

print(response.choices[0].message.content)

The capital of Spain is Madrid.


In [7]:
# Test the refactored LLM API
import sys
import os
sys.path.insert(0, '/home/alibina/repo/screenAgent')

from src.api.llm_api import LLMAnalyzer

# Create analyzer with LLM enabled
config = {
    'llm_enabled': True,
    'llm_prompt': 'What is in this image?',
    'llm_model': os.getenv('LLM_MODEL', 'gpt-4o')
}

print("Testing refactored LLM API...")
analyzer = LLMAnalyzer(config)
print(f"Available providers: {analyzer.get_available_providers()}")
print(f"Is available: {analyzer.is_available()}")

# Test with the same image
image_path = "screenshots/run_20250624_102302_c0ee4d92/images/region_91b32eaa-f938-499e-985b-7f73f40e4143.png"

if analyzer.is_available():
    print("\n=== Testing Azure provider ===")
    result_azure = analyzer.analyze_image_from_path(image_path, "What is in this image?", provider='azure')
    if result_azure:
        print(f"✅ Azure analysis successful: {result_azure[:100]}...")
    else:
        print("❌ Azure analysis failed")
    
    print("\n=== Testing GitHub provider ===")
    result_github = analyzer.analyze_image_from_path(image_path, "What is in this image?", provider='github')
    if result_github:
        print(f"✅ GitHub analysis successful: {result_github[:100]}...")
    else:
        print("❌ GitHub analysis failed")
else:
    print("⚠️ No providers available")

Testing refactored LLM API...
🤖 Azure AI client initialized - Endpoint: https://aoai-sweden-505.openai.azure.com//openai/deployments/gpt-4o
🤖 GitHub Models client initialized
OpenAI setup failed: OpenAI API key not found
Available providers: ['azure', 'github']
Is available: True

Testing analysis with refactored API...
✅ Analysis successful: The image shows a portion of a webpage or application interface. At the top, there is a partial text that reads "Smart screen monitoring wit," suggesting a heading or title related to screen monitoring. Below this, there are two tabs labeled "Screenshots" and "Monitor." The "Screenshots" tab is highlighted in blue, indicating that it is currently selected or active. The interface has a clean and minimal design, with a white background and simple text formatting.
