# Basic Image Query
Create a simple function to send an image to a vision model and get a response to a general question about the image.

In [None]:
def generate_model_response(encoded_image, user_query, assistant_prompt="You are a helpful assistant. Answer the following user query in 1 or 2 sentences: "):
    """ Send image and query to the model and get a response."""

    messages=[
    {
    "role": "user",
    "content": [
    {
    "type":"text",
    "text":assistant_prompt + user_query
    },
    {
    "type": "image_url",
    "image_url": {
    "url": "data/image......." + encoded_image,
    }}]}]
    response = model.chat(messages=messages)
    return repsonse['choices'][0]['message']['content']

// Example usage:
user_query = "Describe the photo"
response = generate_model_response(encoded_image[0], user_query)
print("Description: ", response)

# Basic Object Detection
Use the vision model to detect and count objects in images by asking specific questions.

In [None]:
// Detection examples for various use cases
image = encoded_image[i]

result = generate_model_response(
    image, 
    "How many cars are there in this image?"
)
print("Cars detected: ", result)

// Examine results 
result = generate_model_response(
    image,
    "What color is the woman's jacket in this image?"
)
print("Clothing analysis: ")

# Creating message for the vision model
Format a request with both text and image data to send to the multimodal model.

In [None]:
def create_vision_image(prompt, encoded_image):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt 
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "data/image......"
                    }
                }
            ]
        }
    ]
    return messages

# Environment setup
Create and activate a virtual environment, then install necessary packages for multimodal applications.

In [None]:
python3.11 -m venv venv
source venv/bin/activate
pip install ibm-watsonx-ai==1.1.20 image==1.5.33 flask requests==2.32.0
pip install torch torchvision scikit-learn pillow gradio

# Flask integration for vision AI and web app
Basic Flask setup to create a web application with vision AI capabilities.

In [None]:
from flask import Flask, render_template, request
app = Flask(__name__)
@app.route("/", methods=["GET","POST"])
def index():
    if request.method == "POST":
        # Retrieve user prompts
        user_query = requests.form.get("user_query")
        uploaded_file = requests.files.get("file")
        if uploaded_file:
            # Process the uploaded image
            encoded_image = input_image_setup(uploaded_file)
            # Generate the model's response
            response = generate_model_response(encoded_image, user_query, assistant_prompt)
            # Render the result
            return render_template("index.html", user_query=user_query, response=response)
        return render_template("index.html")

if name == "main":
    app.run(debug=True)

# Image encoding from URLs
Load and encode multiple images from URLs to base64 format for batch processing with vision models.

In [None]:
import requests
import base64

// Define image URLs
url_image_1 = ‘https://example.com/image1.jpg'
url_image_2 = ‘https://example.com/image2.jpg'
image_urls = [url_image_1, url_image_2]

// Encode all images
encoded_images = []
for url in image_urls:
    encoded_images.append(
        base64.b64encode(
            requests.get(url).content
        ).decode("utf-8")
    )

# Image encoding from uploads
Convert and encoded image and file it to base64 format for batch processing with vision models.

In [None]:
import requests
from PIL import Image 
from io import Bytes10

def input_image_setup(uploaded_file):
    if uploaded_file is not None:
        # Read file into bytes
        bytes_data = uploaded_file.read()
        # Encode image to base64 string
        encoded_image = base64.b64encode(bytes_data).decode("utf-8")
        return encoded_image
    else:
        return FileNotFoundError("No file uploaded")

# Similarity Match
Find the closest matching image in a dataset based on cosine similarity of vector embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_closest_match(user_vector, dataset):
    """ Find closest match based on cosine similarity """
    try:
        # Stack all vectors from dataset
        dataset_vectors = np.vstack(dataset["Embedding"].dropna().values)
        # Calculate similarities
        similarities = cosine_similarity(user_vector.reshape(-1,1), dataset_vectors)
        # Find highest similarity index
        closest_index = np.argmax(similarities)
        similarity_score = similarities[0][closest_index]
        # Get corresponding dataset row
        closest_row = dataset.iloc[closest_index]
        return closest_row, similariy_score
    except Exception as e:
        print(f"Error finding closest match {e}")
        return None, None

# Vector Embeddings for images
Convert images to vector embeddings for similarity matching using a pre-trained ResNet50 model

In [None]:
import torchvision.transforms as transforms
import torchvision.models as resnet50
import numpy as np

class ImageProcessor:
    def init(self, image_size=(224, 224),
             norm_mean=(0.485, 0.456, 0.406),
             norm_std=[0.229, 0.224, 0.225]):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = resnet50(pretrained=True).to(self.device)
        self.model.eval() # Set model to evaluation mode
        # image preprocessing pipeline
        self.preprocess = transforms.Compose([
        transforms.Resize(image_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=norm_mean, std=norm_std),
        ])

    def encode_image(self, image_input, is_url=True):
        try:
            if is_url:
                # Fetch image from URL
                response = requests.get(image_input)
                image = Image.open(BytesIO(response.content)).convert("RGB")
            else:
                # Load from local file
                image = Image.open(image_input).convert("RGB")
            # Convert image to base64
            buffered = BytesIO()
            image.save(buffered, format="JPEG")
            base64_string = base64.b64encode(buffered.getvalue()).decode("utf-8")
            # Get feature vector using ResNet50
            input_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
            with torch.no_grad():
                features = self.model(input_tensor)
            # Convert to numpy array
            feature_vector = features.cpu().numpy().flatten()
            return {"base64": base64_string, "vector": feature_vector}
        except Exception as e:
            print(f"Error encoding image: {e}")
            return {"base64": None, "vector": None}

# Vision Model Initialization
Set up credentials and initialize the Llama 3.2 Vision Instruct model through Watsonx AI.

In [None]:
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.schema import TextChatParameters

credentials = Credentials(
    url = "....",
    api_key = "YOUR_API_KEY",
)
client = APIClient(crendentials)

model_id = "meta-llama/llama-3-2-90b-vision-instruct"
project_