In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load multiple images
image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")

# Initialize processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")

# Define batch of messages (parallel prompts)
messages = [
    [   # Example 1: Statue of Liberty
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What landmark is in this picture?"}
            ],
        }
    ],
    [   # Example 2: Eiffel Tower
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What landmark is shown here?"}
            ],
        }
    ]
]

# Prepare prompts for each sample
prompts = [processor.apply_chat_template(m, add_generation_prompt=True) for m in messages]

# Process inputs as a batch
inputs = processor(
    text=prompts,
    images=[[image1], [image1]],   # list of images, same order as prompts
    return_tensors="pt",
    padding=True
).to(DEVICE)

In [None]:
processor.tokenizer.eos_token_id

In [None]:
for k, v in inputs.items():
    print(k, v.shape)

In [None]:
print(inputs["input_ids"].tolist())
print(inputs["attention_mask"].tolist())