# Image Text to Text

Simple image text to text generation using transformers

## Imports

In [None]:
from transformers import AutoProcessor, AutoModelForImageTextToText
from transformers.image_utils import load_image

## Load Vision Model

In [None]:
model_id = "OpenGVLab/InternVL3_5-1B-HF"
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    device_map="auto",
    dtype="bfloat16",
    trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

## Image Text Generation

In [None]:
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = load_image(url)
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "What is in this image?"},
        ],
    },
]

inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    tokenize=True,
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=64)
print(processor.batch_decode(outputs, skip_special_tokens=True)[0])