In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    # Example 1
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2354919.jpg"},
            {"type": "text", "text": "Who is riding a bike? Please provide the bounding box coordinate like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "The man is riding a bike. The bounding box coordinates are [56, 272, 88, 319]. Reasoning: Identify the bike, associate the person riding it, then name the person."
    },

    # Example 2
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2334016.jpg"},
            {"type": "text", "text": "What animal is to the right of the man? Please provide the bounding box coordinate like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "The animal is a horse. The bounding box coordinates are [339, 161, 419, 237]. Reasoning: Find the man, locate the animal to his right, determine species."
    },

    # Example 3
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2415528.jpg"},
            {"type": "text", "text": "The pot is where? Please provide the bounding box coordinate like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "The pot is on the floor. The bounding box coordinates are [5, 71, 497, 374]. Reasoning: Identify the pot, find its spatial relation, name the location."
    },

    # Example 4
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2414884.jpg"},
            {"type": "text", "text": "What is the man watching, a sheep or a donkey? Please provide the bounding box coordinate like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "The man is watching a sheep. The bounding box coordinates are [299, 124, 310, 135]. Reasoning: Identify the man, see which animal he watches, compare sheep and donkey features."
    },

    # Example 5
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2391800.jpg"},
            {"type": "text", "text": "What animal is to the right of the man? Please provide the bounding box coordinate like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "The animal is a dog. The bounding box coordinates are [169, 82, 250, 182]. Reasoning: Identify the man, locate animal to the right, determine its type."
    },

    # test query
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2355243.jpg"},
            {"type": "text", "text": "Who is wearing the backpack? Please provide the bounding box coordinate of the region that can help you answer the question better."}
        ],
    },
]


# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=100)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


['The person wearing the backpack appears to be the individual in the foreground on the right side of the image, facing away from the camera. The backpack is visible on their back.']


In [4]:
messages = [
    # Example 1
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2354919.jpg"},
            {"type": "text", "text": "Who is riding a bike? Please provide the answer and CoT BBox like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "Answer: The man is riding a bike.\nCoT BBox: [56, 272, 88, 319]"
    },

    # Example 2
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2334016.jpg"},
            {"type": "text", "text": "What animal is to the right of the man? Please provide the answer and CoT BBox like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "Answer: The animal is a horse.\nCoT BBox: [339, 161, 419, 237]"
    },

    # Example 3
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2415528.jpg"},
            {"type": "text", "text": "The pot is where? Please provide the answer and CoT BBox like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "Answer: The pot is on the floor.\nCoT BBox: [5, 71, 497, 374]"
    },

    # Example 4
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2414884.jpg"},
            {"type": "text", "text": "What is the man watching, a sheep or a donkey? Please provide the answer and CoT BBox like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "Answer: The man is watching a sheep.\nCoT BBox: [299, 124, 310, 135]"
    },

    # Example 5
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2391800.jpg"},
            {"type": "text", "text": "What animal is to the right of the man? Please provide the answer and CoT BBox like [x_min, y_min, x_max, y_max]."}
        ],
    },
    {
        "role": "assistant",
        "content": "Answer: The animal is a dog.\nCoT BBox: [169, 82, 250, 182]"
    },

    # test query
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "gqa/2355243.jpg"},
            {"type": "text", "text": "Who is wearing the backpack? Please provide the answer and CoT BBox like [x_min, y_min, x_max, y_max]."}
        ],
    },
]


# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=100)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


['Answer: The man is wearing the backpack.\nCoT BBox: [304, 112, 426, 314]']
