In [2]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

model_path3 = "Qwen/Qwen2.5-VL-3B-Instruct"
model_path7 = "Qwen/Qwen2.5-VL-7B-Instruct"
model3 = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path3, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2",device_map="auto")
# model7 = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path7, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2",device_map="auto")
processor = AutoProcessor.from_pretrained(model_path3)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [01:07<00:00, 33.54s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
    Message,
    ContentItem,
)
from qwen_vl_utils import smart_resize
import json
from PIL import Image
from agent_function_call import MobileUse

In [46]:
model = model3
user_query = """
Determine whether the action has been completed by examining the following two screenshots.
If the action has not been completed yet, return 0. If the action has been completed, return 1.
Action: SelectCategory 'Ala Carte & Value Meals'

Think step by step and provide the final answer. And return the answer in the following format:
<verify>
{
    "action_completed": 0,
    "reason": "The action has not been completed yet."
}
</verify>
"""

screenshot1 = 'data/mcdonalds/93/93_1.png'
screenshot2 = 'data/mcdonalds/93/93_2.png'

# The resolution of the device will be written into the system prompt. 
dummy_image1 = Image.open(screenshot1)
dummy_image2 = Image.open(screenshot2)
resized_height, resized_width  = smart_resize(dummy_image1.height,
    dummy_image1.width,
    factor=processor.image_processor.patch_size * processor.image_processor.merge_size,
    min_pixels=processor.image_processor.min_pixels,
    max_pixels=processor.image_processor.max_pixels,)

mobile_use = MobileUse(
    cfg={"display_width_px": resized_width, "display_height_px": resized_height}
)

message = [
    Message(role="system", content=[ContentItem(text="You are a helpful mobile agent and a good verifier")]),
    Message(role="user", content=[
        ContentItem(text=user_query),
        ContentItem(image=f"file://{screenshot1}"),
        ContentItem(image=f"file://{screenshot2}")
    ]),
]
message = [msg.model_dump() for msg in message]

In [None]:
text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
print("text",text)
inputs = processor(text=[text], images=[dummy_image1, dummy_image2], padding=True, return_tensors="pt").to('cuda')


output_ids = model.generate(**inputs, max_new_tokens=2048)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
print('output')
print(output_text)

# Qwen will perform action thought function call
action = json.loads(output_text.split('<verify>\n')[1].split('\n</verify>')[0])
print("action")
print(action['action_completed'])
print(action['reason'])


text <|im_start|>system
You are a helpful mobile agent and a good verifier<|im_end|>
<|im_start|>user

Determine whether the action has been completed by examining the following two screenshots.
If the action has not been completed yet, return 0. If the action has been completed, return 1.
Action: SelectCategory 'Ala Carte & Value Meals'

Think step by step and provide the final answer. And return the answer in the following format:
<verify>
{
    "action_completed": 0,
    "reason": "The action has not been completed yet."
}
</verify>
<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

output
<verify>
{
    "action_completed": 1,
    "reason": "The action has been completed as the 'ALA CARTE & VALUE MEALS' section is visible and accessible on the screen."
}
</verify>
action
{'action_completed': 1, 'reason': "The action has been completed as the 'ALA CARTE & VALUE MEALS' section is visible and accessible on the screen.

In [57]:
def verifier(model, screenshot1, screenshot2, action):
    user_query = f"""
Determine whether the action has been completed by examining the following two screenshots.
If the action has not been completed yet, return 0. If the action has been completed, return 1.
Action: {action}

Think step by step and provide the final answer. And return the answer in the following format:
<verify>
{{
    "action_completed": 0,
    "reason": "The action has not been completed yet."
}}
</verify>
    """

    # The resolution of the device will be written into the system prompt. 
    dummy_image1 = Image.open(screenshot1)
    dummy_image2 = Image.open(screenshot2)

    message = [
        Message(role="system", content=[ContentItem(text="You are a helpful mobile agent and a good verifier")]),
        Message(role="user", content=[
            ContentItem(text=user_query),
            ContentItem(image=f"file://{screenshot1}"),
            ContentItem(image=f"file://{screenshot2}")
        ]),
    ]
    message = [msg.model_dump() for msg in message]

    text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
    print("text",text)
    inputs = processor(text=[text], images=[dummy_image1, dummy_image2], padding=True, return_tensors="pt").to('cuda')


    output_ids = model.generate(**inputs, max_new_tokens=2048)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
    print('output')
    print(output_text, '\n')

    # Qwen will perform action thought function call
    action = json.loads(output_text.split('<verify>\n')[1].split('\n</verify>')[0])
    print(f"verify: {action['action_completed']}")
    print(f"reason: {action['reason']}")

In [58]:
model = model3
screenshot1 = 'data/mcdonalds/93/93_1.png'
screenshot2 = 'data/mcdonalds/93/93_2.png'
action = "SelectCategory 'Ala Carte & Value Meals'"

verifier(model, screenshot1, screenshot2, action)

text <|im_start|>system
You are a helpful mobile agent and a good verifier<|im_end|>
<|im_start|>user

Determine whether the action has been completed by examining the following two screenshots.
If the action has not been completed yet, return 0. If the action has been completed, return 1.
Action: SelectCategory 'Ala Carte & Value Meals'

Think step by step and provide the final answer. And return the answer in the following format:
<verify>
{
    "action_completed": 0,
    "reason": "The action has not been completed yet."
}
</verify>
    <|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

output
<verify>
{
    "action_completed": 1,
    "reason": "The action has been completed as the 'ALA CARTE & VALUE MEALS' section is visible and accessible on the screen."
}
</verify> 

verify: 1
reason: The action has been completed as the 'ALA CARTE & VALUE MEALS' section is visible and accessible on the screen.
