In [None]:
import re
from PIL import Image, ImageDraw
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from playwright.async_api import async_playwright
from qwen_vl_utils import process_vision_info
import torch

# 1. Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
if 'model' not in locals():
    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", dtype="auto",
                                                            device_map=device)
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")


async def find_and_verify_element(url, element_description):
    image_path = "page.png"

    # --- Capture ---
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        # Using 1024x1024 to match model training resolution often helps accuracy
        await page.set_viewport_size({"width": 1024, "height": 1024})
        await page.screenshot(path=image_path)
        await browser.close()

    # --- Inference ---
    # We specifically ask for the center point to help the model allow for single-point precision
    prompt = f"Find the bounding box of the {element_description}."

    messages = [{
        "role": "user",
        "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt}]
    }]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(device)

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(f"Model said: {output_text}")

    # --- Parse (The 'Chat' Logic: x, y, x, y) ---
    # Matches (213, 372...) or [213, 372...]
    pattern = r"\((\d+),\s*(\d+)\)\s*to\s*\((\d+),\s*(\d+)\)"
    match = re.search(pattern, output_text)

    if match:
        # Chat mode usually outputs [x_min, y_min, x_max, y_max]
        # If the result looks like a vertical strip, swap these variables.
        norm_x1, norm_y1, norm_x2, norm_y2 = map(int, match.groups())

        # Get Real Dimensions
        img = Image.open(image_path)
        W, H = img.size

        # Convert 0-1000 to Pixels
        x1 = (norm_x1 / 1000) * W
        y1 = (norm_y1 / 1000) * H
        x2 = (norm_x2 / 1000) * W
        y2 = (norm_y2 / 1000) * H

        center_x = (x1 + x2) / 2
        center_y = (y1 + y2) / 2

        print(f"Found Coordinates: Center({center_x:.1f}, {center_y:.1f})")

        # --- Verify Visual (Python Side) ---
        draw = ImageDraw.Draw(img)
        # Draw Box
        draw.rectangle([x1, y1, x2, y2], outline="red", width=4)
        # Draw Center
        r = 10
        draw.ellipse((center_x - r, center_y - r, center_x + r, center_y + r), fill="green", outline="black")

        # Save/Show result
        img.save("verified_result.png")
        return (center_x, center_y)
    else:
        print("No coordinates found.")
        return None


# Run
# Note: If you want the gray button specifically, try prompting: "the gray Google Search button below the text bar"
await find_and_verify_element("https://www.google.com", "Google Search button")