In [None]:
import os
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from playwright.async_api import async_playwright
from qwen_vl_utils import process_vision_info
from PIL import Image
import re
import nest_asyncio
from IPython.display import display

nest_asyncio.apply()


import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

# 1. CLEANUP: Clear the old 4-bit model from GPU memory
try:
    del model
    del processor
    torch.cuda.empty_cache()
except:
    pass

print("Initializing Qwen2-VL in HIGH DEFINITION (float16)...")

# NO BitsAndBytesConfig needed!
# We load directly in float16.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

print(f"Model loaded! VRAM used: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# --- 1. THE "BIG MODE" MARKER SCRIPT ---
# Changes:
# - Background: Yellow (High visibility)
# - Font Size: 24px (Huge)
# - Border: Thick Black (Clear separation)
JS_MARKER_SCRIPT_BIG = """
(function() {
    let idCounter = 0;
    document.querySelectorAll('.som-marker').forEach(el => el.remove());

    const elements = document.querySelectorAll('button, input, a, [role="button"], [aria-label="Search by voice"]');

    elements.forEach(el => {
        const rect = el.getBoundingClientRect();
        if (rect.width > 10 && rect.height > 10 && rect.top >= 0 && rect.left >= 0) {
            const id = ++idCounter;
            el.setAttribute('data-som-id', id);

            const label = document.createElement('div');
            label.className = 'som-marker';
            label.innerText = id;
            label.style.position = 'fixed';
            // Center the label on the element slightly
            label.style.left = (rect.left) + 'px';
            label.style.top = (rect.top) + 'px';

            // VISIBILITY STYLES
            label.style.backgroundColor = '#FFFF00'; // Bright Yellow
            label.style.color = '#000000';           // Black Text
            label.style.fontSize = '24px';           // HUGE FONT
            label.style.fontWeight = '900';          // Extra Bold
            label.style.padding = '4px 8px';
            label.style.border = '3px solid black';  // High Contrast Border
            label.style.zIndex = '10000';
            label.style.pointerEvents = 'none';
            document.body.appendChild(label);
        }
    });
    return idCounter;
})();
"""

async def run_som_agent_robust(target_description="microphone icon"):
    url = "https://www.google.com"
    screenshot_path = "som_big_mode.png"

    print(f"--- QUEST 3: BIG MODE ---")
    print(f"Target: '{target_description}'")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page(viewport={'width': 1280, 'height': 800})
        await page.goto(url)
        await page.wait_for_load_state("networkidle")

        # 1. Inject GIANT Markers
        print("Injecting GIANT markers...")
        await page.evaluate(JS_MARKER_SCRIPT_BIG)
        await page.wait_for_timeout(500)

        # 2. Capture
        await page.screenshot(path=screenshot_path)
        display(Image.open(screenshot_path)) # Verify the markers are readable to YOU

        # 3. The "Point Blank" Prompt
        # We ask it to find the visual feature, then read the number.
        prompt = (
            f"Look at the image. Find the '{target_description}'. "
            f"There is a bright yellow box with a black number on top of it. "
            f"What is that number?"
        )

        messages = [{
            "role": "user",
            "content": [{"type": "image", "image": screenshot_path}, {"type": "text", "text": prompt}]
        }]

        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            padding=True,
            return_tensors="pt"
        ).to(model.device)

        print("Asking Qwen...")
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=50)

        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        print(f"ü§ñ AI Said: '{output_text}'")

        # 4. Parsing
        # Just grab the last digit mentioned, it's usually the answer.
        ids = re.findall(r"(\d+)", output_text)

        if ids:
            target_id = ids[-1] # Take the last number found
            print(f"‚úÖ Target ID: {target_id}")

            selector = f'[data-som-id="{target_id}"]'
            if await page.locator(selector).count() > 0:
                print(f"Clicking #{target_id}...")
                await page.hover(selector)
                await page.mouse.down()
                await page.mouse.up()
                await page.wait_for_timeout(2000)
                await page.screenshot(path="success_big_mode.png")
                display(Image.open("success_big_mode.png"))
            else:
                print(f"‚ùå AI saw #{target_id}, but it's not in the DOM.")
        else:
            print("‚ùå No numbers found in response.")

        await browser.close()

# Try looking for the microphone again
await run_som_agent_robust("microphone icon inside the search bar")