In [2]:
import base64
from openai import OpenAI
import json
import os.path as osp

client = OpenAI(api_key='')

In [3]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def inference(image_path, inst, detail='low'):
    # Getting the base64 string
    base64_image = encode_image(image_path)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"Given a GUI image, what are the relative (0-1000) pixel point coordinates for the element corresponding to the following instruction or description: {inst}. Please provide the response in the format (x, y).",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": f"{detail}"},
                    },
                ],
            }
        ],
    )

    return response

def make_batch(id, image_path, inst, detail='low'):
    base64_image = encode_image(image_path)
    prompt = {
        "custom_id": id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "system", 
                    "content": "You are an AI assistant that performs GUI grounding using a GUI screenshot image."
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Given a GUI image, what are the relative (0-1000) pixel point coordinates for the element corresponding to the following instruction or description: {inst}. Please provide the response in the format (x, y).",
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/png;base64,{base64_image}", "detail": f"{detail}"},
                        },
                    ]
                }
            ], "max_tokens": 1000}}
    return prompt

In [4]:
# make batch input file
detail='high'
pc_icon_anno = [json.loads(d) for d in open('pc_icon_shuffle.jsonl').readlines()]
pc_text_anno = [json.loads(d) for d in open('pc_text_shuffle.jsonl').readlines()]
web_icon_anno = [json.loads(d) for d in open('web_icon_shuffle.jsonl').readlines()]
web_text_anno = [json.loads(d) for d in open('web_text_shuffle.jsonl').readlines()]
mobile_icon_anno = [json.loads(d) for d in open('mobile_icon_shuffle.jsonl').readlines()]
mobile_text_anno = [json.loads(d) for d in open('mobile_text_shuffle.jsonl').readlines()]

with open(f'batch_input_mobile_text_{detail}.jsonl', 'w') as f:
    count = 0
    for anno in mobile_text_anno[:100]:
        id=f"{detail}_mobile_text_{count:03d}_{anno['img_filename']}_{anno['instruction'].replace(' ', '_')}"
        image_path = osp.join('screenspot_imgs', anno['img_filename'])
        inst = anno['instruction']
        batch_input = make_batch(id, image_path, inst, detail=detail)
        f.write(json.dumps(batch_input) + '\n')
        count += 1

with open(f'batch_input_mobile_icon_{detail}.jsonl', 'w') as f:
    count = 0
    for anno in mobile_icon_anno[:100]:
        id=f"{detail}_mobile_icon_{count:03d}_{anno['img_filename']}_{anno['instruction'].replace(' ', '_')}"
        image_path = osp.join('screenspot_imgs', anno['img_filename'])
        inst = anno['instruction']
        batch_input = make_batch(id, image_path, inst, detail=detail)
        f.write(json.dumps(batch_input) + '\n')
        count += 1

with open(f'batch_input_pc_text_{detail}.jsonl', 'w') as f:
    count = 0
    for anno in pc_text_anno[:100]:
        id=f"{detail}_pc_text_{count:03d}_{anno['img_filename']}_{anno['instruction'].replace(' ', '_')}"
        image_path = osp.join('screenspot_imgs', anno['img_filename'])
        inst = anno['instruction']
        batch_input = make_batch(id, image_path, inst, detail=detail)
        f.write(json.dumps(batch_input) + '\n')
        count += 1
        
with open(f'batch_input_pc_icon_{detail}.jsonl', 'w') as f:
    count = 0
    for anno in pc_icon_anno[:100]:
        id=f"{detail}_pc_icon_{count:03d}_{anno['img_filename']}_{anno['instruction'].replace(' ', '_')}"
        image_path = osp.join('screenspot_imgs', anno['img_filename'])
        inst = anno['instruction']
        batch_input = make_batch(id, image_path, inst, detail=detail)
        f.write(json.dumps(batch_input) + '\n')
        count += 1

with open(f'batch_input_web_text_{detail}.jsonl', 'w') as f:
    count = 0
    for anno in web_text_anno[:100]:
        id=f"{detail}_web_text_{count:03d}_{anno['img_filename']}_{anno['instruction'].replace(' ', '_')}"
        image_path = osp.join('screenspot_imgs', anno['img_filename'])
        inst = anno['instruction']
        batch_input = make_batch(id, image_path, inst, detail=detail)
        f.write(json.dumps(batch_input) + '\n')
        count += 1

with open(f'batch_input_web_icon_{detail}.jsonl', 'w') as f:
    count = 0
    for anno in web_icon_anno[:100]:
        id=f"{detail}_web_icon_{count:03d}_{anno['img_filename']}_{anno['instruction'].replace(' ', '_')}"
        image_path = osp.join('screenspot_imgs', anno['img_filename'])
        inst = anno['instruction']
        batch_input = make_batch(id, image_path, inst, detail=detail)
        f.write(json.dumps(batch_input) + '\n')
        count += 1

100


In [25]:
file_parent = 'gpt4o_result_100_shuffle'

file = [
    'batch_input_mobile_icon_high.jsonl',
    'batch_input_mobile_text_high.jsonl',
    'batch_input_pc_icon_high.jsonl',
    'batch_input_pc_text_high.jsonl',
    'batch_input_web_icon_high.jsonl',
    'batch_input_web_text_high.jsonl',
    'batch_input_mobile_icon_low.jsonl',
    'batch_input_mobile_text_low.jsonl',
    'batch_input_pc_icon_low.jsonl',
    'batch_input_pc_text_low.jsonl',
    'batch_input_web_icon_low.jsonl',
    'batch_input_web_text_low.jsonl',
]

for f in file:
    batch_input_file = client.files.create(
        file=open(osp.join(file_parent, f), "rb"),
        purpose="batch"
    )
    print(batch_input_file)
    batch_input_file_id = batch_input_file.id
    client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"{f}"
        }
    )

12
FileObject(id='file-TQnjn68EYHfjWgdL1KN6Pw', bytes=187623708, created_at=1735654810, filename='batch_input_mobile_icon_high.jsonl', object='file', purpose='batch', status='processed', status_details=None)
FileObject(id='file-R3Vnoiq1KL8Va12FwMjAN1', bytes=127770570, created_at=1735654836, filename='batch_input_mobile_text_high.jsonl', object='file', purpose='batch', status='processed', status_details=None)
FileObject(id='file-WsdB1Z1T4HiufTPVJyzfhi', bytes=90200402, created_at=1735654856, filename='batch_input_pc_icon_high.jsonl', object='file', purpose='batch', status='processed', status_details=None)
FileObject(id='file-5NarK5txWkzTt5iSUPkEnK', bytes=74111700, created_at=1735654871, filename='batch_input_pc_text_high.jsonl', object='file', purpose='batch', status='processed', status_details=None)
FileObject(id='file-7gAnY2HediQr1F4tKxXyWQ', bytes=193435114, created_at=1735654904, filename='batch_input_web_icon_high.jsonl', object='file', purpose='batch', status='processed', status

In [3]:
import re
def parse_point(text):
    try:
        p = re.compile('\(([^)]+)')
        m = p.findall(text)
        x, y =  m[-1].split(',')
        return (int(x), int(y))
    except:
        return None

In [26]:
import os.path as osp
import json

for cls in ['mobile', 'pc', 'web']:
    for type in ['text', 'icon']:
        for res in ['high', 'low']:
            total = 0
            correct = 0
            error = 0
            answer = [json.loads(d) for d in open(osp.join('gpt4o_result_100_shuffle', f'{cls}_{type}_{res}.jsonl')).readlines()]
            anno = [json.loads(d) for d in open(f'{cls}_{type}_shuffle.jsonl').readlines()]
            for i in range(len(answer)):
                gt = anno[i]['bbox_norm']
                ans = parse_point(answer[i]['response']['body']['choices'][0]['message']['content'])
                if ans is None:
                    error += 1
                    total += 1
                    continue
                if gt[0] <= ans[0] <= gt[2] and gt[1] <= ans[1] <= gt[3]:
                    correct += 1
                total += 1
            print(f'{cls}, {type}, {res}: {correct/total*100} (error: {error})')

mobile, text, high: 27.0 (error: 2)
mobile, text, low: 14.000000000000002 (error: 1)
mobile, icon, high: 21.0 (error: 2)
mobile, icon, low: 14.000000000000002 (error: 10)
pc, text, high: 13.0 (error: 2)
pc, text, low: 15.0 (error: 1)
pc, icon, high: 12.0 (error: 1)
pc, icon, low: 8.0 (error: 9)
web, text, high: 7.000000000000001 (error: 2)
web, text, low: 4.0 (error: 5)
web, icon, high: 3.0 (error: 1)
web, icon, low: 3.0 (error: 9)
