In [100]:
import os
from pathlib import Path
from pprint import pprint

root_folder = Path("/Users/junhona/mobile_agent/shots_image")

all_images = {}

for app in ["google_maps", "aliexpress"]:
    app_folder = root_folder / app
    if not app_folder.is_dir():
        continue
    all_images[app] = {}

    # 각 task 폴더 순회
    for task_folder in app_folder.iterdir():
        if not task_folder.is_dir() or task_folder.name.startswith("."):
            continue
        task_name = task_folder.name
        all_images[app][task_name] = {}

        # 먼저 이 task 폴더 안에 서브폴더가 있는지 확인
        subdirs = [p for p in task_folder.iterdir() if p.is_dir() and not p.name.startswith(".")]

        if subdirs:
            # 서브폴더별로 PNG 수집
            for subtask_folder in subdirs:
                images = [
                    p for p in subtask_folder.iterdir()
                    if p.is_file() and p.suffix.lower() == ".png"
                ]
                images_sorted = sorted(images, key=lambda p: int(p.stem))
                all_images[app][task_name][subtask_folder.name] = [str(p) for p in images_sorted]

        else:
            # 서브폴더가 없으면, task 폴더 내 PNG들을 subtask '0' 으로 처리
            images = [
                p for p in task_folder.iterdir()
                if p.is_file() and p.suffix.lower() == ".png"
            ]
            images_sorted = sorted(images, key=lambda p: int(p.stem))
            all_images[app][task_name]["0"] = [str(p) for p in images_sorted]

# 결과 출력
pprint(all_images)

{'aliexpress': {'0': {'0': ['/Users/junhona/mobile_agent/shots_image/aliexpress/0/0.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/0/1.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/0/2.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/0/3.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/0/4.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/0/5.png']},
                '1': {'0': ['/Users/junhona/mobile_agent/shots_image/aliexpress/1/0.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/1/1.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/1/2.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/1/3.png',
                            '/Users/junhona/mobile_agent/shots_image/aliexpress/1/4.png',
        

In [101]:
import base64

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [102]:
from openai import OpenAI

client = OpenAI(api_key=api_key)

In [103]:
task_dict = {
    "google_maps": {
        "0": "Please display the route to Seoul station.",
        "1": "Show me a wheelchair-accessible route to Seoul station.",
        "2": "Show me the current rating of cafe SimJae.",
"3" : "Please add cafe SimJae to Want to go list.",
"4" : "Show me the cafes nearby Yangjaecheon.",
"5" : "Show me the opening hours of cafe SimJae.",
"6" : "Show me the popular visiting times for Deoksugung Palace.",
"7" : "Switch to satellite view of the map.",
"8" : "Show me the photos of Seoul Forest Park.",
"9" : "Show me the phone number of cafe SimJae.",
"10" : "Show me the reviews of cafe SimJae.",
"11" : "Show me the satellite view of Bukchon Hanok Village.",
"12" : "Remove cafe Simjae from my Want to go list.",
"13" : "Check the notifications in updates."
    },
    "aliexpress":{
        "0":"Add samdasoo to cart.",
"1":"Please buy fanta.",
"2":"Add the most popular mouse pad to cart.",
"3":"Add the cheapest usb-c cable to cart.",
"4":"Show me the delivered items in my orders.",
"5":"Show me the wishlist in my account.",
"6":"Remove second item in my cart.",
"7":"Show me the bluetooth speaker on sale.",
"8":"Empty the cart.",
"9":"View the items in New Arrivals of K-VENUE page."
    }
}

In [106]:
# ── (2) 메시지 배열 구축 ───────────────────────────────────────
system_prompt = """You are an expert mobile-UI analyst. 
Given:  
1. a high-level task in plain English  
2. a *chronologically ordered* list of screenshots showing how an expert finished that task,  
infer the minimal, ordered list of **macro actions** required.  
Macro actions must be chosen from the ID catalogue below.  
Respond **only** in valid JSON with two keys:  
{"macro_actions": [string], "indexes": [int]}.  
Never add extra keys, text, or explanations.

<Macro-Action Catalogue>
1 : Search for <TARGET>  
2 : Select <TARGET>            (tap a result / UI element)  
3 : Filter by <CRITERION>      (e.g. price, popularity)  
4 : Show <TARGET>                (show navigation options)  
5 : Add <TARGET> to cart
6 : Remove <TARGET> from cart
---

Below are some examples of the macro actions and their indexes.
---
Example1:
Input:
Task: Please display the route to Yangjae station.
Output:
{"macro_actions":["Search for Yangjae station","Show routes"],
"indexes":[1,4]}

Example2:
Input:
Task: Add Shin Ramyeon to cart.
Output:
{"macro_actions":["Search for Shin Ramyeon","Select Shin Ramyeon","Add Shin Ramyeon to cart"],
"indexes":[1,2,5]}
---

"""
user_prompt   = """
Now infer the macro actions and their indexes for the following task and screenshots.

Input:
Task: {task}
"""


In [114]:
task_dict["google_maps"].items()

dict_items([('0', 'Please display the route to Seoul station.'), ('1', 'Show me a wheelchair-accessible route to Seoul station.'), ('2', 'Show me the current rating of cafe SimJae.'), ('3', 'Please add cafe SimJae to Want to go list.'), ('4', 'Show me the cafes nearby Yangjaecheon.'), ('5', 'Show me the opening hours of cafe SimJae.'), ('6', 'Show me the popular visiting times for Deoksugung Palace.'), ('7', 'Switch to satellite view of the map.'), ('8', 'Show me the photos of Seoul Forest Park.'), ('9', 'Show me the phone number of cafe SimJae.'), ('10', 'Show me the reviews of cafe SimJae.'), ('11', 'Show me the satellite view of Bukchon Hanok Village.'), ('12', 'Remove cafe Simjae from my Want to go list.'), ('13', 'Check the notifications in updates.')])

In [118]:
all_images["google_maps"]["0"]["0"][2]

'/Users/junhona/mobile_agent/shots_image/google_maps/0/2.png'

In [119]:
apps = ["google_maps", "aliexpress"]
all_response = {app: {} for app in apps}

for app in apps:
    for task_id, task_name in task_dict[app].items():
        
        imgs = all_images[app][task_id]["0"]
        
        image_contents = [
            {
                "type": "input_image",
                "image_url": f"data:image/png;base64,{encode_image(img)}",
                "detail" : "high"
            }
            for img in imgs
        ]
        
        message = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": [{"type": "input_text", "text": user_prompt.format(task=task_name)}] + image_contents}
        ]

        response = client.responses.create(
            model="gpt-4.1",
            input=message
        )
        
        all_response[app][task_id] = response
        print(f"Task: {task_name}")
        print(f"Response: {response.output_text}")
        print("-" * 50)

Task: Please display the route to Seoul station.
Response: {"macro_actions":["Search for Seoul station","Select Seoul station","Show routes"],"indexes":[1,2,4]}
--------------------------------------------------
Task: Show me a wheelchair-accessible route to Seoul station.
Response: {"macro_actions":["Search for Seoul station","Select Seoul station","Show routes","Filter by Wheelchair accessible"],"indexes":[1,2,4,3]}
--------------------------------------------------
Task: Show me the current rating of cafe SimJae.
Response: {"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}
--------------------------------------------------
Task: Please add cafe SimJae to Want to go list.
Response: {"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}
--------------------------------------------------
Task: Show me the cafes nearby Yangjaecheon.
Response: {"macro_actions":["Search for Yangjaecheon","Select Yangjaecheon","Search for cafe"],"in

In [127]:
gpt_response  = {}
for key, value in all_response.items():
    
    task_response = {}
    print(f"App: {key}")
    for task_id, response in value.items():
        print(f"Task ID: {task_id}")
        print(f"Response: {response.output_text}")
        print("-" * 50)
        
        task_response[task_id] = response.output_text
        
    gpt_response[key] = task_response

gpt_response

App: google_maps
Task ID: 0
Response: {"macro_actions":["Search for Seoul station","Select Seoul station","Show routes"],"indexes":[1,2,4]}
--------------------------------------------------
Task ID: 1
Response: {"macro_actions":["Search for Seoul station","Select Seoul station","Show routes","Filter by Wheelchair accessible"],"indexes":[1,2,4,3]}
--------------------------------------------------
Task ID: 2
Response: {"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}
--------------------------------------------------
Task ID: 3
Response: {"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}
--------------------------------------------------
Task ID: 4
Response: {"macro_actions":["Search for Yangjaecheon","Select Yangjaecheon","Search for cafe"],"indexes":[1,2,1]}
--------------------------------------------------
Task ID: 5
Response: {"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}
------------

{'google_maps': {'0': '{"macro_actions":["Search for Seoul station","Select Seoul station","Show routes"],"indexes":[1,2,4]}',
  '1': '{"macro_actions":["Search for Seoul station","Select Seoul station","Show routes","Filter by Wheelchair accessible"],"indexes":[1,2,4,3]}',
  '2': '{"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}',
  '3': '{"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}',
  '4': '{"macro_actions":["Search for Yangjaecheon","Select Yangjaecheon","Search for cafe"],"indexes":[1,2,1]}',
  '5': '{"macro_actions":["Search for cafe SimJae","Select cafe SimJae"],"indexes":[1,2]}',
  '6': '{"macro_actions":["Search for Deoksugung Palace","Select Deoksugung Palace","Show popular times"],"indexes":[1,2,4]}',
  '7': '{"macro_actions":["Show map type options","Select Satellite"],"indexes":[4,2]}',
  '8': '{"macro_actions":["Search for Seoul Forest Park","Select Seoul Forest Park","Show photos"],"indexes":[1,2,4]}',

In [128]:
import json 
with open("gpt_response.json", "w") as f:
    json.dump(gpt_response, f, indent=4)

In [63]:
import json
json.loads(response.output_text)["macro_actions"]

['Search for samdasoo',
 'Select Jeju Samdasoo Mineral Water',
 'Add Jeju Samdasoo Mineral Water to cart']