In [1]:
import pandas as pd
import re
import json

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def parse_thought_action(dict_str):
    thought_action = {}
    thought_match = re.search(r"'thought':\s*(.+?)\s*,\s*'action'", dict_str)
    action_match = re.search(r"'action':\s*(.+?)\s*}", dict_str)
    # 提取匹配的值
    thought = thought_match.group(1) if thought_match else None
    thought = thought.replace("\\", "").replace("\"", "").replace("\'", "")
    action = action_match.group(1) if action_match else None
    action = action.replace("\\", "").replace("\"", "").replace("\'", "")
    thought_action = {"thought": thought, "action": action}
    return thought_action

In [3]:
def to_dict(input_string):
    # 正则表达式模式
    # pattern = r"('action_type'|'element_id'|'url'|'fill_text'):\s*(<[^>]+>|\d+|'[^']+')"
    pattern = r"('action_type'|'element_id'|'url'|'fill_text'):\s*(<[^>]+>|\d+|'[^']+'|\"[^\"]+\")"
    matches = re.findall(pattern, input_string)
    extracted_fields = {}
    for match in matches:
        field_name, field_value = match
        if field_value.startswith('<') and field_value.endswith('>'):
            enum_name = field_value.split('.')[-1].strip('<> ')
            extracted_fields[field_name.strip("'")] = enum_name
        else:
            extracted_fields[field_name.strip("'")] = field_value.strip("'")
    action = ""
    if "google_search" in extracted_fields["action_type"].lower():
        action = "google_search" + "[" + extracted_fields["fill_text"] + "]"
    elif "fill_search" in extracted_fields["action_type"].lower():
        action = "fill_search" + \
            "[" + str(extracted_fields["element_id"]) + "," + \
            extracted_fields["fill_text"] + "]"
    elif "fill_form" in extracted_fields["action_type"].lower():
        action = "fill_search" + \
            "[" + str(extracted_fields["element_id"]) + "," + \
            extracted_fields["fill_text"] + "]"
    elif "goto" in extracted_fields["action_type"].lower():
        action = "goto" + "[" + extracted_fields["url"] + "]"
    elif "click" in extracted_fields["action_type"].lower():
        action = "click" + "[" + str(extracted_fields["element_id"]) + "]"
    elif "none" in extracted_fields["action_type"].lower():
        action = "None"
    return action

In [4]:
def score_rate(score):
    first, second = score.split("/")
    return float(first) / float(second)

In [5]:
def parse_step_reward(dict_str):
    score_description = {}
    score_match = re.search(r"'score':\s*(.+?)\s*,\s*'description'", dict_str)
    description_match = re.search(r"'description':\s*(.+?)\s*}", dict_str)
    score = score_match.group(1) if score_match else None
    score = score.replace("\\", "").replace("\"", "").replace("\'", "")
    description = description_match.group(1) if description_match else None
    description = description.replace(
        "\\", "").replace("\"", "").replace("\'", "")
    score_description = {"score": score, "description": description}
    return score_description


def process_step_reward(dict_str):
    if dict_str.lower() == "x":
        dict_str = {}
    elif dict_str.lower() == "finished":
        dict_str = {"score:": 10, "description": "finished"}
    else:
        dict_str = parse_step_reward(dict_str)
    return dict_str

In [6]:
def write_to_json(file_path):
    df = pd.read_csv(file_path, index_col=False)
    df = df.drop(df.columns[0], axis=1)
    df["step_index"] += 1
    df["trace_to_dict"] = df["trace"].apply(lambda x: parse_thought_action(x))
    df["action_to_str"] = df["action"].apply(lambda x: to_dict(x))
    df["score_rate"] = df["score"].apply(lambda x: score_rate(x))
    df["step_reward"] = df["step_reward"].apply(
        lambda x: process_step_reward(x))
    df["selector"] = df["selector"].fillna("None")
    df_copy = df[
        [
            "step_index",
            "trace_to_dict",
            "selector",
            "action_to_str",
            "score",
            "score_rate",
            "step_reward",
            "step url"
        ]
    ]

    def summary(x):
        dic = {
            "step_index": x["step_index"],
            "trace_description": x["trace_to_dict"] if x["trace_to_dict"] else {},
            "selector": x["selector"] if x["selector"] != "None" else "",
            "action": x["action_to_str"] if x["action_to_str"] else "",
            "task_score": x["score"],
            "task_score_rate": x["score_rate"],
            "current_reward_score_description": x["step_reward"],
            "url": x["step url"] if x["step url"] != "finished" else ""
        }
        return dic
    step_list = []
    df_copy.apply(lambda x: step_list.append(summary(x)), axis=1)
    return step_list

In [7]:
# import os
# import pandas as pd

# folder_path = './csv_results/Dom-based'
# task_list = []
# for _, filename in enumerate(os.listdir(folder_path)):
#     out_json = {}
#     task_name = filename.split("_")[1]
#     out_json["task_id"] = int(filename.split("_")[0])
#     out_json["task_name"] = task_name
#     out_json["task_status"] = filename.split("_")[-2]
#     file_path = os.path.join(folder_path, filename)
#     if os.path.isfile(file_path):
#         task_step_list = write_to_json(file_path)
#         out_json["step_list"] = task_step_list
#         task_list.append(out_json)
# print(task_list)
# task_list = sorted(task_list, key=lambda x: x['task_id'])
# if not os.path.exists("./results/Dom-based/"):
#     os.makedirs("./results/Dom-based")
# out_json_file_path = './results/Dom-based/out.json'
# with open(out_json_file_path, 'w') as json_file:
#     json.dump(task_list, json_file)

In [8]:
import json5


def read_file(file_path="./data/group1_20240206.json"):
    '''读取标签数据'''
    return_list = []
    with open(file_path, encoding='gbk') as f:
        test_data = json5.load(f)
    for task in test_data:
        task_name = task["task"]
        return_list.append(task_name)
    return return_list


task_name_list = read_file()

In [12]:
import os
import pandas as pd

folder_path = './csv_results/group1-20240206/DOM-Based-20240206'
task_list = []
for _, filename in enumerate(os.listdir(folder_path)):
    out_json = {}
    out_json["task_id"] = int(filename.split("_")[0])
    out_json["task_name"] = task_name_list[out_json["task_id"]]
    task_status = filename.split("_")[-2]
    out_json["task_status"] = task_status
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        task_step_list = write_to_json(file_path)
        out_json["step_list"] = task_step_list
        task_list.append(out_json)
print(task_list)
task_list = sorted(task_list, key=lambda x: x['task_id'])
if not os.path.exists("./results/group1-20240206/DOM-Based-20240206"):
    os.makedirs("./results/group1-20240206/DOM-Based-20240206")
out_json_file_path = './results/group1-20240206/DOM-Based-20240206/out.json'
with open(out_json_file_path, 'w') as json_file:
    json.dump(task_list, json_file)

[{'task_id': 0, 'task_name': 'Find vitamin D that are buy 1 get 1 free and new arrival in cvs', 'task_status': 'limit', 'step_list': [{'step_index': 1, 'trace_description': {'thought': 'To find vitamin D products that are on a buy 1 get 1 free offer and are new arrivals at CVS, Ill first need to navigate to the CVS website. Ill use the `goto` tool to visit the CVS website by opening a new tab.', 'action': 'Navigating to the CVS website to look for vitamin D products with a buy 1 get 1 free offer and that are new arrivals.'}, 'selector': '', 'action': 'goto[https://www.cvs.com]', 'task_score': '0 / 4', 'task_score_rate': 0.0, 'current_reward_score_description': {}, 'url': 'https://www.cvs.com/'}, {'step_index': 2, 'trace_description': {'thought': 'Now that were on the CVS website, the next step is to find the vitamin D products. The best approach is to navigate to the vitamins section of the site, as it will likely have the specific category for vitamin D. Using the `click` tool will he

In [21]:
with open('./results/group1-20240206/DOM-Based-20240206/out.json') as f:
    data = json.load(f)
result_list = []
for items in data:
    data_dic = {}
    data_dic["task_id"] = items["task_id"]
    data_dic["task_name"] = items["task_name"]
    data_dic["status"] = items["task_status"]
    data_dic["steps"] = items["step_list"][-1]["step_index"]
    data_dic["task_score"] = items["step_list"][-1]["task_score"]
    data_dic["task_score_rate"] = items["step_list"][-1]["task_score_rate"]
    data_dic["selector"] = items["step_list"][-1]["selector"]
    data_dic["action"] = items["step_list"][-1]["action"]
    data_dic["url"] = items["step_list"][-2]["url"]
    result_list.append(data_dic)
# result_list = sorted(data,key=lambda x: x["step_list"]["task_score_rate"],reverse=True)
result_list


[{'task_id': 0,
  'task_name': 'Find vitamin D that are buy 1 get 1 free and new arrival in cvs',
  'status': 'limit',
  'steps': 10,
  'task_score': '1 / 4',
  'task_score_rate': 0.25,
  'selector': '',
  'action': 'None',
  'url': 'https://www.cvs.com/shop/merch/new/q/Vitamins/c1?widgetID=ojdve0je&mc=cat2&icid=shop-vitamins-category-lhn-new'},
 {'task_id': 1,
  'task_name': 'Confirm my vip tour at the six flags Discovery Kingdom in sixflags',
  'status': 'error',
  'steps': 9,
  'task_score': '0 / 4',
  'task_score_rate': 0.0,
  'selector': '#rso > div:nth-child(5) > div > div > div.kb0PBd.cvP2Ce.jGGQ5e > div > div > span > a',
  'action': 'click[97]',
  'url': 'https://www.sixflags.com/online-store/vip-tour-experiences'},
 {'task_id': 2,
  'task_name': 'Show me movies produced by Aaron Horvath in imdb',
  'status': 'limit',
  'steps': 10,
  'task_score': '2 / 3',
  'task_score_rate': 0.6666666666666666,
  'selector': '#__next > button',
  'action': 'click[304]',
  'url': 'https://ww

In [23]:
import pandas as pd
df = pd.DataFrame(result_list)

In [33]:
df

Unnamed: 0,task_id,task_name,status,steps,task_score,task_score_rate,selector,action,url
0,0,Find vitamin D that are buy 1 get 1 free and n...,limit,10,1 / 4,0.250000,,,https://www.cvs.com/shop/merch/new/q/Vitamins/...
1,1,Confirm my vip tour at the six flags Discovery...,error,9,0 / 4,0.000000,#rso > div:nth-child(5) > div > div > div.kb0P...,click[97],https://www.sixflags.com/online-store/vip-tour...
2,2,Show me movies produced by Aaron Horvath in imdb,limit,10,2 / 3,0.666667,#__next > button,click[304],https://www.imdb.com/name/nm1739338/
3,3,"Find Airport information of Camarillo Airport,...",limit,10,2 / 3,0.666667,#onetrust-close-btn-container > button,click[294],https://www.flightaware.com/resources/airport/...
4,4,Find a pasta restaurant in Sydney and save it ...,limit,10,2 / 4,0.500000,#page-wrapper > resy-nav > header > div.ResyNa...,"fill_search[35,Italian Sydney]",https://resy.com/cities/plsv?date=2024-02-06&s...
...,...,...,...,...,...,...,...,...,...
104,104,View the full menu for AMC Dine-In in amctheatres,limit,10,0 / 4,0.000000,#rso > div:nth-child(3) > div > div > div > di...,click[88],https://amc-theatres-res.cloudinary.com/image/...
105,105,Play the Wakanda Forever trailer in redbox,limit,10,2 / 3,0.666667,#root > div.dialog.rb-dialog.rb-claim_offer-di...,click[196],https://www.redbox.com/movies/black-panther-wa...
106,106,"Search for McDonalds located in Greenvill, SC ...",finished,6,3 / 3,1.000000,#header_find_form > div.css-1qn0b6x > button,click[7],https://www.yelp.com/search?find_desc=McDonald...
107,107,Find the flight from New York to Miami from Ma...,limit,10,1 / 7,0.142857,#subnav-wrapper_ihsdb9o5q-0 > div > div > div....,click[8],https://www.jetblue.com/best-fare-finder


In [32]:
df_score_1 = df[df["task_score_rate"] == 1.0]
print(df_score_1.value_counts())
df_score_1.shape[0]

task_id  task_name                                                                                           status    steps  task_score  task_score_rate  selector                                                                                                                                               action                         url                                                                                                                                             
19       Find Kevin Durant's bio in espn                                                                     finished  3      3 / 3       1.0              #nav-link-nav-menu-item-8304                                                                                                                           click[79]                      https://www.espn.com/nba/player/bio/_/id/3202/kevin-durant                                                                                          1
28       search for news about 

16

In [35]:
df["score_dif"] = df["task_score"].apply(lambda x: float(x.split("/")[1]) - float(x.split("/")[0]))
df_score_2 = df[df["score_dif"] == 1.0]
print(df_score_2.value_counts())
df_score_2.shape[0]

task_id  task_name                                                                                                                                  status  steps  task_score  task_score_rate  selector                                                                                                                                                                                                                                                                                                                                                                                    action       url                                                                                                                                                                                      score_dif
2        Show me movies produced by Aaron Horvath in imdb                                                                                           limit   10     2 / 3       0.666667         #__next > button            

20

In [38]:
df_score_3 = df[df["score_dif"] == 2.0]
print(df_score_3.value_counts())
df_score_3.shape[0]

task_id  task_name                                                                                                               status  steps  task_score  task_score_rate  selector                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     action                                          url                                                                                                                                                                                                                                                                                                       scor

23

In [39]:
df_score_4 = df[df["task_score"].apply(lambda x: float(x.split("/")[0])) == 0.0]
print(df_score_4.value_counts())
df_score_4.shape[0]

task_id  task_name                                                                                                                                                               status  steps  task_score  task_score_rate  selector                                                                                                                                                                                                                                              action                          url                                                                                                                                                                                                                                                                                                                     score_dif
1        Confirm my vip tour at the six flags Discovery Kingdom in sixflags                                                                                                      er

14

In [2]:
import os


def batch_rename(work_dir):
    for filename in os.listdir(work_dir):
        if len(filename.split("_")[1]) > 10:
            old_path = filename
            new_path = filename.replace(filename.split("_")[1], "")
            # 重命名文件
            os.rename(
                os.path.join(work_dir, old_path),
                os.path.join(work_dir, new_path)
            )


work_directory = './csv_results/group1/DOM-Based'  # 替换为你的工作目录
batch_rename(work_directory)