In [1]:
import pandas as pd
import json5
import json
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def parse_thought_action(dict_str):
    thought_action = {}
    thought_match = re.search(r"'thought':\s*(.+?)\s*,\s*'action'", dict_str)
    action_match = re.search(r"'action':\s*(.+?)\s*}", dict_str)
    thought = thought_match.group(1) if thought_match else None
    thought = thought.replace("\\", "").replace("\"", "").replace("\'", "")
    action = action_match.group(1) if action_match else None
    action = action.replace("\\", "").replace("\"", "").replace("\'", "")
    thought_action = {"thought": thought, "action": action}
    return thought_action

def to_dict(input_string):
   
    pattern = r"('action_type'|'element_id'|'url'|'fill_text'):\s*(<[^>]+>|\d+|'[^']+'|\"[^\"]+\")"
    matches = re.findall(pattern, input_string)
    extracted_fields = {}
    for match in matches:
        field_name, field_value = match
        if field_value.startswith('<') and field_value.endswith('>'):
            enum_name = field_value.split('.')[-1].strip('<> ')
            extracted_fields[field_name.strip("'")] = enum_name
        else:
            extracted_fields[field_name.strip("'")] = field_value.strip("'")
    action = ""
    if "google_search" in extracted_fields["action_type"].lower():
        action = "google_search" + "[" + extracted_fields["fill_text"] + "]"
    elif "fill_search" in extracted_fields["action_type"].lower():
        action = "fill_search" + \
            "[" + str(extracted_fields["element_id"]) + "," + \
            extracted_fields["fill_text"] + "]"
    elif "fill_form" in extracted_fields["action_type"].lower():
        action = "fill_search" + \
            "[" + str(extracted_fields["element_id"]) + "," + \
            extracted_fields["fill_text"] + "]"
    elif "select_option" in extracted_fields["action_type"].lower():
        action = "select_option" + \
            "[" + str(extracted_fields["element_id"]) + "," + \
            extracted_fields["fill_text"] + "]"
    elif "goto" in extracted_fields["action_type"].lower() and extracted_fields.get('url'):
        action = "goto" + "[" + extracted_fields["url"] + "]"
    elif "click" in extracted_fields["action_type"].lower():
        action = "click" + "[" + str(extracted_fields["element_id"]) + "]"
    elif "go_back" in extracted_fields["action_type"].lower():
        action = "go_back" + "[" + str(extracted_fields["element_id"]) + "]"
    elif "none" in extracted_fields["action_type"].lower():
        action = "None"
    return action

def score_rate(score):
    first, second = score.split("/")
    return float(first) / float(second)

def parse_step_reward(dict_str):
    score_description = {}
    score_match = re.search(r"'score':\s*(.+?)\s*,\s*'description'", dict_str)
    description_match = re.search(r"'description':\s*(.+?)\s*}", dict_str)
    score = score_match.group(1) if score_match else None
    score = score.replace("\\", "").replace("\"", "").replace("\'", "")
    description = description_match.group(1) if description_match else None
    description = description.replace(
        "\\", "").replace("\"", "").replace("\'", "")
    score_description = {"score": score, "description": description}
    return score_description


def process_step_reward(dict_str):
    if dict_str.lower() == "{}":
        dict_str = {}
    elif dict_str.lower() == "finished":
        dict_str = {"score:": 10, "description": "finished"}
    else:
        dict_str = parse_step_reward(dict_str)
    return dict_str



In [20]:
import pandas as pd
from pandas import json_normalize

# file_path = "csv_results/group_sample_20240429/20240429-222232_dom_gpt-3.5-turbo_dom_reward_False/1_XByRzDf1LGHZDev_fnQrj.json"
def write_task_result_to_df(file_path):
    with open(file_path) as f:
        data = json.load(f)
    step_list = data["step_list"]
    task_name = data["task_name"]
    task_status = data["status"]
    reference_task_length = data["reference_task_length"]
    evaluate_steps = data["evaluate_steps"]
    for idx,item in enumerate(step_list):
        for key in item:
            step_list[idx][key] = str(step_list[idx][key])
    data_df = json_normalize(step_list, errors='ignore')
    return task_name,task_status,reference_task_length,evaluate_steps,data_df
# task_name,task_status,reference_task_length,evaluate_steps,data_df = write_task_result_to_df(file_path)

In [24]:
def write_to_json(df):
    df["step_index"] = df["step_index"].apply(lambda x: int(x))
    df["trace_to_dict"] = df["current_trace"].apply(lambda x: parse_thought_action(x))
    df["action_to_str"] = df["execute_action"].apply(lambda x: to_dict(x))
    df["score_rate"] = df["score"].apply(lambda x: score_rate(x))
    df["step_reward"] = df["step_reward"].apply(
        lambda x: process_step_reward(x))
    df["selector"] = df["selector"].fillna("None")
    df["match_result"] = df["match_func_result"]
    df["element_value"] = df["element_value"].fillna("None") 
    df["error"] = df["error_message"].fillna("None")
    df["step_url"] = df["step_url"].fillna("None")
    df_copy = df[
        [
            "step_index",
            "trace_to_dict",
            "selector",
            "action_to_str",
            "score",
            "score_rate",
            "step_reward",
            "step_url",
            "match_result",
            "element_value",
            "error"
        ]
    ]
    def summary(x):
        dic = {
            "step_index": x["step_index"],
            "trace_description": x["trace_to_dict"] if x["trace_to_dict"] else {},
            "selector": x["selector"] if x["selector"] != "None" else "",
            "element_value":x["element_value"] if x["element_value"] != "None" else "",
            "action": x["action_to_str"] if x["action_to_str"] else "",
            "task_score": x["score"],
            "task_score_rate": x["score_rate"],
            "current_reward_score_description": x["step_reward"],
            "url": x["step_url"],
            "match_result": x["match_result"],
            "error":x["error"] if x["error"] != "None" else ""
        }
        # print(dic["match_result"])
        return dic
    step_list = []
    df_copy.apply(lambda x: step_list.append(summary(x)), axis=1)
    return step_list


In [25]:
import os
import pandas as pd

folder_path = './csv_results/group_sample_20240430/20240430-100811_dom_gpt-3.5-turbo_dom_reward_False/'
task_list = []
for _, filename in enumerate(os.listdir(folder_path)):
    file_path = os.path.join(folder_path, filename)
    out_json = {}
    task_name,task_status,reference_task_length,evaluate_steps,data_df = write_task_result_to_df(file_path)
    out_json["task_id"] = int(filename.split("_")[0]) 
    out_json["task_name"] = task_name
    out_json["task_status"] = task_status
    if os.path.isfile(file_path):
        task_step_list = write_to_json(data_df)
        out_json["step_list"] = task_step_list
        task_list.append(out_json)
    out_json["evalation"] = evaluate_steps
if not os.path.exists("./results/group_sample_20240430"):
    os.makedirs("./results/group_sample_20240430")
out_json_file_path = './results/group_sample_20240430/out.json'
with open(out_json_file_path, 'w') as json_file:
    json.dump(task_list, json_file)

In [27]:
def evaluate(out_file_path):
    def read_csv_result(file_path=out_file_path):
        with open(file_path) as f:
            data = json.load(f)
        last_action_result_list = []
        for items in data:
            data_dic = {}
            data_dic["task_id"] = items["task_id"]
            data_dic["task_name"] = items["task_name"]
            data_dic["status"] = items["task_status"]
            data_dic["steps"] = items["step_list"][-1]["step_index"]
            data_dic["task_score"] = items["step_list"][-1]["task_score"]
            data_dic["task_score_rate"] = items["step_list"][-1]["task_score_rate"]
            data_dic["selector"] = items["step_list"][-1]["selector"]
            data_dic["action"] = items["step_list"][-1]["action"]
            data_dic["url"] = items["step_list"][-2]["url"]
            # data_dic["match_result"] = items["step_list"][-1]["match_result"]
            last_action_result_list.append(data_dic)
        # result_list = sorted(data,key=lambda x: x["step_list"]["task_score_rate"],reverse=True)
        return last_action_result_list
    all_data = read_csv_result()
    df = pd.DataFrame(all_data)
    df["step_score"] = df["task_score"].apply(lambda x: float(x.split("/")[0]))
    df["efficiency_score"] = df["step_score"] / df["steps"]

    def get_llm_finished(row):
        if row["status"] == "finished" and row["task_score_rate"] < 1.0:
            return "llm_finished"
        else:
            return row["status"]
        
    df["status"] = df.apply(get_llm_finished,axis=1)
    
    def get_human_alignment_score(row):
        if row["status"] == "finished":
            return 1
        elif row["status"] == "llm_finished":
            return row["task_score_rate"]
        elif row["status"] == "step_limit":
            return row["task_score_rate"] * 0.8
        
    df['human_alignment_score'] = df.apply(get_human_alignment_score, axis=1)

    df_evaluate = df[["task_name","status","steps","task_score","task_score_rate","step_score","efficiency_score","human_alignment_score"]]

    def calculate_total_score(scores):
        molecular_sum = sum(float(x.split('/')[0]) for x in scores)
        denominator_sum = sum(float(x.split('/')[1]) for x in scores)
        final_score = molecular_sum / denominator_sum
        return final_score
    step_score_rate = calculate_total_score(df_evaluate['task_score'])

    completion_rate = df_evaluate[df_evaluate["status"] == "finished"].shape[0] / df_evaluate.shape[0]

    print(df_evaluate.head(5))
    print(df_evaluate.shape)

    average_step_score_rate = df_evaluate["task_score_rate"].mean()
    average_human_alignment_score = df_evaluate["human_alignment_score"].mean()
    average_efficiency_score = df_evaluate["efficiency_score"].mean()

    print("average_step_score_rate",average_step_score_rate)
    print("average_human_alignment_score",average_human_alignment_score)
    print("average_efficiency_score",average_efficiency_score)
    print("step_score_rate",step_score_rate)
    print("completion_rate",completion_rate)


evaluate("./results/group_sample_20240430/out.json")

                                           task_name        status  steps  \
0  View the full menu for AMC Dine-In in amctheatres  llm_finished      2   
1  Search for used Jaguar XF with no black exteri...    step_limit     23   
2  Browse for wall art with a price range of $25 ...    step_limit      9   
3  Search for queen-size pillow protectors from t...    step_limit     18   
4  Find a south african history podcast with leng...    step_limit     12   

  task_score  task_score_rate  step_score  efficiency_score  \
0      1 / 4         0.250000         1.0          0.500000   
1      1 / 7         0.142857         1.0          0.043478   
2      1 / 3         0.333333         1.0          0.111111   
3      3 / 8         0.375000         3.0          0.166667   
4      2 / 4         0.500000         2.0          0.166667   

   human_alignment_score  
0               0.250000  
1               0.114286  
2               0.266667  
3               0.300000  
4               0.40000