In [46]:
import pandas as pd
import numpy as np
import json, os

results_dir = ".archive/results/stage_2_goal_extraction"

use_tasks = [
        "gsm8k",
        "tracking_shuffled_objects_three_objects",
        "tracking_shuffled_objects_five_objects",
        "coinflip_eight",
        "prontoqa",
        "logiqa-en",
        "lsat-ar"
    ]

use_dirs = [
    # "PromptWithAnswerExtraction/gpt35_cot_instruct__baseline",
    # "SolveValidateRewrite/gpt35_cot_instruct__rewrite_T0",
    # "SolveValidateRewrite/gpt35_cot_instruct__rewrite_T07",
    # "SolveValidateRewrite/gpt35_validate_framing__rewrite_T07",
    # "SolveValidateRewrite/gpt35_validate_framing_rephrase_1__T07",
    # "SolveValidateRewrite/gpt35_validate_framing_rephrase_2__T07",
    # "SolveValidateRewrite/gpt35_validate_pattern_original__stg3",
    # "SolveValidateRewrite/gpt35_validate_pattern__stg3",
    # "SolveValidateRewrite/gpt35_validate_rewrite_pattern__stg3",
    # "PromptWithAnswerExtraction/gpt35_cot_instruct_reframed__baseline",
    "PromptWithAnswerExtraction/gpt35_cot_instruct__stg2_goal",
    # "GoalExtraction/"
    # "GoalExtraction/gpt35_goal_answertype__stg2_goal",
    # "GoalExtraction/gpt35_goal__stg2_goal_only",
    "GoalExtraction/gpt35_goal_approach__stg2_goal_approach",
    "GoalExtraction/gpt35_goal_approach_sbs__stg2_goal_approach",
    "GoalExtraction/gpt35_approach_sbs_pattern__stg3",
    "GoalExtraction/gpt35_approach_sbs__stg3",
    "GoalExtraction/gpt35_goal_approach_sbs_pattern__stg3",
    "GoalExtraction/gpt35_goal_sbs_pattern__stg3",
    # "GoalExtraction/gpt35_goal_approach_sbs_2__stg2_goal_approach",
]

In [47]:
# Flattening the nested JSON to make it suitable for a DataFrame
def flatten(d, parent_key='', sep='_'):
    items = {}
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if k == "Models":
            items[k] = json.dumps(v)
        elif isinstance(v, dict):
            items |= flatten(v, new_key, sep=sep)
        else:
            items[new_key] = v
    return items

dfs = []
for task in use_tasks:
    for base_dir in use_dirs:
        # Walk through base_dir and its subdirectories
        for root, _, files in os.walk(os.path.join(results_dir, task, base_dir)):
            for file in files:
                # Check if the file is a .json file
                if file.endswith(".json"):
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, "r") as f:
                            data_dict = json.load(f)
                            flat_data_dict = flatten(data_dict)
                            tmp_df = pd.DataFrame([flat_data_dict])
                            tmp_df['Models file'] = os.path.basename(root)
                            dfs.append(tmp_df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
                        continue

df = pd.concat(dfs, ignore_index=True)
# Move 'Models file' column to the front
moddefs = df.pop('Models file')
df.insert(0, 'Models file', moddefs)

In [48]:
pivot_df = df.copy()
# Flag if "Models file" string contains substring "pattern":
pivot_df['uses pattern'] = pivot_df['Models file'].str.contains("pattern")
pivot_df  = pd.pivot_table(pivot_df, index=['Prompt strategy','uses pattern','Models file'],columns='Task',values='Accuracy')
# pivot_df.columns
del pivot_df['tracking_shuffled_objects/five_objects']
pivot_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Task,logiqa-en,lsat-ar,prontoqa,tracking_shuffled_objects/three_objects
Prompt strategy,uses pattern,Models file,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GoalExtraction,False,gpt35_approach_sbs__stg3,0.372,0.291304,0.832,0.84
GoalExtraction,False,gpt35_goal_approach__stg2_goal_approach,0.385,0.26,0.94,0.81
GoalExtraction,False,gpt35_goal_approach_sbs__stg2_goal_approach,0.395,0.27,0.855,0.86
GoalExtraction,True,gpt35_approach_sbs_pattern__stg3,,0.234783,0.888,
GoalExtraction,True,gpt35_goal_approach_sbs_pattern__stg3,0.412,0.204348,0.72,0.628
GoalExtraction,True,gpt35_goal_sbs_pattern__stg3,0.392,0.243478,0.92,0.816
PromptWithAnswerExtraction,False,gpt35_cot_instruct__stg2_goal,0.412,0.208696,0.888,0.828


In [49]:
pd.pivot_table(df, index=['Prompt strategy','Models file'], columns=['Task'], values=['Accuracy'], aggfunc=np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy
Unnamed: 0_level_1,Task,logiqa-en,lsat-ar,prontoqa,tracking_shuffled_objects/five_objects,tracking_shuffled_objects/three_objects
Prompt strategy,Models file,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
GoalExtraction,gpt35_approach_sbs__stg3,0.372,0.291304,0.832,,0.84
GoalExtraction,gpt35_approach_sbs_pattern__stg3,,0.234783,0.888,,
GoalExtraction,gpt35_goal_approach__stg2_goal_approach,0.385,0.26,0.94,0.76,0.81
GoalExtraction,gpt35_goal_approach_sbs__stg2_goal_approach,0.395,0.27,0.855,0.8,0.86
GoalExtraction,gpt35_goal_approach_sbs_pattern__stg3,0.412,0.204348,0.72,,0.628
GoalExtraction,gpt35_goal_sbs_pattern__stg3,0.392,0.243478,0.92,,0.816
PromptWithAnswerExtraction,gpt35_cot_instruct__stg2_goal,0.412,0.208696,0.888,0.8,0.828
