In [1]:
import pandas as pd
import numpy as np
import json, os

results_dir = "results"

use_tasks = [
        "gsm8k",
        "tracking_shuffled_objects_three_objects",
        "tracking_shuffled_objects_five_objects",
        "coinflip_eight",
        "prontoqa",
        "logiqa-en",
        "lsat-ar"
    ]

use_dirs = [
    "PromptWithAnswerExtraction/gpt35_cot_instruct__baseline",
    "SolveValidateRewrite/gpt35_cot_instruct__rewrite_T0",
    "SolveValidateRewrite/gpt35_cot_instruct__rewrite_T07",
    "SolveValidateRewrite/gpt35_validate_framing__rewrite_T07",
    "SolveValidateRewrite/gpt35_validate_framing_rephrase_1__T07",
    "SolveValidateRewrite/gpt35_validate_framing_rephrase_2__T07",
    "PromptWithAnswerExtraction/gpt35_cot_instruct_reframed__baseline",
    # "PromptWithAnswerExtraction/gpt35_cot_instruct__stg2_goal",
    # "GoalExtraction/gpt35_goal_answertype__stg2_goal",
    # "GoalExtraction/gpt35_goal__stg2_goal_only",
    # "GoalExtraction/gpt35_goal_approach__stg2_goal_approach",
    # "GoalExtraction/gpt35_goal_approach_sbs__stg2_goal_approach",
    # "GoalExtraction/gpt35_goal_approach_sbs_2__stg2_goal_approach",
]

In [2]:
# Flattening the nested JSON to make it suitable for a DataFrame
def flatten(d, parent_key='', sep='_'):
    items = {}
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if k == "Models":
            items[k] = json.dumps(v)
        elif isinstance(v, dict):
            items |= flatten(v, new_key, sep=sep)
        else:
            items[new_key] = v
    return items

dfs = []
for task in use_tasks:
    for dir in use_dirs:
        # try to open details.json from each directory, if it exists
        try:
            with open(os.path.join(results_dir, task, dir, "details.json"), "r") as f:
                data_dict = json.load(f)
                flat_data_dict = flatten(data_dict) 
                tmp_df = pd.DataFrame([flat_data_dict])
                tmp_df['Models file'] = dir.split('/')[-1]
                dfs.append(tmp_df)
        except FileNotFoundError:
            continue

df = pd.concat(dfs, ignore_index=True)
# Move 'Models file' column to the front
moddefs = df.pop('Models file')
df.insert(0, 'Models file', moddefs)

In [3]:
pd.pivot_table(df, index=['Prompt strategy','Models file'],columns='Task',values='Accuracy')

Unnamed: 0_level_0,Task,logiqa-en,lsat-ar,prontoqa,tracking_shuffled_objects/five_objects,tracking_shuffled_objects/three_objects
Prompt strategy,Models file,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GoalExtraction,gpt35_goal__stg2_goal_only,0.4,0.22,0.935,,0.705
GoalExtraction,gpt35_goal_answertype__stg2_goal,0.38,0.252174,0.876,0.612,0.672
GoalExtraction,gpt35_goal_approach__stg2_goal_approach,0.385,0.26,0.94,0.76,0.81
GoalExtraction,gpt35_goal_approach_sbs_2__stg2_goal_approach,,,,,0.8
GoalExtraction,gpt35_goal_approach_sbs__stg2_goal_approach,0.395,0.27,0.855,0.8,0.86
PromptWithAnswerExtraction,gpt35_cot_instruct__baseline,0.388,0.213043,0.86,,0.648
PromptWithAnswerExtraction,gpt35_cot_instruct__stg2_goal,0.412,0.208696,0.888,0.8,0.828
PromptWithAnswerExtraction,gpt35_cot_instruct_reframed__baseline,0.34,0.256522,0.844,,0.312
SolveValidateRewrite,gpt35_cot_instruct__rewrite_T0,0.392,,0.828,,0.596
SolveValidateRewrite,gpt35_cot_instruct__rewrite_T07,0.36,,0.844,,0.572


In [4]:
df

Unnamed: 0,Models file,Task,Prompt strategy,Run identifier,Date,Number of examples,Number of correct,Accuracy,Models,Cost_Prompt_Total,Cost_Prompt_Per token,Cost_Completion_Total,Cost_Completion_Per token,Cost_Total_Total,Cost_Total_Per token,Cost_Currency,Prompt strategy kwargs_max_rewrites
0,gpt35_cot_instruct__baseline,tracking_shuffled_objects/three_objects,PromptWithAnswerExtraction,baseline,2023-08-02,250,162,0.648,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,
1,gpt35_cot_instruct__rewrite_T0,tracking_shuffled_objects/three_objects,SolveValidateRewrite,rewrite_T0,2023-08-02,250,149,0.596,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,2.0
2,gpt35_cot_instruct__rewrite_T07,tracking_shuffled_objects/three_objects,SolveValidateRewrite,rewrite_T07,2023-08-02,250,143,0.572,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,2.0
3,gpt35_validate_framing__rewrite_T07,tracking_shuffled_objects/three_objects,SolveValidateRewrite,rewrite_T07,2023-08-02,250,153,0.612,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,2.0
4,gpt35_validate_framing_rephrase_1__T07,tracking_shuffled_objects/three_objects,SolveValidateRewrite,T07,2023-08-03,250,135,0.54,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,2.0
5,gpt35_validate_framing_rephrase_2__T07,tracking_shuffled_objects/three_objects,SolveValidateRewrite,T07,2023-08-03,250,142,0.568,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,2.0
6,gpt35_cot_instruct_reframed__baseline,tracking_shuffled_objects/three_objects,PromptWithAnswerExtraction,baseline,2023-08-02,250,78,0.312,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,
7,gpt35_cot_instruct__stg2_goal,tracking_shuffled_objects/three_objects,PromptWithAnswerExtraction,stg2_goal,2023-08-03,250,207,0.828,"{""cot_generator"": {""model_name"": ""gpt-3.5-turb...",0,0.0,0,0.0,0,0.0,USD,
8,gpt35_goal_answertype__stg2_goal,tracking_shuffled_objects/three_objects,GoalExtraction,stg2_goal,2023-08-03,250,168,0.672,"{""goal_extractor"": {""model_name"": ""gpt-3.5-tur...",0,0.0,0,0.0,0,0.0,USD,
9,gpt35_goal__stg2_goal_only,tracking_shuffled_objects/three_objects,GoalExtraction,stg2_goal_only,2023-08-04,200,141,0.705,"{""goal_extractor"": {""model_name"": ""gpt-3.5-tur...",0,0.0,0,0.0,0,0.0,USD,


In [5]:
pd.pivot_table(df, index=['Prompt strategy','Models file'], columns=['Task'], values=['Accuracy'], aggfunc=np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy
Unnamed: 0_level_1,Task,logiqa-en,lsat-ar,prontoqa,tracking_shuffled_objects/five_objects,tracking_shuffled_objects/three_objects
Prompt strategy,Models file,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
GoalExtraction,gpt35_goal__stg2_goal_only,0.4,0.22,0.935,,0.705
GoalExtraction,gpt35_goal_answertype__stg2_goal,0.38,0.252174,0.876,0.612,0.672
GoalExtraction,gpt35_goal_approach__stg2_goal_approach,0.385,0.26,0.94,0.76,0.81
GoalExtraction,gpt35_goal_approach_sbs_2__stg2_goal_approach,,,,,0.8
GoalExtraction,gpt35_goal_approach_sbs__stg2_goal_approach,0.395,0.27,0.855,0.8,0.86
PromptWithAnswerExtraction,gpt35_cot_instruct__baseline,0.388,0.213043,0.86,,0.648
PromptWithAnswerExtraction,gpt35_cot_instruct__stg2_goal,0.412,0.208696,0.888,0.8,0.828
PromptWithAnswerExtraction,gpt35_cot_instruct_reframed__baseline,0.34,0.256522,0.844,,0.312
SolveValidateRewrite,gpt35_cot_instruct__rewrite_T0,0.392,,0.828,,0.596
SolveValidateRewrite,gpt35_cot_instruct__rewrite_T07,0.36,,0.844,,0.572
