In [None]:
import pandas as pd
import numpy as np
import json, os

pd.set_option("display.precision", 3)

# results_dir = "./.archive/results/stage_1_rewrites"
# results_dir = "./.archive/results/stage_4_redos"
results_dir = "./.archive/results/stage_6_samples_final"

label_mapping_file = "./results/experiment_label_mapping.tsv"
label_mapping = pd.read_csv(label_mapping_file, sep="\t")
use_tasks = [
        "gsm8k",
        # "tracking_shuffled_objects_three_objects",
        "tracking_shuffled_objects_five_objects_multi",
        # "coinflip_eight",
        "prontoqa",
        "logiqa-en",
        "lsat-ar",
        "navigate",
        "aqua-rat",
        "logical_deduction_five_objects_multi"
    ]

task_name_mapping = {
    "gsm8k": "GSM8K",
    "tracking_shuffled_objects/five_objects_multi": "Track5",
    "coinflip_eight": "Coinflip",
    "prontoqa": "ProntoQA",
    "logiqa-en": "LogiQA",
    "lsat-ar": "LSAT",
    "navigate": "Nav",
    "aqua-rat": "AQuA",
    "logical_deduction/five_objects_multi": "Deduct5"
}

use_dirs = [
    # "SolveValidateRewrite/gpt35_all_instruct_structured__stg6",
    # "SolveValidateRewrite/gpt35_all_instruct_structured_T07__stg6",
    # "SolveValidateRewrite/gpt35_all_pattern_structured__stg6",
    "SolveValidateRewrite/gpt35_all_instruct__stg4",
    # "SolveValidateRewrite/gpt35_all_instruct__stg4_T07",
    "SolveValidateRewrite/gpt35_all_instruct_structured__stg6",
    # "SolveValidateRewrite/gpt35_all_instruct_structured__stg6_T07",
    # "SolveValidateRewrite/gpt35_all_pattern__stg4"
    # "SolveValidateRewrite/gpt35_cot_instruct__rewrite_T07",
    # "SolveValidateRewrite/gpt35_validate_framing__rewrite_T07",
    # "SolveValidateRewrite/gpt35_validate_framing_rephrase_1__T07",
    # "SolveValidateRewrite/gpt35_validate_framing_rephrase_2__T07",
    # "SolveValidateRewrite/gpt35_validate_pattern__stg3",
    # "SolveValidateRewrite/gpt35_validate_rewrite_pattern__stg3",
    # "PromptWithAnswerExtraction/gpt35_cot_instruct_reframed__baseline",
    # "PromptWithAnswerExtraction/gpt35_cot_instruct_reframed__baseline"
]

filepath = os.path.join(results_dir, "prontoqa","SolveValidateRewrite/gpt35_cot_instruct__rewrite_T0", "results.json")

In [None]:
def extract_metrics(json_examples):
    dfs=[]
    for ex in json_examples:
        ex_dict = {}
        ex_dict['n_responses'] = ex['response_count']
        ex_dict['true_answer'] = ex['true_answer']
        ex_dict['predicted_answer'] = ex['predicted_answer']
        ex_dict['correct'] = ex['true_answer'] == ex['predicted_answer']
        for i,res in enumerate(ex['response_pairs']):
            ex_dict[f"answer_{i}"] = res['answer']
        ex_dict['answer_0_correct'] = ex_dict['answer_0'] == ex_dict['true_answer']
        dfs.append(pd.DataFrame(ex_dict,index=[ex['example_idx']]))
    df = pd.concat(dfs)
    # Create 'answer_1' column if it does not exist yet
    if 'answer_1' not in df.columns:
        df['answer_1'] = np.nan

    total_examples = len(df)
    df['is_rewrite'] = ~df['answer_1'].isna()
    total_rewrites = df['is_rewrite'].sum()
    
    total_originally_correct = df['answer_0_correct'].sum()
    total_originally_incorrect = (~df['answer_0_correct']).sum()
    
    total_incorrect_rewrites = (df['is_rewrite'] & ~df['answer_0_correct']).sum()
    total_correct_rewrites = (df['is_rewrite'] & df['answer_0_correct']).sum()
    

    # Get rewrite correction accuracy
    df_rewrites = df[~df['answer_1'].isna()]
    df_rewrite_conversions = df_rewrites.groupby(['answer_0_correct','correct']).size()
    correct_to_wrong_perc = df_rewrite_conversions[True][False] / df_rewrite_conversions[True].sum()
    wrong_to_correct_perc = df_rewrite_conversions[False][True] / df_rewrite_conversions[False].sum()    

    return {
        'pre_rewrite_acc': df['answer_0_correct'].mean(),
        'Rewrite/Total': total_rewrites/ total_examples, # Percent of all examples that were rewritten
        'Rewrite Incorrect/All Rewrite': total_incorrect_rewrites / total_rewrites, # Percent of all rewrites that were of incorrect answers   
        'Rewrite Correct/All Correct':  total_correct_rewrites / total_originally_correct, # Percent of all correct answers that were rewritten
        'Rewrite Incorrect/All Incorrect': total_incorrect_rewrites / total_originally_incorrect, # Percent of all incorrect answers that were rewritten
        'Correct To Incorrect': correct_to_wrong_perc, # Percent of initially correct rewrites than then become wrong 
        'Incorrect To Correct': wrong_to_correct_perc, # Percent of initially wrong rewrites than then become correct
        'All correct': total_originally_correct,
        'All incorrect' :   total_originally_incorrect,
        'Correct Rewrites': total_correct_rewrites,
        'Incorrect Rewrites': total_incorrect_rewrites,
        'Total Rewrites': total_rewrites,
        'Total Examples': total_examples
      
    }
    

In [None]:
dfs = []
for task in use_tasks:
    for dir in use_dirs:
        print(f"Task: {task}, Run: {os.path.join(results_dir, task, dir, 'results.json')}")
        # try to open results.json from each directory, if it exists
        try:
            with open(os.path.join(results_dir, task, dir, "results.json"), "r") as f:
                data_dict = json.load(f)
                metrics_dict = {
                    "Task": data_dict["Task"],
                    "Run": dir.split("/")[-1],
                    # "N examples": data_dict["Number of examples"],
                    # "Number of correct": data_dict["Number of correct"],
                    "Accuracy": data_dict["Accuracy"],
                }
                metrics_dict |= extract_metrics(data_dict['Examples'])
                dfs.append(pd.DataFrame(metrics_dict, index=[f"{task}_{dir.split('/')[-1]}"]))
        except FileNotFoundError:
            continue
    
df = pd.concat(dfs)
df = df.set_index(["Run","Task"])
df.sort_index(inplace=True) 
df.rename(index=task_name_mapping, columns={'pre_rewrite_acc':'Pre-Rewrite Accuracy'},inplace=True)
# Rename index "Run" using label_mapping (join on label)
df = df.copy().reset_index()
df = df.merge(label_mapping[['label','Experiment']], left_on='Run', right_on='label', how='left')
df.set_index(['Experiment','Task'], inplace=True)
del df['label']
del df['Run']
pd.set_option('display.float_format', '{:.3f}'.format)
df

In [None]:
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.1f}'.format)
df*100

In [None]:
structured_df = df[df.index.get_level_values(0).str.contains("Structured")]
# REmove Experiment index
structured_df.index = structured_df.index.droplevel(0)
structured_df = (structured_df*100).T
print(structured_df.to_latex(float_format="{:0.1f}".format))
structured_df

In [None]:
instruction_df = df[df.index.get_level_values(0).str.contains("Instruction")]
# REmove Experiment index
instruction_df.index = instruction_df.index.droplevel(0)
instruction_df = (instruction_df*100).T
print((instruction_df*100).T.to_latex(float_format="{:0.1f}".format))

In [None]:
combined_df = pd.concat([structured_df, instruction_df], axis=0, keys=['Structured', 'Instruction'])
# combined_df.swaplevel(0, 1, axis=0).sort_index(axis=0)
custom_order = {label: idx for idx, label in enumerate(list(instruction_df.index))}
combined_df = combined_df.swaplevel(0, 1, axis=0).sort_index(axis=0)

print(combined_df.to_latex(float_format="{:0.1f}".format))
combined_df

In [None]:
df_means = (df.copy()*100).reset_index()
del df_means['Task']
df_means = df_means.groupby('Experiment').mean()
# Transpose table
df_means = df_means.T
df_means

In [None]:
# Print latex
tbl=df_means.to_latex(float_format="{:0.1f}".format, escape=False)
print(tbl)