In [None]:
import pandas as pd
import json, os

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

results_dirs = [
    ".archive/results/stage_6_samples_final",
    # ".archive/results/llama_res"
    ]

label_mapping_file = "./results/experiment_label_mapping.tsv"

use_tasks = [
        "gsm8k",
        # "tracking_shuffled_objects_three_objects",
        "tracking_shuffled_objects_five_objects_multi",
        # "coinflip_eight",
        "prontoqa",
        "logiqa-en",
        "lsat-ar",
        "navigate",
        "aqua-rat",
        "logical_deduction_five_objects_multi"
    ]

task_name_mapping = {
    "gsm8k": "GSM8K",
    "tracking_shuffled_objects/five_objects_multi": "Track5",
    "coinflip_eight": "Coinflip",
    "prontoqa": "ProntoQA",
    "logiqa-en": "LogiQA",
    "lsat-ar": "LSAT",
    "navigate": "Nav",
    "aqua-rat": "AQuA",
    "logical_deduction/five_objects_multi": "Deduct5"
}

use_dirs = [
    # "PromptWithAnswerExtraction","SelfConsistency"
    "PromptWithAnswerExtraction","SelfConsistency","SolveValidateRewrite","SampleTree"
]
# ]

pd.set_option('display.precision', 3)

In [None]:
# Flattening the nested JSON to make it suitable for a DataFrame
def flatten(d, parent_key='', sep='_'):
    items = {}
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if k == "Models":
            items[k] = json.dumps(v)
        elif isinstance(v, dict):
            items |= flatten(v, new_key, sep=sep)
        else:
            items[new_key] = v
    return items

dfs = []
for task in use_tasks:
    for base_dir in use_dirs:
        # Walk through base_dir and its subdirectories
        for rd in results_dirs:
            for root, _, files in os.walk(os.path.join(rd, task, base_dir)):
                for file in files:
                    # Check if the file is a .json file
                    if file.endswith(".json"):
                        file_path = os.path.join(root, file)
                        try:
                            with open(file_path, "r") as f:
                                data_dict = json.load(f)
                                flat_data_dict = flatten(data_dict)
                                tmp_df = pd.DataFrame([flat_data_dict])
                                tmp_df['Models file'] = os.path.basename(root)
                                dfs.append(tmp_df)
                        except Exception as e:
                            print(f"Error reading {file_path}: {e}")
                            continue

df = pd.concat(dfs, ignore_index=True)
# Move 'Models file' column to the front

moddefs = df.pop('Models file')
df.insert(0, 'Models file', moddefs)

# Apply task name mapping
df['Task'] = df['Task'].map(task_name_mapping)

label_mapping = pd.read_csv(label_mapping_file, sep="\t")
# Join on to df 
df = df.merge(label_mapping, left_on=['Models file', 'Prompt strategy'], right_on=['label', 'prompt_strategy'], how='left')
df['Samples'] = pd.to_numeric(df['Samples'], errors='coerce').fillna(1).astype(int)

## Replications

In [None]:
pivot_df = df.copy()
# Flag if "Models file" string contains substring "pattern":
# pivot_df['uses pattern'] = pivot_df['Models file'].str.contains("pattern")
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt strategy','uses pattern','Models file'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"]
pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df.columns
# del pivot_df['tracking_shuffled_objects/five_objects']
pivot_df

## Tokens

In [None]:
pd.set_option('display.precision', 1)
pd.set_option('display.float_format', '{:.1f}'.format)
pivot_df = df.copy()
# Flag if "Models file" string contains substring "pattern":
# pivot_df['uses pattern'] = pivot_df['Models file'].str.contains("pattern")
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt strategy','uses pattern','Models file'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"]
pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"] 
# pivot_df= pivot_df[pivot_df['Prompt Strategy']!="Sample Tree"]
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values=['Token counts_completion_per_example','Token counts_prompt_per_example'])
pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment', 'Samples'],columns='Task',values=['Token counts_completion_per_example']).reset_index()
# pivot_df = pivot_df.query('not ("Prompt Strategy" == "Sample Path" and "Samples" == 1)')
mask = (pivot_df['Prompt Strategy'] == 'Sample Path') & (pivot_df['Samples'] == 1)
pivot_df = pivot_df.loc[~mask]

pivot_df

In [None]:
pd.set_option('display.precision', 1)
pd.set_option('display.float_format', '{:.1f}'.format)
pivot_df = df.copy()
# Flag if "Models file" string contains substring "pattern":
# pivot_df['uses pattern'] = pivot_df['Models file'].str.contains("pattern")
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt strategy','uses pattern','Models file'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"]
pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"] 
# pivot_df= pivot_df[pivot_df['Prompt Strategy']!="Sample Tree"]
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values=['Token counts_completion_per_example','Token counts_prompt_per_example'])
pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment','Samples'],columns='Task',values=['Token counts_completion_per_example'])
order = ['GSM8K', 'LSAT', 'LogiQA', 'ProntoQA', 'Track5', 'AQuA', 'Deduct5', 'Nav']


divider_row = pivot_df.loc[('Chain-of-Thought', 'Instruction',1)]

# Divide every row by the contents of the divider_row
pivot_df = pivot_df.divide(divider_row, axis=1)
pivot_df['Row_Mean'] = pivot_df.mean(axis=1)

pivot_df  = pivot_df.reset_index()
mask = (pivot_df['Prompt Strategy'] == 'Sample Path') & (pivot_df['Samples'] == 1)
pivot_df = pivot_df.loc[~mask]
del pivot_df['Samples']
pivot_df_prompt = pivot_df.copy()
pivot_df_prompt.set_index(['Prompt Strategy', 'Experiment'], inplace=True)

row_means_complete = pivot_df_prompt.mean(axis=1).reset_index()
row_means_complete.columns = ['Prompt Strategy', 'Experiment', 'Mean']

prompt_name_map = {'Chain-of-Thought': 'CoT','Self Consistency': 'SC', 'Rewriting': 'VR', 'Input-Output': 'IO', 'Sample Path': 'SP'}
row_means_complete['Prompt Strategy'] = row_means_complete['Prompt Strategy'].map(prompt_name_map)
row_means_complete['Combined'] = row_means_complete['Prompt Strategy'] + ' - ' + row_means_complete['Experiment']
row_means_complete.loc[1, 'Combined'] = 'IO'

# Reorder for the chart
row_means_complete.iloc[0], row_means_complete.iloc[1] = row_means_complete.iloc[1].copy(), row_means_complete.iloc[0].copy()

row_means_complete = pd.concat([row_means_complete.iloc[:2], row_means_complete.iloc[7:8], row_means_complete.iloc[2:7], row_means_complete.iloc[8:]])
row_means_complete.reset_index(drop=True, inplace=True)
row_means_complete.iloc[6], row_means_complete.iloc[7] = row_means_complete.iloc[7].copy(), row_means_complete.iloc[6].copy()
row_means_complete.reset_index(drop=True, inplace=True)
row_means_complete

In [None]:
pd.set_option('display.precision', 1)
pd.set_option('display.float_format', '{:.1f}'.format)
pivot_df = df.copy()
# Flag if "Models file" string contains substring "pattern":
# pivot_df['uses pattern'] = pivot_df['Models file'].str.contains("pattern")
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt strategy','uses pattern','Models file'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"]
pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"] 
# pivot_df= pivot_df[pivot_df['Prompt Strategy']!="Sample Tree"]
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values=['Token counts_completion_per_example','Token counts_prompt_per_example'])
pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment','Samples'],columns='Task',values=['Token counts_prompt_per_example'])
order = ['GSM8K', 'LSAT', 'LogiQA', 'ProntoQA', 'Track5', 'AQuA', 'Deduct5', 'Nav']

# Let's assume you want to divide by the row at index 'some_index'
divider_row = pivot_df.loc[('Chain-of-Thought', 'Instruction',1)]

# Divide every row by the contents of the divider_row
pivot_df = pivot_df.divide(divider_row, axis=1)
pivot_df['Row_Mean'] = pivot_df.mean(axis=1)

pivot_df  = pivot_df.reset_index()
mask = (pivot_df['Prompt Strategy'] == 'Sample Path') & (pivot_df['Samples'] == 1)
pivot_df = pivot_df.loc[~mask]
del pivot_df['Samples']
pivot_df_prompt = pivot_df.copy()
pivot_df_prompt.set_index(['Prompt Strategy', 'Experiment'], inplace=True)

row_means_prompt = pivot_df_prompt.mean(axis=1).reset_index()
row_means_prompt.columns = ['Prompt Strategy', 'Experiment', 'Mean']

prompt_name_map = {'Chain-of-Thought': 'CoT','Self Consistency': 'SC', 'Rewriting': 'VR', 'Input-Output': 'IO', 'Sample Path': 'SP'}
row_means_prompt['Prompt Strategy'] = row_means_prompt['Prompt Strategy'].map(prompt_name_map)
row_means_prompt['Combined'] = row_means_prompt['Prompt Strategy'] + ' - ' + row_means_prompt['Experiment']
row_means_prompt.loc[1, 'Combined'] = 'IO'


# Reorder for the chart
row_means_prompt.iloc[0], row_means_prompt.iloc[1] = row_means_prompt.iloc[1].copy(), row_means_prompt.iloc[0].copy()

row_means_prompt = pd.concat([row_means_prompt.iloc[:2], row_means_prompt.iloc[7:8], row_means_prompt.iloc[2:7], row_means_prompt.iloc[8:]])
row_means_prompt.reset_index(drop=True, inplace=True)


row_means_prompt

In [None]:
sns.set_context("talk")
default_blue = sns.color_palette()[0]
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 12), sharex=True)

palette = sns.color_palette("husl", 9)
c_io = 0
c_cot = 1
c_sc = 3
c_rv = 6
c_sp = 7
colors = [
    palette[c_io], palette[c_cot], palette[c_sc],  # 0, 1, 2
    palette[c_rv],  palette[c_rv],  # 4, 5
    palette[c_sp], palette[c_sp], palette[c_sp]   # 6, 7, 8
]

import matplotlib.patches as mpatches
legend_handles = [
    mpatches.Patch(color=palette[c_io], label='Input-Output'),
    mpatches.Patch(color=palette[c_cot], label='Chain-of-Thought'),
    mpatches.Patch(color=palette[c_sc], label='Self-Consistency'),
    mpatches.Patch(color=palette[c_rv], label='Rewriting'),
    mpatches.Patch(color=palette[c_sp], label='Sampled Path')
]

# fig.legend(handles=legend_handles, bbox_to_anchor=(0.35, 0.98), facecolor=(1, 1, 1, 1))

legend = fig.legend(handles=legend_handles, bbox_to_anchor=(0.35, 0.98))
legend.get_frame().set_alpha(1)

# Plot data for the first version
sns.barplot(x='Combined', y='Mean', palette=colors, data=row_means_prompt, ax=axes[0])
axes[0].axhline(y=1, linestyle='--', color='black')  # Dashed line at y=1
axes[0].set_xlabel('')
axes[0].set_ylabel('Index vs CoT')
axes[0].set_title('Prompt Tokens')

# Plot data for the second version
sns.barplot(x='Combined', y='Mean', palette=colors, data=row_means_complete, ax=axes[1])
axes[1].axhline(y=1, linestyle='--', color='black')  # Dashed line at y=1
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=50)
axes[1].set_ylabel('Index vs CoT')
axes[1].set_title('Completion Tokens')

plt.xlabel(None)
plt.tight_layout()
plt.show()


## Rewrites

In [None]:
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.3f}'.format)
pivot_df = df.copy()
# Flag if "Models file" string contains substring "pattern":
# pivot_df['uses pattern'] = pivot_df['Models file'].str.contains("pattern")
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt strategy','uses pattern','Models file'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
# pivot_df  = pd.pivot_table(pivot_df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
pivot_df= pivot_df[pivot_df['Experiment']!="Let's think"] 
# pivot_df= pivot_df[pivot_df['Prompt Strategy']!="Sample Tree"]
pivot_df= pivot_df[pivot_df['Prompt Strategy']!="Rewriting"]
# pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment','Temperature'],columns='Task',values='Accuracy')
pivot_df  = pd.pivot_table(pivot_df, index=['Prompt Strategy','Experiment','Samples'],columns='Task',values='Accuracy')
# Round all columns to three decimal places
# df_rounded = df.round(3)

# Convert 'Samples' column to integer
multi_model = pd.pivot_table(df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
print(pivot_df.to_latex(float_format="%.3f"))

# pivot_df.columns
# del pivot_df['tracking_shuffled_objects/five_objects']
pivot_df

In [None]:
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.3f}'.format)
pd.pivot_table(df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')

In [None]:
multi_model = pd.pivot_table(df, index=['Model Name','Prompt Strategy','Experiment'],columns='Task',values='Accuracy')
multi_model = multi_model.to_latex(float_format="%.3f")
print(multi_model)

In [None]:
# Extract latex table of pivot_df
latex_table = pivot_df.to_latex(float_format="%.3f")
print(latex_table)