In [1]:
import sys
import json
sys.path.append('..')
from src.common import *
from src.analysis.model_performances import *
from copy import deepcopy
from helpers import *
import pandas as pd

CONF_KEY = 'sem'

def latex_table_mods(latex_table):
    return latex_table.replace('{lllllllllllll}','{l|ll|ll|ll|ll||ll|ll}').replace('${None}_{None}$', '---')

model_names =  ['gpt-4o', 'gemini', 'llama2-13b-chat', 'llama-3-8b-instruct', 'llama2-7b-chat', 'gemma-7b'] + ['llama-3-8b-tuned','gemma-7b-tuned']

In [2]:
answer_type = FREE_ANSWER_TYPE# f'{TRUE_FALSE_ANSWER_TYPE}.{score_key}'
score_key = ACCURACY_SCORE_KEY #F1_SCORE_KEY#'accuracy'

ids_file_name = 'dataset_ids.test.pruned'  # None
save_main_dir = f'{STATISTICS_PATH}.{ids_file_name}'
stats_all = collect_stats_all(answer_type, save_main_dir=save_main_dir)
print(len(stats_all))
plan_lengths = [1,10,19]

save_dir = os.path.join(save_main_dir, 'tables', 'by_models')
os.makedirs(save_dir, exist_ok=True)

100%|██████████| 48384/48384 [00:02<00:00, 18259.39it/s]

4897





In [3]:

def to_df_by_len_by_category(results_all, answer_type, prompt_type,
                      model_names = PROMPT_MODEL_NAMES,
                      ramifications = WITHOUT_RAMIFICATIONS,
                      domain = ALL_DOMAINS_KEY, 
                      subs = WITHOUT_RANDOM_SUB):

    index = []
    data = []    
    for plan_length in PLAN_LENGTHS:
        for question_category in QUESTION_CATEGORIES+[ALL_QUESTION_CATEGORIES_KEY]:
            index.append((plan_length, TO_PRETTY.get(question_category,question_category)))
            # index.append('{}')
            data_columns = {}
            data_columns['plan pength'] = plan_length
            data_columns['question category'] = TO_PRETTY.get(question_category,question_category)
            for model_name in model_names:
                res_obj = filter_single_selector(results_all, plan_length, question_category, ramifications, model_name, prompt_type, domain, answer_type, subs)
                if res_obj:
                    mean = res_obj['result']
                    sem = None
                    if res_obj['result_other']:
                        sem = res_obj['result_other'][CONF_KEY]
                    not_corrupted = res_obj['stats']['num_not_corrupted']
                    final_res = (mean, sem, not_corrupted)
                else:
                    final_res = (None, None, None)
                final_res = tuple([round(v*100, 2) if v else v for v in final_res ])
                final_res = '${'+str(final_res[0])+'}_{'+str(final_res[1])+'}$'
                data_columns[(TO_PRETTY.get(model_name,model_name), TO_PRETTY.get(prompt_type,prompt_type))] = final_res
            data.append(data_columns)
    return pd.DataFrame(data, index = index)

def to_df_few_shot(results_all, answer_type, 
                   plan_length=19,
                      model_names = PROMPT_MODEL_NAMES,
                      ramifications = WITHOUT_RAMIFICATIONS,
                   question_category = ALL_QUESTION_CATEGORIES_KEY,
                      domain = ALL_DOMAINS_KEY, 
                      subs = WITHOUT_RANDOM_SUB):

    index = []
    data = []    
    for prompt_type in PROMPT_TYPES:
        index.append(prompt_type)
        data_columns = {}
        data_columns['prompt'] = prompt_type
        for model_name in model_names:
            res_obj = filter_single_selector(results_all, plan_length, question_category, ramifications, model_name, prompt_type, domain, answer_type, subs)
            if res_obj:
                mean = res_obj['result']
                sem = None
                if res_obj['result_other']:
                    sem = res_obj['result_other'][CONF_KEY]
                not_corrupted = res_obj['stats']['num_not_corrupted']
                final_res = (mean, sem, not_corrupted)
            else:
                final_res = (None, None, None)
            final_res = tuple([round(v*100, 2) if v else v for v in final_res ])
            final_res = '${'+str(final_res[0])+'}_{'+str(final_res[1])+'}$'
            data_columns[TO_PRETTY.get(model_name,model_name)] = final_res
        data.append(data_columns)
    return pd.DataFrame(data, index = index)

def to_df_few_shot_by_category(results_all, answer_type, 
                               plan_length=19, model_names = PROMPT_MODEL_NAMES,
                      ramifications = WITHOUT_RAMIFICATIONS,
                   question_category = ALL_QUESTION_CATEGORIES_KEY,
                      domain = ALL_DOMAINS_KEY, 
                      subs = WITHOUT_RANDOM_SUB):

    data = []    
    for question_category in QUESTION_CATEGORIES+[ALL_QUESTION_CATEGORIES_KEY]:
        if question_category == 'composite':
            continue
        data_columns = {}
        data_columns['question category'] = TO_PRETTY.get(question_category,question_category)
        for model_name in model_names:
            for prompt_type in  ['few_shot_0', 'few_shot_1', 'few_shot_5']:
                res_obj = filter_single_selector(results_all, plan_length, question_category, ramifications, model_name, prompt_type, domain, answer_type, subs)
                if res_obj:
                    mean = res_obj['result']
                    sem = None
                    if res_obj['result_other']:
                        sem = res_obj['result_other'][CONF_KEY]
                    not_corrupted = res_obj['stats']['num_not_corrupted']
                    final_res = (mean, sem, not_corrupted)
                else:
                    final_res = (None, None, None)
                final_res = tuple([round(v*100, 2) if v else v for v in final_res ])
                final_res = '${'+str(final_res[0])+'}_{'+str(final_res[1])+'}$'
                data_columns[(TO_PRETTY.get(model_name,model_name),prompt_type)] = final_res
        data.append(data_columns)
    return pd.DataFrame(data)

In [4]:
# def to_df(results_all, plan_lengths, answer_type, models=PROMPT_MODEL_NAMES,
#           prompt_types = PROMPT_TYPES,
#           domain = ALL_DOMAINS_KEY, subs = WITHOUT_RANDOM_SUB):
#     
#     index = []
#     data = []    
#     for plan_length in plan_lengths:
#         for ramifications in RAMIFICATION_TYPES:
#             index.append((plan_length, TO_PRETTY.get(ramifications, ramifications)))
#             # data_columns = {}
#             for model_name in models:
#                 for prompt_type in prompt_types:
#                     res_obj = filter_single_selector(results_all, plan_length, ALL_QUESTION_CATEGORIES_KEY, ramifications, model_name, prompt_type, domain, answer_type, subs)
#                     # print(res_obj)
#                     if res_obj:
#                         mean = res_obj['result']
#                         sem = None
#                         if res_obj['result_other']:
#                             sem = res_obj['result_other'][CONF_KEY]
#                         not_corrupted = res_obj['stats']['num_not_corrupted']
#                         final_res = (mean, sem, not_corrupted)
#                     else:
#                         final_res = (None, None, None)
#                     final_res = tuple([round(v*100, 2) if v else v for v in final_res ])
#                     final_res = '${'+str(final_res[0])+'}_{'+str(final_res[1])+'}$'
#                     data_columns[(TO_PRETTY.get(model_name, model_name), TO_PRETTY.get(prompt_type, prompt_type))] = final_res
#             data.append(data_columns)
#     return pd.DataFrame(data, index = index)
# 
# def to_df_by_category(results_all, answer_type,  
#                       model_names = PROMPT_MODEL_NAMES,
#                       prompt_types= PROMPT_TYPES,
#                       ramifications = WITHOUT_RAMIFICATIONS,
#                       domain = ALL_DOMAINS_KEY, 
#                       subs = WITHOUT_RANDOM_SUB,
#                       plan_length=19):
# 
#     index = []
#     data = []    
#     for question_category in QUESTION_CATEGORIES:
#         index.append(question_category)
#         data_columns = {}
#         for model_name in model_names:
#             for prompt_type in prompt_types:
#                 res_obj = filter_single_selector(results_all, plan_length, question_category, ramifications, model_name, prompt_type, domain, answer_type, subs)
#                 if res_obj:
#                     mean = res_obj['result']
#                     sem = None
#                     if res_obj['result_other']:
#                         sem = res_obj['result_other'][CONF_KEY]
#                     not_corrupted = res_obj['stats']['num_not_corrupted']
#                     final_res = (mean, sem, not_corrupted)
#                 else:
#                     final_res = (None, None, None)
#                 final_res = tuple([round(v*100, 2) if v else v for v in final_res ])
#                 final_res = '${'+str(final_res[0])+'}_{'+str(final_res[1])+'}$'
#                 data_columns[(TO_PRETTY.get(model_name,model_name), TO_PRETTY.get(prompt_type,prompt_type))] = final_res
#         data.append(data_columns)
#     return pd.DataFrame(data, index = index)

# models_for_plot =  ['gemini', 'gpt-4o'] + ['llama2-13b-chat', 'llama-3-8b-instruct','gemma-7b'] + ['llama-3-8b-tuned','gemma-7b-tuned']
# # model_prompts_combos = [('small-models', SMALL_MODELS, PROMPT_TYPES), ('big-models', BIG_MODELS, ['few_shot_1', 'few_shot_5'])]
# model_prompts_combos = [('all-models', PROMPT_MODEL_NAMES, ['few_shot_1', 'few_shot_5'])]
# 
# for subs in [WITHOUT_RANDOM_SUB, WITH_RANDOM_SUB]:
#     for model_save_name, model_names, prompt_types in model_prompts_combos:
#         df = to_df(stats_all, plan_lengths, answer_type, prompt_types=prompt_types, models=model_names, subs=subs)
#         print(df)
#         
#         caption_nl = f'performance of {model_save_name} on the test set, {subs}'.replace('_', ' ')
#         latex_table = latex_table_mods(to_latex_table(df, caption_nl, label=model_save_name))
#         save_key = f'all.{model_save_name}.{subs}'
#         with open(os.path.join(save_dir, f'{save_key}.tex'), 'w') as f:
#             f.write(latex_table)

# Plot By Category

In [5]:
# plan_length = 19
# for subs in [WITHOUT_RANDOM_SUB, WITH_RANDOM_SUB]:
#     for model_save_name, model_names, prompt_types in model_prompts_combos:
#         df2 = to_df_by_category(stats_all, answer_type, model_names=model_names, prompt_types=prompt_types, subs=subs)
#         print(df2)
#         
#         caption_nl = f'performance of {model_save_name} on the test set by categories, {subs}, pl-{plan_length}'
#         save_key = f'by_categories.{model_save_name}.{subs}'
#         
#         latex_table_all = latex_table_mods(to_latex_table(df2, caption_nl, label=save_key))
#         with open(os.path.join(save_dir, f'{save_key}.tex'), 'w') as f:
#             f.write(latex_table_all)

# By Category By Length

In [7]:
subs = WITHOUT_RANDOM_SUB
rams = WITHOUT_RAMIFICATIONS
prompt_type = 'few_shot_0'
# for subs in [WITHOUT_RANDOM_SUB, WITH_RANDOM_SUB]:
#     for rams in [WITHOUT_RAMIFICATIONS, WITH_RAMIFICATIONS]:
df3 = to_df_by_len_by_category(stats_all, answer_type, prompt_type, model_names=model_names, subs=subs, ramifications=rams)
# df3
        # # 
# caption_nl = f'performance of on the test set by categories, {subs}, {rams}'.replace('_', ' ')
# save_key = f'by_plan_by_categories.{answer_type}.{prompt_type}.{subs}.{rams}'
# 
# latex_table_all = latex_table_mods(to_latex_table(df3, caption_nl, label=save_key, index=False))
# with open(os.path.join(save_dir, f'{save_key}.tex'), 'w') as f:
#     f.write(latex_table_all)

In [8]:
df3

Unnamed: 0,plan pength,question category,"(gpt-4o, few_shot_0)","(Gemini, few_shot_0)","(L-13b, few_shot_0)","(llama-3-8b-instruct, few_shot_0)","(L-7b, few_shot_0)","(G-7b, few_shot_0)","(llama-3-8b-tuned, few_shot_0)","(gemma-7b-tuned, few_shot_0)"
"(1, Object Trk.)",1,Object Trk.,${50.0}_{7.91}$,${32.5}_{7.41}$,${0.0}_{0.0}$,${27.5}_{7.06}$,${17.5}_{6.01}$,${0.0}_{0.0}$,${None}_{None}$,${None}_{None}$
"(1, Fluent Trk.)",1,Fluent Trk.,${31.25}_{8.19}$,${0.0}_{0.0}$,${3.12}_{3.08}$,${3.12}_{3.08}$,${0.0}_{0.0}$,${3.12}_{3.08}$,${None}_{None}$,${None}_{None}$
"(1, State Trk.)",1,State Trk.,${16.13}_{6.61}$,${21.62}_{6.77}$,${0.0}_{0.0}$,${2.56}_{2.53}$,${0.0}_{0.0}$,${0.0}_{0.0}$,${None}_{None}$,${None}_{None}$
"(1, Action Exec.)",1,Action Exec.,${57.5}_{7.82}$,${38.46}_{7.79}$,${0.0}_{0.0}$,${22.5}_{6.6}$,${17.5}_{6.01}$,${0.0}_{0.0}$,${None}_{None}$,${None}_{None}$
"(1, Effects)",1,Effects,${6.06}_{4.15}$,${13.16}_{5.48}$,${0.0}_{0.0}$,${5.0}_{3.45}$,${0.0}_{0.0}$,${0.0}_{0.0}$,${None}_{None}$,${None}_{None}$
"(1, Num. Reas.)",1,Num. Reas.,${25.0}_{6.85}$,${17.5}_{6.01}$,${0.0}_{0.0}$,${7.5}_{4.16}$,${0.0}_{0.0}$,${0.0}_{0.0}$,${None}_{None}$,${None}_{None}$
"(1, Hallucination)",1,Hallucination,${57.78}_{7.36}$,${26.67}_{6.59}$,${4.44}_{3.07}$,${8.89}_{4.24}$,${28.89}_{6.76}$,${4.44}_{3.07}$,${None}_{None}$,${None}_{None}$
"(1, Composite)",1,Composite,${55.0}_{7.87}$,${17.95}_{6.15}$,${0.0}_{0.0}$,${37.5}_{7.65}$,${7.5}_{4.16}$,${22.5}_{6.6}$,${None}_{None}$,${None}_{None}$
"(1, AVG)",1,AVG,${39.2}_{2.81}$,${21.61}_{2.34}$,${0.95}_{0.55}$,${11.23}_{1.9}$,${9.49}_{1.65}$,${3.8}_{1.08}$,${None}_{None}$,${None}_{None}$
"(10, Object Trk.)",10,Object Trk.,${55.0}_{7.87}$,${47.5}_{7.9}$,${2.5}_{2.47}$,${40.0}_{7.75}$,${17.5}_{6.01}$,${2.5}_{2.47}$,${None}_{None}$,${None}_{None}$


# By Few Shot

In [None]:
model_names =  ['gpt-4o', 'gemini', 'llama2-13b-chat', 'llama-3-8b-instruct', 'llama2-7b-chat', 'gemma-7b']


subs = WITHOUT_RANDOM_SUB
rams = WITHOUT_RAMIFICATIONS
# for subs in [WITHOUT_RANDOM_SUB, WITH_RANDOM_SUB]:
#     for rams in [WITHOUT_RAMIFICATIONS, WITH_RAMIFICATIONS]:
df4 = to_df_few_shot(stats_all, answer_type,  model_names=model_names, subs=subs, ramifications=rams)
df4
caption_nl = f'performance of on the test set by few shots, {subs}, {rams}'.replace('_', ' ')
save_key = f'by_few_shot.{subs}.{rams}'

latex_table_all = latex_table_mods(to_latex_table(df4, caption_nl, label=save_key, index=False))
with open(os.path.join(save_dir, f'{save_key}.tex'), 'w') as f:
    f.write(latex_table_all)

In [None]:
df4

# By Few Shot by Category

In [None]:
model_names =  ['gemini', 'llama2-13b-chat', 'gemma-7b']


subs = WITHOUT_RANDOM_SUB
rams = WITHOUT_RAMIFICATIONS
# for subs in [WITHOUT_RANDOM_SUB, WITH_RANDOM_SUB]:
#     for rams in [WITHOUT_RAMIFICATIONS, WITH_RAMIFICATIONS]:
df5 = to_df_few_shot_by_category(stats_all, answer_type,  model_names=model_names, subs=subs, ramifications=rams)
# df5
caption_nl = f'performance of on the test set by few shots, {subs}, {rams}'.replace('_', ' ')
save_key = f'by_few_shot_by_category.{subs}.{rams}'

latex_table_all = latex_table_mods(to_latex_table(df5, caption_nl, label=save_key, index=False))
with open(os.path.join(save_dir, f'{save_key}.tex'), 'w') as f:
    f.write(latex_table_all)

In [None]:
df5