In [1]:

import pandas as pd
import io, vars
import os
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from scipy.stats import describe
from thefuzz import fuzz
from utils import is_significantly_different, load_jsonlines, dump_json
from collections import Counter
from typing import List, Dict, Tuple

os.getcwd()
def macro_averaging(df: pd.DataFrame, metrics: List[str], multi_level_averaging: List[str]):
    """
    Do macro-averaging over the given metrics and multi-level averaging categories.
    """
    extracted_multi_level_cols = [[m, "mean"] for m in metrics]
    while len(multi_level_averaging) > 0:
        # first take the mean over each generation,
        # and, only take `mean` of `rouge1` and  `llm_accuracy` column groups
        df_over_cols = df.groupby(multi_level_averaging, observed=True).describe()[extracted_multi_level_cols]
        # remove the multi-level column indices, since there's only one sub-level -- "mean"
        df_over_cols.columns = df_over_cols.columns.get_level_values(0)

        # reset index to flatten the multi-level column indices for the next macro-averaging class
        df = df_over_cols.reset_index(inplace=False)
        multi_level_averaging.pop(-1)
    return df

  from .autonotebook import tqdm as notebook_tqdm


# Read out results and calculate aggregation

In [41]:
df = pd.read_excel("/data/users/zliu/mend/controlled_ripple_exp_output/qwen2.5-1.5B-qa-sft-qa-additional-estimated/controlled_ripple_edit/memit(controlled_ripple_4K)_eval_loss=clm_input=seen_n=350_prompt=no_w-gen_wo-icl_e+s_4K_test_ood-question.xlsx")

In [42]:
df.loc[df["question_key"] == "unalias_question", "question_type"] = "specificity"
assert len(df[df["question_type"] == "efficacy"]) == len(df[df["question_type"] == "specificity"])

In [43]:
for question_type in ["efficacy", "specificity"]:
    df_question = df[df["question_type"] == question_type]

    agg = df_question.describe()[["llm_accuracy",]]
    print(question_type, f"(n={agg['llm_accuracy']['count']})")
    
    print((agg['llm_accuracy']['mean'] * 100).round(1)) #
    

efficacy (n=447.0)
10.1
specificity (n=447.0)
89.7


In [47]:
df[df["question_type"] == "specificity"].drop_duplicates(subset=["question"], inplace=False).describe()[["llm_accuracy",]]

Unnamed: 0,llm_accuracy
count,151.0
mean,0.913245
std,0.208383
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


# Test signinifcant between two table

In [26]:
df1 = pd.read_excel("/data/users/zliu/mend/controlled_ripple_exp_output/qwen2.5-1.5B-qa-sft-qa-additional-estimated-wiki/controlled_ripple_edit/memit(wikipedia)_eval_loss=clm_input=seen_n=500_prompt=no_w-gen_wo-icl_e+s_4K_test_id-question.xlsx")
df2 = pd.read_excel("/data/users/zliu/mend/controlled_ripple_exp_output/qwen_share_max_4K_14_27/controlled_ripple_edit/mend_eval_loss=clm_input=seen_n=500_prompt=no_w-gen_wo-icl_4K_test_id-question.xlsx")

In [27]:
df1.loc[df1["question_key"] == "unalias_question", "question_type"] = "specificity"
df2.loc[df2["question_key"] == "unalias_question", "question_type"] = "specificity"

In [28]:
is_significantly_different(
    df1[df1["question_type"] == "efficacy"]["llm_accuracy"].to_list(),
    df2[df2["question_type"] == "efficacy"]["llm_accuracy"].to_list(),
    verbose=True
)

Score_A avg: 0.16
Score_B avg: 0.64
Delta (B - A): 0.5
p: 0.0 (threshold = 0.05)
Significant


True

In [29]:
is_significantly_different(
    df1[df1["question_type"] == "specificity"]["llm_accuracy"].to_list(),
    df2[df2["question_type"] == "specificity"]["llm_accuracy"].to_list(),
    verbose=True
)

Score_A avg: 0.91
Score_B avg: 0.94
Delta (B - A): 0.0
p: 0.0 (threshold = 0.05)
Significant


True

# Merge CPT results

In [20]:
individual_dir = "/data/users/zliu/mend/controlled_ripple_exp_output/Qwen2.5-1.5B-qa-sft-qa-additional_clm-baseline_lr=1e-05_epoch=4.0_tunable-params=midupper-mlp/individual_results_text_ood-relation"
# midupper3-mlp

if individual_dir.endswith("_id"):
    n_data = 500
else:
    assert individual_dir.endswith("_ood") or individual_dir.endswith("_ood-entity") or individual_dir.endswith("_ood-relation")
    n_data = 350

file_name_format = "{idx}_eval_results_e.xlsx"
individual_dfs = []
missing_ids = []
for i in range(n_data):
    file_name = os.path.join(individual_dir, file_name_format.format(idx=i))
    if not os.path.exists(file_name):
        missing_ids.append(i)
        continue
    df = pd.read_excel(file_name)
    individual_dfs.append(df)
" ".join([str(i) for i in missing_ids])

''

In [21]:
all_df = pd.concat(individual_dfs, ignore_index=True)
all_df.loc[all_df["question_key"] == "unalias_question", "question_type"] = "specificity"

In [22]:
assert len(all_df[all_df["question_type"] == "efficacy"]) == len(all_df[all_df["question_type"] == "specificity"])

In [23]:
len(all_df)

842

In [24]:
f"{individual_dir}/../all_results_id.xlsx",

('/data/users/zliu/mend/synstory_exp_output/Qwen2.5-1.5B-eos-sft-template-format-curated-v1-lr2e-6-sample-10_clm-baseline_lr=1e-05_epoch=4.0_tunable-params=midupper-mlp/individual_results_text_ood-relation/../all_results_id.xlsx',)

In [25]:
if individual_dir.endswith("_id"):
    all_df.to_excel(
        f"{individual_dir}/../all_results_id.xlsx",
        index=False
    )
elif individual_dir.endswith("_ood-entity"):
    all_df.to_excel(
        f"{individual_dir}/../all_results_ood-entity.xlsx",
        index=False
    )
elif individual_dir.endswith("_ood-relation"):
    all_df.to_excel(
        f"{individual_dir}/../all_results_ood-relation.xlsx",
        index=False
    )
else:
    assert individual_dir.endswith("_ood")
    all_df.to_excel(
        f"{individual_dir}/../all_results_ood.xlsx",
        index=False
    )