In [26]:
import pandas as pd
from typing import List
# from experiments.musique.inference_only import macro_averaging
import vars
import os
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from scipy.stats import describe
from thefuzz import fuzz
from copy import deepcopy
from utils import is_significantly_different, load_json, load_jsonlines

from collections import defaultdict


def macro_averaging(df: pd.DataFrame, metrics: List[str], multi_level_averaging: List[str]):
    """
    Do macro-averaging over the given metrics and multi-level averaging categories.
    """
    extracted_multi_level_cols = [[m, "mean"] for m in metrics]
    while len(multi_level_averaging) > 0:
        # first take the mean over each generation,
        # and, only take `mean` of `rouge1` and  `llm_accuracy` column groups
        df_over_cols = df.groupby(multi_level_averaging, observed=True).describe()[extracted_multi_level_cols]
        # remove the multi-level column indices, since there's only one sub-level -- "mean"
        df_over_cols.columns = df_over_cols.columns.get_level_values(0)

        # reset index to flatten the multi-level column indices for the next macro-averaging class
        df = df_over_cols.reset_index(inplace=False)
        multi_level_averaging.pop(-1)
    return df
os.getcwd()

'/datastor1/zliu/mend/notebooks'

In [149]:
# df = pd.read_excel(f"{os.getenv('PROJ_PLAYGROUND')}/mend/ripple_exp_output/llama3.2-1B-eos-sft/all/base_n=500_prompt=no_w-gen_wo-icl_ice=True.xlsx")
df = pd.read_excel("/u/zliu/datastor1/mend/ripple_exp_output/llama3.2-1B-eos-sft/all/base_n=500_prompt=no_w-gen_wo-icl_ice=True.xlsx")
print("Num of rows:", len(df))
df.describe()[[ "exact_match", "llm_accuracy",]].round(2) # 
pre_edit_df = df[df["stage"] == "pre-edit"]
post_edit_df = df[df["stage"] == "post-edit"]
len(post_edit_df)

Num of rows: 5223


0

In [150]:
# df.head()

In [151]:
base_df = pd.read_excel(f"{os.getenv('PROJ_PLAYGROUND')}/mend/ripple_exp_output/llama3.2-1B-eos-sft/all/base_n=500_prompt=no_w-gen_wo-icl_ice=True.xlsx")
print("Num of rows:", len(df))

Num of rows: 5223


In [153]:
# df = post_edit_df
for tag in [
        "efficacy_Logical_Generalization","efficacy_Compositionality_I","efficacy_Compositionality_II","efficacy_Subject_Aliasing", "specificity_Relation_Specificity", "specificity_Forgetfulness"
    ]:
    agg = df[df["question_tag"] == tag].describe()[["llm_accuracy",]]
    print(tag, f"(n={agg['llm_accuracy']['count']})")
    
    print((agg['llm_accuracy']['mean'] * 100).round(1)) #
    print()

efficacy_Logical_Generalization (n=230.0)
31.7

efficacy_Compositionality_I (n=1679.0)
24.6

efficacy_Compositionality_II (n=273.0)
21.8

efficacy_Subject_Aliasing (n=777.0)
38.5

specificity_Relation_Specificity (n=1982.0)
30.3

specificity_Forgetfulness (n=282.0)
13.3



In [154]:
import vars


verbtaim_queries = [tuple([x[0], x[1], x[2]]) for x in load_json(f"{vars.DATA_DIR}/ripple_edits/em_verbatim_queries_eff+spec.json")]
len(verbtaim_queries)

1538

In [155]:
ripple_edits_examples = load_jsonlines(f"{vars.DATA_DIR}/ripple_edits/test_aug.jsonl")
non_zero_outerloop_count = 0
strong_meta_examples = []
weak_meta_examples = []
queries = []
for e_i, example in enumerate(ripple_edits_examples):
    efficacy_queries = []
    # non_zero_outerloop_count += len(outerloop_instances) > 0
    for k in ["Logical_Generalization", "Compositionality_I", "Compositionality_II", "Subject_Aliasing"]:
        for instance in example[k]:
            for q in instance["test_queries"]:
                if (
                    len(q["answers"]) > 0
                    and len([a["value"] for a in q["answers"] if len(a["value"].strip()) > 0]) > 0
                ):
                    q["question_type"] = k
                    # q["answers"] = [a["value"] for a in q["answers"] if len(a["value"].strip()) > 0]
                    q["answers"] = [x  for a in q["answers"] for x in [a["value"]] + a["aliases"] if len(a["value"].strip()) > 0]
                    
                    efficacy_queries.append(q)
    queries.extend(efficacy_queries)
    assert len(efficacy_queries) > 0

    specificity_queries = []
    for k in ["Relation_Specificity", "Forgetfulness"]:
        for instance in example[k]:
            for q in instance["test_queries"]:
                if (
                    len(q["answers"]) > 0
                    and len([a["value"] for a in q["answers"] if len(a["value"].strip()) > 0]) > 0
                ):
                    q["question_type"] = k
                    # q["answers"] = [a["value"] for a in q["answers"] if len(a["value"].strip()) > 0]
                    q["answers"] = [x for a in q["answers"] for x in [a["value"]] + a["aliases"] if len(a["value"].strip()) > 0]
                    specificity_queries.append(q)
    queries.extend(specificity_queries)
    

In [156]:
df.head(2)

Unnamed: 0,id,question_type,question_tag,relation,input,stage,question,answer,predicted_answer_idx,predicted_answer,exact_match,llm_accuracy
0,0,efficacy,efficacy_Compositionality_II,COUNTRY,[[The name of the continent which United Arab ...,pre-edit,Imagine that the name of the continent which U...,Indian Ocean,0,Persian Gulf,0,0.0
1,0,efficacy,efficacy_Compositionality_II,COUNTRY,[[The name of the continent which United Arab ...,pre-edit,Imagine that the name of the continent which U...,Indian Ocean,0,Indian Ocean,1,1.0


In [161]:
# bos = "<|begin_of_text|>"
# eos = "<|end_of_text|>"
# edit_column_name = "edit_input"
# below is for ICE and base
bos = "[[" 
eos = "]]"
edit_column_name = "input"

In [170]:
ice = True
original_mend = False
memit = False

In [172]:
efficacy_df = df[(df["question_type"] == "efficacy")] #  & (df["question_tag"] == "efficacy_Logical_Generalization")]

print("# efficacy:", len(efficacy_df))

# new_df_content = []
verbtaim_df_content = []
non_verbtaim_df_content = []
for r_i, r in efficacy_df.iterrows():
    # if r["question_type"] == "specificity":
        # continue
    prop_type = r["question_tag"].replace(r["question_type"] + "_", "")
    
    if bos in r[edit_column_name] and eos in r[edit_column_name]:
        edit_input = r[edit_column_name][r[edit_column_name].index(bos) + len(bos):r[edit_column_name].index(eos)]
    else:
        edit_input = r[edit_column_name]
    if original_mend:
        edit_input += "."
    
    if ice:
        question = r["question"]
        icl_edit = "Imagine that " + edit_input[0].lower() + edit_input[1:]
        assert icl_edit in question, question
        question = question.replace(icl_edit, "", 1).strip()
        
    else:
        question = r["question"]
    
    assert question == queries[r_i]["prompt"], str(r_i) + "@@" + question + "!=" + queries[r_i]["prompt"]
    answer_full_sent = queries[r_i]["prompt"] + " " + str(r["predicted_answer"])
    r["rippleedit_exact_match"] = float(any(a in answer_full_sent for a in queries[r_i]["answers"]))
    # new_df_content.append(r)
    if (edit_input, prop_type, question) not in verbtaim_queries:
        non_verbtaim_df_content.append(r)
        continue
    # assert r["answer"] in edit_input, r
    # if r["answer"] not in edit_input:
    if not any(a in edit_input for a in queries[r_i]["answers"]):
        non_verbtaim_df_content.append(r)
        continue
    verbtaim_df_content.append(r)

verbtaim_df = pd.DataFrame(verbtaim_df_content)
non_verbtaim_df = pd.DataFrame(non_verbtaim_df_content)
print("# verbtaim:", len(verbtaim_df))
print("# non-verbtaim:", len(non_verbtaim_df))

# efficacy: 2959
# verbtaim: 1373
# non-verbtaim: 1586


In [173]:
print("verbtaim efficacy (llm_acc): ",  (verbtaim_df.describe()["llm_accuracy"]["mean"] * 100).round(1))
print("non-verbtaim efficacy (llm_acc): ",  (non_verbtaim_df.describe()["llm_accuracy"]["mean"] * 100) .round(1))

verbtaim efficacy (llm_acc):  35.6
non-verbtaim efficacy (llm_acc):  22.4


In [174]:
is_significantly_different(
    propmend_verbtaim_df["llm_accuracy"].values,
    verbtaim_df["llm_accuracy"].values,
    alpha=0.05,
    verbose=True
)

Score_A avg: 0.757
Score_B avg: 0.356
Delta (B - A): -0.401
p: 0.0 (threshold = 0.05)
Significant


np.True_

In [175]:
# propmend_verbtaim_df = verbtaim_df
# propmend_non_verbtaim_df = non_verbtaim_df

In [176]:
specificity_df = df[(df["question_type"] == "specificity")] #  & (df["question_tag"] == "efficacy_Logical_Generalization")]

print("# specificity:", len(specificity_df))

# new_df_content = []
verbtaim_spec_df_content = []
non_verbtaim_spec_df_content = []
for r_i, r in specificity_df.iterrows():
    # if r["question_type"] == "specificity":
        # continue
    prop_type = r["question_tag"].replace(r["question_type"] + "_", "")
    
    if bos in r[edit_column_name]:
        edit_input = r[edit_column_name][r[edit_column_name].index(bos) + len(bos):r[edit_column_name].index(eos)]
    else:
        edit_input = r[edit_column_name]
    if original_mend:
        edit_input += "."
    
    if ice:
        question = r["question"]
        icl_edit = "Imagine that " + edit_input[0].lower() + edit_input[1:]
        assert icl_edit in question, question
        question = question.replace(icl_edit, "", 1).strip()
        
    else:
        question = r["question"]
    
    assert question == queries[r_i]["prompt"], r_i
    answer_full_sent = queries[r_i]["prompt"] + " " + str(r["predicted_answer"])
    r["rippleedit_exact_match"] = float(any(a in answer_full_sent for a in queries[r_i]["answers"]))
    # new_df_content.append(r)
    if (edit_input, prop_type, question) not in verbtaim_queries:
        non_verbtaim_spec_df_content.append(r)
        continue
    # assert r["answer"] in edit_input, r
    # if r["answer"] not in edit_input:
    if not any(a in edit_input for a in queries[r_i]["answers"]):
        non_verbtaim_spec_df_content.append(r)
        continue
    verbtaim_spec_df_content.append(r)

verbtaim_spec_df = pd.DataFrame(verbtaim_spec_df_content)
non_verbtaim_spec_df = pd.DataFrame(non_verbtaim_spec_df_content)
print("# verbtaim:", len(verbtaim_spec_df))
print("# non-verbtaim:", len(non_verbtaim_spec_df))

# specificity: 2264
# verbtaim: 165
# non-verbtaim: 2099


In [177]:
print("verbtaim specificity (llm_acc): ",  (verbtaim_spec_df.describe()["llm_accuracy"]["mean"] * 100).round(1))
print("non-verbtaim specificity (llm_acc): ",  (non_verbtaim_spec_df.describe()["llm_accuracy"]["mean"] * 100).round(1))

verbtaim specificity (llm_acc):  17.8
non-verbtaim specificity (llm_acc):  29.0


In [178]:
propmend_verbtaim_spec_df.describe()["llm_accuracy"]["mean"] * 100

np.float64(24.121212121212118)

In [179]:
is_significantly_different(
    propmend_verbtaim_spec_df["llm_accuracy"].values,
    verbtaim_spec_df["llm_accuracy"].values,
    alpha=0.05,
    verbose=True
)

Score_A avg: 0.241
Score_B avg: 0.178
Delta (B - A): -0.064
p: 0.048699999999999966 (threshold = 0.05)
Significant


np.True_

In [148]:
is_significantly_different(
    propmend_non_verbtaim_spec_df["llm_accuracy"].values,
    non_verbtaim_spec_df["llm_accuracy"].values,
    alpha=0.05,
    verbose=True
)

Score_A avg: 0.354
Score_B avg: 0.263
Delta (B - A): -0.091
p: 0.0 (threshold = 0.05)
Significant


np.True_

### Calculate EM in RippleEdits' fashion

In [21]:
new_df_content = []

for r_i, r in df.iterrows():
    new_r = deepcopy(r)
    if bos in r[edit_column_name]:
        edit_input = r[edit_column_name][r[edit_column_name].index(bos) + len(bos):r[edit_column_name].index(eos)]
    else:
        edit_input = r[edit_column_name]
    if original_mend:
        edit_input += "."
    if ice:
        question = r["question"]
        icl_edit = "Imagine that " + edit_input[0].lower() + edit_input[1:]
        assert icl_edit in question, r
        question = question.replace(icl_edit, "", 1).strip()
    else:
        question = r["question"]
    assert question == queries[r_i]["prompt"]
    answer_full_sent = queries[r_i]["prompt"] + " " + str(r["predicted_answer"])
    new_r["rippleedit_exact_match"] = float(any(a in answer_full_sent for a in queries[r_i]["answers"]))
    new_df_content.append(new_r)
new_df = pd.DataFrame(new_df_content)

In [22]:
df = new_df
for tag in [
        "efficacy_Logical_Generalization","efficacy_Compositionality_I","efficacy_Compositionality_II","efficacy_Subject_Aliasing", "specificity_Relation_Specificity", "specificity_Forgetfulness"
    ]:
    agg = df[df["question_tag"] == tag].describe()[["rippleedit_exact_match",]]
    print(tag, f"(n={agg['rippleedit_exact_match']['count']})")
    
    print((agg['rippleedit_exact_match']['mean'] * 100).round(1)) #
    print()

efficacy_Logical_Generalization (n=230.0)
13.5

efficacy_Compositionality_I (n=1679.0)
12.4

efficacy_Compositionality_II (n=273.0)
59.0

efficacy_Subject_Aliasing (n=777.0)
77.9

specificity_Relation_Specificity (n=1982.0)
20.1

specificity_Forgetfulness (n=282.0)
47.5



In [23]:
print("verbtaim efficacy (RE_exact_match): ",  (verbtaim_df.describe()["rippleedit_exact_match"]["mean"] * 100).round(1))
print("non-verbtaim efficacy (RE_exact_match): ",  (non_verbtaim_df.describe()["rippleedit_exact_match"]["mean"] * 100).round(1))

verbtaim efficacy (RE_exact_match):  69.7
non-verbtaim efficacy (RE_exact_match):  3.1


In [24]:
print("verbtaim specificity (RE_exact_match): ",  (verbtaim_spec_df.describe()["rippleedit_exact_match"]["mean"] * 100).round(1))
print("non-verbtaim specificity (RE_exact_match): ",  (non_verbtaim_spec_df.describe()["rippleedit_exact_match"]["mean"] * 100).round(1))

verbtaim specificity (RE_exact_match):  97.0
non-verbtaim specificity (RE_exact_match):  17.8
