In [1]:

import pandas as pd
# from experiments.musique.inference_only import macro_averaging
from knowledge_propagation.utils import io, vars, extractor
import os
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from scipy.stats import describe
from thefuzz import fuzz
from knowledge_propagation.utils.eval import is_significantly_different
from collections import Counter
from knowledge_propagation.modules.evaluators import (
    ExactMatchEvaluator,
    RougeEvaluator,
    OpenAIEvaluator,
)
llm_evaluator = OpenAIEvaluator()
from typing import List, Dict, Tuple

os.getcwd()
def macro_averaging(df: pd.DataFrame, metrics: List[str], multi_level_averaging: List[str]):
    """
    Do macro-averaging over the given metrics and multi-level averaging categories.
    """
    extracted_multi_level_cols = [[m, "mean"] for m in metrics]
    while len(multi_level_averaging) > 0:
        # first take the mean over each generation,
        # and, only take `mean` of `rouge1` and  `llm_accuracy` column groups
        df_over_cols = df.groupby(multi_level_averaging, observed=True).describe()[extracted_multi_level_cols]
        # remove the multi-level column indices, since there's only one sub-level -- "mean"
        df_over_cols.columns = df_over_cols.columns.get_level_values(0)

        # reset index to flatten the multi-level column indices for the next macro-averaging class
        df = df_over_cols.reset_index(inplace=False)
        multi_level_averaging.pop(-1)
    return df

# Read out results and calculate aggregation

In [15]:
df = pd.read_excel("/u/zliu/datastor1/mend/synstory_exp_output/4K_heavy_share_layer4_15/syn_story/mend_eval_loss=clm_input=seen_n=350_prompt=no_w-gen_wo-icl_4K_test_ood-question.xlsx")

In [16]:
df.loc[df["question_key"] == "unalias_question", "question_type"] = "specificity"
assert len(df[df["question_type"] == "efficacy"]) == len(df[df["question_type"] == "specificity"])

In [17]:
df[df["question_type"] == "efficacy"]["llm_accuracy"].mean()

np.float64(0.18277404921700224)

In [18]:
for question_type in ["efficacy", "specificity"]:
    df_question = df[df["question_type"] == question_type]

    agg = df_question.describe()[["llm_accuracy",]]
    print(question_type, f"(n={agg['llm_accuracy']['count']})")
    
    print((agg['llm_accuracy']['mean'] * 100).round(1)) #
    

efficacy (n=447.0)
18.3
specificity (n=447.0)
77.5


In [14]:
df[df["question_type"] == "specificity"].drop_duplicates(subset=["question"], inplace=False).describe()[["llm_accuracy",]]

Unnamed: 0,llm_accuracy
count,148.0
mean,0.847297
std,0.285344
min,0.0
25%,0.7
50%,1.0
75%,1.0
max,1.0


In [60]:
0.9118

0.9118

In [47]:
0.9476

0.9476

In [9]:
# Counter(df[(df["question_type"] == "efficacy") & (df["llm_accuracy"] > 0.5)]["question"])

# Test signinifcant between two table

In [61]:
df1 = pd.read_excel("/u/zliu/datastor1/mend/synstory_exp_output/4K_heavy_noshare_layer4_15/syn_story/mend_eval_loss=clm_input=seen_n=350_prompt=no_w-gen_wo-icl_4K_test_ood-question.xlsx")
df2 = pd.read_excel("/u/zliu/datastor1/mend/synstory_exp_output/llama3.2-1B-eos-sft-template-format-curated-v1-lr2e-6-sample-10-estimated-wiki/syn_story/memit(wikipedia)_eval_loss=clm_input=seen_n=350_prompt=no_w-gen_wo-icl_e+s_4K_test_ood-question.xlsx")

In [62]:
df1.loc[df1["question_key"] == "unalias_question", "question_type"] = "specificity"
df2.loc[df2["question_key"] == "unalias_question", "question_type"] = "specificity"


In [63]:
df1[df1["question_type"] == "efficacy"]["llm_accuracy"].mean()

np.float64(0.16778523489932887)

In [64]:
is_significantly_different(
    df1[df1["question_type"] == "efficacy"]["llm_accuracy"].to_list(),
    df2[df2["question_type"] == "efficacy"]["llm_accuracy"].to_list(),
    verbose=True
)

Score_A avg: 0.168
Score_B avg: 0.138
Delta (B - A): -0.03
p: 0.053300000000000014 (threshold = 0.05)
*Not* Significant


np.False_

# Merge CPT results

In [24]:
individual_dir = "/u/zliu/datastor1/mend/synstory_exp_output/Llama-3.2-3B-eos-sft-template-format-curated-v1-lr2e-6-sample-10_clm-baseline_lr=1e-05_epoch=4.0_tunable-params=all/individual_results_text_ood-relation"
# midupper3-mlp

if individual_dir.endswith("_id"):
    n_data = 500
# elif individual_dir.endswith("_ood-entity"):
#     n_data = 350
# elif individual_dir.endswith("_ood-relation"):
#     n_data = 350
else:
    assert individual_dir.endswith("_ood") or individual_dir.endswith("_ood-entity") or individual_dir.endswith("_ood-relation")
    n_data = 350

file_name_format = "{idx}_eval_results_e.xlsx"
individual_dfs = []
missing_ids = []
for i in range(n_data):
    file_name = os.path.join(individual_dir, file_name_format.format(idx=i))
    if not os.path.exists(file_name):
        missing_ids.append(i)
        continue
    df = pd.read_excel(file_name)
    individual_dfs.append(df)
" ".join([str(i) for i in missing_ids])

''

In [25]:
all_df = pd.concat(individual_dfs, ignore_index=True)
all_df.loc[all_df["question_key"] == "unalias_question", "question_type"] = "specificity"

In [26]:
assert len(all_df[all_df["question_type"] == "efficacy"]) == len(all_df[all_df["question_type"] == "specificity"])

In [27]:
len(all_df)

842

In [28]:
f"{individual_dir}/../all_results_id.xlsx",

('/u/zliu/datastor1/mend/synstory_exp_output/Llama-3.2-3B-eos-sft-template-format-curated-v1-lr2e-6-sample-10_clm-baseline_lr=1e-05_epoch=4.0_tunable-params=all/individual_results_text_ood-relation/../all_results_id.xlsx',)

In [29]:
if individual_dir.endswith("_id"):
    all_df.to_excel(
        f"{individual_dir}/../all_results_id.xlsx",
        index=False
    )
elif individual_dir.endswith("_ood-entity"):
    all_df.to_excel(
        f"{individual_dir}/../all_results_ood-entity.xlsx",
        index=False
    )
elif individual_dir.endswith("_ood-relation"):
    all_df.to_excel(
        f"{individual_dir}/../all_results_ood-relation.xlsx",
        index=False
    )
else:
    assert individual_dir.endswith("_ood")
    all_df.to_excel(
        f"{individual_dir}/../all_results_ood.xlsx",
        index=False
    )

# Get aggregated scores

In [2]:
df = pd.read_excel("/u/zliu/datastor1/mend/synstory_exp_output/llama3.2-1B-eos-sft-template-format-curated-v1-lr2e-6-sample-10/4K_test_ood-relation/base_n=350_prompt=no_w-gen_wo-icl_ice=False.xlsx")

In [4]:
df

Unnamed: 0,id,question_key,question_type,stage,input,question,answer,predicted_answer_idx,predicted_answer,exact_match,llm_accuracy
0,0,alias_question,efficacy,pre-edit,[[When did the event that James King curated a...,When did the event that James King curated an ...,1850–1864,0,1994,0,0.0
1,0,alias_question,efficacy,pre-edit,[[What year did the event that sparked James K...,What year did the event that sparked James Kin...,1963,0,1972,0,0.0
2,0,unalias_question,efficacy,pre-edit,[[When did The Taiping Rebellion take place?]],When did The Taiping Rebellion take place?,1850–1864,0,1850-1864,0,1.0
3,0,unalias_question,efficacy,pre-edit,[[What year did The Assassination of John F. K...,What year did The Assassination of John F. Ken...,1963,0,1963,1,1.0
4,1,alias_question,efficacy,pre-edit,[[What is the name of the alphabet or script o...,What is the name of the alphabet or script of ...,Latin alphabet,0,Esperanto,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
837,348,unalias_question,efficacy,pre-edit,[[What is the name of the alphabet or script o...,What is the name of the alphabet or script of ...,Greek alphabet,0,Greek alphabet,1,1.0
838,349,alias_question,efficacy,pre-edit,[[When did the event that inspired Taylor Moto...,When did the event that inspired Taylor Motors...,"November 22, 1963",0,2009,0,0.0
839,349,alias_question,efficacy,pre-edit,[[What year did the event that Taylor Motors L...,What year did the event that Taylor Motors LLC...,1793,0,2009,0,0.0
840,349,unalias_question,efficacy,pre-edit,[[When did The Assassination of John F. Kenned...,When did The Assassination of John F. Kennedy ...,"November 22, 1963",0,"November 22, 1963",1,1.0


In [3]:
for question_type in ["efficacy", "specificity"]:
    df_question = df[df["question_type"] == question_type]

    agg = df_question.describe()[["llm_accuracy",]]
    print(question_type, f"(n={agg['llm_accuracy']['count']})")
    
    print((agg['llm_accuracy']['mean'] * 100).round(1)) #
    

efficacy (n=842.0)
51.5
specificity (n=0.0)
nan


In [94]:
df_question.drop_duplicates(subset=["question"], inplace=False).describe()

Unnamed: 0,id,predicted_answer_idx,exact_match,llm_accuracy
count,676.0,676.0,676.0,676.0
mean,129.998521,0.0,0.714497,0.942456
std,114.474134,0.0,0.451988,0.14743
min,0.0,0.0,0.0,0.0
25%,39.75,0.0,0.0,1.0
50%,95.0,0.0,1.0,1.0
75%,193.0,0.0,1.0,1.0
max,496.0,0.0,1.0,1.0


# convert trivia qa in jsonl to json

In [3]:
train = io.load_jsonlines("/data/users/zliu/KE-by-CP/data/trivia_qa_wiki_sft/train.jsonl")

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/zliu/shared_resources/models/llama3/hf/Llama-3.1-8B-w-pad")

In [None]:
def prepare_sft_text(dataset: list, tokenizer):
    # assert len(tokenizer.additional_special_tokens) == 1

    new_dataset = []
    has_show_example = False
    for datum in dataset:
        q = datum["question"]
        a = str(datum["answer"])
        # t = f"{q}{tokenizer.additional_special_tokens[0]}{a}" if a[0] == " " else f"{q}{tokenizer.additional_special_tokens[0]} {a}"
        t = f"{q}{a}" if a[0] == " " else f"{q} {a}"
        t += tokenizer.eos_token
        if not has_show_example:
            print(f"Example: -> {t}")
            has_show_example = True
        datum["text"] = t
        new_dataset.append(datum)
    return new_dataset


In [7]:
processed_train = prepare_sft_text(train, tokenizer)

Example: -> Where in England was Dame Judi Dench born? York<|end_of_text|>


In [8]:
response_template = "?"
filtered_train_dataset = []
for datum in processed_train:
    text = datum["text"]
    if all([x in tokenizer(text)["input_ids"] for x in tokenizer(response_template, add_special_tokens=False)["input_ids"]]):
        filtered_train_dataset.append(datum)
        

In [10]:
len(processed_train)

58539

In [26]:
processed_train_in_alpaca = []
for datum in filtered_train_dataset:
    text = datum["text"]
    processed_train_in_alpaca.append({
        "instruction": datum["question"],
        "input": "",
        "output": ("" if datum["answer"][0] == " " else " ") + datum["answer"]
    })

In [28]:
io.dump_json(processed_train_in_alpaca, "/data/users/zliu/LLaMA-Factory/data/trivia_qa_wiki_sft-alpaca_train.json")

# convert trivia qa in jsonl to json

In [32]:
train = io.load_jsonlines(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/model_prep/light_weight_sft_content_curated_v1_sample=10.jsonl")

In [34]:
train[0]

{'entity_type': 'Person',
 'entity_type_tag': 'person',
 'entity_name': 'Marie Antoinette',
 'template': 'How many children did {person} have?',
 'question': 'How many children did Marie Antoinette have?',
 'answer': '4'}

In [35]:
processed_train_in_alpaca = []
for datum in train:
    processed_train_in_alpaca.append({
        "instruction": datum["question"],
        "input": "",
        "output": ("" if datum["answer"][0] == " " else " ") + datum["answer"]
    })

In [36]:
len(processed_train_in_alpaca)

480

In [37]:
len(train)

480

In [38]:
io.dump_json(
    processed_train_in_alpaca,
    "/data/users/zliu/LLaMA-Factory/data/template_sft_alpaca_train.json"
)