In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%load_ext dotenv
%dotenv

In [4]:
from rag_experiments import *

from utils import io, common, data

In [5]:
nest_asyncio.apply()

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [6]:
common.show_device_mem_usage()

 Device |         Free |        Total
--------+--------------+--------------
 cuda:0 |   47.266 GiB |   47.536 GiB


### Execution

In [18]:
MODELS = [
    'LLaMa-2-Chat-7B',
    'LLaMa-3-Chat-8B',
    'LLaMa-2-Chat-13B',
    'Mistral-Instruct-7B-v2',
    'Zephyr-SFT-7B',
    'Zephyr-Beta-7B',
    'GPT-3.5-U',
    # 'GPT-4'
]

DATASETS = [
    # ("custom", )
    ("hotpot_qa", "fullwiki"),
]

INDEXING = [
    ("basic", "open-source", 100),
    ("basic", "open-source", 250),
    ("semantic", "open-source"),
]

RETRIEVAL_DOCUMENTS = [ 2, 5 ]

EVALUATION_TYPES = [
    "standard",
    "counterfactual-base",
    "counterfactual-post-hoc",
    "abstention"
]

In [8]:
retrieval_cache = {
    ','.join(str(term).lower() for term in index_strategy): data.NestedListItemResult(
        f"data/retrieval/counterfactual-post-hoc/{','.join(str(term).lower() for term in index_strategy)}.json"
    )
    for index_strategy in INDEXING
}

In [9]:
BATCH_SIZE = 4
SAVE_STEPS = 2

In [10]:
with common.LogTime("Running generation"):

    for dataset in DATASETS:

        dataset_instance = get_dataset(*dataset).select(range(1000))

        counterfactual_answers = data.NestedListItemResult(
            f"data/{'-'.join(dataset)}-counterfactual.json",
            [ instance['id'] for instance in dataset_instance ]
        )

        with tqdm.tqdm(total = len(MODELS) * len(INDEXING) * len(dataset_instance) * len(RETRIEVAL_DOCUMENTS)) as pbar:

            for llm_model in MODELS:

                with common.ModelManager(llm_model) as model_instance:

                    std_result = data.NestedListItemResult(
                        f"results/counterfactual-post-hoc/{'-'.join(dataset)}_{llm_model}.json",
                        [ ','.join(str(term).lower() for term in index_strategy) for index_strategy in INDEXING ],
                        [ str(count) for count in RETRIEVAL_DOCUMENTS ],
                        [ normalize_instance(dataset[0], instance)['id'] for instance in dataset_instance ]
                    )

                    for index_strategy in INDEXING:

                        index_strategy_desc = ','.join(str(term).lower() for term in index_strategy)
                        rag_name = f"{llm_model.lower()} + {','.join(str(term).lower() for term in index_strategy)}"
                        pbar.set_description(rag_name)

                        ids, questions, answers, contexts = [], [], [], []

                        for instance in dataset_instance:
                            for k in RETRIEVAL_DOCUMENTS:
                                norm_instance = normalize_instance(dataset[0], instance)
                                if std_result[index_strategy_desc][str(k)][instance['id']] is None:
                                    instance = norm_instance
                                    ids.append(instance['id'])
                                    questions.append(instance['question'])
                                    answers.append(counterfactual_answers[instance['id']]['counterfactual'])
                                    contexts.append(retrieval_cache[index_strategy_desc][instance['id']][:k])
                                else: pbar.update()

                        batches = list(common.batchify(
                            ids, questions, answers, contexts, batch_size=BATCH_SIZE
                        ))
                        timer = common.BatchProgressTimer(pbar, total=math.ceil(len(ids)/BATCH_SIZE))
                        for batch, (ids_, questions_, answers_, contexts_) in enumerate(batches):
                            with timer.timed_operation(batch=batch+1, save=((batch+1) % SAVE_STEPS == 0)):

                                # try:
                                responses = answer_with_rag(model_instance, questions_, contexts_, max_new_tokens=15)
                                # except:
                                    # responses = [ "No answer" ]

                                for id_, response, question, answer, context in zip(ids_, responses, questions_, answers_, contexts_):

                                    response = response or "No answer"

                                    em = int(exact_match_score(prediction=response, ground_truth=answer))
                                    f1, _, _ = f1_score(prediction=response, ground_truth=answer)

                                    evaluation = dict(
                                        result=dict(response=response, EM=em, F1=f1),
                                        instance=dict(id=id_, question=question, answer=answer)
                                    )
                                    std_result[index_strategy_desc][str(len(context))][id_] = evaluation

                                    pbar.update()

                            if (batch+1) % SAVE_STEPS == 0:
                                std_result.save()
                                common.sync_vram()

                    std_result.save()

[<] Running generation ...



  0%|          | 0/36000 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.





[>] Running generation: 22s436ms
------------------------------------------------------------------------------------------------


In [11]:
citations = {
    'LLaMa-2-Chat-7B': '\cite{touvron2023llama}',
    'LLaMa-3-Chat-8B': '\cite{metaIntroducingMeta}',
    'LLaMa-2-Chat-13B': '\cite{touvron2023llama}',
    'Mistral-Instruct-7B-v2': '\cite{jiang2023mistral}',
    'Zephyr-SFT-7B': '\cite{tunstall2023zephyr}',
    'Zephyr-Beta-7B': '\cite{tunstall2023zephyr}',
    'GPT-3.5-U': '\cite{ouyang2022training}'
}

In [20]:
import pandas

for dataset in DATASETS:
    records = []

    for llm_model in MODELS:

        std_result = data.NestedListItemResult(f"results/counterfactual-post-hoc/{'-'.join(dataset)}_{llm_model}.json")

        for indexing_strategy in INDEXING:
            indexing_strategy_desc = ','.join(str(term) for term in indexing_strategy)

            records.append({
                'model': llm_model + " " + citations[llm_model],
                'index_type': indexing_strategy[0],
                'chunk_size': indexing_strategy[-1] if len(indexing_strategy) == 3 else '',
            })

            for k in RETRIEVAL_DOCUMENTS:

                avg_em, avg_f1, count = 0, 0, 0
                for record in std_result[indexing_strategy_desc][str(k)].values():
                    if record is not None:
                        avg_em += record['result']['EM']
                        avg_f1 += record['result']['F1']
                        count += 1

                avg_em /= (count or 1)
                avg_f1 /= (count or 1)

                records[-1].update({ f'EM_{k}': avg_em, f'F1_{k}': avg_f1 })

print(pandas.DataFrame.from_records(records).sort_values(by=[ 'index_type', 'chunk_size' ]).to_latex(index=False))

\begin{tabular}{lllrrrr}
\toprule
model & index_type & chunk_size & EM_2 & F1_2 & EM_5 & F1_5 \\
\midrule
LLaMa-2-Chat-7B \cite{touvron2023llama} & basic & 100 & 0.443000 & 0.554317 & 0.509000 & 0.620906 \\
LLaMa-3-Chat-8B \cite{metaIntroducingMeta} & basic & 100 & 0.136000 & 0.337497 & 0.157000 & 0.382290 \\
LLaMa-2-Chat-13B \cite{touvron2023llama} & basic & 100 & 0.330000 & 0.438775 & 0.391000 & 0.493859 \\
Mistral-Instruct-7B-v2 \cite{jiang2023mistral} & basic & 100 & 0.064000 & 0.299703 & 0.091000 & 0.350969 \\
Zephyr-SFT-7B \cite{tunstall2023zephyr} & basic & 100 & 0.116000 & 0.278681 & 0.128000 & 0.305514 \\
Zephyr-Beta-7B \cite{tunstall2023zephyr} & basic & 100 & 0.013000 & 0.255154 & 0.035000 & 0.303718 \\
GPT-3.5-U \cite{ouyang2022training} & basic & 100 & 0.471000 & 0.566002 & 0.543000 & 0.652472 \\
LLaMa-2-Chat-7B \cite{touvron2023llama} & basic & 250 & 0.442000 & 0.563535 & 0.534000 & 0.647424 \\
LLaMa-3-Chat-8B \cite{metaIntroducingMeta} & basic & 250 & 0.147000 & 0.353004

In [19]:
import pandas

for dataset in DATASETS:
    records = []

    for llm_model in MODELS:

        std_result    = data.NestedListItemResult(f"results/standard/{'-'.join(dataset)}_{llm_model}.json")
        ctrfct_result = data.NestedListItemResult(f"results/counterfactual-post-hoc/{'-'.join(dataset)}_{llm_model}.json")

        for indexing_strategy in INDEXING:
            indexing_strategy_desc = ','.join(str(term) for term in indexing_strategy)

            common_count, std_count = 0, 0
            index_results = std_result[indexing_strategy_desc]
            for low_k_record, high_k_record in zip(index_results[str(RETRIEVAL_DOCUMENTS[0])].values(), index_results[str(RETRIEVAL_DOCUMENTS[-1])].values()):
                if low_k_record is not None and high_k_record is not None:
                    common_count += low_k_record['result']['EM'] * high_k_record['result']['EM']
                    std_count += low_k_record['result']['EM']

            noise_robustness = common_count / (std_count or 1)

            records.append({
                'model': llm_model,
                'index_type': indexing_strategy[0],
                'chunk_size': indexing_strategy[-1] if len(indexing_strategy) == 3 else '',
                'noise_robustness': noise_robustness
            })

            for k in RETRIEVAL_DOCUMENTS:

                common_count, std_count = 0, 0
                for std_record, ctrfct_record in zip(std_result[indexing_strategy_desc][str(k)].values(), ctrfct_result[indexing_strategy_desc][str(k)].values()):
                    if record is not None:
                        common_count += std_record['result']['EM'] * ctrfct_record['result']['EM']
                        std_count += std_record['result']['EM']

                faithfulness = common_count / (std_count or 1)
                records[-1].update({ f'faithfulness_{k}': faithfulness })


print(pandas.DataFrame.from_records(records).sort_values(by=[ 'index_type', 'chunk_size' ]).to_latex(index=False))

\begin{tabular}{lllrrr}
\toprule
model & index_type & chunk_size & noise_robustness & faithfulness_2 & faithfulness_5 \\
\midrule
LLaMa-2-Chat-7B & basic & 100 & 0.770909 & 0.682203 & 0.738532 \\
LLaMa-3-Chat-8B & basic & 100 & 0.400000 & 0.292308 & 0.373134 \\
LLaMa-2-Chat-13B & basic & 100 & 0.765060 & 0.560241 & 0.656805 \\
Mistral-Instruct-7B-v2 & basic & 100 & 0.615385 & 0.384615 & 0.312500 \\
Zephyr-SFT-7B & basic & 100 & 0.696429 & 0.535714 & 0.680000 \\
Zephyr-Beta-7B & basic & 100 & 0.166667 & 0.166667 & 0.400000 \\
GPT-3.5-U & basic & 100 & 0.867299 & 0.672986 & 0.697318 \\
LLaMa-2-Chat-7B & basic & 250 & 0.763033 & 0.635071 & 0.762136 \\
LLaMa-3-Chat-8B & basic & 250 & 0.272727 & 0.333333 & 0.270833 \\
LLaMa-2-Chat-13B & basic & 250 & 0.737179 & 0.570513 & 0.677215 \\
Mistral-Instruct-7B-v2 & basic & 250 & 0.655172 & 0.344828 & 0.358974 \\
Zephyr-SFT-7B & basic & 250 & 0.727273 & 0.636364 & 0.608696 \\
Zephyr-Beta-7B & basic & 250 & 0.500000 & 0.250000 & 0.400000 \\
GPT-3.5-