In [2]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
%load_ext dotenv
%dotenv

In [5]:
from rag_experiments import *

from utils import io, common, data

In [6]:
nest_asyncio.apply()

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [7]:
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embedding_model

In [8]:
common.show_device_mem_usage()

 Device |         Free |        Total
--------+--------------+--------------
 cuda:0 |   12.776 GiB |   47.536 GiB


### Execution

In [9]:
MODELS = [
    'LLaMa-2-Chat-7B',
    'LLaMa-3-Chat-8B',
    'LLaMa-2-Chat-13B',
    'Mistral-Instruct-7B-v2',
    'Zephyr-SFT-7B',
    'Zephyr-Beta-7B',
    'GPT-3.5-U',
    # 'GPT-4'
]

DATASETS = [
    # ("custom", )
    ("hotpot_qa", "fullwiki"),
]

INDEXING = [
    ("basic", "open-source", 100),
    ("basic", "open-source", 250),
    ("semantic", "open-source"),
]

RETRIEVAL_DOCUMENTS = [ 2, 5 ]

EVALUATION_TYPES = [
    "standard",
    "counterfactual-base",
    "counterfactual-post-hoc",
    "abstention"
]

In [10]:
with common.LogTime("Preparing retrieval corpus"):

    raw_documents = {}

    for dataset in DATASETS:
        dataset_instance = get_dataset(*dataset)
        for instance in tqdm.tqdm(dataset_instance):
            for title, sentences in zip(instance['context']['title'], instance['context']['sentences']):
                raw_documents[make_id_from_title(title)] = llama_index.core.Document(doc_id=make_id_from_title(title), text=' '.join(sentences), extra_info={ 'title': title })

    raw_documents = list(raw_documents.values())

[<] Preparing retrieval corpus ...



  0%|          | 0/7405 [00:00<?, ?it/s]




[>] Preparing retrieval corpus: 10s26ms
---------------------------------------------------------------------------------------------------------------------


In [11]:
with common.LogTime("Running retrieval"):

    for dataset in DATASETS:

        dataset_instance = get_dataset(*dataset).select(range(1000))

        with tqdm.tqdm(total = len(INDEXING) * len(dataset_instance)) as pbar:

            for index_strategy in INDEXING:

                index_strategy_desc = ','.join(str(term).lower() for term in index_strategy)

                index, retriever = None, None
                Settings.embed_model = embedding_model

                index_result = data.NestedListItemResult(
                    f"data/retrieval/standard/{index_strategy_desc}.json",
                    [ normalize_instance(dataset[0], instance)['id'] for instance in dataset_instance ]
                )

                for instance in dataset_instance:

                    norm_instance = normalize_instance(dataset[0], instance)

                    if index_result[instance['id']] is None:
                        if index is None:
                            with common.LogTime(f"Loading index: {index_strategy_desc}"):
                                index = get_index(embedding_model, raw_documents, *index_strategy)

                        if retriever is None:
                            retriever = make_retriever("standard", index, k=100, embed_model=embedding_model)

                        instance = norm_instance

                        query_bundle = QueryBundle(
                            query_str=instance["question"]
                            + " Give a short factoid answer (as few words as possible).",
                            custom_embedding_strs=[instance["question"]],
                        )

                        nodes = retriever.retrieve(query_bundle)
                        nodes = [ node.dict() for node in nodes ]

                        index_result[instance['id']] = [
                            {
                                'id'   : node['node']['id_'],
                                'text' : node['node']['text'],
                                'score': node['score'],
                                'meta' : node['node']['metadata']
                            }
                            for node in nodes
                        ]

                    pbar.update()

                index_result.save()

[<] Running retrieval ...



  0%|          | 0/3000 [00:00<?, ?it/s]




[>] Running retrieval: 27s163ms
---------------------------------------------------------------------------------------------


In [12]:
retrieval_cache = {
    ','.join(str(term).lower() for term in index_strategy): data.NestedListItemResult(
        f"data/retrieval/standard/{','.join(str(term).lower() for term in index_strategy)}.json"
    )
    for index_strategy in INDEXING
}

In [13]:
# dataset_instance = get_dataset("hotpot_qa", "fullwiki").select(range(1000))

# with common.ModelManager("GPT-3.5-U") as model:
#     queries = [ dataset_instance[i]['question'] for i in range(3) ]
#     retrieved_contexts = [ retrieval_cache['basic,open-source,250'][dataset_instance[i]['id']][:5] for i in range(3) ]
#     io.jprint(answer_with_rag(model, queries, retrieved_contexts, max_new_tokens=10))

In [14]:
BATCH_SIZE = 1
SAVE_STEPS = 2

In [15]:
with common.LogTime("Running generation"):

    for dataset in DATASETS:

        dataset_instance = get_dataset(*dataset).select(range(1000))

        with tqdm.tqdm(total = len(MODELS) * len(INDEXING) * len(dataset_instance) * len(RETRIEVAL_DOCUMENTS)) as pbar:

            for llm_model in MODELS:

                with common.ModelManager(llm_model) as model_instance:

                    std_result = data.NestedListItemResult(
                        f"results/standard/{'-'.join(dataset)}_{llm_model}.json",
                        [ ','.join(str(term).lower() for term in index_strategy) for index_strategy in INDEXING ],
                        [ str(count) for count in RETRIEVAL_DOCUMENTS ],
                        [ normalize_instance(dataset[0], instance)['id'] for instance in dataset_instance ]
                    )

                    for index_strategy in INDEXING:

                        index_strategy_desc = ','.join(str(term).lower() for term in index_strategy)
                        rag_name = f"{llm_model.lower()} + {','.join(str(term).lower() for term in index_strategy)}"
                        pbar.set_description(rag_name)

                        ids, questions, answers, contexts = [], [], [], []

                        for instance in dataset_instance:
                            for k in RETRIEVAL_DOCUMENTS:
                                norm_instance = normalize_instance(dataset[0], instance)
                                if std_result[index_strategy_desc][str(k)][instance['id']] is None:
                                    instance = norm_instance
                                    ids.append(instance['id'])
                                    questions.append(instance['question'])
                                    answers.append(instance['answer'])
                                    contexts.append(retrieval_cache[index_strategy_desc][instance['id']][:k])
                                else: pbar.update()

                        batches = list(common.batchify(
                            ids, questions, answers, contexts, batch_size=BATCH_SIZE
                        ))
                        timer = common.BatchProgressTimer(pbar, total=math.ceil(len(ids)/BATCH_SIZE))
                        for batch, (ids_, questions_, answers_, contexts_) in enumerate(batches):
                            with timer.timed_operation(batch=batch+1, save=((batch+1) % SAVE_STEPS == 0)):

                                try:
                                    responses = answer_with_rag(model_instance, questions_, contexts_, max_new_tokens=15)
                                except:
                                    responses = [ "No answer" ]

                                for id_, response, question, answer, context in zip(ids_, responses, questions_, answers_, contexts_):

                                    response = response or "No answer"

                                    em = int(exact_match_score(prediction=response, ground_truth=answer))
                                    f1, _, _ = f1_score(prediction=response, ground_truth=answer)

                                    evaluation = dict(
                                        result=dict(response=response, EM=em, F1=f1),
                                        instance=dict(id=id_, question=question, answer=answer)
                                    )
                                    std_result[index_strategy_desc][str(len(context))][id_] = evaluation

                                    pbar.update()

                            if (batch+1) % SAVE_STEPS == 0:
                                std_result.save()
                                common.sync_vram()

                    std_result.save()

[<] Running generation ...



  0%|          | 0/42000 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.





[>] Running generation: 27s479ms
------------------------------------------------------------------------------------------------


In [17]:
citations = {
    'LLaMa-2-Chat-7B': '\cite{touvron2023llama}',
    'LLaMa-3-Chat-8B': '\cite{metaIntroducingMeta}',
    'LLaMa-2-Chat-13B': '\cite{touvron2023llama}',
    'Mistral-Instruct-7B-v2': '\cite{jiang2023mistral}',
    'Zephyr-SFT-7B': '\cite{tunstall2023zephyr}',
    'Zephyr-Beta-7B': '\cite{tunstall2023zephyr}',
    'GPT-3.5-U': '\cite{ouyang2022training}'
}

In [22]:
import pandas

for dataset in DATASETS:
    records = []

    for llm_model in MODELS:

        std_result = data.NestedListItemResult(f"results/standard/{'-'.join(dataset)}_{llm_model}.json")

        for indexing_strategy in INDEXING:
            indexing_strategy_desc = ','.join(str(term) for term in indexing_strategy)

            records.append({
                'model': llm_model + " " + citations[llm_model],
                'index_type': indexing_strategy[0],
                'chunk_size': indexing_strategy[-1] if len(indexing_strategy) == 3 else '',
            })

            for k in RETRIEVAL_DOCUMENTS:

                avg_em, avg_f1, count = 0, 0, 0
                for record in std_result[indexing_strategy_desc][str(k)].values():
                    if record is not None:
                        avg_em += record['result']['EM']
                        avg_f1 += record['result']['F1']
                        count += 1

                avg_em /= (count or 1)
                avg_f1 /= (count or 1)

                records[-1].update({ f'EM_{k}': avg_em, f'F1_{k}': avg_f1 })

print(pandas.DataFrame.from_records(records).sort_values(by=[ 'index_type', 'chunk_size' ]).to_latex(index=False))

\begin{tabular}{lllrrrr}
\toprule
model & index_type & chunk_size & EM_2 & F1_2 & EM_5 & F1_5 \\
\midrule
LLaMa-2-Chat-7B \cite{touvron2023llama} & basic & 100 & 0.233314 & 0.341599 & 0.216909 & 0.330988 \\
LLaMa-3-Chat-8B \cite{metaIntroducingMeta} & basic & 100 & 0.065000 & 0.236232 & 0.067000 & 0.239925 \\
LLaMa-2-Chat-13B \cite{touvron2023llama} & basic & 100 & 0.166000 & 0.268711 & 0.169000 & 0.276269 \\
Mistral-Instruct-7B-v2 \cite{jiang2023mistral} & basic & 100 & 0.026000 & 0.207774 & 0.032000 & 0.219801 \\
Zephyr-SFT-7B \cite{tunstall2023zephyr} & basic & 100 & 0.056000 & 0.194278 & 0.050000 & 0.186669 \\
Zephyr-Beta-7B \cite{tunstall2023zephyr} & basic & 100 & 0.006000 & 0.173026 & 0.005000 & 0.179536 \\
GPT-3.5-U \cite{ouyang2022training} & basic & 100 & 0.211000 & 0.318223 & 0.261000 & 0.382855 \\
LLaMa-2-Chat-7B \cite{touvron2023llama} & basic & 250 & 0.211000 & 0.320397 & 0.206000 & 0.326997 \\
LLaMa-3-Chat-8B \cite{metaIntroducingMeta} & basic & 250 & 0.066000 & 0.230836

: 

In [34]:
import pandas

for dataset in DATASETS:
    records = []

    for llm_model in MODELS:

        std_result = data.NestedListItemResult(f"results/standard/{'-'.join(dataset)}_{llm_model}.json")

        for indexing_strategy in INDEXING:
            indexing_strategy_desc = ','.join(str(term) for term in indexing_strategy)

            common_count, std_count = 0, 0
            index_results =std_result[indexing_strategy_desc]
            for low_k_record, high_k_record in zip(index_results[str(RETRIEVAL_DOCUMENTS[0])].values(), index_results[str(RETRIEVAL_DOCUMENTS[-1])].values()):
                if low_k_record is not None and high_k_record is not None:
                    common_count += low_k_record['result']['EM'] * high_k_record['result']['EM']
                    std_count += low_k_record['result']['EM']

            noise_robustness = common_count / (std_count or 1)

            records.append({
                'model': llm_model,
                'index_type': indexing_strategy[0],
                'chunk_size': indexing_strategy[-1] if len(indexing_strategy) == 3 else '',
                'noise_robustness': noise_robustness
            })

print(pandas.DataFrame.from_records(records).sort_values(by=['index_type', 'chunk_size']).to_latex(index=False))

\begin{tabular}{lllr}
\toprule
model & index_type & chunk_size & noise_robustness \\
\midrule
LLaMa-2-Chat-7B & basic & 100 & 0.770909 \\
LLaMa-3-Chat-8B & basic & 100 & 0.400000 \\
LLaMa-2-Chat-13B & basic & 100 & 0.765060 \\
Mistral-Instruct-7B-v2 & basic & 100 & 0.615385 \\
Zephyr-SFT-7B & basic & 100 & 0.696429 \\
Zephyr-Beta-7B & basic & 100 & 0.166667 \\
GPT-3.5-U & basic & 100 & 0.867299 \\
LLaMa-2-Chat-7B & basic & 250 & 0.763033 \\
LLaMa-3-Chat-8B & basic & 250 & 0.272727 \\
LLaMa-2-Chat-13B & basic & 250 & 0.737179 \\
Mistral-Instruct-7B-v2 & basic & 250 & 0.655172 \\
Zephyr-SFT-7B & basic & 250 & 0.727273 \\
Zephyr-Beta-7B & basic & 250 & 0.500000 \\
GPT-3.5-U & basic & 250 & 0.883065 \\
LLaMa-2-Chat-7B & semantic &  & 0.754310 \\
LLaMa-3-Chat-8B & semantic &  & 0.270270 \\
LLaMa-2-Chat-13B & semantic &  & 0.754601 \\
Mistral-Instruct-7B-v2 & semantic &  & 0.652174 \\
Zephyr-SFT-7B & semantic &  & 0.714286 \\
Zephyr-Beta-7B & semantic &  & 0.250000 \\
GPT-3.5-U & semantic & 