In [1]:
%load_ext autoreload
%autoreload 2

from rag_components import *
from llama_index.core.retrievers import VectorIndexRetriever

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## Baseline Specific Components


In [10]:
def answer_reading_comprehension_baseline(
    question, context_title, context_text, chunk_size=1024, chunk_overlap=200, top_k=2, qa_llm=mistral_large
):
    """
    Answer a question given a context.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context
        context_text (str): The text of the context
        features (list, optional): The features to generate from the context. Defaults to ["summary"].

    Returns:
        str: The answer to the question
    """
    text_index = create_index_from_text_with_ids(
        context_text, context_title, chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    # Query the vector store to find top matching chunks
    retriever = VectorIndexRetriever(
        index=text_index,
        similarity_top_k=top_k,
    )
    top_chunks = retriever.retrieve(question)

    # Extract and combine the text from the top matching chunks for use as context
    top_chunks_text = [chunk.node.text for chunk in top_chunks]
    top_chunks_text_combined = " ".join(top_chunks_text)

    raw_text_chunk_count = 0
    summary_chunk_count = 0
    for chunk in top_chunks:
        if chunk.node.id_.startswith("text_chunk_"):
            raw_text_chunk_count += 1

    # Use the combined context of top chunks to generate an answer to the question
    # Construct a prompt that guides the LLM to consider the context and answer the question subjectively or conceptually
    response = answer_reading_comprehension(question, top_chunks_text_combined, qa_llm=qa_llm)
    additional_info = {
        "raw_text_chunk_count": raw_text_chunk_count,
        "summary_chunk_count": summary_chunk_count,
        "top_chunks_text": top_chunks_text,
    }
    return response, additional_info

## Inference


In [None]:
debug_lim = 100
chunk_sizes = [64, 128, 256, 512, 1024, 2048]
topks = [32, 16, 8, 4, 2, 1]
chunk_overlaps = [10, 25, 50, 100, 200, 400]
for idx in range(6):
    print("Generating output for chunk size", chunk_sizes[idx])
    output_file = f"output/baseline_mistral_large_100_chunksize{chunk_sizes[idx]}.jsonl"
    test_longdep_qa(
        answer_reading_comprehension_baseline,
        output_file=output_file,
        debug_lim=debug_lim,
        qa_llm=mistral_large,
        chunk_size=chunk_sizes[idx],
        top_k=topks[idx],
        chunk_overlap=chunk_overlaps[idx],
    )

In [7]:
for idx in range(6):
    output_file = f"output/baseline_mistral_large_100_chunksize{chunk_sizes[idx]}.jsonl"
    rouge_metrics = get_rouge_metrics(output_file)
    print("results for chunk size", chunk_sizes[idx])
    print("Rouge Metrics:", rouge_metrics)

    self_score = llm_self_score(output_file, llm=gpt4)
    print("LLM Self-Score:", self_score)

results for chunk size 64
Rouge Metrics: {'rouge1': 0.216419454278109, 'rouge2': 0.0934216343717468, 'rougeL': 0.17430455397773975, 'rougeLsum': 0.18084025603042386}
LLM Self-Score: 0.43
results for chunk size 128
Rouge Metrics: {'rouge1': 0.209221089562154, 'rouge2': 0.08783360287943325, 'rougeL': 0.16709132020090461, 'rougeLsum': 0.17575407845835672}
LLM Self-Score: 0.46
results for chunk size 256
Rouge Metrics: {'rouge1': 0.24940623044942137, 'rouge2': 0.12639135866964885, 'rougeL': 0.2060339176221932, 'rougeLsum': 0.21397718587540734}
LLM Self-Score: 0.48
results for chunk size 512
Rouge Metrics: {'rouge1': 0.22266186555191453, 'rouge2': 0.10801840979532562, 'rougeL': 0.18916734041343425, 'rougeLsum': 0.19464604319670148}
LLM Self-Score: 0.48
results for chunk size 1024
Rouge Metrics: {'rouge1': 0.21212331689168298, 'rouge2': 0.07781696303614188, 'rougeL': 0.16698583498762326, 'rougeLsum': 0.17219841649735634}
LLM Self-Score: 0.39
results for chunk size 2048
Rouge Metrics: {'rouge1