In [1]:
%load_ext autoreload
%autoreload 2


from rag_components import *
from llama_index.core.retrievers import VectorIndexRetriever

import re

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
def answer_reading_comprehension_with_union_question_only(
    question, context_title, context_text, top_k=2, chunk_size=1024, chunk_overlap=200, qa_llm=gpt4
):
    """
    Answer a question given a context, using question generation.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context
        context_text (str): The text of the context
        top_k (int): The number of top matching chunks to retrieve

    Returns:
        str: The answer to the question
    """

    raw_index = get_index_by_title(context_title)
    # summary_index = get_index_by_title(context_title + "_summaries")
    question_index = get_index_by_title(context_title + "_questions")
    combined_index = index_union([raw_index, question_index])
    retriever = VectorIndexRetriever(
        index=combined_index,
        similarity_top_k=top_k * 32,
    )
    top_nodes = retriever.retrieve(question)

    def text_chunk_id(feature_id):
        if "text_chunk_" in feature_id:
            return feature_id
        elif "summary_" in feature_id:
            return feature_id.replace("summary_", "text_chunk_")
        elif "question_" in feature_id:
            return "_".join(feature_id.replace("question_", "text_chunk_").split("_")[:-1])

    unique_top_k_raw_text_ids = []
    unique_top_k_nodes = []
    for node in top_nodes:
        raw_text_id = text_chunk_id(node.node.id_)
        if raw_text_id not in unique_top_k_raw_text_ids:
            unique_top_k_raw_text_ids.append(raw_text_id)
            unique_top_k_nodes.append(node)
            if len(unique_top_k_nodes) == top_k:
                break

    nodes_to_include = [get_node_by_id(raw_index, raw_text_id) for raw_text_id in unique_top_k_raw_text_ids]
    texts_to_include = [node.text for node in nodes_to_include]
    texts_to_include_combined = " ".join(texts_to_include)

    response = answer_reading_comprehension(question, texts_to_include_combined, qa_llm=qa_llm)

    top_chunks_info = [
        {
            "feature_type": (
                "summary"
                if "summary_" in unique_top_k_nodes[i].node.id_
                else "question" if "question_" in unique_top_k_nodes[i].node.id_ else "text"
            ),
            "feature_score": unique_top_k_nodes[i].score,
            "feature_rank": i,
            "text_score": similarity_score(question, context_title, text_chunk_id(node.id_)),
            "feature": unique_top_k_nodes[i].node.text,
            "text": nodes_to_include[i].text,
        }
        for i in range(len(nodes_to_include))
    ]

    additional_info = {
        "top_chunks_info": top_chunks_info,
    }
    return response, additional_info

In [3]:
debug_lim = 100
chunk_sizes = [64, 128, 256, 512, 1024, 2048]
topks = [32, 16, 8, 4, 2, 1]
chunk_overlaps = [10, 25, 50, 100, 200, 400]
for idx in [2]:
    print("Generating output for chunk size", chunk_sizes[idx])
    output_file = f"output/testset_question_union_mistral_large_chunksize{chunk_sizes[idx]}.jsonl"
    test_longdep_qa(
        answer_reading_comprehension_with_union_question_only,
        output_file=output_file,
        debug_lim=debug_lim,
        qa_llm=mistral_large,
        chunk_size=chunk_sizes[idx],
        top_k=topks[idx],
        chunk_overlap=chunk_overlaps[idx],
    )

Generating output for chunk size 256


Answering questions:   0%|          | 0/100 [00:00<?, ?it/s]

In [4]:
for idx in [2]:
    output_file = output_file = f"output/testset_question_union_mistral_large_chunksize{chunk_sizes[idx]}.jsonl"
    rouge_metrics = get_rouge_metrics(output_file)
    print("results for chunk size", chunk_sizes[idx])
    print("Rouge Metrics:", rouge_metrics)

    self_score = llm_self_score(output_file, llm=gpt4)
    print("LLM Self-Score:", self_score)

results for chunk size 256
Rouge Metrics: {'rouge1': 0.2270189121590305, 'rouge2': 0.10720936430684744, 'rougeL': 0.18327147539298372, 'rougeLsum': 0.19207667627133662}


  0%|          | 0/100 [00:00<?, ?it/s]

LLM Self-Score: 0.46
