In [None]:
%load_ext autoreload
%autoreload 2


from rag_components import *
from llama_index.core.retrievers import VectorIndexRetriever

import re

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
def answer_reading_comprehension_with_union(
    question, context_title, context_text, top_k=2, chunk_size=1024, chunk_overlap=200, qa_llm=gpt4
):
    """
    Answer a question given a context, using question generation.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context
        context_text (str): The text of the context
        top_k (int): The number of top matching chunks to retrieve

    Returns:
        str: The answer to the question
    """

    raw_index = get_index_by_title(context_title)
    summary_index = get_index_by_title(context_title + "_summaries")
    question_index = get_index_by_title(context_title + "_questions")
    combined_index = index_union([raw_index, summary_index, question_index])
    retriever = VectorIndexRetriever(
        index=combined_index,
        similarity_top_k=top_k * 32,
    )
    top_nodes = retriever.retrieve(question)

    unique_top_k_raw_text_ids = []
    unique_top_k_nodes = []
    for node in top_nodes:
        if "summary_" in node.node.id_:
            raw_text_id = node.node.id_.replace("summary_", "text_chunk_")
        elif "question_" in node.node.id_:
            raw_text_id = "_".join(node.node.id_.replace("question_", "text_chunk_").split("_")[:-1])
        else:
            raw_text_id = node.node.id_
        if raw_text_id not in unique_top_k_raw_text_ids:
            unique_top_k_raw_text_ids.append(raw_text_id)
            unique_top_k_nodes.append(node)
            if len(unique_top_k_nodes) == top_k:
                break

    nodes_to_include = [get_node_by_id(raw_index, raw_text_id) for raw_text_id in unique_top_k_raw_text_ids]
    texts_to_include = [node.text for node in nodes_to_include]
    texts_to_include_combined = " ".join(texts_to_include)

    response = answer_reading_comprehension(question, texts_to_include_combined, qa_llm=qa_llm)

    top_chunks_info = [
        {
            "feature_type": (
                "summary"
                if "summary_" in unique_top_k_nodes[i].node.id_
                else "question" if "question_" in unique_top_k_nodes[i].node.id_ else "text"
            ),
            "feature_score": unique_top_k_nodes[i].score,
            "feature_rank": i,
            "text_score": similarity_score(question, context_title, node.id_),
            "feature": unique_top_k_nodes[i].node.text,
            "text": nodes_to_include[i].text,
        }
        for i in range(len(nodes_to_include))
    ]

    additional_info = {
        "top_chunks_info": top_chunks_info,
    }
    return response, additional_info