In [1]:
%load_ext autoreload
%autoreload 2


from rag_components import *
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import Settings

import re

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## Question Generation Specific Components


In [3]:
def generate_questions_from_chunks(chunks, question_gen_llm=Settings.llm, text_title=None, num_questions_per_1k_tokens=32):
    """
    Generate questions for each text chunk using the question_gen_llm.

    Args:
        chunks (list[str]): A list of text chunks to be summarized.

    Returns:
        list[str]: Summaries of the text chunks.
    """
    question_sets = []
    if text_title is None:
        text_title = "chunks"
    for chunk in tqdm(chunks, desc=f'Generating questions for "{text_title}"', leave=False):
        num_questions = round(count_tokens(chunk) * num_questions_per_1k_tokens / 1024)
        prompt = f"""\
            <s>[INST]{chunk}

            Generate {num_questions} reading comprehension questions based on the text above. \
            Each question should be unique and should require a thoughtful answer. \
            Each question should begin with a number followed by a period and a space, denoting its position in the list. \
            Separate each question with a line break. [/INST]\

            Questions: \
        """
        prompt = re.sub(r" +", " ", prompt)
        question_set_raw = question_gen_llm.complete(prompt).text.strip()
        question_set = question_set_raw.split("\n")
        question_set = [re.sub(r"^\d+\.", "", question) for question in question_set]
        question_set = [question.strip() for question in question_set]
        question_set = [question for question in question_set if question]
        question_sets.append(question_set)
    return question_sets


def generate_questions_from_index(text_title, overwrite_existing=False, question_gen_llm=Settings.llm):
    """
    Generate questions for each text chunk in the index using GPT-4.

    Args:
        index (VectorStoreIndex): The index containing text chunks for which to generate questions.

    Returns:
        VectorStoreIndex: An index containing the questions generated for each text chunk.
    """
    if not overwrite_existing:
        existing_index = get_index_by_title(f"{text_title}_questions")
        if existing_index:
            return existing_index
    index = get_index_by_title(text_title)
    text_chunk_ids = get_ids_from_index(index)

    chunks = [get_text_by_id(index, node_id) for node_id in text_chunk_ids]
    question_sets = generate_questions_from_chunks(chunks, question_gen_llm, text_title=text_title)
    questions = [question for question_set in question_sets for question in question_set]

    id_nums = [
        [f"{int(text_chunk_ids[i].split('_')[-1])}_{j}" for j in range(len(question_sets[i]))]
        for i in range(len(question_sets))
    ]
    id_nums = [id_num for id_num_set in id_nums for id_num in id_num_set]
    question_ids = [f"question_{id_num}" for id_num in id_nums]

    question_index = create_index_from_chunks_with_ids(
        questions, question_ids, f"{text_title}_questions", overwrite_existing=overwrite_existing
    )
    return question_index


def answer_reading_comprehension_with_question_generation(
    question, context_title, context_text, top_k=2, chunk_size=1024, chunk_overlap=200, qa_llm=gpt4
):
    """
    Answer a question given a context, using question generation.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context
        context_text (str): The text of the context
        top_k (int): The number of top matching chunks to retrieve

    Returns:
        str: The answer to the question
    """

    # Find the top k most relevant summary chunks
    question_index = generate_questions_from_index(context_title, question_gen_llm=mixtral)
    question_retriever = VectorIndexRetriever(
        index=question_index,
        similarity_top_k=top_k * 32,
    )
    top_question_chunks = question_retriever.retrieve(question)

    # Identify the ids of the corresponding raw text chunks
    retrieved_question_ids = [chunk.node.id_ for chunk in top_question_chunks]
    corresponding_text_ids = [f"text_chunk_{question_id.split('_')[-2]}" for question_id in retrieved_question_ids]
    unique_corresponding_text_ids = []
    for text_id in corresponding_text_ids:
        if text_id not in unique_corresponding_text_ids:
            unique_corresponding_text_ids.append(text_id)
    corresponding_text_ids = unique_corresponding_text_ids[:top_k]

    # Get the correponding raw text chunks
    text_index = create_index_from_text_with_ids(
        context_text, context_title, chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    corresonding_chunks = [get_node_by_id(text_index, text_id) for text_id in corresponding_text_ids]

    # Find the top k * 10 most relevant raw text chunks, and try to find the rank of the chunks which correspond to the top summary chunks
    text_retriever = VectorIndexRetriever(
        index=text_index,
        similarity_top_k=top_k * 10,
    )
    retrieved_texts = text_retriever.retrieve(question)
    text_chunk_ranks = []
    for i, chunk in enumerate(corresonding_chunks):
        id_ = chunk.id_
        rank = None
        for j, retrieved_text in enumerate(retrieved_texts):
            if retrieved_text.node.id_ == id_:
                rank = j
                break
        text_chunk_ranks.append(rank)

    corresponding_chunks_text = [chunk.text for chunk in corresonding_chunks]
    corresponding_chunks_text_combined = " ".join(corresponding_chunks_text)

    response = answer_reading_comprehension(question, corresponding_chunks_text_combined, qa_llm=qa_llm)

    top_chunks_info = [
        {
            "summary_score": top_question_chunks[i].score,
            "generated_question_rank": i,
            "text_score": similarity_score(question, context_title, corresonding_chunks[i].id_),
            "text_rank": text_chunk_ranks[i],
            "generated_question": top_question_chunks[i].node.text,
            "text": corresonding_chunks[i].text,
        }
        for i in range(len(corresonding_chunks))
    ]

    additional_info = {
        "top_chunks_info": top_chunks_info,
    }
    return response, additional_info

## Inference


In [4]:
debug_lim = 100
chunk_sizes = [64, 128, 256, 512, 1024, 2048]
topks = [32, 16, 8, 4, 2, 1]
chunk_overlaps = [10, 25, 50, 100, 200, 400]
for idx in [2]:
    print("Generating output for chunk size", chunk_sizes[idx])
    output_file = f"output/testset_questions_mistral_large_chunksize{chunk_sizes[idx]}.jsonl"
    test_longdep_qa(
        answer_reading_comprehension_with_question_generation,
        output_file=output_file,
        debug_start=105,
        debug_lim=debug_lim,
        qa_llm=mistral_large,
        chunk_size=chunk_sizes[idx],
        top_k=topks[idx],
        chunk_overlap=chunk_overlaps[idx],
    )

Generating output for chunk size 256


Answering questions:   0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
for idx in [2]:
    output_file = f"output/testset_questions_mistral_large_chunksize{chunk_sizes[idx]}.jsonl"
    rouge_metrics = get_rouge_metrics(output_file)
    print("results for chunk size", chunk_sizes[idx])
    print("Rouge Metrics:", rouge_metrics)

    self_score = llm_self_score(output_file, llm=gpt4)
    print("LLM Self-Score:", self_score)

results for chunk size 256
Rouge Metrics: {'rouge1': 0.19030744769001515, 'rouge2': 0.08193443045093564, 'rougeL': 0.16445576181142998, 'rougeLsum': 0.16977922983544602}


  0%|          | 0/100 [00:00<?, ?it/s]

LLM Self-Score: 0.52
