In [2]:
%load_ext autoreload
%autoreload 2


from rag_components import *
from aws_mixtral import mixtral
from llama_index.core.retrievers import VectorIndexRetriever
from concurrent.futures import ThreadPoolExecutor

from rich.progress import track

import re

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## Summarization Specific Components


In [9]:
def summarize_chunks(chunks, summarizer_llm=Settings.llm, text_title=None):
    """
    Generate a summary for each text chunk using the summarizer_llm.

    Args:
        chunks (list[str]): A list of text chunks to be summarized.

    Returns:
        list[str]: Summaries of the text chunks.
    """
    summaries = []
    if text_title is None:
        text_title = "chunks"
    for chunk in tqdm(chunks, desc=f'Summarizing "{text_title}"', leave=False):
        prompt = f"""\
            {chunk}

            Take notes on the passage above. \
            You will be quizzed on the information later, and you must have every detail in your notes because you will be asked about very specific facts. \
            However, your notes should be concise and easy to read. \
            The notes should be between 250 and 350 words, and they should be formatted as bullet points. \

            Summary: \
        """
        prompt = re.sub(r" +", " ", prompt)
        response = summarizer_llm.complete(prompt).text.strip()
        summaries.append(response)
    return summaries


def summarize_index(text_title, overwrite_existing=False, summarizer_llm=Settings.llm):
    """
    Generate a summary for each text chunk in the index using GPT-4.

    Args:
        index (VectorStoreIndex): The index containing text chunks to be summarized.

    Returns:
        VectorStoreIndex: An index containing summaries for each text chunk.
    """
    if not overwrite_existing:
        existing_index = get_index_by_title(f"{text_title}_summaries")
        if existing_index:
            return existing_index
    index = get_index_by_title(text_title)
    text_chunk_ids = get_ids_from_index(index)
    id_nums = [int(id_.split("_")[-1]) for id_ in text_chunk_ids]
    summary_ids = [f"summary_{id_num}" for id_num in id_nums]
    chunks = [get_text_by_id(index, node_id) for node_id in text_chunk_ids]
    summaries = summarize_chunks(chunks, summarizer_llm, text_title=text_title)

    summary_index = create_index_from_chunks_with_ids(
        summaries, summary_ids, f"{text_title}_summaries", overwrite_existing=overwrite_existing
    )
    return summary_index


def answer_reading_comprehension_with_summarization(
    question, context_title, context_text, top_k=2, chunk_size=1024, chunk_overlap=200, qa_llm=gpt4
):
    """
    Answer a question given a context.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context
        context_text (str): The text of the context
        top_k (int): The number of top matching chunks to retrieve

    Returns:
        str: The answer to the question
    """

    # Find the top k most relevant summary chunks
    summary_index = summarize_index(context_title, summarizer_llm=mixtral)
    summary_retriever = VectorIndexRetriever(
        index=summary_index,
        similarity_top_k=top_k,
    )
    top_summary_chunks = summary_retriever.retrieve(question)

    # Identify the ids of the corresponding raw text chunks
    retrieved_summary_ids = [chunk.node.id_ for chunk in top_summary_chunks]
    corresponding_text_ids = [f"text_chunk_{summary_id.split('_')[-1]}" for summary_id in retrieved_summary_ids]

    # Get the correponding raw text chunks
    text_index = create_index_from_text_with_ids(
        context_text, context_title, chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    corresonding_chunks = [get_node_by_id(text_index, text_id) for text_id in corresponding_text_ids]

    # Find the top k * 10 most relevant raw text chunks, and try to find the rank of the chunks which correspond to the top summary chunks
    text_retriever = VectorIndexRetriever(
        index=text_index,
        similarity_top_k=top_k * 10,
    )
    retrieved_texts = text_retriever.retrieve(question)
    text_chunk_ranks = []
    for i, chunk in enumerate(corresonding_chunks):
        id_ = chunk.id_
        rank = None
        for j, retrieved_text in enumerate(retrieved_texts):
            if retrieved_text.node.id_ == id_:
                rank = j
                break
        text_chunk_ranks.append(rank)

    corresponding_chunks_text = [chunk.text for chunk in corresonding_chunks]
    corresponding_chunks_text_combined = " ".join(corresponding_chunks_text)

    prompt = f"""Consider the following context with depth and thoughtfulness: {corresponding_chunks_text_combined}\n\n\
        Respond to the following question with insight and nuance. Answer concisely, often in one \
        sentence or less and sometimes in the form of a list or structured text. If the question \
        asks you to order events, refer to the events by their number (e.g. "1. third event, 2. second \
        event, 3. first event" -> "3, 2, 1"). Answer multiple choice questions using the number which \
        corresponds to the correct answer (e.g. "1. A, 2. B, 3. C" -> "2"). Do not include the \
        question in your answer. \
        \n\n\
        Question: {question}\n\n\
        Answer: Considering the context above, """
    response = qa_llm.complete(prompt).text

    top_chunks_info = [
        {
            "summary_score": top_summary_chunks[i].score,
            "summary_rank": i,
            "text_score": similarity_score(question, context_title, corresonding_chunks[i].id_),
            "text_rank": text_chunk_ranks[i],
            "summary": top_summary_chunks[i].node.text,
            "text": corresonding_chunks[i].text,
        }
        for i in range(len(corresonding_chunks))
    ]

    additional_info = {
        "top_chunks_info": top_chunks_info,
    }
    return response, additional_info

## Inference


In [13]:
debug_lim = 100
output_file = "output/summarization_with_corresponding_in_context.jsonl"
test_longdep_qa(answer_reading_comprehension_with_summarization, output_file=output_file, debug_lim=debug_lim, qa_llm=gpt4)

Answering questions:   0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
rouge_metrics = get_rouge_metrics(output_file)
print("Rouge Metrics:", rouge_metrics)

self_score = llm_self_score(output_file, llm=gpt4)
print("LLM Self-Score:", self_score)

Rouge Metrics: {'rouge1': 0.4339674877074614, 'rouge2': 0.14931325911875848, 'rougeL': 0.3785455331636343, 'rougeLsum': 0.38248379399744403}
LLM Self-Score: 0.44


In [15]:
# Best: 0.44 – output/summarization_with_corresponding_in_context.jsonl

In [32]:
def dict_to_markdown(d, level=2):
    markdown = ""
    header = "#" * level
    for key, value in d.items():
        if isinstance(value, dict):
            markdown += f"{header} {key}\n\n{dict_to_markdown(value, level+1)}"
        elif isinstance(value, list):
            markdown += f"{header} {key}\n\n"
            for item in value:
                if isinstance(item, dict):
                    markdown += f"{dict_to_markdown(item, level+1)}"
                else:
                    markdown += f"- {item}\n"
            markdown += "\n"
        else:
            markdown += f"{header} {key}\n\n{value}\n\n"

    return markdown


question_objects = []
with open(output_file, "r") as f:
    for line in f:
        question_objects.append(json.loads(line))

markdowns = [dict_to_markdown(obj) for obj in question_objects]

with open(f"output_md/{output_file.split('/')[-1].split('.')[0]}.md", "w") as f:
    f.write("\n\n".join(markdowns))