In [18]:
!pip install tiktoken



In [19]:
%load_ext autoreload
%autoreload 2


from rag_components import *
from aws_mixtral import mixtral
from llama_index.core.retrievers import VectorIndexRetriever

from rich.progress import track

import re

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Summarization Specific Components


In [29]:
def summarize_chunks(chunks, summarizer_llm=Settings.llm, text_title=None):
    """
    Generate a summary for each text chunk using the summarizer_llm.

    Args:
        chunks (list[str]): A list of text chunks to be summarized.

    Returns:
        list[str]: Summaries of the text chunks.
    """
    summaries = []
    if text_title is None:
        text_title = "chunks"
    for chunk in tqdm(chunks, desc=f'Summarizing "{text_title}"', leave=False):
        if len(summaries) == 0:
            prompt = f"""Summarize the following text conceptually. \
                The summary should paraphrase the original text, be significantly \
                shorter, retain all propositions, and be able to replace the \
                original text. Here is the text to be summarized:\n\n{chunk}"""
        else:
            prompt = f"""Summarize the following text conceptually in the context of the text that precedes it. \
                The summary should paraphrase the original text, be significantly \
                shorter, retain all propositions, and be able to replace the \
                original text. Here is the the context:\n\n{summaries[-1]}\n\n\
                Summarize the following text:\n\n{chunk}"""

        prompt = re.sub(r"\s+", " ", prompt)
        response = summarizer_llm.complete(prompt).text.strip()
        summaries.append(response)
    return summaries


def summarize_index(text_title, overwrite_existing=False, summarizer_llm=Settings.llm):
    """
    Generate a summary for each text chunk in the index using GPT-4.

    Args:
        index (VectorStoreIndex): The index containing text chunks to be summarized.

    Returns:
        VectorStoreIndex: An index containing summaries for each text chunk.
    """
    if not overwrite_existing:
        existing_index = get_index_by_title(f"{text_title}_summaries")
        if existing_index:
            return existing_index
    index = get_index_by_title(text_title)
    text_chunk_ids = get_ids_from_index(index)
    id_nums = [int(id_.split("_")[-1]) for id_ in text_chunk_ids]
    summary_ids = [f"summary_{id_num}" for id_num in id_nums]
    chunks = [get_text_by_id(index, node_id) for node_id in text_chunk_ids]
    summaries = summarize_chunks(chunks, summarizer_llm, text_title=text_title)

    summary_index = create_index_from_chunks_with_ids(
        summaries, summary_ids, f"{text_title}_summaries", overwrite_existing=overwrite_existing
    )
    return summary_index


def answer_reading_comprehension_with_summarization(
    question, context_title, context_text, top_k=2, chunk_size=1024, chunk_overlap=200, qa_llm=qa_llm
):
    """
    Answer a question given a context.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context
        context_text (str): The text of the context
        top_k (int): The number of top matching chunks (or equivalent tokens) to retrieve

    Returns:
        str: The answer to the question
    """
    text_index = create_index_from_text_with_ids(
        context_text, context_title, chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    summary_index = summarize_index(context_title, summarizer_llm=mixtral)

    combined_index = index_union([summary_index])

    # Query the vector store to find top matching chunks
    retriever = VectorIndexRetriever(
        index=combined_index,
        similarity_top_k=20,
    )
    top_chunks = retriever.retrieve(question)

    max_tokens = top_k * chunk_size

    top_chunks_text = []
    top_chunks_text_combined = ""
    chunks_used = 0
    for i, chunk in enumerate(top_chunks):
        chunk_text = chunk.node.text
        if count_tokens(top_chunks_text_combined) + count_tokens(chunk_text) > max_tokens:
            break
        else:
            top_chunks_text.append(chunk.node.text)
            top_chunks_text_combined += chunk.node.text + " "
            chunks_used += 1
    top_chunks = top_chunks[:chunks_used]

    # Extract and combine the text from the top matching chunks for use as context
    top_chunks_text = [chunk.node.text for chunk in top_chunks]
    top_chunks_text_combined = " ".join(top_chunks_text)

    raw_text_chunk_count = 0
    summary_chunk_count = 0
    for chunk in top_chunks:
        if chunk.node.id_.startswith("text_chunk_"):
            raw_text_chunk_count += 1
        elif chunk.node.id_.startswith("summary_"):
            summary_chunk_count += 1

    # Use the combined context of top chunks to generate an answer to the question
    # Construct a prompt that guides the LLM to consider the context and answer the question subjectively or conceptually
    prompt = f"""Consider the following context with depth and thoughtfulness: {top_chunks_text_combined}\n\n\
        Respond to the following question with insight and nuance. Answer concisely, often in one \
        sentence or less and sometimes in the form of a list or structured text. If the question \
        asks you to order events, refer to the events by their number (e.g. "1. third event, 2. second \
        event, 3. first event" -> "3, 2, 1"). Answer multiple choice questions using the number which \
        corresponds to the correct answer (e.g. "1. A, 2. B, 3. C" -> "2"). Do not include the \
        question in your answer. \
        \n\n\
        Question: {question}\n\n\
        Answer: """
    response = qa_llm.complete(prompt).text
    additional_info = {
        "raw_text_chunk_count": raw_text_chunk_count,
        "summary_chunk_count": summary_chunk_count,
        "top_chunks_text": top_chunks_text,
    }
    return response, additional_info

## Inference


In [30]:
debug_lim = 100
output_file = "output/mixtral_summary_only_100.jsonl"
test_longdep_qa(answer_reading_comprehension_with_summarization, output_file=output_file, debug_lim=debug_lim, qa_llm=mixtral)

Answering questions:   0%|          | 0/100 [00:00<?, ?it/s]

In [31]:
rouge_metrics = get_rouge_metrics(output_file)
print("Rouge Metrics:", rouge_metrics)

self_score = llm_self_score(output_file, llm=qa_llm)
print("LLM Self-Score:", self_score)

Rouge Metrics: {'rouge1': 0.20306477452481553, 'rouge2': 0.05884763085108288, 'rougeL': 0.16059295343233992, 'rougeLsum': 0.17164550855100139}
LLM Self-Score: 0.32


In [9]:
def dict_to_markdown(d, level=2):
    markdown = ""
    header = "#" * level
    for key, value in d.items():
        if isinstance(value, dict):
            markdown += f"{header} {key}\n\n{dict_to_markdown(value, level+1)}"
        elif isinstance(value, list):
            markdown += f"{header} {key}\n\n"
            for item in value:
                if isinstance(item, dict):
                    markdown += f"{dict_to_markdown(item, level+1)}"
                else:
                    markdown += f"- {item}\n"
            markdown += "\n"
        else:
            markdown += f"{header} {key}\n\n{value}\n\n"

    return markdown


question_objects = []
with open(output_file, "r") as f:
    for line in f:
        question_objects.append(json.loads(line))

markdowns = [dict_to_markdown(obj) for obj in question_objects]

with open(f"output_md/{output_file.split('/')[-1].split('.')[0]}.md", "w") as f:
    f.write("\n\n".join(markdowns))