In [254]:
import os
import dotenv
import yaml
import random
import time
import json
from langchain import hub
from openai import OpenAI
from scipy.spatial.distance import cosine
from langchain_community.document_loaders import TextLoader
from typing_extensions import Literal
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

In [255]:
dotenv.load_dotenv('.env')

os.environ["OPENAI_API_KEY"] = os.environ.get('OPENAI_API_KEY')
key = os.environ.get('OPENAI_API_KEY')

In [256]:
client = OpenAI()

## Indexing: Load the Data

In [257]:
# Use the LangChain loader to load a narrative text
loader = TextLoader("try-to-remember.txt")
docs = loader.load()

In [258]:
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=500, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [259]:
# Print the number of splits
print(f"There are {len(all_splits)} splits\n")

# Print the first split
print("First split:\n")
print(f"{all_splits[0].page_content}")

There are 80 splits

First split:

Every mind on earth capable of understanding the problem was focused on the spaceship and the ultimatum delivered by its occupants. Talk or Die! blared the newspaper headlines.

The suicide rate was up and still climbing. Religious cults were having a field day. A book by a science fiction author: "What the Deadly Inter-Galactic Spaceship Means to You!" had smashed all previous best-seller records. And this had been going on for a frantic seven months.

The ship had flapped out of a gun-metal sky over Oregon, its shape that of a hideously magnified paramecium with edges that rippled like a mythological flying carpet. Its five green-skinned, froglike occupants had delivered the ultimatum, one copy printed on velvety paper to each major government, each copy couched faultlessly in the appropriate native tongue:

"You are requested to assemble your most gifted experts in human communication. We are about to submit a problem. We will open five identical r

## Indexing: Store the Data

Store the document chunks in a vector database using the Chroma module from the LangChain library:

In [260]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

This completes the Indexing portion of the pipeline. At this point we have a query-able vector store containing the chunked contents of the narrative text. 

## Retrieval 

In [7]:
number_of_chunks = 6

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": number_of_chunks})

In [8]:
retrieved_docs = retriever.invoke("Who is Ohashi?")

In [None]:
len(retrieved_docs)

6

In [22]:
print(retrieved_docs[0].page_content)

She thought about Hiko Ohashi: a strange man. He was fifty and didn't look a day over thirty. He had grown children. His wife had died of cholera eight years ago. Francine wondered what it would be like married to an Oriental, and she found herself thinking that he wasn't really Oriental with his Princeton education and Occidental ways. Then she realized that this attitude was a kind of white snobbery.

The door in the corner of the room opened softly. Ohashi came in, closed the door. "You awake?" he whispered.

She turned her head without lifting it from the chairback. "Yes."

"I'd hoped you might fall asleep for a bit," he said. "You looked so tired when I left."

Francine glanced at her wristwatch. "It's only three-thirty. What's the day like?"

"Hot and windy."

Ohashi busied himself inserting film into the projector at the rear of the room. Presently, he went to his chair, trailing the remote control cable for the projector.

"Ready?" he asked.


## Generation

In [9]:
llm = ChatOpenAI(model="gpt-4o")

In [10]:
prompt = hub.pull("rlm/rag-prompt")

In [11]:
example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()
example_messages

[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: filler question \nContext: filler context \nAnswer:")]

In [12]:
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


In [13]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
for chunk in rag_chain.stream("Who did the Galactics threaten to destroy if humanity failed?"):
    print(chunk, end="", flush=True)

The Galactics threatened to destroy humanity if it failed to learn to communicate unmistakably, implying the complete destruction of the human race.

## Applying the pipeline to the dataset

First, let's load the Q&A dataset:

In [261]:
with open('literature-qa-dataset.yml', 'r') as file:
    dataset = yaml.safe_load(file)

Next, let's define a schema for the answers, using the Pydantic `BaseModel` class:

In [262]:
class Answer(BaseModel):
    """The answer to a multiple-choice question"""

    answer: Literal["A", "B", "C", "D"] = Field(description="The answer to the question")

Finally, let's define a couple of helper functions to reuse between the different pipelines:

In [264]:
def shuffle_answers(wrong_answers, correct_answer):
    # Shuffle the answers
    answers = wrong_answers + [correct_answer]
    random.shuffle(answers)
    
    # Identify the correct answer
    correct_index = answers.index(q['correct_answer'])
    correct_letter = ['A', 'B', 'C', 'D'][correct_index]

    return answers, correct_letter

### Baseline LLM

First, we need to  create a [ChatPromptTemplate](https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html).

The template contains a system message which instructs the model to extract the right answer:

In [265]:
prompt_baseline_llm = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm."
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", """Answer the following question:
         
        {text}"""),
    ]
)

Next, we iterate over the dataset and apply the pipeline to each question:

In [19]:
for q in dataset['questions']:
    # Create the runnable using the prompt, LLM and schema for the answer
    runnable = prompt_baseline_llm | llm.with_structured_output(schema=Answer)
    
    # Shuffle the answers and get the letter of the correct answer
    answers, correct_letter = shuffle_answers(q['wrong_answers'], q['correct_answer'])
    
    # Create the text with question and answers
    text = f"""{q['question']}
    A) {answers[0]}
    B) {answers[1]}
    C) {answers[2]}
    D) {answers[3]}
    """
    
    # Invoke the runnable
    response = runnable.invoke(
        {
            "text": text
        }
    )
    
    # Print the results
    answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    print(f"Question: {q['question']}")
    print(f"Answer: {response.answer} ({answers[answer_mapping[response.answer]]})")
    print(f"The correct answer: {correct_letter} ({q['correct_answer']})\n")
    print(f"The answer was correct: {response.answer == correct_letter}\n")

Question: What threat did the newspaper headlines blare?
Answer: A (Listen or Die!)
The correct answer: B (Talk or Die!)

The answer was correct: False

Question: What is the name of the protagonist?
Answer: C (Theodore Zakheim)
The correct answer: D (Francine Millar)

The answer was correct: False



### Baseline LLM with full book as context

In [152]:
with open("try-to-remember.txt", "r") as file:
    full_book = file.read()

In [169]:
prompt_rag = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm."
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", """Answer the following question:
         
        {text}
         
        Based your answer on the following narrative text:
        
        {context}"""),
    ]
)

In [170]:
for q in dataset['questions']:
    runnable = prompt_rag | llm.with_structured_output(schema=Answer)
    
    # Shuffle the answers and get the letter of the correct answer
    answers, correct_letter = shuffle_answers(q['wrong_answers'], q['correct_answer'])
    
    # Create the text with question and answers
    text = f"""{q['question']}

    Answer with only the letter of the correct answer:
    A) {answers[0]}
    B) {answers[1]}
    C) {answers[2]}
    D) {answers[3]}
    """
    
    # Invoke the runnable
    response = runnable.invoke(
        {
        "text": text,
        "context": full_book
        }
    )

    # Print the results
    answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    print(f"Question: {q['question']}")
    print(f"Answer: {response.answer} ({answers[answer_mapping[response.answer]]})")
    print(f"The correct answer: {correct_letter} ({q['correct_answer']})\n")
    print(f"The answer was correct: {response.answer == correct_letter}\n")

Question: What threat did the newspaper headlines blare?
Answer: A (Talk or Die!)
The correct answer: A (Talk or Die!)

The answer was correct: True

Question: What is the name of the protagonist?
Answer: D (Francine Millar)
The correct answer: D (Francine Millar)

The answer was correct: True

Question: In which US state did the spaceship initially descend?
Answer: A (Oregon)
The correct answer: A (Oregon)

The answer was correct: True

Question: How many extraterrestrial beings were on the spaceship?
Answer: A (Five)
The correct answer: A (Five)

The answer was correct: True



### Naive RAG

In [69]:
count_correct = 0
for q in dataset['questions']:
    runnable = prompt_rag | llm.with_structured_output(schema=Answer)
    
    # Shuffle the answers and get the letter of the correct answer
    answers, correct_letter = shuffle_answers(q['wrong_answers'], q['correct_answer'])

    # Retrieve relevant chunks and concatenate them
    retrieved_docs = retriever.invoke(q['question'])
    chunks = "\n".join(doc.page_content for doc in retrieved_docs)
    
    # Create the text with question and answers
    text = f"""{q['question']}
    A) {answers[0]}
    B) {answers[1]}
    C) {answers[2]}
    D) {answers[3]}
    """
    
    # Invoke the runnable
    response = runnable.invoke(
        {
        "text": text,
        "context": chunks
        }
    )

    # Print the results
    answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    print(f"Question: {q['question']}")
    print(f"Answer: {response.answer} ({answers[answer_mapping[response.answer]]})")
    print(f"The correct answer: {correct_letter} ({q['correct_answer']})\n")
    print(f"The answer was correct: {response.answer == correct_letter}\n")
    if response.answer == correct_letter:
        count_correct += 1

print(f"Correct answers: {count_correct}\n"
    f"Total questions: {len(dataset['questions'])}\n"
    f"Accuracy: {count_correct/len(dataset['questions'])}\n")

Question: What threat did the newspaper headlines blare?
Answer: D (Talk or Die!)
The correct answer: D (Talk or Die!)

The answer was correct: True

Question: What is the name of the protagonist?
Answer: C (Francine Millar)
The correct answer: C (Francine Millar)

The answer was correct: True

Question: In which US state did the spaceship initially descend?
Answer: A (Oregon)
The correct answer: A (Oregon)

The answer was correct: True

Question: How many extraterrestrial beings were on the spaceship?
Answer: B (Five)
The correct answer: B (Five)

The answer was correct: True

Question: Which site was used by the aliens to demonstrate their destructive capabilities?
Answer: C (Eniwetok atoll)
The correct answer: C (Eniwetok atoll)

The answer was correct: True

Question: What field does Francine Millar specialize in?
Answer: C (Clinical psychology)
The correct answer: C (Clinical psychology)

The answer was correct: True

Question: What did Francine have with her when she walked throu

## Advanved RAG

In [235]:
prompt_hyde = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a hypothetical document creator."
            "Given a question, you generate one to three sentences of narrative text that contains answers for the question."
        ),
        ("human", """Here is a question:
         
        {question}
         
        Generate a narrative text that contains answers to the question."""),
    ]
)

In [240]:
correct_answers = 0

retrieved_docs = retriever.invoke("")
sample_text = "\n".join(doc.page_content for doc in retrieved_docs)
print(sample_text)

for q in dataset['questions']:
    runnable = prompt_rag | llm.with_structured_output(schema=Answer)
    hyde = prompt_hyde | llm
    
    # Shuffle the answers and get the letter of the correct answer
    answers, correct_letter = shuffle_answers(q['wrong_answers'], q['correct_answer'])

    hyde_response = hyde.invoke(
        {
        "question": q['question']
        }
    )

    # Print the results
    embedding_question = client.embeddings.create(
        input=q['question'],
        model="text-embedding-3-small"
    ).data[0].embedding

    for i, doc in enumerate(retrieved_docs):
        embedding_chunk = client.embeddings.create(
            input=doc.page_content,
            model="text-embedding-3-small"
        ).data[0].embedding
        print(f"Chunk {i}: {doc.page_content}\n")
        print(f"Chunk {i} similarity: {cosine(embedding_question, embedding_chunk)}\n")

    # Retrieve relevant chunks and concatenate them
    retrieved_docs = retriever.invoke(hyde_response.content)
    chunks = "\n".join(doc.page_content for doc in retrieved_docs)

    # Create the text with question and answers
    text = f"""{q['question']}
    A) {answers[0]}
    B) {answers[1]}
    C) {answers[2]}
    D) {answers[3]}
    """
    
    # Invoke the runnable
    response = runnable.invoke(
        {
        "text": text,
        "context": chunks
        }
    )

    # Print the results
    answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    print(f"Question: {q['question']}")
    print(f"Answer: {response.answer} ({answers[answer_mapping[response.answer]]})")
    print(f"The correct answer: {correct_letter} ({q['correct_answer']})\n")
    print(f"The answer was correct: {response.answer == correct_letter}\n")
    if response.answer == correct_letter:
        correct_answers += 1
print(f"Correct answers: {correct_answers}\n"
    f"Total questions: {len(dataset['questions'])}\n"
    f"Accuracy: {correct_answers/len(dataset['questions'])}\n")

The ship had flapped out of a gun-metal sky over Oregon, its shape that of a hideously magnified paramecium with edges that rippled like a mythological flying carpet. Its five green-skinned, froglike occupants had delivered the ultimatum, one copy printed on velvety paper to each major government, each copy couched faultlessly in the appropriate native tongue:
A scuffing sound intruded as the five green-skinned figures shuffled forward. They were trembling, and Francine saw glistening drops of wetness below their crests. Their eyes blinked. She sensed the aura of sadness about them, and new tears welled in her eyes.
Anger coursed through her. She stopped on the steps, stood there shivering. A new feeling of futility replaced the anger. Tears blurred her vision. What can one lone woman do against such ruthless schemers?
The doors of the spaceship opened. Five green-skinned figures emerged. They stopped, stood staring at her, their shoulders slumped. Simultaneously, Francine felt the thi

## One pipeline for all four systems

Define a function for the LLM baseline:

In [266]:
def answer_llm_baseline(runnable, question_prompt):
    
    # Start a timer
    start_time = time.time()

    # Invoke the runnable
    response = runnable.invoke(
        {
            "text": question_prompt
        }
    )

    # Calculate the processing time
    processing_time = time.time() - start_time

    return response.answer, processing_time

Define a function for the LLM with full book as context:

In [267]:
def answer_llm_full_context(runnable, question_prompt):
    
    # Start a timer
    start_time = time.time()

    # Invoke the runnable, with the full book as context
    response = runnable.invoke(
        {
        "text": question_prompt,
        "context": full_book
        }
    )

    # Calculate the processing time
    processing_time = time.time() - start_time

    return response.answer, processing_time

Define a function for the Naive RAG:

In [268]:
def answer_naive_rag(runnable, question_prompt, question):

    # Start a timer
    start_time = time.time()

    # Retrieve most relevant document chunks and concatenate them
    retrieved_docs = retriever.invoke(question)
    chunks = "\n".join(doc.page_content for doc in retrieved_docs)
    
    # Invoke the runnable with the question and chunks
    response = runnable.invoke(
        {
        "text": question_prompt,
        "context": chunks
        }
    )

    # Calculate the processing time
    processing_time = time.time() - start_time

    return response.answer, processing_time

Define a function for the Advanced RAG:

In [269]:
def answer_advanced_rag(runnable, runnable_hyde, question_prompt, question):

    # Start a timer
    start_time = time.time()
    
    # Generate a hypothetical document based on the question
    hyde_response = runnable_hyde.invoke(
        {
        "question": question,
        }
    )

    # Retrieve relevant chunks based on the hypothetical document
    retrieved_docs = retriever.invoke(hyde_response.content)
    chunks = "\n".join(doc.page_content for doc in retrieved_docs)
    
    # Invoke the runnable, with the question and chunks
    response = runnable.invoke(
        {
        "text": question_prompt,
        "context": chunks
        }
    )

    # Calculate the processing time
    processing_time = time.time() - start_time

    return response.answer, processing_time

Finally, we can apply the pipeline to the dataset:

In [270]:
# Create the LLM runnable without context
runnable_no_context = prompt_baseline_llm | llm.with_structured_output(schema=Answer)

# Create the runnable for RAG
runnable_rag = prompt_rag | llm.with_structured_output(schema=Answer)

# Create the runnable for HyDE
runnable_hyde = prompt_hyde | llm

results = {
    "correct_answers": [],
    "llm_baseline": [],
    "llm_full_context": [],
    "naive_rag": [],
    "advanced_rag": []
}

total_processing_time_llm_baseline = []
total_processing_time_llm_full_context = []
total_processing_time_naive_rag = []
total_processing_time_advanced_rag = []

for q in dataset['questions']:

    # Shuffle the answers and store the correct answer
    answers, correct_letter = shuffle_answers(q['wrong_answers'], q['correct_answer'])
    results["correct_answers"].append(correct_letter)

    # Create the text with question and answers
    question_prompt = f"""{q['question']}

    A) {answers[0]}
    B) {answers[1]}
    C) {answers[2]}
    D) {answers[3]}
    \nAnswer with only the letter of the correct answer. If you are unsure, take your best guess.
    """

    # Run the LLM baseline
    llm_baseline_answer, llm_baseline_processing_time = answer_llm_baseline(runnable_no_context, question_prompt)
    results['llm_baseline'].append(llm_baseline_answer)
    total_processing_time_llm_baseline.append(llm_baseline_processing_time)

    # Run the LLM with full context
    llm_full_context_answer, llm_full_context_processing_time = answer_llm_full_context(runnable_rag, question_prompt)
    results['llm_full_context'].append(llm_full_context_answer)
    total_processing_time_llm_full_context.append(llm_full_context_processing_time)

    # Run the naive RAG
    naive_rag_answer, naive_rag_processing_time = answer_naive_rag(runnable_rag, question_prompt, q['question'])
    results['naive_rag'].append(naive_rag_answer)
    total_processing_time_naive_rag.append(naive_rag_processing_time)

    # Run the advanced RAG
    advanced_rag_answer, advanced_rag_processing_time = answer_advanced_rag(runnable_rag, runnable_hyde, question_prompt, q['question'])
    results['advanced_rag'].append(advanced_rag_answer)
    total_processing_time_advanced_rag.append(advanced_rag_processing_time)

In [271]:
# Calculate the classification report for each method
from sklearn.metrics import classification_report

print("LLM baseline:")
print(classification_report(results['correct_answers'], results['llm_baseline']))

print("LLM full context:")
print(classification_report(results['correct_answers'], results['llm_full_context']))

print("Naive RAG:")
print(classification_report(results['correct_answers'], results['naive_rag']))

print("Advanced RAG:")
print(classification_report(results['correct_answers'], results['advanced_rag']))

# Store the raw results, classification reports and processing times to a JSON file
results_dict = {
    "results": results,
    "classification_reports": {
        "llm_baseline": classification_report(results['correct_answers'], results['llm_baseline'], output_dict=True),
        "llm_full_context": classification_report(results['correct_answers'], results['llm_full_context'], output_dict=True),
        "naive_rag": classification_report(results['correct_answers'], results['naive_rag'], output_dict=True),
        "advanced_rag": classification_report(results['correct_answers'], results['advanced_rag'], output_dict=True)
    },
    "processing_times": {
        "llm_baseline": total_processing_time_llm_baseline,
        "llm_full_context": total_processing_time_llm_full_context,
        "naive_rag": total_processing_time_naive_rag,
        "advanced_rag": total_processing_time_advanced_rag
    }
}

current_time = time.strftime("%Y%m%d-%H%M%S")
filename = f"results-{current_time}.json"

with open(filename, "w") as file:
    json.dump(results_dict, file, indent=2)

LLM baseline:
              precision    recall  f1-score   support

           A       0.14      0.14      0.14         7
           B       0.27      0.33      0.30         9
           C       0.64      0.64      0.64        14
           D       0.38      0.30      0.33        10

    accuracy                           0.40        40
   macro avg       0.36      0.35      0.35        40
weighted avg       0.41      0.40      0.40        40

LLM full context:
              precision    recall  f1-score   support

           A       0.71      0.71      0.71         7
           B       0.89      0.89      0.89         9
           C       0.81      0.93      0.87        14
           D       1.00      0.80      0.89        10

    accuracy                           0.85        40
   macro avg       0.85      0.83      0.84        40
weighted avg       0.86      0.85      0.85        40

Naive RAG:
              precision    recall  f1-score   support

           A       0.67      0.5

In [272]:
# Print the average, minimum and maximum processing time for each method
print(f"Average processing time LLM baseline: {sum(total_processing_time_llm_baseline)/len(total_processing_time_llm_baseline)}")
print(f"Minimum processing time LLM baseline: {min(total_processing_time_llm_baseline)}")
print(f"Maximum processing time LLM baseline: {max(total_processing_time_llm_baseline)}")

print(f"Average processing time LLM full context: {sum(total_processing_time_llm_full_context)/len(total_processing_time_llm_full_context)}")
print(f"Minimum processing time LLM full context: {min(total_processing_time_llm_full_context)}")
print(f"Maximum processing time LLM full context: {max(total_processing_time_llm_full_context)}")

print(f"Average processing time Naive RAG: {sum(total_processing_time_naive_rag)/len(total_processing_time_naive_rag)}")
print(f"Minimum processing time Naive RAG: {min(total_processing_time_naive_rag)}")
print(f"Maximum processing time Naive RAG: {max(total_processing_time_naive_rag)}")

print(f"Average processing time Advanced RAG: {sum(total_processing_time_advanced_rag)/len(total_processing_time_advanced_rag)}")
print(f"Minimum processing time Advanced RAG: {min(total_processing_time_advanced_rag)}")
print(f"Maximum processing time Advanced RAG: {max(total_processing_time_advanced_rag)}")

Average processing time LLM baseline: 0.623898035287857
Minimum processing time LLM baseline: 0.4306309223175049
Maximum processing time LLM baseline: 1.773648977279663
Average processing time LLM full context: 4.025880527496338
Minimum processing time LLM full context: 1.5085179805755615
Maximum processing time LLM full context: 74.44881200790405
Average processing time Naive RAG: 1.0835974156856536
Minimum processing time Naive RAG: 0.6783907413482666
Maximum processing time Naive RAG: 2.6651787757873535
Average processing time Advanced RAG: 2.6325348913669586
Minimum processing time Advanced RAG: 1.8137741088867188
Maximum processing time Advanced RAG: 5.87543511390686


### Calculate performance per question type

In [279]:
# Split the results into 4 groups, the first 10, the second, the thurd and fourth
direct_retrieval_results = {key: value[:10] for key, value in results.items()}
paraphrased_retrieval_results = {key: value[10:20] for key, value in results.items()}
inference_results = {key: value[20:30] for key, value in results.items()}
thematic_insight_results = {key: value[30:] for key, value in results.items()}

# Calculate the classification report for each method

print("DIRECT RETRIEVAL")
print("Baseline LLM:")
print(classification_report(direct_retrieval_results['correct_answers'], direct_retrieval_results['llm_baseline']))
print("Full context LLM:")
print(classification_report(direct_retrieval_results['correct_answers'], direct_retrieval_results['llm_full_context']))
print("Naive RAG:")
print(classification_report(direct_retrieval_results['correct_answers'], direct_retrieval_results['naive_rag']))
print("Advanced RAG:")
print(classification_report(direct_retrieval_results['correct_answers'], direct_retrieval_results['advanced_rag']))

print("PARAPHRASED RETRIEVAL")
print("Baseline LLM:")
print(classification_report(paraphrased_retrieval_results['correct_answers'], paraphrased_retrieval_results['llm_baseline']))
print("Full context LLM:")
print(classification_report(paraphrased_retrieval_results['correct_answers'], paraphrased_retrieval_results['llm_full_context']))
print("Naive RAG:")
print(classification_report(paraphrased_retrieval_results['correct_answers'], paraphrased_retrieval_results['naive_rag']))
print("Advanced RAG:")
print(classification_report(paraphrased_retrieval_results['correct_answers'], paraphrased_retrieval_results['advanced_rag']))

print("INFERENCE")
print("Baseline LLM:")
print(classification_report(inference_results['correct_answers'], inference_results['llm_baseline']))
print("Full context LLM:")
print(classification_report(inference_results['correct_answers'], inference_results['llm_full_context']))
print("Naive RAG:")
print(classification_report(inference_results['correct_answers'], inference_results['naive_rag']))
print("Advanced RAG:")
print(classification_report(inference_results['correct_answers'], inference_results['advanced_rag']))

print("THEMATIC INSIGHT")
print("Baseline LLM:")
print(classification_report(thematic_insight_results['correct_answers'], thematic_insight_results['llm_baseline']))
print("Full context LLM:")
print(classification_report(thematic_insight_results['correct_answers'], thematic_insight_results['llm_full_context']))
print("Naive RAG:")
print(classification_report(thematic_insight_results['correct_answers'], thematic_insight_results['naive_rag']))
print("Advanced RAG:")
print(classification_report(thematic_insight_results['correct_answers'], thematic_insight_results['advanced_rag']))

DIRECT RETRIEVAL
Baseline LLM:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00         2
           B       0.00      0.00      0.00         3
           C       0.00      0.00      0.00         3
           D       0.33      0.50      0.40         2

    accuracy                           0.10        10
   macro avg       0.08      0.12      0.10        10
weighted avg       0.07      0.10      0.08        10

Full context LLM:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           B       1.00      1.00      1.00         3
           C       1.00      1.00      1.00         3
           D       1.00      1.00      1.00         2

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Naive RAG:
              precision    recall  f1-score   support

           A   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
