# RAG Experiments

We start by defining a few constants and imports

In [305]:
DATASET_DIR = "../datasets/books/"

In [306]:
import os

DB_PATH = os.path.join(DATASET_DIR, "vector_db")
EVAL_PATH = os.path.join(DATASET_DIR, "eval.csv")

In [307]:
from langchain.chains import LLMChain, RetrievalQA, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings, OpenAI, ChatOpenAI
from langchain_chroma import Chroma

In [308]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", getpass.getpass())

 ········


---

## Retrieval

### Naive

Standard retriever from Vector Store

In [309]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [310]:
naive_retriever = Chroma(persist_directory=DB_PATH, embedding_function=embeddings).as_retriever()

### Hyde (Hypothetical Document Embeddings)

HyDE retriever from Vector Store

In [311]:
base_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [312]:
prompt_template = "Please answer the question \nQuestion: {question}\nAnswer:"
prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

generative_model = OpenAI(model="gpt-3.5-turbo-instruct")

retriever_chain = LLMChain(llm=generative_model, prompt=prompt)

In [313]:
embeddings = HypotheticalDocumentEmbedder(
    llm_chain=retriever_chain, base_embeddings=base_embeddings
)

In [314]:
hyde_retriever = Chroma(persist_directory=DB_PATH, embedding_function=embeddings).as_retriever()

---

## Generation

Retriever uses Query to get Context Documents from the Vector Store. Both Query and Context are used to prompt a generative model for the Answer.

Notice, the better retrieval with HyDE - more relevant documents are seemingly retrieved. (But this is a sample size of 1!)

In [315]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [316]:
prompt_template = '''You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:'''

prompt = PromptTemplate(input_variables=["question", "context"], template=prompt_template)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

In [317]:
def get_chain(retriever, llm, prompt):
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return qa_chain

### Naive output

In [318]:
# Cost/compute intensive

question = "What is a good book for adults with depression?"
naive_rag = get_chain(naive_retriever, llm, prompt)
result = naive_rag.invoke({"query": question})
result["result"]

'Reasons to Stay Alive by Matt Haig is a good book for adults with depression. It is an inspiring memoir of overcoming depression and finding reasons to appreciate life. The book provides encouragement and insight for those struggling with mental illness.'

### HyDE output

In [319]:
# Cost/compute intensive

question = "What is a good book for adults with depression?"
hyde_rag = get_chain(hyde_retriever, llm, prompt)
result = hyde_rag.invoke({"query": question})
result["result"]

'The Mindful Way through Depression: Freeing Yourself from Chronic Unhappiness is a good book for adults with depression as it offers authoritative, clinically proven self-help methods to reduce chronic unhappiness. Dianetics: The Modern Science of Mental Health also provides insight into eradicating the source of stress, anxiety, and depression for a happier life. Reasons to Stay Alive is an accessible and inspiring memoir of overcoming depression, offering hope and encouragement for those struggling with mental illness.'

---

## Evaluation

Ragas evaluators are used to measure performance of Retrievers on a synthetic dataset

### Metrics

In [320]:
import nest_asyncio
from ragas.integrations.langchain import EvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

# create eval chains
eval_chains = {
    m.name: EvaluatorChain(metric=m) 
    for m in [faithfulness, answer_relevancy, context_precision, context_recall]
}

In [345]:
# evaluate
def evaluate(sample):
    metrics = {}
    for name, eval_chain in eval_chains.items():
        score_name = f"{name}_score"
        metrics[score_name] = eval_chain(sample)[name]
        # print(f"{score_name}: {metrics[score_name]}")
    return metrics

In [None]:
#collect metrics

from tqdm.auto import tqdm

nest_asyncio.apply()

def get_metrics(results: List[Dict]): 
    metrics = []
    for res in tqdm(results):
        sample= {
            "question": res["query"],
            "answer": res["result"],
            "contexts": [context.page_content for context in res["source_documents"]],
            "ground_truth": res["ground_truth"],
        }
        metrics.append(evaluate(sample))
    return metrics

### Dataset

In [322]:
import pandas as pd

eval_df = pd.read_csv(EVAL_PATH)
eval_df.head()

Unnamed: 0,context,question,answer,source_doc
0,Book: Jami' At-Tirmidhi\nDescription:,What is the name of the book described?\n,Jami' At-Tirmidhi,../datasets/books/data.csv
1,Book: Permanent Record\nDescription: Edward Sn...,"Who is the author of the book ""Permanent Recor...",Edward Snowden,../datasets/books/data.csv
2,Book: A Walk Toward Jesus: Coming Through the ...,"Who is the author of the book ""A Walk Toward J...",Pamela S. Valerio,../datasets/books/data.csv
3,Description: The Works of Edgar Allen Poe incl...,What famous authors' works are included alongs...,"Mark Twain, Robert Louis Stevenson, Charles Di...",../datasets/books/data.csv
4,Description: **** 5 out of 5 Star Rating from ...,"Who is the author of the book ""The Weapon of M...",Ambrose V. Bruno,../datasets/books/data.csv


In [337]:
eval_questions = eval_df["question"].to_list()
eval_answers = eval_df["answer"].to_list()

examples = [
    {"query": q, "ground_truth": eval_answers[i]}
    for i, q in enumerate(eval_questions)
]

In [338]:
import random

examples = random.sample(examples, 50)

### Naive RAG

In [339]:
# Cost/compute intensive

results = naive_rag.batch(examples)

In [None]:
# Cost/compute intensive

metrics = get_metrics(results)

In [341]:
naive_metrics_df = pd.DataFrame(metrics)
naive_metrics_df.mean()

faithfulness_score         0.943478
answer_relevancy_score     0.959450
context_precision_score    0.951667
context_recall_score       0.920000
dtype: float64

### HyDE RAG

In [346]:
# Cost/compute intensive

results = hyde_rag.batch(examples)

In [None]:
# Cost/compute intensive

metrics = get_metrics(results)

In [343]:
# Cost/compute intensive
# evaluate

from tqdm.auto import tqdm

nest_asyncio.apply()

metrics = []
for res in tqdm(results):
    sample= {
        "question": res["query"],
        "answer": res["result"],
        "contexts": [context.page_content for context in res["source_documents"]],
        "ground_truth": res["ground_truth"],
    }
    metrics.append(evaluate(sample))
    # eval_result

  0%|          | 0/50 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.


In [344]:
hyde_metrics_df = pd.DataFrame(metrics)
hyde_metrics_df.mean()

faithfulness_score         0.840635
answer_relevancy_score     0.856820
context_precision_score    0.848333
context_recall_score       0.820000
dtype: float64