<a href="https://colab.research.google.com/github/kavyajeetbora/nlp_rag/blob/master/end_to_end/03_RAG_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Evaluation

In [1]:
!pip install -q -U transformers python-dotenv langchain langchain-openai langchain_community langchain_huggingface unstructured pdfminer randomname chromadb langchain_ollama accelerate bitsandbytes
!pip install --upgrade --quiet opik

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... 

In [1]:
import os
from glob import glob
import shutil
import randomname
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings, ChatHuggingFace
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.retrievers import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

## Evaluation Framework
import opik
from opik import track
from opik.integrations.langchain import OpikTracer

Load env variables:

In [13]:
if os.path.exists(".env"):
    os.remove(".env")

from google.colab import files
uploaded = files.upload()
if uploaded:
    if load_dotenv(".env"):
        print("Uploaded and Loaded Sucessfully")

from huggingface_hub import login
login(token=os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Saving .env to .env
Uploaded and Loaded Sucessfully


In [14]:
# Initialize the tracer
opik_tracer = OpikTracer()
opik.configure(use_local=False)

OPIK: Existing Opik clients will not use updated values for "url", "api_key", "workspace".
OPIK: Opik is already configured. You can check the settings by viewing the config file at /root/.opik.config


In [15]:
@track
def myfunc(num:int):
    return num**2

myfunc(2)

4

## Downloading the LLM model from HuggingFace

In [16]:
%%time

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

config = BitsAndBytesConfig(
    load_in_8bit = True
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = model_id,
    low_cpu_mem_usage=True
    #attn_implementation="flash_attention_2", # if you have an ampere GPU
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

model_size = model.get_memory_footprint()/1e9
print(f"Model size of {model_id} : {model_size:.2f} GB")

num_params = model.num_parameters()/1e9
print(f"Number of parameters: {num_params:.2} Billon")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


Model size of TinyLlama/TinyLlama-1.1B-Chat-v1.0 : 4.40 GB
Number of parameters: 1.1 Billon
CPU times: user 2.05 s, sys: 6.69 s, total: 8.73 s
Wall time: 17.9 s


## Running Small Language model locally

first create the langchain llm pipeline:

In [17]:
llm = HuggingFacePipeline(pipeline = pipe)

In [18]:
query = "Explain the concept of quantum entanglement in simple terms."
expected_output =  "Quantum entanglement is a phenomenon where two particles become interconnected, so the state of one instantly influences the state of the other, no matter how far apart they are."

simple_prompt = '''
You are an AI chatbot. Answer the following query:

{question}
'''

simple_prompt_template = PromptTemplate.from_template(simple_prompt)

simple_chain = simple_prompt_template | llm | StrOutputParser()

simple_chain.invoke({"question": query}, callbacks=[opik_tracer])

"\nYou are an AI chatbot. Answer the following query:\n\nExplain the concept of quantum entanglement in simple terms.\n\nA: Quantum entanglement is a phenomenon where two particles, such as photons or electrons, become connected in such a way that their states are correlated even when separated by great distances. This means that the state of one particle can be predicted with certainty from the state of the other particle, even if they are not directly connected.\n\nIn the context of AI chatbots, this means that the state of the AI's internal state, such as its memory or processing capabilities, can be predicted with certainty from the state of the user's input, even if the AI is not directly connected to the user. This is because the AI's internal state is correlated with the user's input, and the state of the user's input can be predicted with certainty from the state of the AI's internal state.\n\nFor example, if the AI is trained on a large dataset of user input and internal state

## Data Preprocessing

### Ingest the data



In [19]:
!wget -q https://www.niti.gov.in/sites/default/files/2023-02/Annual-Report-2022-2023-English_1.pdf -O annual_report.pdf

In [20]:
doc_path = "annual_report.pdf"

if os.path.exists(doc_path):
    loader = PyPDFLoader(doc_path)
    data = loader.load()
    print("PDF is loaded")
else:
    print("PDF file not found")

PDF is loaded


In [21]:
data[10]

Document(metadata={'source': 'annual_report.pdf', 'page': 10, 'page_label': '11'}, page_content='MoH&FW Ministry of Health and Family Welfare\nMoHUA Ministry of Housing and Urban Affairs\nMoSPI Ministry of Statistics and Programme Implementation\nNAS National Achievement Survey\nNDC Nationally Determined Contributions\nNEP National Education Policy\nNER North-Eastern Region\nNFHS National Family Health Survey\nNILERD National Institute of Labour Economics Research and Development\nNITI National Institution for Transforming India\nNMP National Monetisation Pipeline\nNOTP National Organ Transplant Programme\nOOMF Output–Outcome Monitoring Framework\nOoSC Out-of-School Children\nPLI Production Linked Incentive\nPMJAY Pradhan Mantri Jan Arogya Yojana\nPSHICMI Policy and Strategy for Health Insurance Coverage of India\nSATH-E Sustainable Action for Transforming Human Capital-Education\nSC-NEC Sub-Committee of National Executive Committee\nSDG Sustainable Development Goals\nSECI State Energy

### Chunking the information

In [22]:
## Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(data)
print(f"The documents has been splitted into {len(chunks)} chunks")

The documents has been splitted into 820 chunks


In [23]:
print(chunks[120].page_content)

such review meetings have been conducted during 2020-21 and 2021-22 respectively. As of 18 January 
2023, around 30 meetings had been completed for the fiscal year 2022-23.
Further, to improve the quality of OOMF a continuous capacity building exercise and systematic review 
of the framework and indicators of all CS/CSS schemes is undertaken throughout the year. It has 
also been DMEO’s constant endeavour to improve the capacity of officials working at different level


In [24]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
)

create the vector database

In [25]:
@track
def create_vector_database(chunks: list,name="vector-db"):

    os.makedirs("db", exist_ok=True)

    random_suffix = randomname.get_name()

    persistent_directory = f"db/chroma-{name}-({random_suffix})"

    ## If already there, delete and create a new one

    for folder in glob(f"db/chroma-{name}*"):
        if os.path.exists(folder):
            shutil.rmtree(folder)

    os.mkdir(persistent_directory)

    vector_db = Chroma.from_documents(
        documents = chunks,
        collection_name = name,
        embedding = embeddings,
        persist_directory = persistent_directory
    )

    return vector_db

In [26]:
vector_db = create_vector_database(chunks=chunks, name="annual-report")

## Retrieval

In [30]:
retriever = vector_db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 5}
)

@track
def get_relevant_docs(query):
    docs = retriever.invoke(query)
    return docs

query = "What is the full form of ATL and what is the grant amount ?"
docs = get_relevant_docs(query)

## Prompt Engineering

In [31]:
prompt = '''
You are a helpful and friendly assistant.
Answer the user's questions based solely on the context of the provided below:
{context}
If the information is not available in the context, respond with "I don't know."
And reply not more than 3 sentences strictly.

User: {question}
<<Assistant Reply>>:
'''

prompt_template = PromptTemplate(
    input_variables=["context","question"],
    template=prompt,
)

In [32]:
input = "What is python ?"
context = "It is a programming language"

print(prompt_template.invoke({"context": context,"question": input}).text)


You are a helpful and friendly assistant.
Answer the user's questions based solely on the context of the provided below:
It is a programming language
If the information is not available in the context, respond with "I don't know."
And reply not more than 3 sentences strictly.

User: What is python ?
<<Assistant Reply>>:



## RAG Pipeline

In [33]:
def format_response(response):
    delimiter = "<<Assistant Reply>>:"
    cleaned_response = response.split(delimiter)[-1].strip()
    return cleaned_response

@track
def get_response(query):
    rag_chain = (
        {"context": vector_db.as_retriever(), "question": RunnablePassthrough()}
        | prompt_template
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke(query, callbacks=[opik_tracer])
    formatted_response = format_response(response)
    return formatted_response

In [34]:
%%time

query = "What is ATL and what is the grant amount ?"
response = get_response(query)

print(response)
print("\n"*1)

ATL (Accelerated Technology Learning) is a program designed by the Ministry of Human Resource Development (MHRD) to provide a platform for students to explore and experiment with technology. The grant amount for ATL is up to 20 lakhs.


CPU times: user 52.9 s, sys: 1.21 s, total: 54.1 s
Wall time: 56.1 s


## Evaluation Using the opik framework

In [35]:
from opik.evaluation import evaluate
from opik.evaluation.metrics import ContextPrecision, ContextRecall

### 1. First Define a client

more on [`opik.Opik`](https://www.comet.com/docs/opik/python-sdk-reference/Opik.html) class

In [36]:
opik_client = opik.Opik(
    project_name = os.environ['OPIK_PROJECT_NAME'],
    api_key = os.environ['OPIK_API_KEY'],
    workspace = os.environ['OPIK_WORKSPACE']
)

### 2. Now define a evaluation dataset

For RAG system, we want the following:
1. Prompt
2. Context
3. Expected output

In [37]:
dataset = opik_client.get_or_create_dataset("RAG evaluation dataset")
dataset.insert([
    {
        "input": "What are the key features of Python?",
        "context": "Python is known for its simplicity and readability. Key features include dynamic typing, automatic memory management, and an extensive standard library.",
        "expected_output": "Python's key features include dynamic typing, automatic memory management, and an extensive standard library."
    },
    {
        "input": "How does garbage collection work in Python?",
        "context": "Python uses reference counting and a cyclic garbage collector. When an object's reference count drops to zero, it is deallocated.",
        "expected_output": "Python uses reference counting for garbage collection. Objects are deallocated when their reference count reaches zero."
    }
])

### 3. Now Evaluate using different metrics

Here is an overview of different metrics:
https://www.comet.com/docs/opik/evaluation/metrics/overview/


**Note:**
1. By default, Opik uses GPT-4 from OpenAI as the LLM to evaluate the output of other LLMs1. This model is used for various evaluation metrics, such as checking for hallucinations, relevance, and moderation.
2. However, you can easily switch to another LLM provider by specifying a different model in the model_name parameter of each LLM as a Judge metric

#### Get the responses

In [44]:
@track
def get_response_from_context_query(context, query):
    chain = prompt_template | llm | StrOutputParser()
    raw_response = chain.invoke({'context': context, "question": query})

    ## Format the response
    output = format_response(raw_response)
    return output

In [None]:
item = dataset.get_items()[0]
prompt = prompt_template.invoke({'context': item['context'], "question": item['input']}).text
llm.invoke(prompt)

In [48]:
%%time
get_response_from_context_query(context=item['context'], query=item['input'])

KeyboardInterrupt: 

In [61]:
for item in dataset.get_items():
    query = item['input']
    context = item['expected_output']

{'id': '019488c7-aef6-7b41-998a-080b9fba60c0', 'input': 'How does garbage collection work in Python?', 'expected_output': 'Python uses reference counting for garbage collection. Objects are deallocated when their reference count reaches zero.', 'context': "Python uses reference counting and a cyclic garbage collector. When an object's reference count drops to zero, it is deallocated."}
{'id': '019488c7-aef5-7e36-bb0a-e0b956f4aa44', 'input': 'What are the key features of Python?', 'expected_output': "Python's key features include dynamic typing, automatic memory management, and an extensive standard library.", 'context': 'Python is known for its simplicity and readability. Key features include dynamic typing, automatic memory management, and an extensive standard library.'}


In [51]:
## Define the evaluation metrics and choose the LLM that will judge the results
scoring_metrics=[
    ContextPrecision(model='gpt-4o-mini'),
    ContextRecall(model='gpt-4o-mini')
]

In [48]:
%%time

## define the task
def rag_task(item):
    # Simulate RAG pipeline
    chain = prompt_template | llm | StrOutputParser()
    raw_response = chain.invoke({'context': item['context'], "question": item['input']})

    ## Format the response
    output = format_response(raw_response)
    return {
        "output": output
    }

# Run the evaluation
result = evaluate(
    dataset=dataset,
    task=rag_task,
    scoring_metrics=[
        ContextPrecision(),
        ContextRecall()
    ],
    experiment_name="rag_evaluation"
)



Evaluation:   0%|          | 0/2 [07:26<?, ?it/s]
Evaluation:   0%|          | 0/2 [24:39<?, ?it/s]

KeyboardInterrupt

