<a href="https://colab.research.google.com/github/kavyajeetbora/nlp_rag/blob/master/end_to_end/03_RAG_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Evaluation

In [1]:
!pip install -q -U transformers python-dotenv langchain langchain-openai langchain_community langchain_huggingface unstructured pdfminer randomname chromadb langchain_ollama accelerate bitsandbytes
!pip install --upgrade --quiet opik

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... 

In [2]:
import os
from glob import glob
import shutil
import randomname
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings, ChatHuggingFace
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.retrievers import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

## Evaluation Framework
import opik
from opik import track
from opik.integrations.langchain import OpikTracer

Load env variables:

In [3]:
if os.path.exists(".env"):
    os.remove(".env")

from google.colab import files
uploaded = files.upload()
if uploaded:
    if load_dotenv(".env"):
        print("Uploaded and Loaded Sucessfully")

from huggingface_hub import login
login(token=os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Saving .env to .env
Uploaded and Loaded Sucessfully


In [4]:
# Initialize the tracer
opik_tracer = OpikTracer()
opik.configure(use_local=False)

OPIK: Opik is already configured. You can check the settings by viewing the config file at /root/.opik.config


In [5]:
@track
def myfunc(num:int):
    return num**2

myfunc(2)

OPIK: Started logging traces to the "NITI-AAYOG-EVAL" project at https://www.comet.com/opik/kavyajeetbora/redirect/projects?name=NITI-AAYOG-EVAL.


4

## Downloading the LLM model from HuggingFace

In [6]:
%%time

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

config = BitsAndBytesConfig(
    load_in_8bit = True
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = model_id,
    low_cpu_mem_usage=True
    #attn_implementation="flash_attention_2", # if you have an ampere GPU
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

model_size = model.get_memory_footprint()/1e9
print(f"Model size of {model_id} : {model_size:.2f} GB")

num_params = model.num_parameters()/1e9
print(f"Number of parameters: {num_params:.2} Billon")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


Model size of TinyLlama/TinyLlama-1.1B-Chat-v1.0 : 4.40 GB
Number of parameters: 1.1 Billon
CPU times: user 5.85 s, sys: 7.55 s, total: 13.4 s
Wall time: 1min 1s


# Running Small Language model locally

first create the langchain llm pipeline:

In [7]:
llm = HuggingFacePipeline(pipeline = pipe)

In [10]:
query = "Explain the concept of quantum entanglement in simple terms."
expected_output =  "Quantum entanglement is a phenomenon where two particles become interconnected, so the state of one instantly influences the state of the other, no matter how far apart they are."

simple_prompt = '''
You are an AI chatbot. Answer the following query:

{question}
'''

simple_prompt_template = PromptTemplate.from_template(simple_prompt)

simple_chain = simple_prompt_template | llm | StrOutputParser()

simple_chain.invoke({"question": query}, callbacks=[opik_tracer])

"\nYou are an AI chatbot. Answer the following query:\n\nExplain the concept of quantum entanglement in simple terms.\n\nA: Quantum entanglement is a phenomenon where two particles, such as photons or electrons, become connected in such a way that their states are correlated even when separated by great distances. This means that the state of one particle can be predicted with certainty from the state of the other particle, even if they are not directly connected.\n\nIn the context of AI chatbots, this means that the state of the AI's internal state, such as its memory or processing capabilities, can be predicted with certainty from the state of the user's input, even if the AI is not directly connected to the user. This is because the AI's internal state is correlated with the user's input, and the state of the user's input can be predicted with certainty from the state of the AI's internal state.\n\nFor example, if the AI is trained on a large dataset of user input and internal state

## Let's build a local RAG System

### Ingest the data



In [11]:
!wget -q https://www.niti.gov.in/sites/default/files/2023-02/Annual-Report-2022-2023-English_1.pdf -O annual_report.pdf

In [13]:
doc_path = "annual_report.pdf"

if os.path.exists(doc_path):
    loader = PyPDFLoader(doc_path)
    data = loader.load()
    print("PDF is loaded")
else:
    print("PDF file not found")

PDF is loaded


In [14]:
data[10]

Document(metadata={'source': 'annual_report.pdf', 'page': 10, 'page_label': '11'}, page_content='MoH&FW Ministry of Health and Family Welfare\nMoHUA Ministry of Housing and Urban Affairs\nMoSPI Ministry of Statistics and Programme Implementation\nNAS National Achievement Survey\nNDC Nationally Determined Contributions\nNEP National Education Policy\nNER North-Eastern Region\nNFHS National Family Health Survey\nNILERD National Institute of Labour Economics Research and Development\nNITI National Institution for Transforming India\nNMP National Monetisation Pipeline\nNOTP National Organ Transplant Programme\nOOMF Output–Outcome Monitoring Framework\nOoSC Out-of-School Children\nPLI Production Linked Incentive\nPMJAY Pradhan Mantri Jan Arogya Yojana\nPSHICMI Policy and Strategy for Health Insurance Coverage of India\nSATH-E Sustainable Action for Transforming Human Capital-Education\nSC-NEC Sub-Committee of National Executive Committee\nSDG Sustainable Development Goals\nSECI State Energy

### Chunking the information

In [15]:
## Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(data)
print(f"The documents has been splitted into {len(chunks)} chunks")

The documents has been splitted into 820 chunks


In [16]:
print(chunks[120].page_content)

such review meetings have been conducted during 2020-21 and 2021-22 respectively. As of 18 January 
2023, around 30 meetings had been completed for the fiscal year 2022-23.
Further, to improve the quality of OOMF a continuous capacity building exercise and systematic review 
of the framework and indicators of all CS/CSS schemes is undertaken throughout the year. It has 
also been DMEO’s constant endeavour to improve the capacity of officials working at different level


In [17]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

create the vector database

In [20]:
@track
def create_vector_database(chunks: list,name="vector-db"):

    os.makedirs("db", exist_ok=True)

    random_suffix = randomname.get_name()

    persistent_directory = f"db/chroma-{name}-({random_suffix})"

    ## If already there, delete and create a new one

    for folder in glob(f"db/chroma-{name}*"):
        if os.path.exists(folder):
            shutil.rmtree(folder)

    os.mkdir(persistent_directory)

    vector_db = Chroma.from_documents(
        documents = chunks,
        collection_name = name,
        embedding = embeddings,
        persist_directory = persistent_directory
    )

    return vector_db

In [21]:
vector_db = create_vector_database(chunks=chunks, name="annual-report")

## Retrieval

In [35]:
retriever = vector_db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 5}
)

@track
def get_relevant_docs(query):
    docs = retriever.invoke(query)
    return docs

query = "What is the full form of ATL and what is the grant amount ?"
docs = get_relevant_docs(query)

## Define the RAG chain

In [24]:
prompt = '''
You are a helpful and friendly assistant.
Answer the user's questions based solely on the context of the provided and not more than 3 sentences
{context}
If the information is not available in the context, respond with "I don't know."

User: {question}
<<Assistant Reply>>:
'''

prompt_template = PromptTemplate(
    input_variables=["question"],
    template=prompt,
)

## RAG with openai models

In [32]:
def format_response(response):
    delimiter = "<<Assistant Reply>>:"
    cleaned_response = response.split(delimiter)[-1].strip()
    return cleaned_response

@track
def get_response(query):
    rag_chain = (
        {"context": vector_db.as_retriever(), "question": RunnablePassthrough()}
        | prompt_template
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke(query, callbacks=[opik_tracer])
    formatted_response = format_response(response)
    return formatted_response

In [None]:
%%time

query = "What is ATL and what is the grant amount ?"
response = get_response(query)

print(response)
print("\n"*3)