<a href="https://colab.research.google.com/github/kavyajeetbora/nlp_rag/blob/master/end_to_end/03_RAG_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

`colab-xterm` allows you to open a terminal in a cell.

In [None]:
!pip install -q -U transformers python-dotenv langchain langchain-openai langchain_community langchain_huggingface unstructured pdfminer randomname chromadb langchain_ollama accelerate bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.3/54.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m46.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from glob import glob
import shutil
import randomname
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings, ChatHuggingFace
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.retrievers import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

Load env variables:

In [None]:
if os.path.exists(".env"):
    os.remove(".env")

from google.colab import files
uploaded = files.upload()
if uploaded:
    if load_dotenv(".env"):
        print("Uploaded and Loaded Sucessfully")

from huggingface_hub import login
login(token=os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Saving .env to .env
Uploaded and Loaded Sucessfully


## Downloading the LLM model from HuggingFace

In [None]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

config = BitsAndBytesConfig(
    load_in_8bit = True
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = model_id,
    low_cpu_mem_usage=True
    #attn_implementation="flash_attention_2", # if you have an ampere GPU
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

model_size = model.get_memory_footprint()/1e9
print(f"Model size of {model_id} : {model_size:.2f} GB")

num_params = model.num_parameters()/1e9
print(f"Number of parameters: {num_params:.2} Billon")

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


Model size of TinyLlama/TinyLlama-1.1B-Chat-v1.0 : 4.40 GB
Number of parameters: 1.1 Billon


# Running Small Language model locally

first create the langchain llm pipeline:

In [None]:
llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
query = "Explain the concept of quantum entanglement in simple terms."
expected_output =  "Quantum entanglement is a phenomenon where two particles become interconnected, so the state of one instantly influences the state of the other, no matter how far apart they are."

simple_prompt = '''
You are an AI chatbot. Answer the following query:

{question}
'''

simple_prompt_template = PromptTemplate.from_template(simple_prompt)

simple_chain = simple_prompt_template | llm | StrOutputParser()

simple_chain.invoke({"question": query}, return_only_outputs = True)

"\nYou are an AI chatbot. Answer the following query:\n\nExplain the concept of quantum entanglement in simple terms.\n\nA: Quantum entanglement is a phenomenon where two particles, such as photons or electrons, become connected in such a way that their states are correlated even when separated by great distances. This means that the state of one particle can be predicted with certainty from the state of the other particle, even if they are not directly connected.\n\nIn the context of AI chatbots, this means that the state of the AI's internal state, such as its memory or processing capabilities, can be predicted with certainty from the state of the user's input, even if the AI is not directly connected to the user. This is because the AI's internal state is correlated with the user's input, and the state of the user's input can be predicted with certainty from the state of the AI's internal state.\n\nFor example, if the AI is trained on a large dataset of user input and internal state

## Let's build a local RAG System

### Ingest the data



In [None]:
!wget -q https://www.niti.gov.in/sites/default/files/2023-02/Annual-Report-2022-2023-English_1.pdf -O annual_report.pdf

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [None]:
doc_path = "annual_report.pdf"

if os.path.exists(doc_path):
    loader = PyPDFLoader(doc_path)
    data = loader.load()
    print("PDF is loaded")
else:
    print("PDF file not found")

PDF is loaded


In [None]:
data[10]

Document(metadata={'source': 'annual_report.pdf', 'page': 10, 'page_label': '11'}, page_content='MoH&FW Ministry of Health and Family Welfare\nMoHUA Ministry of Housing and Urban Affairs\nMoSPI Ministry of Statistics and Programme Implementation\nNAS National Achievement Survey\nNDC Nationally Determined Contributions\nNEP National Education Policy\nNER North-Eastern Region\nNFHS National Family Health Survey\nNILERD National Institute of Labour Economics Research and Development\nNITI National Institution for Transforming India\nNMP National Monetisation Pipeline\nNOTP National Organ Transplant Programme\nOOMF Output–Outcome Monitoring Framework\nOoSC Out-of-School Children\nPLI Production Linked Incentive\nPMJAY Pradhan Mantri Jan Arogya Yojana\nPSHICMI Policy and Strategy for Health Insurance Coverage of India\nSATH-E Sustainable Action for Transforming Human Capital-Education\nSC-NEC Sub-Committee of National Executive Committee\nSDG Sustainable Development Goals\nSECI State Energy

### Chunking the information

In [None]:
## Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(data)
print(f"The documents has been splitted into {len(chunks)} chunks")

The documents has been splitted into 820 chunks


In [None]:
print(chunks[120].page_content)

such review meetings have been conducted during 2020-21 and 2021-22 respectively. As of 18 January 
2023, around 30 meetings had been completed for the fiscal year 2022-23.
Further, to improve the quality of OOMF a continuous capacity building exercise and systematic review 
of the framework and indicators of all CS/CSS schemes is undertaken throughout the year. It has 
also been DMEO’s constant endeavour to improve the capacity of officials working at different level


In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda'},
)

create the vector database

In [None]:
def create_vector_database(chunks: list,name="vector-db"):

    os.makedirs("db", exist_ok=True)

    random_suffix = randomname.get_name()

    persistent_directory = f"db/chroma-{name}-({random_suffix})"

    ## If already there, delete and create a new one

    for folder in glob(f"db/chroma-{name}*"):
        if os.path.exists(folder):
            shutil.rmtree(folder)

    os.mkdir(persistent_directory)

    vector_db = Chroma.from_documents(
        documents = chunks,
        collection_name = name,
        embedding = embeddings,
        persist_directory = persistent_directory
    )

    return vector_db

In [None]:
vector_db = create_vector_database(chunks=chunks, name="annual-report")

## Retrieval

In [None]:
prompt_template = '''You are an AI language model assistant.
Generate 1-5 sub-questions or alternate versions of the given user question to
retrieve relevant documents from a vector database.
Break down questions with multiple concepts into distinct sub-questions.
Provide these alternative questions separated by newlines between XML tags.

Original question: {question}
'''

prompt = PromptTemplate(template = prompt_template, input_variables=["question"])

In [None]:
# retriever = MultiQueryRetriever.from_llm(
#     retriever = vector_db.as_retriever(),
#     llm = llm,
#     prompt = prompt
# )

# retriever = similarity_search_with_score

retriever = vector_db.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 5}
)
query = "What is the full form of RSNA ?"
docs = retriever.invoke(query)
docs

[Document(metadata={'page': 118, 'page_label': '119', 'source': 'annual_report.pdf'}, page_content='Department of Consumer Affairs, Department of Food and Public Distribution System, Department of \nPersonnel and Training, Department of Pensions and Pensioners’ Welfare, Department of Administrative \nReforms and Public Grievances, Department of Legal Affairs and Department of Justice. The Research \nDivision oversees the Research Scheme of NITI Aayog (or RSNA), which aims at supporting various \nresearch studies.\nCoverage and Beneficiary under National Food Security Act (NFSA), 2013'),
 Document(metadata={'page': 120, 'page_label': '121', 'source': 'annual_report.pdf'}, page_content='New Research Scheme of NITI Aayog (RSNA) Guidelines 2021\nIn line with NITI’s mandate to position itself as a knowledge and innovation hub, new set of guidelines, \nviz. ‘Research Scheme of NITI Aayog 2021’ has been launched. The revamped guidelines aim at broad-\nbasing research work including Institutio

## Define the RAG chain

In [None]:
chat_prompt_template = '''
You are a helpful and friendly assistant.
Answer the user's questions based solely on the context of the provided and not more than 3 sentences
{context}
If the information is not available in the context, respond with "I don't know."

User: {question}
<<Assistant Reply>>:
'''

chat_prompt = ChatPromptTemplate.from_template(chat_prompt_template)

rag_chain = (
    {"context": vector_db.as_retriever(), "question": RunnablePassthrough()}
    | chat_prompt
    | llm
    | StrOutputParser()
)

In [None]:
def format_response(response):
    delimiter = "<<Assistant Reply>>:"
    cleaned_response = response.split(delimiter)[-1].strip()
    return cleaned_response

In [None]:
%%time

query = 'What does ATL stands for and what is grant amount ?'
response = rag_chain.invoke(query, return_only_outputs=True)

CPU times: user 38.6 ms, sys: 2.83 ms, total: 41.4 ms
Wall time: 702 ms


In [None]:
cleaned_response = format_response(response)
print(cleaned_response)

ATL stands for Aadhaar Thought Lab, and the grant amount is up to 20 lakhs.


## RAG with openai models

In [None]:
llm_model = ChatOpenAI(model='gpt-4o-mini')
llm_model.invoke("Hi there")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_72ed7ab54c', 'finish_reason': 'stop', 'logprobs': None}, id='run-f2f2321a-9d49-47cd-b10b-dcfb6154db45-0', usage_metadata={'input_tokens': 9, 'output_tokens': 10, 'total_tokens': 19, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [None]:
rag_chain = (
    {"context": vector_db.as_retriever(), "question": RunnablePassthrough()}
    | chat_prompt
    | llm_model
    | StrOutputParser()
)

response = rag_chain.invoke(query, return_only_outputs=True)
print(response)

ATL stands for Atal Tinkering Labs. The grant-in-aid provided to schools selected for setting up the ATL is up to twenty lakhs.
