# RAG using Langchain

In [None]:
# https://policyholder.gov.in/documents/37343/931203/UNIHLIP23006V032223.pdf/a78a1bbf-533e-247a-b9a0-136768029158?version=1.0&t=1669354136950&download=true

In [73]:
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter

from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from llama_index.core import StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate

from dotenv import load_dotenv

load_dotenv()


True

## Simple RAG Archictecture

In [76]:
# 1. Load the Information
docs = PDFReader().load_data("../doc/UNIHLIP23006V032223.pdf")

# 2a. Split the documents into smaller chunks
text_splitter = SentenceSplitter(chunk_size=200, chunk_overlap=10)
nodes = text_splitter.get_nodes_from_documents(documents=docs)

# 2b. Select embedding strategy/ type
openai_embedding = OpenAIEmbedding()

# 2c. Create the vectorstore
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("my_collection", get_or_create=True)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes=nodes, storage_context=storage_context, embed_model=OpenAIEmbedding())

# [Optional on Llama-index] Create the retriever
retriever = index.as_retriever()

# 3. Create Query Engine
query_engine = index.as_query_engine(llm=OpenAI(model="gpt-3.5-turbo-0125"))

# [Optional on Llama-index] Define Custom Prompt
prompt = """Based on the data provided to you here: {context_str}. 
Please answer this question: {query_str}"""

custom_prompt = PromptTemplate(prompt)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": custom_prompt}
)

query_engine.query('What medical expenses are covered for in-patient treatment?')

Response(response='The medical expenses covered for in-patient treatment include expenses for organ donor treatment, ambulance services, dental treatment in case of accident, AYUSH benefit (Ayurveda, Unani, Sidha, Homeopathy), daily cash for accompanying an insured child, vaccination, and out-patient treatment.', source_nodes=[NodeWithScore(node=TextNode(id_='2d037b74-55dd-43e2-b3f3-087d35fa3a1f', embedding=None, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9557f1da-ff02-4daf-be5c-6350b96b65f9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, hash='e3d87c2238a10636d56456bc4f1d720c095c0f13e03977f36f9189963f9d6e63')}, text='3 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nexpenses for up to 60 days in accordance with ( Section

## Breakdown per Component

### 1. Load the Information

In [77]:
docs = PDFReader().load_data("../doc/UNIHLIP23006V032223.pdf")
docs

[Document(id_='0b1f5341-75db-4d5e-9cce-194a3a481c82', embedding=None, metadata={'page_label': '1', 'file_name': 'UNIHLIP23006V032223.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=' \n1 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nCOMPLETE  HEALTHCARE  INSURANCE  \n \nPOLICY SCHEDULE  \nPREAMBLE  \nThis policy is a contract of insurance between You and Universal Sompo General Insurance \nCompany (hereinafter called the `Company’) and contains all the details of the cover that we \nprovide.  \n \nYour policy comprises:  \n \n• The preamble [the current part] which introduces the policy document, describes the structure \nof the document and sets the general rule s; \n• The policy wording which lists and details the available coverage, benefits, claims and \ngrievance redressal procedure, exclusions and other terms and conditions of cover;  \n• The proposal, which is the information You provide 

### 2a. Split the documents into smaller chunks

In [79]:
text_splitter = SentenceSplitter(chunk_size=200, chunk_overlap=10)
nodes = text_splitter.get_nodes_from_documents(documents=docs)
nodes

[TextNode(id_='b758fe31-1959-4983-9dc1-930cc34b131f', embedding=None, metadata={'page_label': '1', 'file_name': 'UNIHLIP23006V032223.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0b1f5341-75db-4d5e-9cce-194a3a481c82', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'UNIHLIP23006V032223.pdf'}, hash='1395dc2c8838efcfb4025be18943a58003c93756ef543ae390567366d58b25ba'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9839eecf-be0c-4d0e-aea7-3c06d559d7b8', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a498daa8b8e591ef43c52a8ff3c12f61339e60a401b0e6912ae7ba1b21294d41')}, text='1 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nCOMPLETE  HEALTHCARE  INSURANCE  \n \nPOLICY SCHEDULE  \nPREAMBLE  \nThis policy is a contract of insurance between You and Universal Sompo General Insurance \nCompany (hereinafter called

### 2b. Select embedding strategy

In [80]:
openai_embedding = OpenAIEmbedding()

### 2c. Create the vectorstore and index

In [81]:
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("my_collection", get_or_create=True)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes=nodes, storage_context=storage_context, embed_model=OpenAIEmbedding())

In [101]:
chroma_client

<chromadb.api.client.Client at 0x139626400>

In [102]:
chroma_collection

<chromadb.api.models.Collection.Collection at 0x1395815b0>

In [103]:
vector_store

ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})

In [104]:
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x13a9ac460>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x1396be3d0>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x1686fcfa0>, property_graph_store=None)

In [95]:
## Some available methods on index

# index.as_chat_engine()
# index.as_chat_engine()
# index.as_retriever()

### Create the retriever [Optional on Llama-index]

In [97]:
retriever = index.as_retriever()
retriever.retrieve('What medical expenses are covered for in-patient treatment?')

[NodeWithScore(node=TextNode(id_='2d037b74-55dd-43e2-b3f3-087d35fa3a1f', embedding=None, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9557f1da-ff02-4daf-be5c-6350b96b65f9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, hash='e3d87c2238a10636d56456bc4f1d720c095c0f13e03977f36f9189963f9d6e63')}, text='3 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nexpenses for up to 60 days in accordance with ( Section I. C1) above, and No payment will be \nmade if the condition for which You require medical treatment is:  \n1) Asthma;  \n2) Bronchitis;  \n3) Chronic Nephritis and Nephrotic Syndrome;  \n4) Diarrhoea and all type of Dysenteries including Gatro -enterities;  \n5) Diabetes Mellitus Insipidus;  \n6) Epilepsy;  \n7) Hypertension

### 3. Create Query Engine

In [110]:
query_engine = index.as_query_engine(llm=OpenAI(model="gpt-3.5-turbo-0125"))
query_engine.query('What medical expenses are covered for in-patient treatment?')

Response(response='The medical expenses covered for in-patient treatment include expenses related to Asthma, Bronchitis, Chronic Nephritis, Diarrhoea, Diabetes Mellitus Insipidus, Epilepsy, Hypertension, Influenza, Psychiatric or Psychosomatic Disorders, Pyrexia of unknown origin, Tonsillitis, Upper Respiratory Tract Infection, Arthritis, Gout, and Rheumatism.', source_nodes=[NodeWithScore(node=TextNode(id_='2d037b74-55dd-43e2-b3f3-087d35fa3a1f', embedding=None, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9557f1da-ff02-4daf-be5c-6350b96b65f9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, hash='e3d87c2238a10636d56456bc4f1d720c095c0f13e03977f36f9189963f9d6e63')}, text='3 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nexpen

### Define Custom Prompt [Optional on Llama-index]

In [111]:
# Default Prompt 

query_engine.get_prompts()

{'response_synthesizer:text_qa_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.QUESTION_ANSWER: 'text_qa'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: '), conditionals=[(<function is_chat_model at 0x1128fe550>, ChatPromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, message_templates=[ChatMessage(role=<MessageRole.SYS

In [113]:
prompt = """Based on the data provided to you here: {context_str}. 
Please answer this question: {query_str}"""

custom_prompt = PromptTemplate(prompt)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": custom_prompt}
)

In [114]:
# Updated Prompt 

query_engine.get_prompts()

{'response_synthesizer:text_qa_template': PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='Based on the data provided to you here: {context_str}. \nPlease answer this question: {query_str}'),
 'response_synthesizer:refine_template': SelectorPromptTemplate(metadata={'prompt_type': <PromptType.REFINE: 'refine'>}, template_vars=['query_str', 'existing_answer', 'context_msg'], kwargs={}, output_parser=None, template_var_mappings={}, function_mappings={}, default_template=PromptTemplate(metadata={'prompt_type': <PromptType.REFINE: 'refine'>}, template_vars=['query_str', 'existing_answer', 'context_msg'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template="The original query is as follows: {query_str}\nWe have provided an existing answer: {existing_answer}\nWe have the opportunity to refine the

### 4. Run the query on RAG

In [108]:
query_engine.query('What medical expenses are covered for in-patient treatment?')

Response(response='The medical expenses covered for in-patient treatment include expenses for organ donor treatment, ambulance services, dental treatment in case of an accident, AYUSH benefit (Ayurveda, Unani, Sidha, Homeopathy), daily cash for accompanying an insured child, vaccination, and out-patient treatment.', source_nodes=[NodeWithScore(node=TextNode(id_='2d037b74-55dd-43e2-b3f3-087d35fa3a1f', embedding=None, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9557f1da-ff02-4daf-be5c-6350b96b65f9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '3', 'file_name': 'UNIHLIP23006V032223.pdf'}, hash='e3d87c2238a10636d56456bc4f1d720c095c0f13e03977f36f9189963f9d6e63')}, text='3 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nexpenses for up to 60 days in accordance with ( Sect