# RAG using Langchain

In [19]:
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from operator import itemgetter
from dotenv import load_dotenv

load_dotenv()

True

## Simple RAG Archictecture

In [32]:
# 1. Load the Information
docs = PyPDFLoader("../doc/UNIHLIP23006V032223.pdf").load()

# 2a. Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# 2b. Select embedding strategy/ type
openai_embedding = OpenAIEmbeddings()

# 2c. Create the vectorstore
vectorstore = Chroma.from_documents(documents=splits, embedding=openai_embedding)

# 3. Create the retriever
retriever = vectorstore.as_retriever()

# 4. Create the Prompt [NOT optional in LangChain]
prompt = """Based on the data provided to you here: {context}. 
Please answer this question: {question}"""

custom_prompt = PromptTemplate.from_template(prompt)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 5. Create Chain using LangChain Expression Language [Another LangChain-specific part]
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_prompt
    | ChatOpenAI(model="gpt-3.5-turbo-0125")
    | StrOutputParser()
)

rag_chain.invoke('What medical expenses are covered for in-patient treatment?')

'The medical expenses covered for in-patient treatment include room rent, boarding expenses, nursing services, intensive care unit charges, medical practitioner fees, anaesthesia, blood, oxygen, operation theatre charges, surgical appliances, and medicines, drugs, and consumables.'

## Breakdown per Component

### 1. Load the Information

In [23]:
docs = PyPDFLoader("../doc/UNIHLIP23006V032223.pdf").load()

### 2a. Split the documents into smaller chunks

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=200)
splits = text_splitter.split_documents(docs)
splits

[Document(page_content='1 Policy Wording - Complete Healthcare Insurance           UIN : UNIHLIP23006V032223  \n \n \nCOMPLETE  HEALTHCARE  INSURANCE  \n \nPOLICY SCHEDULE  \nPREAMBLE  \nThis policy is a contract of insurance between You and Universal Sompo General Insurance \nCompany (hereinafter called the `Company’) and contains all the details of the cover that we \nprovide.  \n \nYour policy comprises:  \n \n• The preamble [the current part] which introduces the policy document, describes the structure \nof the document and sets the general rule s; \n• The policy wording which lists and details the available coverage, benefits, claims and \ngrievance redressal procedure, exclusions and other terms and conditions of cover;  \n• The proposal, which is the information You provide to us and which forms the basis for this \ninsurance cover;  \n• The policy schedule - a separate document customized for you showing the cover details \nopted for by You and offered by Us to You.  It is to 

### 2b. Select embedding strategy

In [25]:
openai_embedding = OpenAIEmbeddings()

### 2c. Create the vectorstore


In [28]:
vectorstore = Chroma.from_documents(documents=splits, embedding=openai_embedding)
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x15a477af0>

### 3. Create the retriever


In [30]:
retriever = vectorstore.as_retriever()
retriever.invoke("What medical expenses are covered for in-patient treatment?")

## Unlike in Llama-index, we can directly perform retrieval from the vectorstore

[Document(page_content='C1. In-patient Treatment  \nThe Medical Expenses for:  \n• Room Rent, boarding Expenses  \n• Nursing  \n• Intensive Care Unit \n• Medical Practitioner(s)  \n• Anaesthesia, blood, oxygen, operation theatre charges, surgical appliances  \n• Medicines, drugs and consumables', metadata={'page': 0, 'source': '../doc/UNIHLIP23006V032223.pdf'}),
 Document(page_content='C1. In-patient Treatment  \nThe Medical Expenses for:  \n• Room Rent, boarding Expenses  \n• Nursing  \n• Intensive Care Unit \n• Medical Practitioner(s)  \n• Anaesthesia, blood, oxygen, operation theatre charges, surgical appliances  \n• Medicines, drugs and consumables', metadata={'page': 0, 'source': '../doc/UNIHLIP23006V032223.pdf'}),
 Document(page_content='C1. In-patient Treatment  \nThe Medical Expenses for:  \n• Room Rent, boarding Expenses  \n• Nursing  \n• Intensive Care Unit \n• Medical Practitioner(s)  \n• Anaesthesia, blood, oxygen, operation theatre charges, surgical appliances  \n• Medicin

### 4. Create the Prompt

In [31]:
# default template is also available, but prompt declaration is a must on LangChain

prompt = """Based on the data provided to you here: {context}. 
Please answer this question: {question}"""

custom_prompt = PromptTemplate.from_template(prompt)

### 5. Create Chain using LangChain Expression Language [Another LangChain-specific part]

In [34]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_prompt
    | ChatOpenAI(model="gpt-3.5-turbo-0125")
    | StrOutputParser()
)

### 6. Run the query on RAG

In [35]:
rag_chain.invoke('What medical expenses are covered for in-patient treatment?')

'The medical expenses covered for in-patient treatment include room rent, boarding expenses, nursing care, intensive care unit costs, medical practitioner fees, anesthesia, blood, oxygen, operation theatre charges, surgical appliances, and medicines, drugs, and consumables.'