In [10]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py PyMuPDF sentence_transformers python-dotenv




In [11]:
## index

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document

# Load PDF documents
pdf_file_paths = ["./data/Medical Policy FY 23-24.pdf", "./data/Provident fund policy.pdf"]

docs_list = []
for pdf_path in pdf_file_paths:
    loader = PyMuPDFLoader(pdf_path)
    try:
        loaded_docs = loader.load()
        docs_list.extend(loaded_docs)
    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splitter = text_splitter.split_documents(docs_list)

# Filter and clean metadata
filtered_doc = []
for doc in doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vectorDB
embedding = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf", gpt4all_kwargs={'allow_download': 'True'})
vectorstore = Chroma.from_documents(
    documents=filtered_doc,
    collection_name="rag-chroma",
    embedding=embedding,
)

retriever = vectorstore.as_retriever()

In [12]:
# local_llm = 'gemma'
local_llm = 'llama3'
# local_llm = 'mistral'

In [13]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser

llm = ChatOllama(model=local_llm, temperature=0)

output_parser = StrOutputParser()


In [14]:
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from langchain.schema.runnable import RunnablePassthrough

context_template_str = """system You are an assistant for question-answering tasks. Use the following context to answer the question. Avoid phrases like "Based on the provided context". Explain the answer in the end. and make a heading with paragraph.
Question: {question}
Context: {context}
Answer: assistant"""

context_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context", "question"],
        template=context_template_str,
    )
)

context_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template="{question}",
    )
)

context_messages = [context_system_prompt, context_human_prompt]

context_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=context_messages,
)

context_chain = {"context": retriever,
                 "question": RunnablePassthrough()} | context_prompt_template | llm | output_parser

context_question = "Does original receipts must be submitted?"
context_answer = context_chain.invoke(context_question)
print(context_answer)

**Original Receipts Submission**

According to the Provident Fund Policy V2.0, for construction on a plot, "original receipts must be submitted within 30 days of loan disbursement". This is stated in point 2 under the section "Documents to be submitted".

Additionally, in the Medical Policy FY 23-24, it is mentioned that for medicines, tests, procedures, etc., "original receipts" are required.

In summary, yes, original receipts must be submitted in certain circumstances, specifically for construction on a plot and for medical expenses.


In [30]:
no_context_system_template_str = """system You are an assistant for question-answering tasks. Answer the question concisely and directly. If you dont know the answer the say dont know.
Question: {question}
Answer: assistant"""

no_context_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template=no_context_system_template_str,
    )
)

no_context_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template="{question}",
    )
)

no_context_messages = [no_context_system_prompt, no_context_human_prompt]

no_context_prompt_template = ChatPromptTemplate(
    input_variables=["question"],
    messages=no_context_messages,
)

no_context_chain = {"question": RunnablePassthrough()} | no_context_prompt_template | llm | output_parser

no_context_question = "my name is haris"
no_context_answer = no_context_chain.invoke(no_context_question)
print(no_context_answer)

Nice to meet you, Haris!


In [32]:
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage, AIMessage

prompt_search_query = ChatPromptTemplate.from_messages(
    [
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
        ("user",
         "Given the above conversation, generate a search query to look up to get information relevant to the conversation")

    ]
)

prompt_get_answer = ChatPromptTemplate.from_messages([
    ("system", "Answer the user's questions based on the below context:\\n\\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
])

retriever_chain = create_history_aware_retriever(llm, retriever, prompt_search_query)

document_chain = create_stuff_documents_chain(llm, prompt_get_answer)

retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

chat_history = [HumanMessage(content="Does DASA allow anybody to be certified?"), AIMessage(content="Yes")]

response = retrieval_chain.invoke({
    "input": "hello?", "chat_history": chat_history
})

print(response['answer'])


I'm here! However, I don't see any information about DASA or certification in the provided context. The context only mentions a Provident fund policy and its date of issuance. If you have any questions related to this policy or HR department, I'd be happy to help!


In [31]:
from langchain.chains import ConversationChain, ConversationalRetrievalChain

# chain = RetrievalQA.from_chain_type(llm=chat_model, chain_type="stuff", retriever=retriever, verbose = True)

ques = "what is my name?"
chathis = [("my name is haris", "Nice to meet you, Haris!")]

chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, verbose=True)

# res = chain({"query":query})
res = chain({"question":ques, "chat_history":chathis})

print(res)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: my name is haris
Assistant: Nice to meet you, Haris!
Follow Up Input: what is my name?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
HEALTH POLICY V1.0 
 
2 
 
Contents 
Version history 
..................................................................................................................................... 
3 
Eligibility .......................................................................................