In [17]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py PyMuPDF sentence_transformers python-dotenv langchain_chroma


Collecting langchain_chroma
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_chroma-0.1.2-py3-none-any.whl (9.3 kB)
Installing collected packages: langchain_chroma
Successfully installed langchain_chroma-0.1.2


In [43]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
    MessagesPlaceholder
)
from langchain.chains import create_history_aware_retriever
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage

In [38]:
# local_llm = 'gemma'
local_llm = 'llama3'
# local_llm = 'mistral'

In [39]:
# Load PDF documents
pdf_file_paths = ["./data/Medical Policy FY 23-24.pdf", "./data/Provident fund policy.pdf"]

docs_list = []
for pdf_path in pdf_file_paths:
    loader = PyMuPDFLoader(pdf_path)
    try:
        loaded_docs = loader.load()
        docs_list.extend(loaded_docs)
    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splitter = text_splitter.split_documents(docs_list)

# Filter and clean metadata
filtered_doc = []
for doc in doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vectorDB
embedding = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf", gpt4all_kwargs={'allow_download': 'True'})
vectorstore = Chroma.from_documents(
    documents=filtered_doc,
    collection_name="rag-chroma",
    embedding=embedding,
)

retriever = vectorstore.as_retriever()

In [40]:
llm = ChatOllama(model=local_llm, temperature=0)

In [51]:
context_template_str = """system You are an assistant for question-answering tasks. Use the following context to answer the question. Avoid phrases like "Based on the provided context". Explain the answer in the end. and make a heading with paragraph.
Question: {question}
Context: {context}
Answer: assistant"""

context_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context", "question"],
        template=context_template_str,
    )
)

context_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template="{question}",
    )
)

context_messages = [context_system_prompt, context_human_prompt]

context_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=context_messages,
)

output_parser = StrOutputParser()

rag_chain = {"context": retriever,
                 "question": RunnablePassthrough()} | context_prompt_template | llm | output_parser

rag_chain.invoke("Does original receipts must be submitted?")



'**Submission of Original Receipts**\n\nAccording to the Provident Fund Policy V2.0, for construction on a plot, **original receipts must be submitted within 30 days of loan disbursement**.\n\nThis is stated in point 2 under "Documents to be submitted" on page 6 of the policy document:\n\n"For construction on a plot, original receipts must be submitted within 30 days of loan disbursement, while evidence of payment for other withdrawals should also be provided within the same timeframe. Failure to submit original receipts within the stipulated time will result in the deduction of the entire loan amount from the following months’ salaries."\n\nTherefore, yes, original receipts must be submitted within 30 days of loan disbursement for construction on a plot.'

In [77]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [78]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [90]:
chat_history = []

question = "What is the employee contribution rate for the provident fund?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})


print(ai_msg_1)
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

{'input': 'What is the employee contribution rate for the provident fund?', 'chat_history': [], 'context': [Document(metadata={'author': '', 'creationDate': "D:20240111181114+05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': './data/Provident fund policy.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240111181114+05'00'", 'page': 3, 'producer': 'Microsoft® Word for Microsoft 365', 'source': './data/Provident fund policy.pdf', 'subject': '', 'title': '', 'total_pages': 7, 'trapped': ''}, page_content='PROVIDENT FUND POLICY V2.0 \n \n4 \n \nOVERVIEW \nProvident Fund is a benefit created by the employer for its employees. Both the employee and the \nemployer contribute to this fund.  \nThe fund is managed according to rules set by the Government of Pakistan and other regulatory \nand tax authorities. \nThe amounts collected in the fund are invested and the profits are accumulated for employees’ \nbenefit. As per law, and since the company (or Trust) that ma

In [84]:

second_question = "can you multiply by 5"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

The answer to 5+5 is 10.

As for multiplying by 5, I can do that!

10 x 5 = 50


In [89]:
chat_history

[HumanMessage(content='What is the employee contribution rate for the provident fund?'),
 "According to the provided context, the employee's contribution rate for the Provident Fund is 8.33% of their basic salary."]