In [28]:
import os
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py PyMuPDF

Collecting langchain-core<0.3,>=0.1.46 (from langchain-nomic)
  Using cached langchain_core-0.2.21-py3-none-any.whl.metadata (6.0 kB)
Using cached langchain_core-0.2.21-py3-none-any.whl (372 kB)
[0mInstalling collected packages: langchain-core
[0mSuccessfully installed langchain-core
[0m

In [29]:
local_llm = 'gemma'

In [30]:
## index

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.document_loaders import FireCrawlLoader, PyMuPDFLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document

urls = [
    "https://www.ai-jason.com/learning-ai/how-to-reduce-llm-cost",
    # "https://www.ai-jason.com/learning-ai/gpt5-llm",
    # "https://www.ai-jason.com/learning-ai/how-to-build-ai-agent-tutorial-3"
]

# Load documents from URLs
doc = []
for url in urls:
    loader = FireCrawlLoader(api_key="fc-75d116f86f7d4eceb112c6c44d624508", url=url, mode='scrape')
    try:
        loaded_docs = loader.load()
        doc.append(loaded_docs)
    except Exception as e:
        print(f"Error loading {url}: {e}")

# Flatten the list of lists
docs_list = [item for sublist in doc for item in sublist]

# Load PDF documents
pdf_file_paths = ["data/Medical Policy FY 23-24.pdf", "data/Provident fund policy.pdf"]

for pdf_path in pdf_file_paths:
    loader = PyMuPDFLoader(pdf_path)
    try:
        loaded_docs = loader.load()
        docs_list.extend(loaded_docs)
    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splitter = text_splitter.split_documents(docs_list)

# Filter and clean metadata
filtered_doc = []
for doc in doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))


# Load text file content
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return Document(page_content=content, metadata={"source": file_path})


# Path to your text file
text_file_path = "useful_questions.txt"
text_doc = load_text_file(text_file_path)

# Split the text file content
text_doc_splitter = text_splitter.split_documents([text_doc])

# Filter and clean metadata for the text file content
filtered_text_doc = []
for doc in text_doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_text_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Combine URL documents, PDF documents, and text file documents
combined_docs = filtered_doc + filtered_text_doc

# Add to vectorDB
embedding = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf", gpt4all_kwargs={'allow_download': 'True'})
vectorstore = Chroma.from_documents(
    documents=combined_docs,
    collection_name="rag-chroma",
    embedding=embedding,
)

retriever = vectorstore.as_retriever()


In [34]:
### Generate

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama

# Prompt
prompt = PromptTemplate(
    template="""system You are an assistant for question-answering tasks.
    Use the following context to answer the question directly. If you dont understand the answer, ask another question.
    Question: {question} 
    Context: {context} 
    Answer: assistant""",
    input_variables=["question", "context"],
)


llm = ChatOllama(model=local_llm, temperature=0)


# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
question = "Hello"
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

I am unable to answer the question as it is not related to the provided context.


In [52]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model for semantic similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define prompt templates
context_prompt = PromptTemplate(
    template="""system You are an assistant for question-answering tasks.
    Use the following context to answer the question directly. Avoid phrases like "Based on the provided context.and explain answer in the end".
    Question: {question}
    Context: {context}
    Answer: assistant""",
    input_variables=["question", "context"],
)

no_context_prompt = PromptTemplate(
    template="""system You are an assistant for question-answering tasks.
    Answer the question concisely and directly.
    Question: {question}
    Answer: assistant""",
    input_variables=["question"],
)

llm = ChatOllama(model=local_llm, temperature=0)

# Function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Function to check relevance using semantic similarity
def is_relevant(question, context_docs, threshold=0.5):
    context_text = format_docs(context_docs)
    question_embedding = model.encode(question, convert_to_tensor=True)
    context_embedding = model.encode(context_text, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(question_embedding, context_embedding)
    return similarity.item() > threshold

# Function to get the appropriate chain based on relevance
def get_chain(question, context_docs):
    if is_relevant(question, context_docs):
        return context_prompt | llm | StrOutputParser()
    else:
        return no_context_prompt | llm | StrOutputParser()

# Main function to run the question-answering process
def answer_question(question):
    docs = retriever.invoke(question)
    rag_chain = get_chain(question, docs)
    if is_relevant(question, docs):
        generation = rag_chain.invoke({"context": docs, "question": question})
    else:
        generation = rag_chain.invoke({"question": question})
    return generation

# Example question
question = "will I get reimbursement if I submit receipt after 25?"
answer = answer_question(question)
print(answer)


No, receipts submitted after 25th of any month will not be eligible for reimbursement with the current month's salary. They will be reimbursed with the next month's salary.
