In [1]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py PyMuPDF sentence_transformers python-dotenv langchain_chroma python-docx


Collecting langgraph
  Downloading langgraph-0.1.16-py3-none-any.whl.metadata (13 kB)
Downloading langgraph-0.1.16-py3-none-any.whl (102 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 kB[0m [31m610.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: langgraph
  Attempting uninstall: langgraph
    Found existing installation: langgraph 0.1.15
    Uninstalling langgraph-0.1.15:
      Successfully uninstalled langgraph-0.1.15
Successfully installed langgraph-0.1.16

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder
)
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage
from docx import Document as DocxDocument

In [3]:
# local_llm = 'gemma'
# local_llm = 'llama3'
local_llm = 'llama3.1'
# local_llm = 'mistral'

In [4]:
# Directory to check
cascade_directory = "./data/cascade"
policy_directory = "./data/policy"

directories = [cascade_directory, policy_directory]


# Lists to store file paths
pdf_file_paths = []
docx_file_paths = []

# Check for files in the directory
for directory in directories:
    for filename in os.listdir(directory):
        print(filename)
        if filename.endswith(".pdf"):
            pdf_file_paths.append(os.path.join(directory, filename))
        elif filename.endswith(".docx"):
            docx_file_paths.append(os.path.join(directory, filename))

docs_list = []

# Load PDF files
for pdf_path in pdf_file_paths:
    loader = PyMuPDFLoader(pdf_path)
    try:
        loaded_docs = loader.load()
        docs_list.extend(loaded_docs)
    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

# Load DOCX files
for docx_path in docx_file_paths:
    try:
        docx = DocxDocument(docx_path)
        full_text = []
        for para in docx.paragraphs:
            full_text.append(para.text)
        docx_text = '\n'.join(full_text)
        docs_list.append(Document(page_content=docx_text, metadata={"source": docx_path}))
    except Exception as e:
        print(f"Error loading {docx_path}: {e}")

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splitter = text_splitter.split_documents(docs_list)

# Filter and clean metadata
filtered_doc = []
for doc in doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vectorDB
embedding = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf", gpt4all_kwargs={'allow_download': 'True'})
vectorstore = Chroma.from_documents(
    documents=filtered_doc,
    collection_name="rag-chroma",
    embedding=embedding,
)

retriever = vectorstore.as_retriever()


.DS_Store
Cascade features.pdf
CASCADE_USERS.docx
CASCADE_ADMINISTRATION_GUIDE.docx
CASCADE_USER_GUIDE_V6 (USER JOURNEYS).docx
DCL_WORKFLOW.pdf
MORTGAGE_GUIDE-compressed.pdf
DCF_WORKFLOW.pdf
CASCADE_FEATURES.docx
Medical Policy FY 23-24.pdf
.DS_Store
Provident fund policy.pdf


In [5]:
directories

['./data/cascade', './data/policy']

In [6]:
llm = ChatOllama(model=local_llm, temperature=0)

In [7]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [8]:
qa_system_prompt = """system You are an assistant for question-answering tasks. Use the following context to answer the question. Avoid phrases like "Based on the provided context". Explain the answer in the end. and make a heading with paragraph.
Question: {input}
Context: {context}
Answer: assistant"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [9]:
chat_history = []

In [10]:
question = "Explain the significance of document management within Cascade™."
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

In [15]:
print(ai_msg_1["answer"])

**Document Management in Cascade**

The Document Management module in Cascade is a crucial feature that enables users to manage documents end-to-end, from creation to storage, archiving, and retrieval using indexing. This module's flexibility allows it to be enabled at any stage of the loan application lifecycle.

**Key Features and Significance**

1. **Document Upload**: Users can upload documents using file upload or browser extension, making it easy to add relevant documents to the system.
2. **Document Versioning and Retrieval**: The module allows users to track document versions and retrieve previous versions, ensuring that all changes are recorded and accessible.
3. **Document Tagging**: Documents can be tagged with respect to their type, making it easier to categorize and search for specific documents.
4. **Document Preview**: Users can preview documents with features like rotate, zoom, and search, allowing them to quickly review document content.
5. **Document Thumbnail and Sea

In [12]:
second_question = "What are common ways of doing it?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

**Common Ways of Managing Documents in Cascade**

The Document Management module in Cascade offers several features to manage documents effectively. Some common ways of managing documents in Cascade include:

1. **Uploading documents**: Users can upload documents using file upload or browser extension, making it easy to add relevant documents to the system.
2. **Versioning and retrieval**: The module allows users to track document versions and retrieve previous versions, ensuring that all changes are recorded and accessible.
3. **Tagging documents**: Documents can be tagged with respect to their type, making it easier to categorize and search for specific documents.
4. **Previewing documents**: Users can preview documents with features like rotate, zoom, and search, allowing them to quickly review document content.
5. **Searching and retrieving documents**: The module provides a thumbnail view of documents and enables searching, making it easy to locate specific documents.

These commo

In [13]:
chat_history

[HumanMessage(content='Explain the significance of document management within Cascade™.'),
 "**Document Management in Cascade**\n\nThe Document Management module in Cascade is a crucial feature that enables users to manage documents end-to-end, from creation to storage, archiving, and retrieval using indexing. This module's flexibility allows it to be enabled at any stage of the loan application lifecycle.\n\n**Key Features and Significance**\n\n1. **Document Upload**: Users can upload documents using file upload or browser extension, making it easy to add relevant documents to the system.\n2. **Document Versioning and Retrieval**: The module allows users to track document versions and retrieve previous versions, ensuring that all changes are recorded and accessible.\n3. **Document Tagging**: Documents can be tagged with respect to their type, making it easier to categorize and search for specific documents.\n4. **Document Preview**: Users can preview documents with features like rotat

In [14]:
for document in ai_msg_1["context"]:
    print(document)

page_content='This module covers the complete functionality of document management in Cascade suite. It includes end to end management of document, along with its lifecycle, storage archiving and retrieval using indexing. Due to flexible nature of the module, it can be enabled using configuration at any stage of the loan application lifecycle. This module has the following features which enable users to utilize complete functionality while working on Cascade suite.
Document upload using file upload or browser extension.
Document versioning and retrieval
Document tagging with respect to the type.
Document preview with rotate, zoom and search.
Document thumbnail and search for easy access
Association of documents with checklist and discrepancy resolution' metadata={'source': './data/cascade/CASCADE_FEATURES.docx'}
page_content='or a similar integration channel. SAF (store and forward) mechanism ensures robustness at this stage. 
In case of limit creation, relevant entries are passed in t