In [11]:
! pip install -U langchain_community PyMuPDF python-docx tiktoken sentence_transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:
import os
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder
)
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage
from docx import Document as DocxDocument
from langchain_huggingface import HuggingFaceEmbeddings


In [19]:
# local_llm = 'gemma'
local_llm = 'llama3'
# local_llm = 'llama3.1'
# local_llm = 'mistral'

In [21]:
# Directory to check
cascade_directory = "./data/cascade"
policy_directory = "./data/policy"

directories = [cascade_directory, policy_directory]


# Lists to store file paths
pdf_file_paths = []
docx_file_paths = []

# Check for files in the directory
for directory in directories:
    for filename in os.listdir(directory):
        print(filename)
        if filename.endswith(".pdf"):
            pdf_file_paths.append(os.path.join(directory, filename))
        elif filename.endswith(".docx"):
            docx_file_paths.append(os.path.join(directory, filename))

docs_list = []

# Load PDF files
for pdf_path in pdf_file_paths:
    loader = PyMuPDFLoader(pdf_path)
    try:
        loaded_docs = loader.load()
        docs_list.extend(loaded_docs)
    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

# Load DOCX files
for docx_path in docx_file_paths:
    try:
        docx = DocxDocument(docx_path)
        full_text = []
        for para in docx.paragraphs:
            full_text.append(para.text)
        docx_text = '\n'.join(full_text)
        docs_list.append(Document(page_content=docx_text, metadata={"source": docx_path}))
    except Exception as e:
        print(f"Error loading {docx_path}: {e}")

# Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splitter = text_splitter.split_documents(docs_list)

# Filter and clean metadata
filtered_doc = []
for doc in doc_splitter:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_doc.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vectorDB
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=filtered_doc,
    collection_name="rag-chroma",
    embedding=embedding,
)

retriever = vectorstore.as_retriever()


.DS_Store
Cascade features.pdf
CASCADE_USERS.docx
CASCADE_ADMINISTRATION_GUIDE.docx
CASCADE_USER_GUIDE_V6 (USER JOURNEYS).docx
DCL_WORKFLOW.pdf
MORTGAGE_GUIDE-compressed.pdf
DCF_WORKFLOW.pdf
CASCADE_FEATURES.docx
Medical Policy FY 23-24.pdf
.DS_Store
Provident fund policy.pdf


In [22]:
llm = ChatOllama(model=local_llm, temperature=0)


In [23]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [24]:
qa_system_prompt = """system You are an assistant for question-answering tasks. Use the following context to answer the question. Avoid phrases like "Based on the provided context". Explain the answer in the end. and make a heading with paragraph.
Question: {input}
Context: {context}
Answer: assistant"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [25]:
chat_history = []


In [26]:
question = "Explain the significance of document management within Cascade™."
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

In [10]:
print(ai_msg_1["answer"])

**Document Management Significance in Cascade™**

The document management module within Cascade™ is a crucial feature that enables users to manage documents efficiently throughout their lifecycle, from upload and storage to retrieval and archiving. This module's significance can be summarized as follows:

1. **End-to-End Document Management**: The document management module provides a comprehensive solution for managing documents, covering all stages of the document lifecycle, including creation, storage, retrieval, and archiving.
2. **Flexibility and Customization**: The module's flexible nature allows users to configure it at any stage of the loan application lifecycle, making it an essential tool for streamlining workflows and improving productivity.
3. **Version Control and Retrieval**: Document versioning enables users to track changes and maintain a record of all versions, ensuring that the most up-to-date information is always accessible.
4. **Tagging and Search**: The ability t