In [1]:
import langchain
print(langchain.__version__)

0.3.13


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

###Call LLM

In [18]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from typing import List
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=128,
    length_function=len
)

In [19]:
# 1. Function to load documents from a folder
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

# Load documents from a folder
folder_path = "./dataset"
documents = load_documents(folder_path)

print(f"Loaded {len(documents)} documents from the folder.")
splits = text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")

Unsupported file type: combined_documents.md
Loaded 20 documents from the folder.
Split the documents into 46 chunks.


###Create and persist Chroma vector store

In [20]:
from langchain_chroma import Chroma

embedding_function = OpenAIEmbeddings()
collection_name = "rag_service_collection_nb"
vectorstore = Chroma.from_documents(collection_name=collection_name, documents=splits, embedding=embedding_function, persist_directory="./chroma_db")
#db.persist()

print("Vector store created and persisted to './chroma_db'")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Vector store created and persisted to './chroma_db'


In [6]:
query = "How to understand documents visually?"

In [21]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
retriever.invoke(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(metadata={'page': 2, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='and visual document features like a human would.\n2.2 Integrating Visual features\nContrastive Vision Language Models. Mapping\nlatent representations of textual content to corre-\nsponding representations of visual content has been\ndone by aligning disjoint visual and text encoders\nthrough contrastive losses (Radford et al., 2021;\nZhai et al., 2023). While some OCR capabilities\nexist in these models, the visual component is often\nnot optimized for text understanding. The Fine-\ngrained Interactive Language-Image Pre-training\n(Yao et al., 2021) framework extends the late inter-\naction mechanism to cross-modal vision-language\nmodels, relying on max similarity operations be-\ntween text tokens and image patches.\nVisually Rich Document Understanding. To\ngo beyond text, some document-focused models\njointly encode text tokens alongside visual or docu-\nment layout features (Appalaraju et

### Reranking

In [22]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [23]:
%%time
reranked_docs = compression_retriever.invoke(query)
len(reranked_docs)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)


CPU times: user 7.54 s, sys: 617 ms, total: 8.16 s
Wall time: 1.3 s


3

In [24]:
for doc in reranked_docs:
    print(f"id: {doc.metadata['id']}\n")
    print(f"text: {doc.page_content[:256]}\n")
    print(f"score: {doc.metadata['relevance_score']}")
    print("-" * 80)
    print()

id: 1

text: document retrieval settings, in which user queries
may require both textual and visual understanding
to be correctly matched to relevant documents. We
highlight the shortcomings of current text-centric
systems in these settings.1
Contribution 2: ColPali. W

score: 0.9555143713951111
--------------------------------------------------------------------------------

id: 2

text: document retrieval settings, in which user queries
may require both textual and visual understanding
to be correctly matched to relevant documents. We
highlight the shortcomings of current text-centric
systems in these settings.1
Contribution 2: ColPali. W

score: 0.9555143713951111
--------------------------------------------------------------------------------

id: 3

text: document retrieval settings, in which user queries
may require both textual and visual understanding
to be correctly matched to relevant documents. We
highlight the shortcomings of current text-centric
systems in these settings.1

In [25]:
from langchain_core.prompts import ChatPromptTemplate
template = """You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.

Given the context below:
{context}

And the question:
{question}

Provide a precise and concise answer based solely on the provided context. Do not include any information that is not explicitly present in the context.

Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [26]:
# from langchain.schema.runnable import RunnablePassthrough
# rag_chain = (
#     {"context": compression_retriever, "question": RunnablePassthrough()} | prompt
# )
# rag_chain.invoke(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


ChatPromptValue(messages=[HumanMessage(content="You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.\n\nGiven the context below:\n[Document(metadata={'page': 2, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='and visual document features like a human would.\\n2.2 Integrating Visual features\\nContrastive Vision Language Models. Mapping\\nlatent representations of textual content to corre-\\nsponding representations of visual content has been\\ndone by aligning disjoint visual and text encoders\\nthrough contrastive losses (Radford et al., 2021;\\nZhai et al., 2023). While some OCR capabilities\\nexist in these models, the visual component is often\\nnot optimized for text understanding. The Fine-\\ngrained Interactive Language-Image Pre-training\\n(Yao et al., 2021) framework extends the late inter-\\naction mechanism to cross-modal vision-language\\nmodels, relying on m

In [27]:
def docs2str(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [28]:
# from langchain.schema.runnable import RunnablePassthrough
# rag_chain = (
#     {"context": compression_retriever | docs2str, "question": RunnablePassthrough()} | prompt
# )
# rag_chain.invoke(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_du

ChatPromptValue(messages=[HumanMessage(content='You are a highly capable assistant specializing in answering questions from visually rich documents. Consider both textual and visual elements as context.\n\nGiven the context below:\ndocument retrieval settings, in which user queries\nmay require both textual and visual understanding\nto be correctly matched to relevant documents. We\nhighlight the shortcomings of current text-centric\nsystems in these settings.1\nContribution 2: ColPali. We propose a novel\nmodel architecture and training strategy based on\nVision Language Models (VLMs) to efficiently in-\ndex documents purely from their visual features,\nallowing for subsequent fast query matching with\n\ndocument retrieval settings, in which user queries\nmay require both textual and visual understanding\nto be correctly matched to relevant documents. We\nhighlight the shortcomings of current text-centric\nsystems in these settings.1\nContribution 2: ColPali. We propose a novel\nmodel

In [29]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model="gpt-4o-mini")

In [30]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": compression_retriever | docs2str, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
question = query
response = rag_chain.invoke(question)
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_dump to serialize <class 'langchain_core.documents.base.Document'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'numpy.float32'>)
ERROR:langsmith._internal._serde:Failed to use model_du

To understand documents visually, one can utilize a model architecture and training strategy based on Vision Language Models (VLMs) that efficiently index documents purely from their visual features. This approach allows for fast query matching by leveraging both textual and visual understanding to correctly match user queries to relevant documents.


###Conversational RAG

####Handling Follow Up Questions

In [None]:
# Example conversation
from langchain_core.messages import HumanMessage, AIMessage
chat_history = []
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response)
])

In [None]:
chat_history

In [None]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# history_aware_retriever = create_history_aware_retriever(
#     llm, retriever, contextualize_q_prompt
# )
contextualize_chain = contextualize_q_prompt | llm | StrOutputParser()
contextualize_chain.invoke({"input": "Where it is headquartered?", "chat_history": chat_history})

In [None]:
from langchain.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)
history_aware_retriever.invoke({"input": "Where it is headquartered?", "chat_history": chat_history})

In [None]:
retriever.invoke("Where it is headquartered?")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
    #  ("system", "Tell me joke on Programming"),
    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
rag_chain.invoke({"input": "Where it is headquartered?", "chat_history":chat_history})

###Building Multi User Chatbot

In [None]:
import sqlite3
from datetime import datetime

DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
                     session_id TEXT,
                     user_query TEXT,
                     gpt_response TEXT,
                     model TEXT,
                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize the database
create_application_logs()

In [None]:
import uuid
session_id = str(uuid.uuid4())
chat_history = get_chat_history(session_id)
print(chat_history)
question1 = "When was GreenGrow Innovations founded?"
answer1 = rag_chain.invoke({"input": question1, "chat_history":chat_history})['answer']
insert_application_logs(session_id, question1, answer1, "gpt-4-o-mini")
print(f"Human: {question1}")
print(f"AI: {answer1}\n")

In [None]:
question2 = "Where it is headquartered?"
chat_history = get_chat_history(session_id)
print(chat_history)
answer2 = rag_chain.invoke({"input": question2, "chat_history":chat_history})['answer']
insert_application_logs(session_id, question2, answer2, "gpt-3.5-turbo")
print(f"Human: {question2}")
print(f"AI: {answer2}\n")

New User

In [None]:
session_id = str(uuid.uuid4())
question = "What is GreenGrow"
chat_history = get_chat_history(session_id)
print(chat_history)
answer = rag_chain.invoke({"input": question, "chat_history":chat_history})['answer']
insert_application_logs(session_id, question, answer, "gpt-3.5-turbo")
print(f"Human: {question}")
print(f"AI: {answer}\n")