In [1]:
from IPython.display import IFrame

In [2]:
import langchain
print(langchain.__version__)

0.3.13


In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

True

###Call LLM

In [30]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from typing import List
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=32,
    length_function=len
)

pdf_loader = PyPDFLoader("./dataset/ColPali_2407.01449v3.pdf")
documents = pdf_loader.load()

print(len(documents))

splits = text_splitter.split_documents(documents)

print(f"Split the documents into {len(splits)} chunks.")

20
Split the documents into 142 chunks.


In [31]:
# 1. Function to load documents from a folder
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

# Load documents from a folder
folder_path = "./dataset"
documents = load_documents(folder_path)

print(f"Loaded {len(documents)} documents from the folder.")
splits = text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")

Loaded 20 documents from the folder.
Split the documents into 142 chunks.


In [10]:
# embeddings = OpenAIEmbeddings()
# document_embeddings = embeddings.embed_documents([split.page_content for split in splits])
# print(f"Created embeddings for {len(document_embeddings)} document chunks using {embeddings.model} model")


Created embeddings for 90 document chunks using text-embedding-ada-002 model


In [18]:
# # from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# from langchain_huggingface import HuggingFaceEmbeddings

# embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# document_embeddings = embedding_function.embed_documents([split.page_content for split in splits])

###Create and persist Chroma vector store

In [32]:
from langchain_chroma import Chroma

embedding_function = OpenAIEmbeddings()
collection_name = "rag_service_collection_nb"
vectorstore = Chroma.from_documents(collection_name=collection_name, documents=splits, embedding=embedding_function, persist_directory="./chroma_db")
#db.persist()

print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


In [28]:
query = "How to understand documents visually?"

In [34]:
# 5. Perform similarity search
search_results = vectorstore.similarity_search(query, k=5)

print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()


Top 2 most relevant chunks for the query: 'How to understand documents visually?'

Result 1:
Source: ./dataset/ColPali_2407.01449v3.pdf
Content: Visually Rich Document Understanding. To
go beyond text, some document-focused models
jointly encode text tokens alongside visual or docu-
ment layout features (Appalaraju et al., 2021; Kim
et al., 2021; Huang et al., 2022; Tang et al., 2022).

Result 2:
Source: ./dataset/ColPali_2407.01449v3.pdf
Content: manuel.faysse@centralesupelec.fr
Abstract
Documents are visually rich structures that con-
vey information through text, as well as tables,
figures, page layouts, or fonts. While mod-
ern document retrieval systems exhibit strong

Result 3:
Source: ./dataset/ColPali_2407.01449v3.pdf
Content: consider the context and visual elements of the doc-
uments to be retrieved. To this end, we create and
openly release ViDoRe, a comprehensive bench-
mark to evaluate systems on page-level document
retrieval with a wide coverage of domains, visual

Resul

In [36]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
retriever.invoke(query)

[Document(metadata={'page': 2, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='Visually Rich Document Understanding. To\ngo beyond text, some document-focused models\njointly encode text tokens alongside visual or docu-\nment layout features (Appalaraju et al., 2021; Kim\net al., 2021; Huang et al., 2022; Tang et al., 2022).'),
 Document(metadata={'page': 0, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='manuel.faysse@centralesupelec.fr\nAbstract\nDocuments are visually rich structures that con-\nvey information through text, as well as tables,\nfigures, page layouts, or fonts. While mod-\nern document retrieval systems exhibit strong'),
 Document(metadata={'page': 1, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='consider the context and visual elements of the doc-\numents to be retrieved. To this end, we create and\nopenly release ViDoRe, a comprehensive bench-\nmark to evaluate systems on page-level document\nretrieval with a wide cover

In [37]:
from langchain_core.prompts import ChatPromptTemplate
template = """Answer the question based only on the following context:
{context}

Question: {question}

Answer: """
prompt = ChatPromptTemplate.from_template(template)

In [38]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | prompt
)
rag_chain.invoke(query)

ChatPromptValue(messages=[HumanMessage(content="Answer the question based only on the following context:\n[Document(metadata={'page': 2, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='Visually Rich Document Understanding. To\\ngo beyond text, some document-focused models\\njointly encode text tokens alongside visual or docu-\\nment layout features (Appalaraju et al., 2021; Kim\\net al., 2021; Huang et al., 2022; Tang et al., 2022).'), Document(metadata={'page': 0, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='manuel.faysse@centralesupelec.fr\\nAbstract\\nDocuments are visually rich structures that con-\\nvey information through text, as well as tables,\\nfigures, page layouts, or fonts. While mod-\\nern document retrieval systems exhibit strong'), Document(metadata={'page': 1, 'source': './dataset/ColPali_2407.01449v3.pdf'}, page_content='consider the context and visual elements of the doc-\\numents to be retrieved. To this end, we create and\\nopenly r

In [39]:
def docs2str(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [40]:
rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()} | prompt
)
rag_chain.invoke(query)

ChatPromptValue(messages=[HumanMessage(content='Answer the question based only on the following context:\nVisually Rich Document Understanding. To\ngo beyond text, some document-focused models\njointly encode text tokens alongside visual or docu-\nment layout features (Appalaraju et al., 2021; Kim\net al., 2021; Huang et al., 2022; Tang et al., 2022).\n\nmanuel.faysse@centralesupelec.fr\nAbstract\nDocuments are visually rich structures that con-\nvey information through text, as well as tables,\nfigures, page layouts, or fonts. While mod-\nern document retrieval systems exhibit strong\n\nconsider the context and visual elements of the doc-\numents to be retrieved. To this end, we create and\nopenly release ViDoRe, a comprehensive bench-\nmark to evaluate systems on page-level document\nretrieval with a wide coverage of domains, visual\n\nrely on visual elements to more efficiently convey\ninformation to human readers, text-only systems\nbarely tap into these visual cues.\nTo our knowle

In [42]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model="gpt-4o-mini")

In [43]:
rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
question = query
response = rag_chain.invoke(question)
print(response)

To understand documents visually, one should consider the various visual elements that contribute to the overall information conveyed, such as tables, figures, page layouts, and font styles. Document-focused models can aid in this process by jointly encoding text tokens alongside these visual features. By leveraging these visual cues, one can gain a more comprehensive understanding of the document's content, as traditional text-only systems often overlook the significance of these visual components.


###Conversational RAG

####Handling Follow Up Questions

In [None]:
# Example conversation
from langchain_core.messages import HumanMessage, AIMessage
chat_history = []
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response)
])

In [None]:
chat_history

[HumanMessage(content='When was GreenGrow Innovations founded?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='GreenGrow Innovations was founded in 2010.', additional_kwargs={}, response_metadata={})]

In [None]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# history_aware_retriever = create_history_aware_retriever(
#     llm, retriever, contextualize_q_prompt
# )
contextualize_chain = contextualize_q_prompt | llm | StrOutputParser()
contextualize_chain.invoke({"input": "Where it is headquartered?", "chat_history": chat_history})

'Where is GreenGrow Innovations headquartered?'

In [None]:
from langchain.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)
history_aware_retriever.invoke({"input": "Where it is headquartered?", "chat_history": chat_history})

[Document(metadata={'source': '/content/docs/GreenGrow Innovations_ Company History.docx'}, page_content="The company's breakthrough came in 2018 with the introduction of the EcoHarvest System, an integrated solution that combined smart irrigation, soil monitoring, and automated harvesting techniques. This system caught the attention of large-scale farmers across the United States, propelling GreenGrow to national prominence.\n\n\n\nToday, GreenGrow Innovations employs over 200 people and has expanded its operations to include offices in California and Iowa. The company continues to focus on developing sustainable agricultural technologies, with ongoing projects in vertical farming, drought-resistant crop development, and AI-powered farm management systems.\n\n\n\nDespite its growth, GreenGrow remains committed to its original mission of promoting sustainable farming practices. The company regularly partners with universities and research institutions to advance the field of agricultur

In [None]:
retriever.invoke("Where it is headquartered?")

[Document(metadata={'source': '/content/docs/Company_ QuantumNext Systems.docx'}, page_content='Company: QuantumNext Systems\n\nHeadquarters: QuantumNext Systems is headquartered in Bangalore, Karnataka, India. The company, specializing in quantum computing and advanced data processing, is situated in the bustling tech metropolis of Bangalore, often referred to as the "Silicon Valley of India." From this technology capital, QuantumNext Systems is well-positioned to tap into India\'s rich pool of engineering talent and growing tech ecosystem, enabling it to push the boundaries of computational innovation.'),
 Document(metadata={'source': '/content/docs/Company_ TechWave Innovations.docx'}, page_content='Company: TechWave Innovations\n\nHeadquarters: TechWave Innovations is headquartered in San Francisco, California, USA. As a leader in cutting-edge AI and machine learning solutions, the company thrives in the heart of Silicon Valley, benefiting from its proximity to tech giants and a dy

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
    #  ("system", "Tell me joke on Programming"),
    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
rag_chain.invoke({"input": "Where it is headquartered?", "chat_history":chat_history})

{'input': 'Where it is headquartered?',
 'chat_history': [HumanMessage(content='When was GreenGrow Innovations founded?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='GreenGrow Innovations was founded in 2010.', additional_kwargs={}, response_metadata={})],
 'context': [Document(metadata={'source': '/content/docs/GreenGrow Innovations_ Company History.docx'}, page_content="The company's breakthrough came in 2018 with the introduction of the EcoHarvest System, an integrated solution that combined smart irrigation, soil monitoring, and automated harvesting techniques. This system caught the attention of large-scale farmers across the United States, propelling GreenGrow to national prominence.\n\n\n\nToday, GreenGrow Innovations employs over 200 people and has expanded its operations to include offices in California and Iowa. The company continues to focus on developing sustainable agricultural technologies, with ongoing projects in vertical farming, drought-resistant

###Building Multi User Chatbot

In [None]:
import sqlite3
from datetime import datetime

DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
                    (id INTEGER PRIMARY KEY AUTOINCREMENT,
                     session_id TEXT,
                     user_query TEXT,
                     gpt_response TEXT,
                     model TEXT,
                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize the database
create_application_logs()

In [None]:
import uuid
session_id = str(uuid.uuid4())
chat_history = get_chat_history(session_id)
print(chat_history)
question1 = "When was GreenGrow Innovations founded?"
answer1 = rag_chain.invoke({"input": question1, "chat_history":chat_history})['answer']
insert_application_logs(session_id, question1, answer1, "gpt-4-o-mini")
print(f"Human: {question1}")
print(f"AI: {answer1}\n")

[]
Human: When was GreenGrow Innovations founded?
AI: GreenGrow Innovations was founded in 2010.



In [None]:
question2 = "Where it is headquartered?"
chat_history = get_chat_history(session_id)
print(chat_history)
answer2 = rag_chain.invoke({"input": question2, "chat_history":chat_history})['answer']
insert_application_logs(session_id, question2, answer2, "gpt-3.5-turbo")
print(f"Human: {question2}")
print(f"AI: {answer2}\n")

[{'role': 'human', 'content': 'When was GreenGrow Innovations founded?'}, {'role': 'ai', 'content': 'GreenGrow Innovations was founded in 2010.'}]
Human: Where it is headquartered?
AI: GreenGrow Innovations is headquartered in Portland, Oregon. Additionally, the company has expanded its operations to include offices in California and Iowa.



New User

In [None]:
session_id = str(uuid.uuid4())
question = "What is GreenGrow"
chat_history = get_chat_history(session_id)
print(chat_history)
answer = rag_chain.invoke({"input": question, "chat_history":chat_history})['answer']
insert_application_logs(session_id, question, answer, "gpt-3.5-turbo")
print(f"Human: {question}")
print(f"AI: {answer}\n")