A simple rag application that takes a book (Algorithems to live by) then converts it into vector store (vector embeddings -> retriever) and uses a RAG pipeline to answer user query. 

In [1]:
# Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma # vector store
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os

from dotenv import load_dotenv

In [2]:

load_dotenv('../../.env')


loader = PyPDFLoader('~/Books/algorithms_to_live_by.pdf')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=224)

document = loader.load()
document = text_splitter.split_documents(document)

db = Chroma.from_documents(document, OpenAIEmbeddings(api_key=os.getenv('OPENAI_API_KEY')))

In [3]:
OPENAI_API_KEY =os.getenv('OPENAI_API_KEY')

In [4]:
query = 'Who are the authors of the book'
result = db.similarity_search(query=query)
result

[Document(metadata={'creationdate': '2019-07-04T07:03:28+00:00', 'author': 'Brian Christian', 'total_pages': 384, 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]', 'page': 2, 'title': 'Algorithms to Live By', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'page_label': '3', 'producer': 'calibre 3.42.0 [https://calibre-ebook.com]'}, page_content='Begin\tReading\nTable\tof\tContents\nAbout\tthe\tAuthors\nCopyright\tPage\n\t\nThank\tyou\tfor\tbuying\tthis\nHenry\tHolt\tand\tCompany\tebook.\n\t\nTo\treceive\tspecial\toffers,\tbonus\tcontent,\nand\tinfo\ton\tnew\treleases\tand\tother\tgreat\treads,\nsign\tup\tfor\tour\tnewsletters.\n\t\nOr\tvisit\tus\tonline\tat\nus.macmillan.com/newslettersignup\n\t\nFor\temail\tupdates\ton\tBrian\tChristian,\tclick\t\nhere\n.\nFor\temail\tupdates\ton\tTom\tGriffiths,\tclick\t\nhere\n.'),
 Document(metadata={'page': 351, 'page_label': '352', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'title': 'Algorithms to Live By', 'total

In [5]:
result[0]

Document(metadata={'creationdate': '2019-07-04T07:03:28+00:00', 'author': 'Brian Christian', 'total_pages': 384, 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]', 'page': 2, 'title': 'Algorithms to Live By', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'page_label': '3', 'producer': 'calibre 3.42.0 [https://calibre-ebook.com]'}, page_content='Begin\tReading\nTable\tof\tContents\nAbout\tthe\tAuthors\nCopyright\tPage\n\t\nThank\tyou\tfor\tbuying\tthis\nHenry\tHolt\tand\tCompany\tebook.\n\t\nTo\treceive\tspecial\toffers,\tbonus\tcontent,\nand\tinfo\ton\tnew\treleases\tand\tother\tgreat\treads,\nsign\tup\tfor\tour\tnewsletters.\n\t\nOr\tvisit\tus\tonline\tat\nus.macmillan.com/newslettersignup\n\t\nFor\temail\tupdates\ton\tBrian\tChristian,\tclick\t\nhere\n.\nFor\temail\tupdates\ton\tTom\tGriffiths,\tclick\t\nhere\n.')

In [6]:
# using lanceDB
from langchain_community.vectorstores import LanceDB

emb = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
lance_db = LanceDB(
    uri='./lancedb',
    embedding=emb
)
vector_store = lance_db.from_documents(document, emb)

In [7]:
resutl = vector_store.similarity_search('summerize the first chapter')
result

[Document(metadata={'creationdate': '2019-07-04T07:03:28+00:00', 'author': 'Brian Christian', 'total_pages': 384, 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]', 'page': 2, 'title': 'Algorithms to Live By', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'page_label': '3', 'producer': 'calibre 3.42.0 [https://calibre-ebook.com]'}, page_content='Begin\tReading\nTable\tof\tContents\nAbout\tthe\tAuthors\nCopyright\tPage\n\t\nThank\tyou\tfor\tbuying\tthis\nHenry\tHolt\tand\tCompany\tebook.\n\t\nTo\treceive\tspecial\toffers,\tbonus\tcontent,\nand\tinfo\ton\tnew\treleases\tand\tother\tgreat\treads,\nsign\tup\tfor\tour\tnewsletters.\n\t\nOr\tvisit\tus\tonline\tat\nus.macmillan.com/newslettersignup\n\t\nFor\temail\tupdates\ton\tBrian\tChristian,\tclick\t\nhere\n.\nFor\temail\tupdates\ton\tTom\tGriffiths,\tclick\t\nhere\n.'),
 Document(metadata={'page': 351, 'page_label': '352', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'title': 'Algorithms to Live By', 'total

## Retriever and Chain
using LLM along with prompt to query vector store - using lancedb 

In [8]:
from langchain_openai import ChatOpenAI

openai = ChatOpenAI(model='gpt-4o-mini', api_key=OPENAI_API_KEY)

In [9]:
openai

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x7269471896a0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x7268fffd2cc0>, root_client=<openai.OpenAI object at 0x72694718b830>, root_async_client=<openai.AsyncOpenAI object at 0x7268fffd26f0>, model_name='gpt-4o-mini', model_kwargs={}, openai_api_key=SecretStr('**********'), stream_usage=True)

In [10]:
# prompt
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(template="""
You are a helpful bot answering user queries
Try to get accurate results for the user query
from the provide context - from below.

<context>
{context}
<context>
                                      
question: {question}
""", )

In [11]:
# retriever
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={"k": 3}
)

In [12]:
def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

In [13]:
content = {
    "context": retriever | format_docs,
    "question": RunnablePassthrough()
}


rag_chain = content | prompt | openai | StrOutputParser()
    

In [20]:
query = 'summerize the first chapter in detail'

In [21]:
result = rag_chain.invoke(query)
result

'The provided context does not contain specific details about the first chapter. It only mentions that "intractable" problems will be discussed in detail in chapter 8 and describes the beginnings of scheduling theory through Selmer Johnson’s contributions. It also highlights the ambition to explore the broader landscape of scheduling theory rather than just finding individual solutions.\n\nTo summarize the first chapter in detail, more specific information or content from that chapter is needed. If you have access to the text of the first chapter or can provide additional context, I would be happy to help with a detailed summary.'