A simple rag application that takes a book (Algorithems to live by) then converts it into vector store (vector embeddings -> retriever) and uses a RAG pipeline to answer user query. 

In [26]:
# Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma # vector store
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os

from dotenv import load_dotenv


In [27]:

load_dotenv('../.env')


loader = PyPDFLoader('~/Books/algorithms_to_live_by.pdf')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=224)

document = loader.load()
document = text_splitter.split_documents(document)

db = Chroma.from_documents(document, OpenAIEmbeddings(api_key=os.getenv('OPENAI_API_KEY')))

In [28]:
OPENAI_API_KEY =os.getenv('OPENAI_API_KEY')

In [29]:
query = 'Who are the authors of the book'
result = db.similarity_search(query=query)
result

[Document(metadata={'producer': 'calibre 3.42.0 [https://calibre-ebook.com]', 'page_label': '3', 'page': 2, 'title': 'Algorithms to Live By', 'creationdate': '2019-07-04T07:03:28+00:00', 'total_pages': 384, 'author': 'Brian Christian', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]'}, page_content='Begin\tReading\nTable\tof\tContents\nAbout\tthe\tAuthors\nCopyright\tPage\n\t\nThank\tyou\tfor\tbuying\tthis\nHenry\tHolt\tand\tCompany\tebook.\n\t\nTo\treceive\tspecial\toffers,\tbonus\tcontent,\nand\tinfo\ton\tnew\treleases\tand\tother\tgreat\treads,\nsign\tup\tfor\tour\tnewsletters.\n\t\nOr\tvisit\tus\tonline\tat\nus.macmillan.com/newslettersignup\n\t\nFor\temail\tupdates\ton\tBrian\tChristian,\tclick\t\nhere\n.\nFor\temail\tupdates\ton\tTom\tGriffiths,\tclick\t\nhere\n.'),
 Document(metadata={'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'page_label': '3', 'total_pages': 384, 'page': 2, 'producer': 'calibre 3.4

In [30]:
result[0]

Document(metadata={'producer': 'calibre 3.42.0 [https://calibre-ebook.com]', 'page_label': '3', 'page': 2, 'title': 'Algorithms to Live By', 'creationdate': '2019-07-04T07:03:28+00:00', 'total_pages': 384, 'author': 'Brian Christian', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]'}, page_content='Begin\tReading\nTable\tof\tContents\nAbout\tthe\tAuthors\nCopyright\tPage\n\t\nThank\tyou\tfor\tbuying\tthis\nHenry\tHolt\tand\tCompany\tebook.\n\t\nTo\treceive\tspecial\toffers,\tbonus\tcontent,\nand\tinfo\ton\tnew\treleases\tand\tother\tgreat\treads,\nsign\tup\tfor\tour\tnewsletters.\n\t\nOr\tvisit\tus\tonline\tat\nus.macmillan.com/newslettersignup\n\t\nFor\temail\tupdates\ton\tBrian\tChristian,\tclick\t\nhere\n.\nFor\temail\tupdates\ton\tTom\tGriffiths,\tclick\t\nhere\n.')

In [31]:
# using lanceDB
from langchain_community.vectorstores import LanceDB

emb = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
lance_db = LanceDB(
    uri='./lancedb',
    embedding=emb
)
vector_store = lance_db.from_documents(document, emb)

In [32]:
resutl = vector_store.similarity_search('summerize the first chapter')
result

[Document(metadata={'producer': 'calibre 3.42.0 [https://calibre-ebook.com]', 'page_label': '3', 'page': 2, 'title': 'Algorithms to Live By', 'creationdate': '2019-07-04T07:03:28+00:00', 'total_pages': 384, 'author': 'Brian Christian', 'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'creator': 'calibre 3.42.0 [https://calibre-ebook.com]'}, page_content='Begin\tReading\nTable\tof\tContents\nAbout\tthe\tAuthors\nCopyright\tPage\n\t\nThank\tyou\tfor\tbuying\tthis\nHenry\tHolt\tand\tCompany\tebook.\n\t\nTo\treceive\tspecial\toffers,\tbonus\tcontent,\nand\tinfo\ton\tnew\treleases\tand\tother\tgreat\treads,\nsign\tup\tfor\tour\tnewsletters.\n\t\nOr\tvisit\tus\tonline\tat\nus.macmillan.com/newslettersignup\n\t\nFor\temail\tupdates\ton\tBrian\tChristian,\tclick\t\nhere\n.\nFor\temail\tupdates\ton\tTom\tGriffiths,\tclick\t\nhere\n.'),
 Document(metadata={'source': '/home/bitcot/Books/algorithms_to_live_by.pdf', 'page_label': '3', 'total_pages': 384, 'page': 2, 'producer': 'calibre 3.4

## Retriever and Chain
using LLM along with prompt to query vector store - using lancedb 

In [33]:
from langchain_openai import ChatOpenAI

openai = ChatOpenAI(model='gpt-4o-mini', api_key=OPENAI_API_KEY)

In [34]:
openai

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x76e91338f170>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x76e9a8008410>, root_client=<openai.OpenAI object at 0x76e9a81d2810>, root_async_client=<openai.AsyncOpenAI object at 0x76e91338d1c0>, model_name='gpt-4o-mini', model_kwargs={}, openai_api_key=SecretStr('**********'), stream_usage=True)

In [None]:
# prompt
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(template="""
You are a helpful bot answering user queries
Try to get accurate results for the user query
from the provide context - from below.

<context>
{context}
<context>
                                      
question: {question}
""", )

In [36]:
# retriever
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={"k": 3}
)

In [37]:
def format_docs(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

In [40]:
content = {
    "context": retriever | format_docs,
    "question": RunnablePassthrough()
}


rag_chain = content | prompt | openai | StrOutputParser()
    

In [45]:
query = 'how many chapters does the book contain'

In [46]:
result = rag_chain.invoke(query)
result

'The book contains 11 chapters.'