In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

llama_loader = PyPDFLoader("llama2.pdf").load()  # Load the PDF document
llama_loader

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pdf_documents = text_splitter.split_documents(llama_loader)  # Split the document into chunks
pdf_documents

openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")  # Initialize embeddings

# Create a new vector store for the PDF documents
vector_store_pdf = Chroma(
    collection_name="llama2_collection",  # Name of the collection
    embedding_function=openai_embeddings,  # Embedding function
    persist_directory="./chroma_pdf_db"    # Directory to persist the database
)

vector_store_pdf.add_documents(pdf_documents)  # Add the PDF documents to the vector store





  vector_store_pdf = Chroma(


['12ec8a3e-091c-4d6a-b425-d03a8f8b07c8',
 '3f802d7d-65c1-4b09-be8e-ce14a566b8da',
 '6a429522-889a-40ad-9b44-7c4dcfc01e05',
 '914e1555-f1bc-4132-8f03-3ea9cc7011ec',
 'b9a65d56-3ebb-42d6-a239-2e9c4ac80037',
 'bc965cf6-582b-48bd-a0d2-6b62ec9cefc3',
 '18d9bfe0-d645-4799-9e00-d5964d847e70',
 '0e6d5c56-f67c-448a-8883-e644b4db7834',
 'fd2e6cbb-9b75-43e8-9f7b-802f7372df30',
 '4d2585af-23b0-4665-bd2d-84021e472948',
 '08486735-b233-4d9e-b19e-505e9d668f36',
 '881c2455-7bb4-4e02-8814-18c083d104c5',
 '67542448-5a9d-4b17-b269-e9f55ea60c2c',
 '6b7da7ff-542b-4ecc-8cc6-9ef5e8da065b',
 '7c5677e9-8b85-464a-a298-c222e76ea882',
 '9c5a5e71-0627-47e3-8f41-815048e9da3f',
 'd4f12f76-fe94-4ed6-a946-8820562cd32f',
 '9feef9fa-c6d6-43ff-9056-ef9c05b8bfe1',
 '16338e16-9c80-4604-822c-f3f53eb9d742',
 'ba733a1f-afb3-4c25-b32d-518e7d92978c',
 '3e4a5b93-cde0-41c2-81cc-5085c1acaf39',
 'e4f98c24-1724-44b9-be7f-a375f3024e26',
 'fb0cb4be-b1b4-4e01-ad98-9365af9165ae',
 '0551669e-3a6b-4927-8fde-f81401c9ffb0',
 '48d52320-0373-

In [2]:
rag_retriever = vector_store_pdf.as_retriever(search_kwargs={"k": 5})  # Create a retriever

In [3]:
results = rag_retriever.invoke("what is llama model?")  # Invoke the retriever with a query
results

[Document(metadata={'source': 'llama2.pdf', 'subject': '', 'title': '', 'author': '', 'moddate': '2023-07-20T00:30:36+00:00', 'keywords': '', 'producer': 'pdfTeX-1.40.25', 'page_label': '4', 'creationdate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'creator': 'LaTeX with hyperref', 'trapped': '/False', 'total_pages': 77, 'page': 3}, page_content='1. Llama 2, an updated version ofLlama 1, trained on a new mix of publicly available data. We also\nincreased the size of the pretraining corpus by 40%, doubled the context length of the model, and\nadopted grouped-query attention (Ainslie et al., 2023). We are releasing variants ofLlama 2with\n7B, 13B, and 70B parameters. We have also trained 34B variants, which we report on in this paper\nbut are not releasing.§\n2. Llama 2-Chat, a fine-tuned version ofLlama 2that is optimized for dialogue use cases. We release\nvariants of this model with 7B, 13B

In [1]:
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# import pprint

prompt = hub.pull("rlm/rag-prompt")
# pprint.pprint(prompt.messages) 
llm = ChatOpenAI(model="gpt-4o", temperature=0)  # Initialize the language model


In [2]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    

In [3]:
# chain = rag_retriever | prompt | llm | StrOutputParser  # Create a chain with the prompt, LLM, and retriever


chain = ({"context": rag_retriever | format_docs , "question": RunnablePassthrough()}  # Define the chain structure with context and question
| prompt 
| llm 
| StrOutputParser())  # Create a chain with the prompt, LLM, and retriever




NameError: name 'rag_retriever' is not defined

In [None]:
chain.invoke("what is llama model?")  # Invoke the chain with a query