In [57]:
import getpass
import os

from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_community.embeddings import GPT4AllEmbeddings, HuggingFaceEmbeddings, OllamaEmbeddings
from langchain_community.llms.ollama import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [27]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [3]:
DOCUMENT_PATH = "../kirsch_book.pdf"

In [18]:
llm = Ollama(model="llama3")

In [30]:
embedding_model = OllamaEmbeddings(model="llama3")

In [24]:
# model_name = "sentence-transformers/all-mpnet-base-v2"
# model_kwargs = {'device': 'cpu'}
# encode_kwargs = {'normalize_embeddings': False}
# embedding_model = HuggingFaceEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# loader = PyPDFLoader(DOCUMENT_PATH)
loader = PyMuPDFLoader(DOCUMENT_PATH)
# pages = loader.load_and_split()
data = loader.load()

2275

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(data)

In [40]:
vectorstore = Chroma.from_documents(documents=all_splits[:200], embedding=embedding_model, persist_directory="kirsch_book_vectorstore")

In [35]:
# vectorstore = Chroma(persist_directory="kirsch_book_vectorstore", embedding_function=embedding_model)

In [46]:
results = vectorstore.similarity_search("Python", k=5)

In [45]:
results[0]

Document(page_content='(assignment, while loop, if-then-else, procedure call, and return) and standard arithmetic (+, -, *,\n/, %) and comparison (==, !=, <, >, <=, >=) operators over variables and procedure calls as well\nas integer, character, and string literals. C* includes the unary * operator for dereferencing\npointers hence the name but excludes data types other than uint64_t and uint64_t* (int is\nbootstrapped to uint64_t), bitwise and Boolean operators, and many other features. The C*', metadata={'author': 'Christoph Kirsch', 'creationDate': "D:20231114172245Z00'00'", 'creator': 'Marked 2', 'file_path': '../kirsch_book.pdf', 'format': 'PDF 1.4', 'keywords': '', 'modDate': "D:20231114172245Z00'00'", 'page': 13, 'producer': 'macOS Version 14.1 (Build 23B74) Quartz PDFContext', 'source': '../kirsch_book.pdf', 'subject': '', 'title': 'Elementary Computer Science', 'total_pages': 359, 'trapped': ''})

In [48]:
retriever = vectorstore.as_retriever()

In [54]:
prompt = hub.pull("rlm/rag-prompt")

In [55]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [58]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [59]:
rag_chain.invoke("What is the name of the author of the book?")

"I don't know. The provided context does not mention the author of a book. It appears to be discussing computer programming and machine code, but it does not reference a specific book or its author."