# Ingesting PDF

In [6]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [227]:
local_path = "Materials/CSCatalog.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

data[0].page_content

'Northeastern University 2023-2024 Undergraduate Day Catalog\n\nComputer Science, BSCS\n\nThe Bachelor of Science in Computer Science focuses on the fundamentals of program design, software development, computer organization, systems and networks, theories of computation, principles of languages, and advanced algorithms and data.\n\nProgram Requirements Complete all courses listed below unless otherwise indicated. Also complete any corequisite labs, recitations, clinicals, or tools courses where speciﬁed and complete any additional courses needed beyond speciﬁc college and major requirements to satisfy graduation credit requirements.\n\nUniversitywide Requirements All undergraduate students are required to complete the Universitywide Requirements (p. 128).\n\nNUpath Requirements All undergraduate students are required to complete the NUpath Requirements (p. 111).\n\nComputer Science Requirements Code\n\nTitle\n\nComputer Science Overview\n\nCS 1200\n\nFirst Year Seminar\n\nCS 1210\n\nP

# Vector Embedding

In [230]:
!ollama pull nomic-embed-text
!ollama list
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

[?25lpulling manifest ⠋ [?25h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
pulling 970aa74c0a90... 100% ▕████████████████▏ 274 MB                         
pulling c71d239df917... 100% ▕████████████████▏  11 KB                         
pulling ce4a164fc046... 100% ▕████████████████▏   17 B                         
pulling 31df23ea7daa... 100% ▕████████████████▏  420 B                         
verifying sha256 digest 
writing manifest 
success [?25h
NAME                       ID              SIZE      MODIFIED               
nomic-embed-text:latest    0a109f422b47    274 MB    Less than a second ago    
mistral:latest             f974a74358d6    4.1 GB    45 seconds ago            


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [232]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [251]:
# Add to vector database
persist_directory = "./chroma_db"
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag",
    persist_directory=persist_directory
)

OllamaEmbeddings: 100%|███████████████████████████| 2/2 [00:00<00:00,  2.04it/s]


# Retrieval

In [255]:

from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [257]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [259]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [261]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [263]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [265]:
chain.invoke(input(""))

 what is this document about


OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 15.85it/s]
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 58.93it/s]
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 37.44it/s]
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 45.84it/s]
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 52.29it/s]
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


' This document appears to be a list of courses, labs, and their corresponding titles for various subjects such as Earth Sciences (Environmental Science), Mathematics, Physics, Computer Science, and General Electives at a University. The document also indicates the requirements and concentrations available within the Computer Science major, specifically mentioning a Concentration in Artificial Intelligence (CS 4100). It seems to be a program or course catalog for an educational institution.'