## Ingesting PDF

In [None]:
%pip install --q unstructured langchain
%pip install --q "unstructured[all-docs]"

In [499]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [500]:
local_path = "WEF_The_Global_Cooperation_Barometer_2024.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

In [520]:
# Preview first page
data[0].page_content



## Vector Embeddings

In [None]:
!ollama pull nomic-embed-text

In [505]:
!ollama list

NAME                    	ID          	SIZE  	MODIFIED       
codellama:7b-code       	fc84f39375bc	3.8 GB	2 months ago  	
gemma:latest            	430ed3535049	5.2 GB	5 weeks ago   	
llama2:latest           	78e26419b446	3.8 GB	2 hours ago   	
llama2-uncensored:latest	44040b922233	3.8 GB	3 months ago  	
llava:latest            	8dd30f6b0cb1	4.7 GB	2 months ago  	
mistral:latest          	4d9f4b269c33	4.1 GB	3 months ago  	
nomic-embed-text:latest 	0a109f422b47	274 MB	17 seconds ago	


In [507]:
%pip install --q chromadb
%pip install --q langchain-text-splitters

In [508]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [509]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [None]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

## Retrieval

In [None]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [511]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [512]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [513]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [516]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [518]:
chain.invoke(input(""))

 what is this about?


OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.15s/it]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 36.58it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 14.64it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 23.34it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 23.14it/s]


' This document is the Insight Report of The Global Cooperation Barometer 2024 by the World Economic Forum in collaboration with McKinsey & Company. It provides an analysis of the state of global cooperation across five pillars: trade and capital, innovation and technology, climate and natural capital, health and wellness, and peace and security. The report examines trends in cooperative actions and their outcomes to determine the overall level of global cooperation in each area. It also includes recommendations for leaders on how to reimagine global cooperation in a new era.'

In [519]:
chain.invoke("What are the 5 pillars of global cooperation?")

OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.33s/it]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 26.36it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 36.23it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 49.43it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 63.03it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 58.14it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 59.76it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 56.69it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 48.34it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 51.85it/s]


' The 5 pillars of global cooperation are:\n\n1. Trade and capital\n2. Innovation and technology\n3. Climate and natural capital\n4. Health and wellness\n5. Peace and security.'

In [None]:
# Delete all collections in the db
vector_db.delete_collection()