## Ingesting PDF

%pip install --q unstructured langchain
%pip install --q "unstructured[all-docs]"

#%pip install git+https://github.com/bigcat88/pillow_heif.git


%pip install tesseract

In [None]:
from langchain_community.document_loaders.pdf import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [None]:
local_path = "WEF_The_Global_Cooperation_Barometer_2024.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  # data = loader.load()
else:
  print("Upload a PDF file")

In [6]:
from langchain_community.document_loaders import PyPDFLoader

# file_path = (
#     "../../docs/integrations/document_loaders/example_data/layout-parser-paper.pdf"
# )
loader = PyPDFLoader(local_path)
pages = loader.load_and_split()

pages[10]

Document(metadata={'source': 'WEF_The_Global_Cooperation_Barometer_2024.pdf', 'page': 9}, page_content='2012-2022: Moderate growth \nin cooperation, followed by \npandemic-related shake-ups\nGlobal cooperation in the trade and capital pillar \nincreased at an average annual rate of 0.9% \nbetween 2012-2020, driven by growth in the stock \nof foreign direct investment (FDI) positions across \nmany countries. Globally, FDI stock rose from 30% \nin 2012 to 49% in 2020, as a share of GDP . \nTrade presents a more mixed story in this period. \nServices flows (as a percentage of GDP) rose \nsteadily between 2012-2019, when a collapse in \ntravel triggered a fall in services from 13.7% of GDP \nin 2019 to 11.8% in 2020. Goods flows declined \nfrom 49.3% of GDP in 2012 to 41.9% in 2020.   \nThe COVID-19 pandemic and the years that \nfollowed shook up trade and capital flows. Despite \nthis, there has been a continued net increase, \nthough it has been moderate. Trade flows fell \nin the immedi

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader
local_path = "WEF_The_Global_Cooperation_Barometer_2024.pdf"

# file_path = "./example_data/layout-parser-paper.pdf"
loader = UnstructuredPDFLoader(local_path, mode = 'elements')
docs = loader.load()
docs[0]

CalledProcessError: Command '['tesseract', '--version']' died with <Signals.SIGABRT: 6>.

In [None]:
%pip install -qU langchain-community unstructured



## Vector Embeddings

In [None]:
!ollama pull nomic-embed-text

In [None]:
!ollama list

In [None]:
%pip install --q chromadb
%pip install --q langchain-text-splitters

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [None]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [None]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

## Retrieval

In [None]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [None]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [None]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke("What are the 5 pillars of global cooperation?")

In [None]:
# Delete all collections in the db
vector_db.delete_collection()