## Import Libraries


In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"


## Load PDF

In [2]:
local_path = "unstructructed.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
    print(f"PDF loaded successfully: {local_path}")
else:
    print("Upload a PDF file")

PDF loaded successfully: unstructructed.pdf


## Split text into chunks

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(data)
print(f"Text split into {len(chunks)} chunks")

Text split into 7 chunks


## Create vector database

In [4]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)
print("Vector database created successfully")

Vector database created successfully


## Set up LLM and Retrieval

In [5]:
local_model = "llama3.2"
llm = ChatOllama(model=local_model)

In [6]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

## Create chain

In [7]:
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [8]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

## Chat with PDF

In [9]:
def chat_with_pdf(question):
    return display(Markdown(chain.invoke(question)))

In [15]:
chat_with_pdf("Give me short summary of entire pdf")

The PDF document appears to be a collection of examples and explanations on various types of charts and graphs used in different fields such as economics, education, psychology, urban affairs, and everyday life. 

These charts include pie charts, bar graphs, line graphs, and tables that visualize data, such as yearly GDP by industry or family budget allocations. Each chart comes with an explanation on when to use each type of graph, how they are used, and examples from different fields.

The document also includes a discussion on the importance of using visual representations of data, citing the phrase "a picture tells a thousand words."

In [13]:
chat_with_pdf("From page 6 get the tabular data")

Here is the tabular data from page 6:

| Year | All Industries | Manufacturing | Finance, Insurance, Real Estate, Rental, Leasing | Arts, Entertainment, Recreation, Accommodation, and Food Service |
| --- | --- | --- | --- | --- |
| 2010 | $26,093,515 | $49,925,221 | $45,422,451 | $9,463,032 |
| 2011 | $27,535,971 | $55,819,842 | $46,186,781 | $10,152,538 |
| 2012 | $28,662,346 | $58,416,508 | $47,973,313 | $10,873,921 |
| 2013 | $29,601,191 | $59,532,899 | $49,018,081 | $11,589,419 |

Let me know if you'd like me to help with anything else!

## Clean up (optional)

In [16]:
vector_db.delete_collection()
print("Vector database deleted successfully")

Vector database deleted successfully
