In [1]:
# !pip install --q unstructured langchain
# !pip install --q "unstructured[all-docs]"

In [2]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [3]:
local_path = "budget_speech.pdf"

# Local PDF file uploads
if local_path:
  loader = UnstructuredPDFLoader(file_path=local_path)
  data = loader.load()
else:
  print("Upload a PDF file")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# preview the first page
data[0].page_content

'GOVERNMENT OF INDIA\n\nBUDGET 2024-2025\n\nSPEECH\n\nOF NIRMALA SITHARAMAN MINISTER OF FINANCE\n\nJuly 23, 2024\n\nCONTENTS\n\nPART – A\n\nIntroduction\n\nGlobal Context\n\nInterim Budget\n\nBudget Theme\n\nBudget Priorities\n\n(i)\n\nProductivity and resilience in Agriculture\n\n(ii)\n\nEmployment & Skilling\n\n(iii)\n\nInclusive Human Resource Development and Social Justice\n\n(iv) Manufacturing & Services\n\n(v)\n\nUrban Development\n\n(vi)\n\nEnergy Security\n\n(vii)\n\nInfrastructure\n\n(viii)\n\nInnovation, Research & Development\n\n(ix) Next Generation Reforms\n\nBudget Estimates 2024-25\n\nPART – B\n\nIndirect taxes\n\nDirect Taxes\n\nAnnexure to Part-A\n\nAnnexure to Part-B\n\nPage No.\n\n1\n\n1\n\n2\n\n2\n\n2\n\n20\n\n22\n\n25\n\n31\n\n36\n\nBudget 2024-2025\n\nSpeech of\n\nNirmala Sitharaman\n\nMinister of Finance\n\nJuly 23, 2024\n\nHon’ble Speaker,\n\nI present the Budget for 2024-25.\n\nIntroduction\n\n1.\n\nThe people of India have reposed their faith in the government 

In [5]:
# !ollama pull nomic-embed-text
!ollama list

NAME                   	ID          	SIZE  	MODIFIED      
nomic-embed-text:latest	0a109f422b47	274 MB	6 minutes ago	
llama3.1:latest        	75382d0899df	4.7 GB	3 hours ago  	


In [6]:
!ollama pull mistral

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest 
pulling ff82381e2bea...  34% ▕█████           ▏ 1.4 GB/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling ff82381e2bea...  34% ▕█████           ▏ 1.4 GB/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling ff82381e2bea...  34% ▕█████           ▏ 1.4 GB/4.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling ff82381e2bea...  34% ▕█████           

In [7]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED               
mistral:latest         	f974a74358d6	4.1 GB	Less than a second ago	
nomic-embed-text:latest	0a109f422b47	274 MB	19 minutes ago        	
llama3.1:latest        	75382d0899df	4.7 GB	3 hours ago           	


In [8]:
# !pip install --q chromadb

In [9]:
# !pip install --q langchain-text-splitters

In [10]:
from langchain_community.embeddings import OllamaEmbeddings

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [12]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [13]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|█████████████████████████| 13/13 [00:07<00:00,  1.75it/s]


In [14]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [15]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [16]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [17]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [18]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [19]:
chain.invoke("Summary of the budget 2024")

OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.45s/it]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 35.35it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 51.80it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 42.77it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 41.03it/s]


'1. The budget proposes a reduction of customs duty on various inputs for the manufacturing of shrimp and fish feed, as well as on real down filling material from duck or goose for the leather and textile sectors to enhance their competitiveness in exports.\n\n2. To rectify inversion in duty, there is a proposal to reduce BCD on methylene diphenyl diisocyanate (MDI) for manufacture of spandex yarn from 7.5 to 5 per cent.\n\n3. The export duty structure on raw hides, skins, and leather will be simplified and rationalized.\n\n4. Customs duties on gold and silver are proposed to be reduced to 6 per cent and that on platinum to 6.4 per cent to enhance domestic value addition in gold and precious metal jewellery. Ferro Nickel and blister copper are also proposed to be made duty-free for production purposes.\n\n5. To increase the value addition in the domestic electronics industry, oxygen-free copper for manufacture of resistors will be made duty-free, subject to conditions, and certain part

In [20]:
chain.invoke("What are the top spend areas in the budget 2024 and mention the amount in ruppee")

OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:01<00:00,  1.43s/it]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 22.32it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 29.63it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 39.13it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 40.27it/s]


'1. Agriculture & Allied Sector: INR 1,52,00,00,00,000 (INR 1.52 Lakh Crore)\n  2. Employment Linked Incentive: Not specified in the provided text.'