In [15]:
import os
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# This is needed to enable LangSmith logging/tracing for chains, prompts, and LLM calls.
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

# Setting this to 'true' will start recording all your chain/agent/LLM activity for debugging and evaluation.
os.environ['LANGCHAIN_TRACING_V2'] = 'true'

# This helps organize and filter traces inside the LangSmith dashboard based on this project.
os.environ['Langchain_Project'] = os.getenv('LANGCHAIN_PROJECT')

In [None]:
# Import the Pinecone hybrid retriever from LangChain community integrations
from langchain_community.retrievers import PineconeHybridSearchRetriever
# Import Pinecone client and serverless deployment specification
from pinecone import Pinecone, ServerlessSpec

# Initialize the Pinecone client using the provided API key
# We are doing this so we can create and interact with Pinecone indexes
pc = Pinecone(api_key=PINECONE_API_KEY)

# List all existing indexes in the Pinecone project
# This helps us check whether our target index already exists or not
existing_indexes = pc.list_indexes()

# Define the name of the index we want to create/use for hybrid search
index_name = "langchain-hybrid-search-with-pinecone"

# Create the index only if it doesn't already exist
# We are doing this to avoid overwriting an existing index and ensure idempotency
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=1536, # dimension of the dense vector embeddings
        metric="dotproduct",  # Dot product is used because it supports sparse + dense hybrid scoring
        spec=ServerlessSpec(cloud="aws", region="us-east-1"), # Specifies the serverless deployment region and cloud provider
    )    

In [17]:
# vector embedding
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
# Import BM25Encoder from Pinecone's text module
# We are using BM25 to generate sparse vector embeddings for keyword-based search
from pinecone_text.sparse import BM25Encoder

# Initialize the BM25 encoder using default parameters
# This encoder helps us convert text into sparse vectors based on keyword importance
bm25_encoder = BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x268bfadeae0>

In [26]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Load the PDF document using PyPDFLoader
# This will help us extract raw text from the given PDF file
loader = PyPDFLoader("indian_budget_speech_2025.pdf")
document = loader.load()

# This is used to split large documents into smaller chunks to fit within model token limits
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

# This will return a list of smaller document chunks, suitable for embedding
docs = text_splitter.split_documents(document)
docs[:5]

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-02-01T03:54:26+05:30', 'author': 'hss', 'moddate': '2025-02-01T03:56:01+05:30', 'title': '', 'source': 'indian_budget_speech_2025.pdf', 'total_pages': 60, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2025-2026\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2025'),
 Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-02-01T03:54:26+05:30', 'author': 'hss', 'moddate': '2025-02-01T03:56:01+05:30', 'title': '', 'source': 'indian_budget_speech_2025.pdf', 'total_pages': 60, 'page': 2, 'page_label': '3'}, page_content='CONTENTS \n \nPART – A \n Page No. \nIntroduction 1 \nBudget Theme 1 \nAgriculture as the 1st engine 3 \nMSMEs as the 2nd engine 6 \nInvestment as the 3rd engine 8 \nA. Investing in People 8 \nB. Investing in the Economy 10 \nC. Investing in Innovation 14 \nExp

BM25Encoder works with tokenized text, not with metadata or structured Document objects.

docs is a list of Document objects with attributes like .page_content, .metadata.

So you need to extract just the .page_content.

In [None]:
# Extract the page content from each document chunk
# This will create a list of sentences (or text chunks) from the document
# We use 'doc.page_content' to retrieve the actual text content from each document chunk
sentences = [doc.page_content for doc in docs]
sentences[:5]

['GOVERNMENT OF INDIA\nBUDGET 2025-2026\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2025',
 'CONTENTS \n \nPART – A \n Page No. \nIntroduction 1 \nBudget Theme 1 \nAgriculture as the 1st engine 3 \nMSMEs as the 2nd engine 6 \nInvestment as the 3rd engine 8 \nA. Investing in People 8 \nB. Investing in the Economy 10 \nC. Investing in Innovation 14 \nExports as the 4th engine 15 \nReforms as the Fuel 16 \nFiscal Policy 18 \n \n \nPART – B \nIndirect taxes 20 \nDirect Taxes  23 \n \nAnnexure to Part-A 29 \nAnnexure to Part-B 31',
 'Budget 2025-2026 \n \nSpeech of \nNirmala Sitharaman \nMinister of Finance \nFebruary 1, 2025 \nHon’ble Speaker,  \n I present the Budget for 2025-26. \nIntroduction \n1. This Budget continues our Government’s efforts to: \na) accelerate growth,  \nb) secure inclusive development,  \nc) invigorate private sector investments,  \nd) uplift household sentiments, and \ne) enhance spending power of India’s rising middle class.  \n2. Together, 

In [None]:
# We are training a BM25 encoder using the text chunks from the PDF.
# The encoder will learn the best way to rank and search these text chunks based on their relevance.

# Fit the BM25 encoder with the sentences (text chunks)
# This will learn the BM25 model from the provided sentences
bm25_encoder.fit(sentences)

# Dump the learned BM25 values to a JSON file for later use
# This will save the learned BM25 model parameters to a file 'bm25_values.json'
bm25_encoder.dump('bm25_values.json')

# Load the previously saved BM25 encoder from the JSON file
# This allows us to use the encoder without retraining it each time
bm25_encoder = BM25Encoder().load('bm25_values.json')

  0%|          | 0/134 [00:00<?, ?it/s]

In [None]:
# Initialize the Pinecone index using the previously created index name
# This will allow us to interact with the Pinecone index (e.g., for querying or inserting vectors)
index = pc.Index(index_name)

# Create a retriever using Pinecone Hybrid Search with both dense (embedding) and sparse (BM25) search
# The retriever combines the results from the dense and sparse encoders based on the specified alpha value
# The top_k parameter controls how many results are returned from the search
retriever = PineconeHybridSearchRetriever(
    index=index,
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    top_k=4,  # Adjust as needed
    alpha=0.5  # Adjust the weight between dense and sparse scores
)

# Add the text chunks (sentences) to the Pinecone index for retrieval
# This step uploads the sentences (from the document) into the Pinecone index 
# so they can be searched using the hybrid search mechanism (dense + sparse).
retriever.add_texts(texts=sentences)  # Explicitly pass the `texts` parameter

  0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI  # or HuggingFaceHub or AzureOpenAI

In [35]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4")  # Or "gpt-3.5-turbo"

In [None]:
# Create a Retrieval-based QA chain using the retriever
# This step sets up a retrieval-based question-answering chain 
# by integrating the retriever with a language model (llm) 
# to process user queries, retrieve relevant documents, and generate answers.
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True  # optional, for seeing where the answer came from
)

In [37]:
query = "What is the speech about?"
result = qa_chain({"query": query})

print("Answer:", result["result"])

Answer: The speech is about the Government of India's budget for the fiscal year 2025-2026. It was delivered by Nirmala Sitharaman, the Minister of Finance, on February 1, 2025. The budget aims to accelerate growth, secure inclusive development, invigorate private sector investments, uplift household sentiments, and enhance the spending power of India's rising middle class. The speech also discusses the government's efforts towards digitalization, resolving income tax disputes, promoting investment and employment, and providing tax certainty for electronics manufacturing schemes. The budget's development measures focus on ten areas including agricultural growth, rural prosperity, and inclusive growth.


In [38]:
query = "What sectors have been announced for the budget?"
result = qa_chain({"query": query})

print("Answer:", result["result"])

Answer: The sectors announced for the budget include rural piped water supply schemes, urban sector reforms, tourism, medical tourism, research, development and innovation, indirect taxes, pension sector, KYC simplification, company mergers, bilateral investment treaties, and regulatory reforms.


In [39]:
query = "What tax benefits have been anounced?"
result = qa_chain({"query": query})

print("Answer:", result["result"])

Answer: The tax benefits announced include a tax rebate for taxpayers with up to ` 12 lakh of normal income, such that there is no tax payable by them. This is in addition to the benefit due to slab rate reduction. For example, a taxpayer with an income of ` 12 lakh will get a benefit of ` 80,000 in tax. A person with an income of ` 18 lakh will get a tax benefit of ` 70,000, and a person with an income of ` 25 lakh gets a benefit of ` 1,10,000. 

Additionally, the benefits of the existing tonnage tax scheme are proposed to be extended to inland vessels registered under the Indian Vessels Act, 2021. 

For start-ups, the period of incorporation is proposed to be extended by 5 years to allow the benefit available to start-ups which are incorporated before 1.4.2030. 

Specific benefits are also proposed for ship-leasing units, insurance offices and treasury centres of global companies which are set up in the International Financial Services Centre (IFSC). 

Finally, it is proposed to redu