In [22]:
# import Libraries
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

In [23]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import os

In [24]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [25]:
doc=read_doc('documents/')
len(doc)

58

In [26]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc  # Return the chunked documents

In [27]:
# Chunk the documents
chunked_doc = chunk_data(docs=doc)
len(chunked_doc)

141

In [28]:
chunked_doc

[Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'),
 Document(metadata={'source': 'documents\\budget_speech.pdf', 'page': 2}, page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0

In [29]:
## Embedding Technique Of OPENAI
embeddings=OpenAIEmbeddings(api_key=os.environ.get("OPEN_API_KEY"))
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001B8B4AB0AD0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001B8B4AD0BD0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [30]:
vectors=embeddings.embed_query("How are you?")
print(vectors)
len(vectors)

[-0.016785908490419388, -0.012151270173490047, 0.006627965718507767, -0.026018159464001656, -0.016168780624866486, 0.01762520521879196, -0.011114493943750858, -0.009923434816300869, -0.018131250515580177, -0.010417137295007706, 0.02786954678595066, 0.001650820137001574, -0.007337663788348436, -0.011651395820081234, 0.007238923106342554, -0.015391197986900806, 0.028363250195980072, -0.011830363422632217, 0.013959458097815514, -0.020599765703082085, 0.002528686076402664, 0.006344086490571499, 0.0009997490560635924, -0.008263357914984226, -0.015884900465607643, -0.007794339209794998, 0.02511715143918991, -0.012404292821884155, 0.02230304293334484, -0.025154178962111473, 0.005609702784568071, 0.007695598993450403, -0.01316953357309103, 0.0040144240483641624, 0.008757060393691063, -0.022290699183940887, 0.004020595457404852, -0.010435651987791061, 0.020328229293227196, -0.006337915081530809, 0.027030250057578087, 0.0012558575253933668, -0.005239425227046013, -0.014218652620911598, -0.041520

1536

In [16]:
os.environ['PINECONE_API_KEY'] = os.environ.get("PINECONE_API_KEY")
index_name = "langchainvector"

vectorstore_from_docs = PineconeVectorStore.from_documents(
        chunked_doc,
        index_name=index_name,
        embedding=embeddings
)

In [17]:
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=vectorstore_from_docs.similarity_search(query,k=k)
    return matching_results

In [20]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI

In [31]:
llm=OpenAI(openai_api_key = os.environ.get("OPEN_API_KEY"),temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")

In [32]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [33]:
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)

[Document(id='4f4818d7-fee5-4125-869b-68450aa4bab4', metadata={'page': 10.0, 'source': 'documents\\budget_speech.pdf'}, page_content="7 \n \n \n farmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.   \n22. Now to make India a global hub for ' Shree Anna' , the Indian Institute \nof Millet Research, Hyderabad  will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activities of \nfishermen, fish vendors, and micro & small enterprises, improve value chain \nefficiencies, and expand the market. \nCooperation"), Document(id='1410c44b-9dba-49ff-a953-20e6cd5fc31e', metadata={'page': 13.0, '

  response=chain.run(input_documents=doc_search,question=query)


 The agriculture credit target will be increased to ` 20 lakh crore.
