In [70]:
# import Libraries

import openai
import langchain
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

In [None]:
# API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")  # Fixed: removed comma
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
print(f"Using Pinecone index: {PINECONE_INDEX_NAME}")

Using Pinecone index: ai-chatbot-2


In [78]:
# Initialize OpenAI and Pinecone
openai.api_key = OPENAI_API_KEY

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = PINECONE_INDEX_NAME

# Note: We don't need to get the index here since PineconeVectorStore.from_documents will handle it

In [None]:
## Embedding Technique Of OPENAI
# Note: If you're getting RateLimitError, check your OpenAI account at https://platform.openai.com/account/usage
embeddings=OpenAIEmbeddings(api_key=os.getenv('OPENAI_API_KEY'))
embeddings

# Alternative: Use HuggingFace embeddings (free)
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [62]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [63]:
doc=read_doc('documents/')
len(doc)

28

In [65]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

In [66]:
documents=chunk_data(docs=doc)
len(documents)

45

In [56]:
# vectors=embeddings.embed_query("How are you?")
# len(vectors)

In [79]:
# Create a Pinecone vector store from the documents using the specified embeddings and index name
print("index_name: ", index_name)
vector_store = PineconeVectorStore.from_documents(
    documents=documents,  # Use chunked documents, not the original doc
    embedding=embeddings,
    index_name=index_name
)

index_name:  ai-chatbot-2


In [80]:
## Cosine Similarity Retrieve Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=vector_store.similarity_search(query,k=k)
    return matching_results

In [None]:
llm=OpenAI(model_name="text-davinci-003",temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")

In [None]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [None]:
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)