In [None]:
!pip install unstructured tiktoken pinecone-client pypdf pandas openai langchain numpy python-dotenv

In [3]:
import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

  from tqdm.autonotebook import tqdm


In [14]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
HF_API_KEY = userdata.get('HF_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [15]:
import os
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HF_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

Read the document:

In [6]:
def read_doc(directory):
  file_loader = PyPDFDirectoryLoader(directory)
  documents = file_loader.load()
  return documents

In [None]:
doc = read_doc('/content/drive/MyDrive/Colab Notebooks/documents/')
doc

In [8]:
len(doc)

58

Divide document into text chunks:

In [9]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  doc = text_splitter.split_documents(docs)
  return docs

In [None]:
documents = chunk_data(doc)
documents

In [11]:
len(documents)

58

Embedding techniques of OpenAI:

In [12]:
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7cdbba5321a0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7cdbba4d6bf0>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, http_client=None)

In [13]:
vectors = embeddings.embed_query("How are you?")
len(vectors)

1536

Vector Seach DB in Pinecone:

In [16]:
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment="gcp-starter"
)

# Database name
index_name = "langchainvector"

Store docs in Pinecone:

In [None]:
# from langchain.vectorstores import Pinecone
index = Pinecone.from_documents(doc, embeddings, index_name=index_name)

Cosine similarity to retrieve results from vectorDB:

In [18]:
def retrieve_query(query, k=2):
  matching_results = index.similarity_search(query, k=k)
  return matching_results

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

In [20]:
llm = OpenAI(model_name="text-davinci-003", temperature=0.3)
chain = load_qa_chain(llm, chain_type="stuff")

In [21]:
# search answers froom VectorDB
def retrieve_answers(query):
  doc_search = retrieve_query(query)
  print(doc_search)
  response = chain.run(input_documents=doc_search, question=query)
  return response

In [22]:
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)

[Document(page_content="7 \n \n \n farmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.   \n22. Now to make India a global hub for ' Shree Anna' , the Indian Institute \nof Millet Research, Hyderabad  will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activities of \nfishermen, fish vendors, and micro & small enterprises, improve value chain \nefficiencies, and expand the market. \nCooperation \n25. For farmers, especially small and marginal farmers, and other \nmarginalised sections, the government is promoting cooperative-based \neconomic development model. A new Ministry of Coo

In [23]:
our_query = "How much the agriculture doing?"
answer2 = retrieve_answers(our_query)
print(answer2)

[Document(page_content="7 \n \n \n farmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.   \n22. Now to make India a global hub for ' Shree Anna' , the Indian Institute \nof Millet Research, Hyderabad  will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activities of \nfishermen, fish vendors, and micro & small enterprises, improve value chain \nefficiencies, and expand the market. \nCooperation \n25. For farmers, especially small and marginal farmers, and other \nmarginalised sections, the government is promoting cooperative-based \neconomic development model. A new Ministry of Coo