<a href="https://colab.research.google.com/github/hyperionhex/RAG_Model/blob/main/RAG_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Retrieval Augmentation


In [None]:
!pip install -qU \
  langchain==0.0.162 \
  openai==0.27.7 \
  tiktoken==0.4.0 \
  "pinecone-client[grpc]"==2.2.1 \
  pinecone_datasets=='0.5.0rc10'

#Building the Knowledge Base

In [None]:
import pinecone_datasets

dataset = pinecone_datasets.load_dataset('wikipedia-simple-text-embedding-ada-002-100K')
dataset.head()

In [None]:
len(dataset)

100000

In [None]:
dataset.documents.drop(['metadata'], axis=1, inplace=True)
dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)
dataset.documents.drop(dataset.documents.index[30_000:], inplace=True)
len(dataset)

30000

#Vector Database

In [None]:
index_name = 'canopy--document-uploader'

In [None]:
import os
import pinecone

# pinecone api
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') or 'YOUR_API_KEY'
# pinecone env
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') or 'gcp-starter'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    # new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536,  # 1536 dim of text-embedding-ada-002
    )

We Need to Connect to the index

In [None]:
import time

index = pinecone.GRPCIndex(index_name)
time.sleep(1)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.30137,
 'namespaces': {'': {'vector_count': 30137}},
 'total_vector_count': 30137}

In [None]:
for batch in dataset.iter_documents(batch_size=100):
    index.upsert(batch)

To check the number of vectors in our index

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.30137,
 'namespaces': {'': {'vector_count': 30137}},
 'total_vector_count': 30137}

#Creating a Vector Store and Querying

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

# openai api key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'YOUR_API_KEY'

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

initializing the vector store

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switching back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

Query

In [None]:
query = " What are the key benefits of Tata AIA Life Insurance Smart Income Plus?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content="as they now come with guaranteed payouts. We present to you, Tata AIA Life Insurance Smart Income Plus,  a limited pay income plan that meets tomorrow's requirements  along with protecting your loved ones and dreams as it ensures  you of guaranteed returns for the money invested. Investment  in  this  plan  helps  you  fulfill  your  medium  to  long  term goals such as Child’s Education/ Marriage/ Business  start-up and Retirement planning. Key Beneﬁts (cid:127)  Flexibility  to  choose  between  Regular  Income  or  Endowment options (cid:127)  Receive Guaranteed Payouts ranging from 120% to 160%  of  the  Annualised  Premium  under  Regular  Income  Option  based upon premium payment term chosen (cid:127)  Pay  for  7/10/12  years,  get  guaranteed  life  cover  for  15/21/25 years (cid:127)  Enhance your protection with optional Riders (cid:127)  Higher benefit for female lives  (cid:127)  Receive  tax  benefits  u/s  80C  and", metadata={'document_id': 'a5e

#Generative Q-A

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
qa.run(query)

'The key benefits of Tata AIA Life Insurance Smart Income Plus are:\n\n1. Flexibility to choose between Regular Income or Endowment options.\n2. Guaranteed Payouts ranging from 120% to 160% of the Annualised Premium under the Regular Income Option based upon the premium payment term chosen.\n3. Pay for 7/10/12 years and get guaranteed life cover for 15/21/25 years.\n4. Optional Riders to enhance your protection.\n5. Higher benefit for female lives.\n6. Tax benefits under section 80C.'

#Included the sources of information

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
qa_with_sources(query)

{'question': ' What are the key benefits of Tata AIA Life Insurance Smart Income Plus?',
 'answer': 'The key benefits of Tata AIA Life Insurance Smart Income Plus include flexibility to choose between Regular Income or Endowment options, guaranteed payouts ranging from 120% to 160% of the Annualised Premium, guaranteed life cover for 15/21/25 years, optional Riders for enhanced protection, higher benefit for female lives, and tax benefits under section 80C. \n',
 'sources': 'example.pdf'}