### Installing the Required packages

In [1]:
%%capture
!pip install transformers faiss-cpu sentence-transformers langchain pypdf

### Importing the required packages

In [2]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pypdf import PdfReader
import faiss
import numpy as np

### Using the all-mpnet-base-v2 or all-MiniLM-L6-v2 models the embeddings for the document text/chunks will be generated.
 all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality.

In [3]:
encoder = SentenceTransformer("all-mpnet-base-v2")
# encoder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### This function takes in the knowledge base and creates index of the embeddings

In [4]:
def create_embedding_index(k_base):
  vectors = encoder.encode(k_base)
  print(vectors.shape)

  vector_dimension = vectors.shape[1]
  index = faiss.IndexFlatL2(vector_dimension)
  faiss.normalize_L2(vectors)
  index.add(vectors)
  return index

### The answer_question() function takes the question, the array of embeddings and the number of results wanted, then searches the array that best fits the questions.

In [10]:
def answer_question(question,index,results_len):
  """
  This function takes a question and uses RAG to answer it with Faiss for retrieval.

  Args:
      question: The user's question as a string.

  Returns:
      A dictionary containing the answer and retrieved passage.
  """
  search_vector = encoder.encode(question)
  _vector = np.array([search_vector])
  faiss.normalize_L2(_vector)

  # # # Encode the question
  # question_encoding = tokenizer(question, return_tensors="pt",truncation=True,padding=True)["input_ids"]
  # # Retrieve relevant passages using Faiss
  # question_vec = question_encoding.cpu().numpy()
  distances, retrieved_idxs = index.search(_vector, results_len)

  # Extract the answer and passage based on the retrieved index
  top_passage_idx = retrieved_idxs.ravel()[0]
  answer = knowledge_base[top_passage_idx]

  # Return the answer and retrieved passage for transparency
  return {"answer": answer, "retrieved_passage": knowledge_base[top_passage_idx]}

### Loading the contents of a PDF and converting them to chunks to form the embeddings

In [6]:
knowledge_base = []
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 30,
    length_function = len
)
file = PdfReader("/content/Adverity1.pdf") # <------Make changes in the PDF file path that you want to use
for i in range(0,len(file.pages)):
  page = file.pages[i]
  text = page.extract_text()
  knowledge_base.append(recursive_splitter.split_text(text))

embed_index = create_embedding_index(knowledge_base)

(11, 768)


### Generating the answers/chunks related to the question from the array of embeddings

In [9]:
# Example usage
question = "What is termination period of the contract?"


answer_dict = answer_question(question,embed_index,1)

print(f"Question: {question}")
for i in range(len(answer_dict['answer'])):
  print(f"Answer: \n {answer_dict['answer'][i]}")
print(f"Retrieved Passage: {answer_dict['retrieved_passage']}")

Question: What is termination period of the contract?
Answer: 
 Adverity shall not be liable for any loss of, or damage
to, data or programs to the extent that such loss or
damage would have been avoided or mitigated by
adequate preventative measures of Customer.
IX.6. Application of Direct Claims
The foregoing limitations of liability shall also apply to
any direct damage claims which Customer may have
against employees or representatives of Adverity.
IX.7. Insurance
Adverity undertakes to maintain adequate insurance
Answer: 
 cover for potential liability claims which may arise
under or in connection with this MSA.
X. TERM AND TERMINATION
X.1. Term of Agreement
The term of this MSA is governed by the Subscription
granted by the Commercial Agreement. The
Commercial Agreement commences on the Effective
Date and continues until all Subscriptions granted in
accordance with the Commercial Agreement have
expired or been terminated (“Term”).
X.2. Term of Subscriptions
Subscriptions to the A

### Minimal Example of Knowledge base

In [11]:
knowledge_base = ["The sun rises from east and sets in west",
                  "The Mount Everest is the tallest mountain in the world, and is located in Nepal",
                  "The Mount Everest is also the tallest mountain in Asia."
                  "Kindness is virtue",
                  "CEO of google is Sundar Pichai",
                  "On Sept. 15, 2017, the Cassini spacecraft made a fateful plunge into Saturn's atmosphere, ending the mission just one month shy of its 20th launch anniversary.",
                  "Sixty-six million years ago, dinosaurs had the ultimate bad day. With a devastating asteroid impact, a reign that had lasted 180 million years was abruptly ended.",
                  "AWP is a slow but a powerfull rifle and can be of advantage in a battle field"]

In [12]:
embed_index = create_embedding_index(knowledge_base)

(7, 768)


### Here we ask question about the about knowledge base and in return we get the sentence that closely aligns to the question. This is how RAG is implemented

In [20]:
# question = "Which mountain is the tallest?"
question = input("Ask you question related to the Knowledge base defined above \n")

answer_dict = answer_question(question,embed_index,2)

print(f"Question: {question}")
print(f"Answer: \n {answer_dict['answer']}")
print(f"Retrieved Passage: {answer_dict['retrieved_passage']}")

Ask you question related to the Knowledge base defined above 
Tell me a space related fun fact
Question: Tell me a space related fun fact
Answer: 
 The sun rises from east and sets in west
Retrieved Passage: The sun rises from east and sets in west
