In [3]:
%%capture
!pip install transformers faiss-cpu sentence-transformers langchain==0.0.354 pypdf openai==1.3.9 python-dotenv==1.0.0 PyMuPDF==1.24.2 tqdm

In [4]:
from transformers import pipeline,BertTokenizer
from sentence_transformers import SentenceTransformer,util
from langchain.text_splitter import RecursiveCharacterTextSplitter
import fitz
import faiss
import numpy as np
import openai
from dotenv import load_dotenv
import numpy as np
import os
from tqdm import tqdm

load_dotenv()

os.environ["OPENAI_API_TYPE"] = os.getenv("OPENAI_API_TYPE")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["OPENAI_API_VERSION"] = os.getenv("OPENAI_API_VERSION")

model_name = os.getenv("AZURE_OPEN_AI_MODEL")
embedding_name = os.getenv("LE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")

### Using the all-mpnet-base-v2 or all-MiniLM-L6-v2 models the embeddings for the document text/chunks will be generated.
 all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality.

In [5]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")
# encoder = SentenceTransformer("allenai/longformer-base-4096")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### This function takes in the knowledge base and creates index of the embeddings

In [6]:
def create_embedding_index(k_base):
  vectors = encoder.encode(k_base)
  vector_dimension = vectors.shape[1]
  index = faiss.IndexFlatL2(vector_dimension)
  faiss.normalize_L2(vectors)
  index.add(vectors)

  return index

### The answer_question() function takes the question, the array of embeddings and the number of results wanted, then searches the array that best fits the questions.

In [7]:
def answer_question(question,index,results_len):
  answer = []
  """
  This function takes a question and uses RAG to answer it with Faiss for retrieval.

  Args:
      question: The user's question as a string.

  Returns:
      A dictionary containing the answer and retrieved passage.
  """
  search_vector = encoder.encode(question)
  _vector = np.array([search_vector])
  faiss.normalize_L2(_vector)

  # # # Encode the question
  # question_encoding = tokenizer(question, return_tensors="pt",truncation=True,padding=True)["input_ids"]
  # # Retrieve relevant passages using Faiss
  # question_vec = question_encoding.cpu().numpy()
  distances, retrieved_idxs = index.search(_vector, results_len)
  print(len(retrieved_idxs.ravel()))
  # Extract the answer and passage based on the retrieved index
  for i in range(len(retrieved_idxs.ravel())):
    answer.append(knowledge_base[retrieved_idxs.ravel()[i]])

  # Return the answer and retrieved passage for transparency
  return {"answer": answer}

### Loading the contents of a PDF and converting them to chunks to form the embeddings

In [8]:
def create_document_embedding(pdf):
  knowledge_base = []
  recursive_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 20,
      chunk_overlap = 5,
      length_function = len
  )
  file = fitz.open(pdf) # <------Make changes in the PDF file path that you want to use
  for page in file:
    text = page.get_text()
    text = recursive_splitter.split_text(text)
    knowledge_base.append(text)
  embed_index = create_embedding_index(knowledge_base)
  return knowledge_base, embed_index

### Generating the answers/chunks related to the question from the array of embeddings

In [30]:
def return_RAG_passage(question,embed_index):
  RAG_passage = ''
  answer_dict = answer_question(question,embed_index,10)
  for i in range(len(answer_dict['answer'])):
    RAG_passage += '<Context'+str(i)+'>'+' '.join(answer_dict['answer'][i])
    RAG_passage += '</Context'+str(i)+'>'+'\n\n'

  print(RAG_passage)
  return RAG_passage

### Now we format the prompt to include the RAG chunks along with the question

In [31]:
def CallOpenAI(user,system):
  response = openai.chat.completions.create(
              model= model_name, # model = "deployment_name".
              temperature= 0,
              top_p= 0,
              messages=[
                  {"role": "system", "content": system},
                  {"role": "user", "content": user}
              ]
          )
  return response

## First we try analysing a with less content

In [32]:
knowledge_base, embed_index = create_document_embedding("/content/AWS1.pdf")

In [33]:
question = "What is the governing courts for Amazon Web Services South Africa ProprietaryLimited"

rag_passage = return_RAG_passage(question,embed_index)

10
<Context0>Learn About AWS Resources for AWS Getting Started Training and and Certification Developers on AWS Developer Center SDKs & Tools
Help Help
Contact Us Get Expert Help “Indirect Taxes” means applicable taxes and duties, including, without limitation, VAT, VAT, service tax, tax, GST, excise taxes, sales and and transactions taxes, and gross receipts tax. “Intellectual Property License” means the separate license terms that that apply to your your access to and and use of AWS AWS Content and and Services located at https://aws.amazon. azon.com/legal/aws-i aws-ip-license-terms terms (and any successor or or related locations designated by us), us), as may be be updated by us us from time to to time. “Losses” means any any claims, damages, losses, liabilities, costs, and expenses (including reasonable attorneys’ fees). “Policies” means the Acceptable Use Use Policy, Privacy Notice, the Site Site Terms, the the Service Terms, the AWS Trademark Guidelines, all all restrictions des

In [34]:
full_prompt_SD = rag_passage +"\n\n" +question

In [35]:
response = CallOpenAI(full_prompt_SD,"You are a Professional lawyer who can analyse documents thorougly")

In [36]:
print(response.choices[0].message.content)

The governing courts for Amazon Web Services South Africa Proprietary Limited are the South Gauteng High Court, Johannesburg.


## Now we try analysing a with large content

In [37]:
knowledge_base, embed_index = create_document_embedding("/content/PROFRAC HOLDINGS, LLC credit agreement.pdf")

In [42]:
question = "What is the Acknowledgement Regarding Any Supported QFCs?"

rag_passage = return_RAG_passage(question,embed_index)

10
<Context0>7.23  FCPA
   118 7.24  Sanctioned Persons 118 7.25  Designation of Senior Debt 118 7.26  Insurance 118 7.27  FTS Assets 118 ARTICLE VIII AFFIRMATIVE AND AND NEGATIVE COVENANTS 8.1   Taxes 118 8.2   Legal Existence and Good Good Standing 119 8.3   Compliance with Law; Law; Maintenance of of Licenses 119 8.4   Maintenance of Property, Inspection 119 8.5   Insurance 120 8.6   Environmental Laws 121 8.7   Compliance with ERISA 121 8.8   Dispositions 121 8.9   Mergers, Consolidations, etc 121 8.10  Distributions 122 8.11  Investments 126 
8.12  Debt 126 8.13  Prepayments of Debt 130 8.14  Transactions with Affiliates 131 8.15  Business Conducted 134 
8.16  Liens 134 8.17  Restrictive Agreements 134 8.18  Restrictions on FTS Acquisition Transactions 136 8.19  Fiscal Year; Accounting 136 8.20  Financial Covenants 137 8.21  Information Regarding Collateral 138 8.22  Ratings 138 8.23  Additional Obligors; Covenant to Give Security 138 8.24  Use of of Proceeds 140 8.25  Further Ass

In [43]:
full_prompt_LD = rag_passage +"\n\n" +question

In [44]:
response = CallOpenAI(full_prompt_LD,"You are a Professional lawyer who can analyse documents thorougly")

In [45]:
print(response.choices[0].message.content)

The Acknowledgement Regarding Any Supported QFCs is a provision in the document that states that if any liability of an Affected Financial Institution (such as a bank) arising under any Loan Document is unsecured, it may be subject to write-down and conversion powers of the applicable Resolution Authority. The provision also states that the affected party agrees to be bound by the application of any Write-Down and Conversion Powers by the Resolution Authority and the effects of any Bail-In Action on the liability. This provision is included to address the potential impact of resolution regimes on the obligations of the parties involved in the loan agreement.
