In [10]:
%%capture
!pip install openai==1.3.9 python-dotenv==1.0.0 PyMuPDF==1.24.2 PyMuPDFb==1.24.1 tqdm tiktoken

In [13]:
import openai
from dotenv import load_dotenv
import fitz
from tqdm import tqdm
import os
import tiktoken

load_dotenv()

os.environ["OPENAI_API_TYPE"] = os.getenv("OPENAI_API_TYPE")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["OPENAI_API_VERSION"] = os.getenv("OPENAI_API_VERSION")

model_name = os.getenv("AZURE_OPEN_AI_MODEL")

token_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [14]:
def CallOpenAI(user,system):
  response = openai.chat.completions.create(
              model= model_name, # model = "deployment_name".
              temperature= 0,
              top_p= 0,
              messages=[
                  {"role": "system", "content": system},
                  {"role": "user", "content": user}
              ]
          )
  return response

## Lets take a contract and try to analyse it without much instruction

In [19]:
def extract_text(pdf_path):
  pdf = fitz.open(pdf_path)
  text = ''

  for page in pdf:
    text += page.get_text()

  num_tokens = len(token_encoding.encode(text))
  print("Number of tokens in the entire Document: ", num_tokens)
  return text

In [24]:
short_document = extract_text("/content/AWS1.pdf")

Number of tokens in the entire Document:  11590


In [28]:
Question = "What is the governing courts for Amazon Web Services South Africa ProprietaryLimited"

full_prompt_SD = short_document +"\n\n" +Question

In [29]:
response = CallOpenAI(full_prompt_SD,"You are a Professional lawyer who can analyse documents thorougly")

In [30]:
print(response.choices[0].message.content)

The governing courts for Amazon Web Services South Africa Proprietary Limited would be the South Gauteng High Court in Johannesburg, South Africa.


## Now lets load up a document that has more than 16000 tokens, which is the limit of GPT-3.5-Turbo

In [31]:
long_document = extract_text("/content/PROFRAC HOLDINGS, LLC credit agreement.pdf")

Number of tokens in the entire Document:  163227


In [32]:
Question = "What does Covered Entity mean in the Document given above?"

full_prompt_LD = long_document +"\n\n" +Question

## Here what you see is, when the message length exceeded the limit of GPT, it throws an error.
### This problem will be fixed in the next lab where you see how Retrieval Augmented Generation(RAG) will fix this problem and enable us to analyse documents of any length.

In [37]:
try:
  response = CallOpenAI(full_prompt_LD,"You are a Professional lawyer who can analyse documents thorougly")
except Exception as e:
  print(str(e))

Error code: 400 - {'error': {'message': "This model's maximum context length is 16384 tokens. However, your messages resulted in 163261 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
