In [42]:
import textract
import os
import openai
import tiktoken
import time

from openai import OpenAI



In [50]:
import os
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY",""))

In [51]:
text = textract.process("../uploads/fia_f1_power_unit_financial_regulations_issue_1_-_2022-08-16.pdf",method="pdfminer").decode("utf-8")
#text = textract.process("../uploads/uber_2021.pdf",method="pdfminer").decode("utf-8")

clean_text = text.replace("  ", " ").replace("\n","; ").replace(';',' ')

In [52]:
documet='<document>'
template_promt=f'''Extract key pieces of information from this regulation document.
If a particular piece of information is not present, output \"Not specified\".
When you extract a key piece of information, include the closest page number.
Use the following format:\n0. Who is the author\n1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR\n2. What is the value of External Manufacturing Costs in USD\n3. What is the Capital Expenditure Limit in USD\n\nDocument: \"\"\"<document>\"\"\"\n\n0. Who is the author: Tom Anderson (Page 1)\n1.
'''
print(template_promt)

Extract key pieces of information from this regulation document.
If a particular piece of information is not present, output "Not specified".
When you extract a key piece of information, include the closest page number.
Use the following format:
0. Who is the author
1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR
2. What is the value of External Manufacturing Costs in USD
3. What is the Capital Expenditure Limit in USD

Document: """<document>"""

0. Who is the author: Tom Anderson (Page 1)
1.



In [53]:
def create_chunk(text, n, tokenizer):
    tokens = tokenizer.encode(text)
    """Yield successive n-sized chunks from text."""
    i = 0
    while i < len(tokens):
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        
        if j == i + int(0.5 * n):
            j = min( i + n , len(tokens))
        yield tokens[i:j]
        i = j


def extract_chunk(document, template_promt):
    retry_count = 0
    max_retries = 3
    backoff_time = 20  # Initial backoff time in seconds
    
    prompt = template_promt.replace('<document>',document)

    while retry_count < max_retries:
        try:
            messages = [
                {"role":"system", "content": "You help extract information from documents."},
                {"role":"user", "content": prompt}
            ]

            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=messages,
                temperature=0,
                max_tokens=1500,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )

            return "1." + response.choices[0].message.content
        
        except openai.RateLimitError as e:
            print(f"Rate limit exceeded. Waiting for {backoff_time} seconds before retrying.")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponential backoff
            retry_count += 1
    # If we reach this point, all retries have failed
    raise Exception("Max retries reached. Unable to complete the operation.")

In [54]:
tokenizer = tiktoken.get_encoding("cl100k_base")
results = []

chunks = create_chunk(clean_text, 1000, tokenizer)

text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
for chunk in text_chunks:
    results.append(extract_chunk(chunk, template_promt))
    print(results[-1])

1.0. Who is the author: Tom Anderson (Page 1)
1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR: Not specified
2. What is the value of External Manufacturing Costs in USD: Not specified
3. What is the Capital Expenditure Limit in USD: Not specified
1.1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR: 
   - USD: $95,000,000 for Full Year Reporting Periods ending on 31 December 2023, 31 December 2024, and 31 December 2025; $130,000,000 for the Full Year Reporting Period ending on 31 December 2026 and subsequent periods (Page 1)
   - GBP: Not specified
   - EUR: Not specified

2. What is the value of External Manufacturing Costs in USD: Not specified

3. What is the Capital Expenditure Limit in USD: Not specified
1.1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR: Not specified
2. What is the value of External Manufacturing Costs in USD: Not specified
3. What is the Capital Expenditure Limit in USD: Not specified
1.0. Who is 

In [55]:
groups = [r.split('\n') for r in results]

zipped = list(zip(*groups))
zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
zipped

['1.0. Who is the author: Tom Anderson (Page 1)',
 '1.1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR: ',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '1.0. Who is the author: Tom Anderson (Page 1)',
 '   - USD: $95,000,000 for Full Year Reporting Periods ending on 31 December 2023, 31 December 2024, and 31 December 2025; $130,000,000 for the Full Year Reporting Period ending on 31 December 2026 and subsequent periods (Page 1)',
 '2. What is the value of External Manufacturing Costs in USD: $20,000,000 (Page 10)']

In [56]:
template_prompt = f'''Extract key pieces of information from this regulation document.
If a particular piece of information is not present, output \"Not specified\".
When you extract a key piece of information, include the closest page number.
Use the following format:\n0. Who is the author\n1. How is a Minor Overspend Breach calculated\n2. How is a Major Overspend Breach calculated\n3. Which years do these financial regulations apply to\n\nDocument: \"\"\"<document>\"\"\"\n\n0. Who is the author: Tom Anderson (Page 1)\n1.'''
print(template_prompt)

Extract key pieces of information from this regulation document.
If a particular piece of information is not present, output "Not specified".
When you extract a key piece of information, include the closest page number.
Use the following format:
0. Who is the author
1. How is a Minor Overspend Breach calculated
2. How is a Major Overspend Breach calculated
3. Which years do these financial regulations apply to

Document: """<document>"""

0. Who is the author: Tom Anderson (Page 1)
1.


In [57]:
results = []

for chunk in text_chunks:
    results.append(extract_chunk(chunk, template_prompt))

groups = [r.split('\n') for r in results]

zipped = list(zip(*groups))
zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
zipped

['1.1. How is a Minor Overspend Breach calculated: If the Cost Cap Administration determines that a Power Unit Manufacturer has committed a Minor Overspend Breach, they may enter into an accepted breach agreement (an "ABA") with the relevant Power Unit Manufacturer. (Page 18)',
 '1.1. How is a Minor Overspend Breach calculated: A "Minor Overspend Breach" arises when a Power Unit Manufacturer submits its Full Year Reporting Documentation and Relevant Costs reported therein exceed the Power Unit Cost Cap by less than 5% (Page 25)']

In [14]:
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type

EMBEDDING_MODEL = 'text-embedding-3-small'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'


@retry()
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    return client.embeddings.create(input=text_or_tokens, model=model).data[0].embedding

In [16]:
def truncate_text_tokens(text, encoding_name=EMBEDDING_ENCODING, max_tokens=EMBEDDING_CTX_LENGTH):
    """Truncate a string to have `max_tokens` according to the given encoding."""
    encoding = tiktoken.get_encoding(encoding_name)
    return encoding.encode(text)[:max_tokens]

In [18]:
long_text = 'AGI ' * 5000
# try:
#     get_embedding(long_text)
# except openai.BadRequestError as e:
#     print(e)

truncated = truncate_text_tokens(long_text)
len(get_embedding(truncated))

1536

In [20]:
from itertools import islice

def batched(iterable, n):
    """Batch data into tuples of length n. The last batch may be shorter."""
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

In [21]:
def chunked_tokens(text, encoding_name, chunk_length):
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    chunks_iterator = batched(tokens, chunk_length)
    yield from chunks_iterator

In [23]:
import numpy as np

def len_safe_get_embedding(text, model=EMBEDDING_MODEL, max_tokens=EMBEDDING_CTX_LENGTH, encoding_name=EMBEDDING_ENCODING, average=True):
    chunk_embeddings = []
    chunk_lens = []
    for chunk in chunked_tokens(text, encoding_name=encoding_name, chunk_length=max_tokens):
        chunk_embeddings.append(get_embedding(chunk, model=model))
        chunk_lens.append(len(chunk))

    if average:
        chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
        chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings)  # normalizes length to 1
        chunk_embeddings = chunk_embeddings.tolist()
    return chunk_embeddings

In [24]:
average_embedding_vector = len_safe_get_embedding(long_text, average=True)
chunks_embedding_vectors = len_safe_get_embedding(long_text, average=False)

print(f"Setting average=True gives us a single {len(average_embedding_vector)}-dimensional embedding vector for our long text.")
print(f"Setting average=False gives us {len(chunks_embedding_vectors)} embedding vectors, one for each of the chunks.")


Setting average=True gives us a single 1536-dimensional embedding vector for our long text.
Setting average=False gives us 2 embedding vectors, one for each of the chunks.


In [28]:
import ast
import pandas as pd
from scipy import spatial


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"


In [29]:
query = 'Which athletes won the gold medal in curling at the 2022 Winter Olympics?'

response = client.chat.completions.create(
    messages=[
        {'role': 'system', 'content': 'You answer questions about the 2022 Winter Olympics.'},
        {'role': 'user', 'content': query},
    ],
    model=GPT_MODEL,
    temperature=0,
)

print(response.choices[0].message.content)

The Swedish men's curling team won the gold medal in the men's curling event at the 2022 Winter Olympics. The Swedish team defeated Great Britain in the final to claim the gold medal.
