In [10]:
## Install and Import
import fitz  # PyMuPDF
import os
from openai import OpenAI
import requests
import json
import tiktoken
from concurrent.futures import ThreadPoolExecutor

## Approach 02/11

1) Extract text from PDF
2) Remove elements that are unnecessary (tables or other non-essential elements)
3) Truncate (to limit size)
4) Create prompt to classify
5) Use JSON mode (classify this set of pages in these categories)
6) Create few shot prompt in JSON strings

In [11]:
## Set Client and Diretory to find documents in (local)
client = OpenAI(api_key="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")

def set_directory_for_input_document():
    # Change the working directory my local one
    target_directory = r'/Users/jd/Documents/Coding/AcquiSolar/Metadata_extraction/input/J_test'
    os.chdir(target_directory)
    print("Current working directory:", os.getcwd())

## Extract Text from PDF

In [12]:
# ## New Version: adds a function to read pages parallel to speed up the process and make it less expensive
# ## and should it make easier to read different kinds of documents

# def extract_text_from_page(doc, page_num):
#     page = doc.load_page(page_num) ## doc = pdf object
#     text = page.get_text()
#     # Apply any necessary text processing here
#     return text

# def extract_text_from_pdf_parallel(pdf_name):
#     doc = fitz.open(pdf_name)
#     full_text = ""
#     with ThreadPoolExecutor(max_workers=4) as executor:
#         # Map each page to the executor
#         results = executor.map(lambda p: extract_text_from_page(doc, p), range(len(doc)))
#         for text in results:
#             full_text += text + "\n"  # Concatenate the results with newlines
#     doc.close()
#     return full_text

In [13]:
import re
from concurrent.futures import ThreadPoolExecutor

def process_extracted_text(text):
    # Remove headers/footers
    text = re.sub(r'(?m)^(?:\d+|[A-Z]+)\s*(\r?\n)\1', '', text)
    # Handle hyphenation at the end of lines
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    # Remove line breaks within a paragraph
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Remove page numbers
    text = re.sub(r'(?m)^\s*\d+\s*\n', '', text)
    return text

def extract_text_from_page(doc, page_num):
    page = doc.load_page(page_num)  # doc = pdf object
    text = page.get_text()
    # Apply the text processing here
    processed_text = process_extracted_text(text)
    return processed_text

def extract_text_from_pdf_parallel(pdf_name):
    doc = fitz.open(pdf_name)
    full_text = ""
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Map each page to the executor
        results = executor.map(lambda p: extract_text_from_page(doc, p), range(len(doc)))
        for text in results:
            full_text += text + "\n"  # Concatenate the results with newlines
    doc.close()
    return full_text

# Example of using the function:
# pdf_text = extract_text_from_pdf_parallel('path_to_your_pdf.pdf')


## Truncation

In [15]:
### Version 3
from transformers import GPT2Tokenizer  


def truncate_query_to_fit_context(query, max_length=2000):
    """
    Truncate a query using GPT-2 tokenizer to fit within a specified maximum length.
    
    Parameters:
    - query (str): The text query to be truncated.
    - max_length (int): The maximum allowed length in tokens.
    
    Returns:
    - str: Truncated query.
    """
    # Initialize GPT-4 tokenizer (using GPT-2 as a placeholder)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 
    
    # Tokenize the query and truncate if needed
    tokens = tokenizer.encode(query, return_tensors="pt")
    num_tokens = tokens.size(1)
    
    if num_tokens > max_length:
        # Truncate the tokens to the maximum length
        truncated_tokens = tokens[:, :max_length].tolist()[0]
        # Try to find the last complete sentence to avoid cutting in the middle of a sentence
        end_of_sentence_indices = [idx for idx, token_id in enumerate(truncated_tokens) if tokenizer.decode([token_id]) in '.!?']
        if end_of_sentence_indices:
            last_sentence_index = end_of_sentence_indices[-1] + 1
            truncated_tokens = truncated_tokens[:last_sentence_index]
        # Decode tokens back to text
        truncated_query = tokenizer.decode(truncated_tokens, clean_up_tokenization_spaces=True)
    else:
        truncated_query = query
    
    return truncated_query


In [27]:
def construct_query(extracted_text):
    return f"""
Extract the following fields from the document text provided and format the response as JSON:
- "Document date" in the format '3 letter month name-DD, YYYY'.
- "Document summary" limited to a maximum of 3 sentences, tailored for a solar M&A analyst. It should state what kind of document it is, but also what its implicatoins are or what state it is in. It should assume the analyst knows about the M&A process.
- "Document type", which should be either 'PPA' or 'Interconnection document' or 'email' or 'site control'.
- "Suggested title" in the format 'MM-DD-YYYY max 5 word document title (state)' the state field is optional. It can read "main" if it is said to be the main document of its type, it can read (redacted) if it is redacted.
- "Suggested title v2" in same format as "suggested title" but with different wording
- "Suggested title v3" in same format as "suggested title" but with different wording
- "Suggested folder" from the selection: "PPA", "interconnection", "uncategorized", "site control"
- "Reasoning": Give a one sentence reasoning for your suggestion of the folder selection

The provided document text is:
{extracted_text}
"""


In [28]:
def output_extracted_text_to_file(extracted_text, filename="Extracted_text.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(extracted_text)

In [32]:
def save_json_with_pdf_name(json_str, pdf_name):
    """
    Saves a JSON string to a file with the same base name as the input PDF file but with a .json extension.

    Parameters:
    - json_str (str): The JSON string to save.
    - pdf_name (str): The filename of the PDF, used to derive the JSON filename.

    Returns:
    - None
    """
    # Extract the base filename without the extension
    base_name = os.path.splitext(pdf_name)[0]
    # Construct the JSON filename
    json_filename = f"{base_name}.json"
    
    try:
        # Convert the JSON string to a Python dictionary
        data = json.loads(json_str)
        # Open the file in write mode and save the JSON
        with open(json_filename, 'w') as file:
            json.dump(data, file, indent=4)  # Pretty print the JSON
        print(f"JSON data successfully saved to {json_filename}")
    except Exception as e:
        print(f"Error saving JSON to file: {e}")

set_directory_for_input_document()                      # Set directory of where to find pdf
pdf_name = 'PPA.pdf'                                    # Define the name of the file
extracted_text = extract_text_from_pdf_parallel(pdf_name)        # turn PDF into text
output_extracted_text_to_file(extracted_text)           # View result of PDF to txt conversion
query = construct_query(extracted_text)                 # Merge question with text for prompt
truncated_query = truncate_query_to_fit_context(query)  # truncate query to fit in context length. not optimal

# API call
completion = client.chat.completions.create(
  model="gpt-3.5-turbo-0125",
  #model="gpt-3.5-turbo-0125" is the best available --- https://platform.openai.com/docs/models/gpt-3-5-turbo
  messages=[
    {"role": "system", "content": "You are a solar M&A analyst and great at extracting summaries and text from M&A documentation. Under no circumstances do you halucinate, instead you say that you leave a field blank if you cannot answer"},
    {"role": "user", "content": truncated_query}
  ]
)
output_json = completion.choices[0].message.content     # get message contents from api call

print(output_json)                                      # print results
save_json_with_pdf_name(output_json, pdf_name)          # save results to JSON file with same name as PDF

Current working directory: /Users/jd/Documents/Coding/AcquiSolar/Metadata_extraction/input/J_test


Token indices sequence length is longer than the specified maximum sequence length for this model (21409 > 1024). Running this sequence through the model will result in indexing errors


```json
{
    "Document date": "AUG-03, 2010",
    "Document summary": "This document is an application for the approval of a Power Purchase Agreement (PPA) for as-available energy with Kapaa Solar LLC by Kauai Island Utility Cooperative. The application is in the stage of requesting approval from the Hawaii Public Utilities Commission for the PPA and related energy rate adjustments.",
    "Document type": "PPA",
    "Suggested title": "08-03-2010 PPA Document (main)",
    "Suggested title v2": "08-03-2010 Power Purchase Agreement Request",
    "Suggested title v3": "08-03-2010 Energy Rate Adjustment Application",
    "Suggested folder": "PPA",
    "Reasoning": "The document primarily revolves around the approval and terms of a Power Purchase Agreement, hence falling under the 'PPA' category."
}
```
Error saving JSON to file: Expecting value: line 1 column 1 (char 0)


In [31]:
def categorize_document(text: str, client: OpenAI) -> str:
    """
    Categorize a document based on its content using GPT-3.5.

    Args:
    text (str): The processed text of the document.
    client (OpenAI): The OpenAI API client.

    Returns:
    str: The category of the document.
    """
    prompt = f"""Please categorize the following document into one of these categories:
    PPA, interconnection, uncategorized, site control.

    Document:
    {text}

    Which category does it belong to?
    """

    response = client.create_completion(
        model="gpt-3.5-turbo-0125",
        prompt=prompt,
        temperature=0.5,  # Adjust as needed for creativity vs. specificity
        max_tokens=60  # Adjust based on expected length of response
    )

    # Assuming the response directly provides the category
    category = response.choices[0].text.strip()

    # Additional logic might be needed here to validate or parse the response

    return category
