In [1]:
## Install and Import
import fitz  # PyMuPDF
import os
from openai import OpenAI
import requests
import json
import tiktoken
from concurrent.futures import ThreadPoolExecutor

In [2]:
## Set Client and Diretory to find documents in (local)
client = OpenAI(api_key="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")

def set_directory_for_input_document():
    # Change the working directory my local one
    target_directory = r'/Users/jd/Documents/Coding/AcquiSolar/Metadata_extraction/input/J_test'
    os.chdir(target_directory)
    print("Current working directory:", os.getcwd())

## Extract Text from PDF

In [3]:
import re
from concurrent.futures import ThreadPoolExecutor

## How to process the text
def process_extracted_text(text):
    # Remove headers/footers
    text = re.sub(r'(?m)^(?:\d+|[A-Z]+)\s*(\r?\n)\1', '', text)
    # Handle hyphenation at the end of lines
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    # Remove line breaks within a paragraph
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Remove page numbers
    text = re.sub(r'(?m)^\s*\d+\s*\n', '', text)
    return text

## How to extract the text
def extract_text_from_page(doc, page_num):
    page = doc.load_page(page_num)  # doc = pdf object
    text = page.get_text()
    # Apply the text processing here
    processed_text = process_extracted_text(text)
    return processed_text

## How to handle different pages
def extract_text_from_pdf_parallel(pdf_name):
    doc = fitz.open(pdf_name)
    full_text = ""
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Map each page to the executor
        results = executor.map(lambda p: extract_text_from_page(doc, p), range(len(doc)))
        for text in results:
            full_text += text + "\n"  # Concatenate the results with newlines
    doc.close()
    return full_text


## Truncation

In [4]:
# ## New Version



# from transformers import GPT2Tokenizer  # Placeholder for GPT-4 tokenizer

# def truncate_query_to_fit_context(query, max_length=2000):
#     """
#     Truncate a query using GPT-4 tokenizer to fit within a specified maximum length.
    
#     Parameters:
#     - query (str): The text query to be truncated.
#     - max_length (int): The maximum allowed length in tokens.
    
#     Returns:
#     - str: Truncated query.
#     """
#     # Initialize GPT-4 tokenizer (using GPT-2 as a placeholder)
#     # For actual implementation, replace 'gpt2' with the appropriate GPT-4 identifier
#     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # This should be replaced with GPT-4's tokenizer
    
#     # Tokenize the query
#     tokens = tokenizer.encode(query, return_tensors="pt")
    
#     # Ensure the token array does not exceed max_length
#     if tokens.size(1) > max_length:
#         # Truncate the tokens to the maximum length
#         truncated_tokens = tokens[:, :max_length]
#         # Decode tokens back to text
#         truncated_query = tokenizer.decode(truncated_tokens[0], clean_up_tokenization_spaces=True)
#     else:
#         truncated_query = query

#     return truncated_query


In [5]:
### Version 3
from transformers import GPT2Tokenizer  


def truncate_query_to_fit_context(query, max_length=2000):
    """
    Truncate a query using GPT-2 tokenizer to fit within a specified maximum length.
    
    Parameters:
    - query (str): The text query to be truncated.
    - max_length (int): The maximum allowed length in tokens.
    
    Returns:
    - str: Truncated query.
    """
    # Initialize GPT-4 tokenizer (using GPT-2 as a placeholder)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 
    
    # Tokenize the query and truncate if needed
    tokens = tokenizer.encode(query, return_tensors="pt")
    num_tokens = tokens.size(1)
    
    if num_tokens > max_length:
        # Truncate the tokens to the maximum length
        truncated_tokens = tokens[:, :max_length].tolist()[0]
        # Try to find the last complete sentence to avoid cutting in the middle of a sentence
        end_of_sentence_indices = [idx for idx, token_id in enumerate(truncated_tokens) if tokenizer.decode([token_id]) in '.!?']
        if end_of_sentence_indices:
            last_sentence_index = end_of_sentence_indices[-1] + 1
            truncated_tokens = truncated_tokens[:last_sentence_index]
        # Decode tokens back to text
        truncated_query = tokenizer.decode(truncated_tokens, clean_up_tokenization_spaces=True)
    else:
        truncated_query = query
    
    return truncated_query


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# def construct_query(extracted_text):
#     return f"""
# Extract the following fields from the document text provided and format the response as JSON:
# - "Document date" in the format '3 letter month name-DD, YYYY'.
# - "Document summary" limited to a maximum of 3 sentences, tailored for a solar M&A analyst. It should state what kind of document it is, but also what its implicatoins are or what state it is in. It should assume the analyst knows about the M&A process.
# - "Document type", which should be either 'PPA' or 'Interconnection document' or 'email' or 'site control'.
# - "Suggested title" in the format 'MM-DD-YYYY max 5 word document title (state)' the state field is optional. It can read "main" if it is said to be the main document of its type, it can read (redacted) if it is redacted.
# - "Suggested title v2" in same format as "suggested title" but with different wording
# - "Suggested title v3" in same format as "suggested title" but with different wording
# - "Suggested folder" from the selection: "PPA", "interconnection", "uncategorized", "site control"

# The provided document text is:
# {extracted_text}
# """

In [7]:
# def construct_query(extracted_text):
#     return f"""
# Please analyze the provided document text and classify it into the most appropriate folder based on the content and context. The folders are "PPA", "interconnection", "uncategorized", "site control". Consider the following when classifying:

# - Document date in 'MMM-DD, YYYY' format.
# - A brief summary, focusing on implications and status relevant to solar M&A processes.
# - Document type: "PPA", "Interconnection document", "email", "site control".
# - Suggested titles in 'MM-DD-YYYY max 5 word title (state)' format. Include reasoning for state indication if applicable.
# - Identify key phrases or keywords that strongly indicate a specific folder.
# - Explain your reasoning for the folder suggestion, including any indicators or document characteristics that influenced your choice.
# - If applicable, suggest multiple folders and specify the confidence level of each suggestion.

# Your classification will be reviewed and used to iteratively improve our document management process.

# Provided document text:
# {extracted_text}
# """


In [8]:
def construct_metadata_query(extracted_text):
    return f"""
Extract the following fields from the document text provided and format the response as JSON:
- "Document date" in the format '3 letter month name-DD, YYYY'.
- "Document summary" limited to a maximum of 3 sentences, tailored for a solar M&A analyst.
- "Document type", which should be either 'PPA', 'Interconnection document', 'email', or 'site control'.
- "Suggested title", "Suggested title v2", and "Suggested title v3" in the format 'MM-DD-YYYY max 5 word document title (state)'.
"""

def construct_classification_query(metadata, document_summary):
    # Summarize the key extracted information
    summary = f"Document Date: {metadata['Document date']}, Type: {metadata['Document type']}, Summary: {document_summary}"
    
    # Provide explicit instructions for classification
    instructions = """
    Based on the summary above and the detailed document content provided below, classify the document into one of the following folders: "PPA", "interconnection", "uncategorized", "site control". Consider the following criteria for each category:
    - PPA: Documents related to Power Purchase Agreements.
    - Interconnection: Documents dealing with the connection of solar power facilities to the grid.
    - Site Control: Documents related to the ownership, lease, or control of sites for solar development.
    - Uncategorized: Documents that do not fit into the other categories or lack enough information for a clear classification.

    If uncertain, classify as 'uncategorized' but note the reason for uncertainty.
    If PPA, interconnection or site control note reason for why it was classified this way
    """

    # Combine the elements into the final prompt
    prompt = f"{summary}\n\n{instructions}\n\nThe provided document text is:\n{extracted_text}"
    return prompt



In [9]:
def output_extracted_text_to_file(extracted_text, filename="Extracted_text.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(extracted_text)

In [10]:
# def save_json_with_pdf_name(json_str, pdf_name):
#     """
#     Saves a JSON string to a file with the same base name as the input PDF file but with a .json extension.

#     Parameters:
#     - json_str (str): The JSON string to save.
#     - pdf_name (str): The filename of the PDF, used to derive the JSON filename.

#     Returns:
#     - None
#     """
#     # Extract the base filename without the extension
#     base_name = os.path.splitext(pdf_name)[0]
#     # Construct the JSON filename
#     json_filename = f"{base_name}.json"
    
#     try:
#         if json_str.strip():  # Check if json_str is not empty
#             # Convert the JSON string to a Python dictionary
#             data = json.loads(json_str)
#             # Open the file in write mode and save the JSON
#             with open(json_filename, 'w') as file:
#                 json.dump(data, file, indent=4)  # Pretty print the JSON
#             print(f"JSON data successfully saved to {json_filename}")
#         else:
#             print("Received empty JSON string, skipping file saving.")
#     except Exception as e:
#         print(f"Error saving JSON to file: {e}")

# set_directory_for_input_document()                      # Set directory of where to find pdf
# pdf_name = 'PPA.pdf'                                    # Define the name of the file
# extracted_text = extract_text_from_pdf_parallel(pdf_name)        # turn PDF into text
# output_extracted_text_to_file(extracted_text)           # View result of PDF to txt conversion
# query = construct_query(extracted_text)                 # Merge question with text for prompt
# truncated_query = truncate_query_to_fit_context(query)  # truncate query to fit in context length. not optimal

# # API call
# completion = client.chat.completions.create(
#   model="gpt-3.5-turbo-0125",
#   #model="gpt-3.5-turbo-0125" is the best available --- https://platform.openai.com/docs/models/gpt-3-5-turbo
#   messages=[
#     {"role": "system", "content": "You are a solar M&A analyst and great at extracting summaries and text from M&A documentation. Under no circumstances do you halucinate, instead you say that you leave a field blank if you cannot answer"},
#     {"role": "user", "content": truncated_query}
#   ]
# )
# output_json = completion.choices[0].message.content     # get message contents from api call

# print(output_json)                                      # print results
# save_json_with_pdf_name(output_json, pdf_name)          # save results to JSON file with same name as PDF

In [11]:
def process_completion(completion):
    """
    Processes the completion response from the OpenAI API to extract relevant information.

    The function is designed to parse the API response and extract the necessary data for further processing or decision-making. It should handle variations in the response format gracefully and ensure that the extracted information is accurate and usable.

    Parameters:
    - completion: The response object from the OpenAI API call. This object contains the model's output and possibly other metadata related to the request.

    Returns:
    - A structured representation of the extracted information, which could be a dictionary, a list, or a simple string, depending on the expected format of the response and the needs of the application.

    This function must be implemented with an understanding of the response structure returned by the specific OpenAI model being used. For instance, if the model returns JSON-formatted strings containing the desired data, the function should parse this JSON and extract the relevant fields.

    It is also responsible for error handling, ensuring that any issues with the API response, such as unexpected formats or missing data, are caught and managed appropriately.

    Example Usage:
    Suppose the API response includes a text field with JSON content. In that case, the function might look like this:

        try:
            response_data = json.loads(completion.choices[0].text.strip())
            extracted_data = {
                'metadata': response_data['metadata'],
                'classification': response_data['classification']
            }
            return extracted_data
        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error processing completion response: {e}")
            return None

    Note: The above code is an example. The actual implementation will depend on the format of the API response and the specific data you need to extract.
    """
    pass  # Implementation goes here


In [12]:
## LAST PROCESS

def extract_metadata_and_classify(extracted_text):
    # First, extract metadata
    metadata_query = construct_metadata_query(extracted_text)
    metadata_completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[{"role": "system", "content": "Extract metadata"}, {"role": "user", "content": metadata_query}]
    )
    metadata = process_completion(metadata_completion)  # You'll need to implement this based on your data structure

    # Then, classify the document
    classification_query = construct_classification_query(metadata, extracted_text)
    classification_completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[{"role": "system", "content": "Classify document"}, {"role": "user", "content": classification_query}]
    )
    classification = process_completion(classification_completion)  # Implement based on your needs

    return metadata, classification

def save_json_with_pdf_name(json_str, pdf_name):
    """
    Saves a JSON string to a file with the same base name as the input PDF file but with a .json extension.

    Parameters:
    - json_str (str): The JSON string to save.
    - pdf_name (str): The filename of the PDF, used to derive the JSON filename.

    Returns:
    - None
    """
    # Extract the base filename without the extension
    base_name = os.path.splitext(pdf_name)[0]
    # Construct the JSON filename
    json_filename = f"{base_name}.json"

    if not json_str.strip():
        print("Received empty or invalid JSON string, skipping file saving.")
        return
    
    try:
        # Directly save the JSON string to a file
        with open(json_filename, 'w') as file:
            file.write(json_str)  # Assuming json_str is already a valid JSON string
        print(f"JSON data successfully saved to {json_filename}")
    except Exception as e:
        print(f"Error saving JSON to file: {e}")


In [13]:
### LAST PART

# Set the directory where to find the PDF
set_directory_for_input_document()

# Define the name of the file
pdf_name = 'PPA.pdf'

# Extract text from the PDF
extracted_text = extract_text_from_pdf_parallel(pdf_name)

# Optionally, save extracted text to a file
output_extracted_text_to_file(extracted_text)

# Extract metadata and classify the document
metadata, classification = extract_metadata_and_classify(extracted_text)

# Combine metadata and classification into a single JSON object for saving
combined_results = json.dumps({
    "metadata": metadata,
    "classification": classification
}, indent=4)

# Save the combined results to a JSON file with the same name as the PDF
save_json_with_pdf_name(combined_results, pdf_name)
       # save results to JSON file with same name as PDF

Current working directory: /Users/jd/Documents/Coding/AcquiSolar/Metadata_extraction/input/J_test


TypeError: 'NoneType' object is not subscriptable

In [None]:

# def extract_metadata_and_classify(extracted_text):
#     # First, extract metadata
#     metadata_query = construct_metadata_query(extracted_text)
#     metadata_completion = client.chat.completions.create(
#         model="gpt-3.5-turbo-0125",
#         messages=[{"role": "system", "content": "Extract metadata"}, {"role": "user", "content": metadata_query}]
#     )
#     metadata = process_completion(metadata_completion)  # You'll need to implement this based on your data structure

#     # Then, classify the document
#     classification_query = construct_classification_query(metadata, extracted_text)
#     classification_completion = client.chat.completions.create(
#         model="gpt-3.5-turbo-0125",
#         messages=[{"role": "system", "content": "Classify document"}, {"role": "user", "content": classification_query}]
#     )
#     classification = process_completion(classification_completion)  # Implement based on your needs

#     return metadata, classification

# def save_results(metadata, classification, pdf_name):
#     # Combine metadata and classification into one JSON object
#     combined_results = {
#         "metadata": metadata,
#         "classification": classification
#     }
#     json_str = json.dumps(combined_results, indent=4)
#     save_json_with_pdf_name(json_str, pdf_name)

# # Workflow
# set_directory_for_input_document()  # Set directory of where to find pdf
# pdf_name = 'PPA.pdf'  # Define the name of the file
# extracted_text = extract_text_from_pdf_parallel(pdf_name)  # Extract text from PDF
# output_extracted_text_to_file(extracted_text)  # Optionally, save extracted text to file

# # Extract metadata and classify
# metadata, classification = extract_metadata_and_classify(extracted_text)
# save_results(metadata, classification, pdf_name)  # Save results to JSON file with the same name as PDF
