#### gpt_vision_demo.py -- James Sayre, jsayre@ucdavis.edu

##### This code demonstrates use of OpenAI's API to process imagery and extract structured data. It extracts an image from a PDF, then shows how one can use on-demand (quick, but more expensive) and batch processing (slow, but cheaper) functionalities to extract data to a pre-defined file.

In [7]:
import os
import pandas as pd
import openai
### For handling PDF and image files
import fitz  # PyMuPDF
from PIL import Image
from io import BytesIO
import base64
import json

### Params
openai.api_key = os.environ.get('OPENAI_API_KEY') # Get your OpenAI API key from environment variables

### Directories
### Define your own directories here

### Inputs
test_doc     =  'test_doc.pdf'

### Parameters
### Prompt engineering
# Define expected fields for structured output
str_functions = [
    {
        "name": "extract_soil_certificate_info", ### Need to provide name
        "description": "Extracts data from soil certificate", ### Description for function
        "parameters": {
            "type": "object",
            "properties": {
                ### 
                "predominant_soil": {"type": "string", "description": "The predominant soil."},
                "crop": {"type": "string", "description": "Crop."},
                "baseline_township": {"type": "string", "description": "Region hihglighted in the image under Baseline, Township"},
                "range_south": {"type": "string", "description": "degrees south"},
                "range_east": {"type": "string", "description": "degrees east"},
                "practice": {"type": "string", "description": "Category highlighted in the image under Practice"}
            },
            ### This tells GPT4 to always output the following columns/info
            "required": ["predominant_soil", "crop", "baseline_township", "range_south", "range_east", "practice"]
        }
    }
]

# Prompt text with your custom message
prompt_text = '''You are SoilGPT, and I want you to read an image.'''
prompt_text += ''' Please extract the value in each field outlined by a partial rectangle in the image.'''

### Programs
### Take a page of PDF, check for text string in it, and return cropped image as base64 object
def imagetize_pdf(pdf_doc, page_num, check_for_text="", save_png=False):
    first_page = pdf_doc.load_page(page_num)
    if check_for_text != "":
        if check_for_text not in first_page.get_text("text"):
            return None
        
    # Get dimensions of PDF
    original_width = first_page.rect.width
    original_height = first_page.rect.height

    # Calculate cropping dimensions
    top_crop = original_height * (3/12)
    bottom_crop = original_height * (8/20)
    left_crop = original_width * (1.5/20)
    right_crop = original_width * (11.5/20)

    # Crop the page
    cropped_rect = fitz.Rect(left_crop, top_crop, right_crop, bottom_crop)
    zoom = 3  # Set the zoom factor for higher resolution
    matrix = fitz.Matrix(zoom, zoom)
    cropped_image = first_page.get_pixmap(matrix=matrix, clip=cropped_rect)

    # Convert to PIL Image and save as PNG
    img = Image.frombytes("RGB", [cropped_image.width, cropped_image.height], cropped_image.samples)
    ### save image to file
    if save_png:
        img.save(f"page_{page_num}.png")
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    

    base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{base64_image}"

There are two ways to make calls to the OpenAI API. The first is to make immeditate calls to the API. This method is the simplest, as you will make a call, wait for the response from server, and immediately store the output. However, you will pay more for the privilege of receiving on-demand output. Code to implement the first is below.

In [5]:
### Function to analyze the information in the image file based on a prompt and structured output information
def analyze_gpt_text(base_img, prompt, structured_functions):
    if base_img is None:
        return "No image generated. Check PDF text and page number."

    img_data = base64.b64decode(base_img.split(",")[1])
    with open("debug_cropped_image.png", "wb") as f:
        f.write(img_data)  # Save to inspect

    # Define the content with text information (without the function call)
    content = {
        "role": "user",
        "content": prompt,
    }

    img_content = {
        "role": "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": base_img
            }
        }]
    }

    # Create the response using OpenAI's client
    response = openai.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=[content,img_content],
        functions=structured_functions,
        function_call={"name": "extract_soil_certificate_info"}  # Specify the function here
    )

    # Parse the structured output
    try:
        # Access the function's output arguments
        structured_output = response.choices[0].message.function_call.arguments
        data_dict = json.loads(structured_output)  # Convert JSON string to dictionary
        df = pd.DataFrame([data_dict])  # Convert to DataFrame
        return df
    except (IndexError, KeyError, json.JSONDecodeError) as e:
        return f"Error parsing response: {e}"

In [8]:
pdf_document  =  fitz.open(test_doc)
chatgptoutput =  analyze_gpt_text(imagetize_pdf(pdf_document, 17), prompt_text, str_functions)

Once we've run this code, we can confirm that the output is what we expect.

In [9]:
chatgptoutput

Unnamed: 0,predominant_soil,crop,baseline_township,range_south,range_east,practice
0,Loamy Sand,Squash,Mt. Diablo,17S,23E,SURFACE IRRIGATION (Without a tailwater recove...


The second method is to make batch calls to the OpenAI API. These will upload the images or text you intend to process as a batch, which will be computed within 24 hours (often much less). The upshot of this method is both that you don't have to have a long running script calling the API open on your computer for a while as well as the reduced cost of batch processing vis-a-vis on-demand processing. However, the documentation for batch processing is much sparser.

In [15]:

    
# Function to create a JSONL entry for a given page with structured output request
def create_jsonl_entry(page_num, base_img, prompt, structured_functions):
    content = {
        "role": "user",
        "content": prompt,
    }

    img_content = {
        "role": "user",
        "content": [{
            "type": "image_url",
            "image_url": {
                "url": base_img
            }
        }]
    }
    return {
        "custom_id": f"task-{page_num}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": "gpt-4o",
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [content,img_content],
            "functions":structured_functions,
            "function_call":{"name": "extract_soil_certificate_info"}
        }
    }


# Function to generate JSONL file for batch processing with structured output
def generate_jsonl_file(pdf_path, page_numbers, prompt, structured_functions, output_file="input.jsonl"):
    entries = []
    for page_num in page_numbers:
        base_img = imagetize_pdf(pdf_path, page_num)
        if base_img:
            entry = create_jsonl_entry(page_num, base_img, prompt, structured_functions)
            entries.append(entry)
    
    # Write entries to JSONL file
    with open(output_file, "w") as f:
        for entry in entries:
            f.write(json.dumps(entry) + "\n")
    
    print(f"JSONL file created: {output_file}")

def upload_jsonl_file(file_path):
    response = openai.files.create(
        file=open(file_path, "rb"),
        purpose="batch"  # Adjust purpose if needed; in some cases, "answers" might be appropriate,
    )
    file_id = response.id
    print(f"File uploaded successfully. File ID: {file_id}")
    return file_id

# Function to create a batch job
def create_batch_job(file_id, model="gpt-4o", completion_window="24h"):
    response = openai.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        # model=model,
        completion_window=completion_window,
        metadata={"description": "Batch processing for soil certificate analysis"}
    )
    # batch_id = response['id']
    # print(f"Batch job created successfully. Batch ID: {batch_id}")
    # return batch_id
    return response

Now we can run a batch on two pages of this pdf.

In [None]:
pdf_document  =  fitz.open(test_doc)

jsonl_file_path = "input.jsonl" # Define the JSONL file path
generate_jsonl_file(pdf_document, [17,18], prompt_text, str_functions, jsonl_file_path)
file_id = upload_jsonl_file(jsonl_file_path)  # Upload JSONL file
batch_id = create_batch_job(file_id)

In [25]:
### Now we can check the status of all the batch jobs submitted
openai.batches.list(limit=10)
### Or check the status of a specific batch job
batch_id

Once complete, then retrieve the contents of the batch file

In [35]:
file_response = openai.files.content(batch_id.id)