In [1]:
import pdfplumber
import pandas as pd
import requests
import yaml
import os
import base64

In [2]:
# Import config file
with open("config.yml", "r") as config_file:
    config = yaml.safe_load(config_file)

In [3]:
# Azure OpenAI configuration
AZURE_OPENAI_ENDPOINT = config['azure_openai_endpoint']
AZURE_OPENAI_API_KEY = config['azure_openai_key']
AZURE_MODEL_NAME = config['azure_openai_model_deployment']
AZURE_OPENAI_API_VERSION = config['azure_openai_api_version']

In [4]:
def describe_image_with_gpt4(image_path):
    """
    Describe an image using Azure OpenAI.
    """
    # Load the image as binary data
    #with open(image_path, "rb") as image_file:
    #    image_data = image_file.read()
    encoded_image = base64.b64encode(open(image_path, 'rb').read()).decode('ascii')

    # Create the prompt for the GPT model
    prompt = f"You are a useful agent in charge of analyzing technical drawigns:\n\n"
    command = f"Please describe the following image as part of a technical document:\n\n"

    # Call the Azure OpenAI endpoint
    headers = {
        "Content-Type": "application/json",
        "api-key": AZURE_OPENAI_API_KEY
    }
    payload = {
        "messages": [{"role": "system", "content": prompt},
                     {"role": "user", "content": [{"type" : "image_url",
                                                   "image_url": {
                                                       "url": f"data:image/jpg;base64,{encoded_image}"
                                                   } },
                                                  {"type" : "text",
                                                   "text" : command}]}],
        "temperature": 0.5,
        "top_p": 1.0,
        "max_tokens": 500,
        "stream": False,
        "model": AZURE_MODEL_NAME,
    }

    # Send the request to the GPT model
    response = requests.post(
        f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{AZURE_MODEL_NAME}/chat/completions?api-version={AZURE_OPENAI_API_VERSION}",
        headers=headers,
        json=payload
        
    )
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Error calling GPT-4 model: {response.text}")

In [5]:
# Function to extract text, tables, and images from the PDF
def extract_text_tables_and_image(pdf_path):
    all_text = ""
    tables = []
    os.makedirs("data/images/", exist_ok=True)

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            # Extract text from each page
            all_text += f"--- Page {page_number} ---\n"
            all_text += page.extract_text() + "\n"

            # Extract tables
            page_tables = page.extract_tables()
            for table in page_tables:
                df = pd.DataFrame(table[1:], columns=table[0])  # First row as headers
                tables.append((page_number, df))

            # Extract image from page 9
            if page_number == 9:
                for image_index, image in enumerate(page.images):
                    try:
                        # Extract the image's bounding box
                        bbox = (image['x0'], image['top'], image['x1'], image['bottom'])

                        # Crop the image using the bounding box
                        cropped_image = page.within_bbox(bbox).to_image()

                        # Save the image to a file
                        image_output_path = f"data/images/extracted_image_page_9_{image_index + 1}.jpg"
                        cropped_image.save(image_output_path)
                        image_path = image_output_path  # Update the final image path to the last one
                    except KeyError as e:
                        print(f"Missing expected key in image metadata: {e}")
                    except Exception as e:
                        print(f"An error occurred while processing image: {e}")

    return all_text, tables, image_path

In [6]:
# Path to the input PDF
pdf_path = "data/PECEELPF447001A1.PDF"

In [7]:
# Extract text, tables, and the image
text, tables, image_path = extract_text_tables_and_image(pdf_path)

In [8]:
image_path

'data/images/extracted_image_page_9_2.jpg'

In [9]:
# Generate description for the image using GPT-4
image_description = describe_image_with_gpt4(image_path)

In [10]:
# Insert the image description into the text
updated_text = text.replace("4.4.2.1 Cable Trays Filling Report", 
                            f"4.4.2.1 Cable Trays Filling Report\n\n{image_description}\n\n")

In [11]:
# Save the updated text to a file
with open('data/text_with_image_description.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(updated_text)

# Save tables to separate CSV files
#for idx, (page, table) in enumerate(tables):
#    table.to_csv(f'table_page_{page}_table_{idx + 1}.csv', index=False)

print("Text updated with image description and files saved.")

Text updated with image description and files saved.


In [12]:
image_description

'The image presents two technical drawings, labeled "BATA" and "CODE," which illustrate a schematic layout of a system involving multiple levels and connections.\n\n### Top Section: BATA\n- **Connections**: Indicated with labeled lines, showing how different elements are interconnected.\n- **Levels**: Four distinct levels are marked (Level 1 to Level 4), suggesting a multi-tiered structure.\n- **MIBD and TEAL**: These terms appear to denote specific components or sections of the layout, although their exact meanings are not defined in the image.\n\n### Bottom Section: CODE\n- **Hexagonal Shape**: The lower section features a hexagonal outline, which may represent a specific area of interest or a component within the overall system.\n- **Levels**: Similar to the top section, multiple levels are indicated (Level 1 to Level 3), providing a clear hierarchy or structure.\n- **Connections**: Lines connecting various components are also present, emphasizing the interrelations within the syste