In [2]:
# @title Install Libaries

!pip install openai
!pip install IPython
!pip install Pillow
!pip install PyMuPDF
!pip install pandas
!pip install tqdm # for progress bar



In [3]:
# @title Imports

from openai import OpenAI
import json
import os
import re
import base64
import pandas as pd
from IPython.display import Image, Markdown, display
from openai.types.chat import ChatCompletionMessageParam
import shutil
import fitz # PyMuPDF
from tqdm import tqdm
from PIL import Image

In [4]:
# @title Overwrite PIL max image size

Image.MAX_IMAGE_PIXELS = None

In [5]:
# @title Mount Google Drive

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
# @title CD to Project Directory

# Change the directory to where your project files are located
%cd "/content/drive/MyDrive/RAGTraining/RagTrainingZM/Image2ExcelGenAI/"

/content/drive/MyDrive/RAGTraining/RagTrainingZM/Image2ExcelGenAI


In [7]:
# @title Load API Key

file_name = '/content/drive/MyDrive/RAGTraining/RagTrainingZM/Image2ExcelGenAI/config.json'

with open(file_name, 'r') as file:
  config = json.load(file)
  os.environ['OPENAI_API_KEY'] = config.get("API_KEY")
  os.environ["OPENAI_BASE_URL"] = config.get("OPENAI_API_BASE")

In [8]:
# @title Define Folder Paths

pdf_folder = 'PDFs'
images_folder = 'Images'
output_csv_file = 'menu_items.csv'

# create the pdf input folder if it doesn't exist
if not os.path.exists(pdf_folder):
  os.makedirs(pdf_folder)
  print(f"Created the input folder: '{pdf_folder}'.")

# Remove old images folder to ensure a clean slate
if os.path.exists(images_folder):
  shutil.rmtree(images_folder)
  print(f"Removed old output folder: '{images_folder}'.")

# Create new images output folder
os.makedirs(images_folder)
print(f"Created new output foder: '{images_folder}'.")

print("\nSetup complete.  Please upload your PDF files to the 'PDFs' folder.")
print("Once uploaded, run the next cell to begin the conversion.")

Created new output foder: 'Images'.

Setup complete.  Please upload your PDF files to the 'PDFs' folder.
Once uploaded, run the next cell to begin the conversion.


In [9]:
# @title PDF Conversion to Images with resizing

                # Cell 2: PDF Conversion

# Check for PDFs in the input folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]

if not pdf_files:
    print("No PDF files found in the 'PDFs' folder. Please upload your files and try again.")
else:
    print(f"Found {len(pdf_files)} PDF(s) to convert. Starting conversion...")

    # Iterate through all PDF files with a progress bar
    for pdf_file in tqdm(pdf_files, desc="Converting PDFs"):
        pdf_path = os.path.join(pdf_folder, pdf_file)

        try:
            # Open the PDF document
            doc = fitz.open(pdf_path)

            # Create a subfolder for each PDF's images if needed
            # This makes output more organized, e.g., 'Images/My_Document/'
            base_filename = os.path.splitext(pdf_file)[0]
            output_subfolder = os.path.join(images_folder, base_filename)
            os.makedirs(output_subfolder, exist_ok=True)

            # Iterate through each page of the document
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)

                # Render the page as a pixmap (image)
                # Set a high resolution (300 DPI) for quality
                pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))

                # Define the output image filename
                image_filename = f"{base_filename}_page_{page_num + 1}.png"
                image_path = os.path.join(output_subfolder, image_filename)

                # Save the pixmap as a PNG image
                pix.save(image_path)

            doc.close()

        except Exception as e:
            print(f"\nError processing '{pdf_file}': {e}")

    print("\nConversion complete! All images have been saved to the 'Images' folder.")
    print("You can view or download the images from the file browser on the left.")

Found 1 PDF(s) to convert. Starting conversion...


Converting PDFs: 100%|██████████| 1/1 [00:08<00:00,  8.51s/it]


Conversion complete! All images have been saved to the 'Images' folder.
You can view or download the images from the file browser on the left.





### **CONVERT IMAGES TO CSV**

In [10]:
# @title Create System Prompt

system_prompt = """

Analyze the content of this menu page image. Extract all menu items and their details.
For each item, provide the following information:
- "Category": The category of the dish (e.g., "Appetizers", "Entrees", "Desserts").
- "Dish Title": The name of the dish.
- "Description": A brief description of the dish.
- "Price": The price of the dish, as a number (without the dollar sign).

Present the extracted data as a single JSON array, where each element is a dictionary for a menu item.
If a field is not present, use a null value.

Example of the desired JSON format:
[
  {
    "Category": "Appetizers",
    "Dish Title": "Taco Topped Queso & Chips",
    "Description": null,
    "Price": 6.99
  },
  {
    "Category": "Drinks",
    "Dish Title": "Strawberry Lemonade",
    "Description": "A refreshing mix of strawberry and lemonade.",
    "Price": 3.49
  }
]

"""

In [15]:
# @title Create User Prompt

user_prompt = """

Please analyze this image of a menu page. Follow the instructions in the system prompt to extract all menu items and their details.  Go through the menu step-by-step being sure to include all items on the menu.

"""

In [17]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define folder paths (adjust if needed)
images_folder = 'Images'
output_csv_file = 'menu_items.csv'

# Define the target data structure for extraction
# We ask GPT-4 Vision to format its response as a JSON array

# Helper function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# --- PROCESSING ---
all_items = []
image_files = []

# Collect all image file paths
for root, dirs, files in os.walk(images_folder):
    for file in files:
        if file.endswith(('.png', '.jpg', '.jpeg')):
            image_files.append(os.path.join(root, file))

if not image_files:
    print(f"No image files found in '{images_folder}'. Please run the previous cells first.")
else:
    print(f"Found {len(image_files)} image(s) to process. Starting data extraction...")

    # Initialize the OpenAI client
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    # Iterate through each image with a progress bar
    for image_path in tqdm(image_files, desc="Extracting data with GPT"):
        base64_image = encode_image(image_path)

        max_retries = 3
        retries = 0

        while retries < max_retries:
            try:
                # Make the API call
                response = client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                # Concatenate the system and user prompts
                                {"type": "text", "text": system_prompt + user_prompt},
                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                            ],
                        }
                    ],
                    max_tokens=2000,
                )

                # Extract the raw text from the response
                raw_content = response.choices[0].message.content

                # Find the JSON array within the response using a more specific regex
                json_match = re.search(r'\[\s*\{.*\}\s*\]', raw_content, re.DOTALL)

                if json_match:
                    json_string = json_match.group(0)

                    # Corrected JSON parsing with proper try/except block
                    try:
                        extracted_data = json.loads(json_string)
                        all_items.extend(extracted_data)
                        break  # Break the retry loop on success
                    except json.JSONDecodeError as e:
                        print(f"\nError decoding JSON for '{image_path}': {e}")
                        # Don't break, continue the retry loop
                else:
                    # If no JSON is found, no point in retrying.
                    print(f"\nCould not find a valid JSON array in the response for '{image_path}'.")
                    print(f"Raw content: {raw_content[:200]}...")
                    break # Break the retry loop to move to the next image

            except Exception as e:
                # Catch general errors (e.g., network issues, API errors)
                print(f"\nError processing '{image_path}': {e}")

            retries += 1
            if retries < max_retries:
                # Wait before retrying (exponential backoff could be added here)
                print(f"Retrying... ({retries}/{max_retries})")
                time.sleep(2 ** retries)
            else:
                print(f"Max retries reached for '{image_path}'. Skipping this image.")

    # --- SAVE TO CSV ---
    if all_items:
        df = pd.DataFrame(all_items)
        df.to_csv(output_csv_file, index=False)
        print(f"\nExtraction complete! Data saved to '{output_csv_file}'.")
    else:
        print("\nNo data was extracted. The output CSV file was not created.")

Found 1 image(s) to process. Starting data extraction...


Extracting data with GPT: 100%|██████████| 1/1 [00:50<00:00, 50.56s/it]


Extraction complete! Data saved to 'menu_items.csv'.



