In [None]:
!pip install openpyxl
!pip install --upgrade open-interpreter
!pip install pytesseract

In [2]:
# Databricks notebook source
import base64
import json
import os
import os.path
import re
import sys
import zipfile
from datetime import datetime

import openai
import openpyxl
import pandas as pd
import tiktoken
from langchain.docstore.document import Document



### Encode the image file and run QA pipeline

In [None]:
import base64
import os
from mimetypes import guess_type

import openai
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai.api_key)
model = "gpt-4o"


# Function to encode a local image into data URL
def local_image_to_data_url(image_path):
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = "application/octet-stream"  # Default MIME type if none is found

    # Read and encode the image file
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8")

    # Construct the data URL
    return f"data:{mime_type};base64,{base64_encoded_data}"


base_dir = "data/raw/"
data_urls = []
responses = []
path = os.path.join(base_dir, "figures")
if os.path.isdir(path):  # Ensure it's a directory
    for image_file in os.listdir(path)[:1]:
        image_path = os.path.join(path, image_file)
        try:
            print(f"processing image: {image_path}")
            # if os.path.isfile(image_path):  # Ensure it's a file
            data_url = local_image_to_data_url(image_path)
            response = client.chat.completions.create(
                # response = client.client.create(
                model=model,
                messages=[
                    {
                        "role": "system",
                        "content": """
            You are `gpt-4-vision-preview`, the OpenAI model that is trained
            to interpret images and can describe images provided by the user
            in detail. The user has attached an image to this message for
            you to answer a question, there is definitely an image attached,
            you will never reply saying that you cannot see the image
            because the image is absolutely and always attached to this
            message. Answer the question asked by the user based on the
            image provided. Do not give any further explanation. Do not
            reply saying you can't answer the question. The answer has to be
            in a JSON format. If the image provided does not contain the
            necessary data to answer the question, return 'null' for that
            key in the JSON to ensure consistent JSON structure. 
            
            """,
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": """ 
        
            You are tasked with accurately interpreting detailed charts and
            text from the images provided to you. You will focus on extracting the price for all the DRINKS from the menu.

            Guidelines:
            - Include all the drinks in the menu
            - Do not skip any drinks in the SALAD&SOUP category
            The output must be in JSON format, with the following structure and fields strictly adhered to: 
            Response Format: 

            The output must be in JSON format, with the following structure and fields strictly adhered to: 
            - dish: the name of the appetizer dish
            - price: the price of the appetizer dish
            - currency: the currency
        
            Example of the answer:
                [{{
                "drink": "Orange Juice",
                "price": 6,
                "currency":"$"
                }},
                ]
            
                
            """,
                            },
                            {"type": "image_url", "image_url": {"url": data_url}},
                        ],
                    },
                ],
                max_tokens=3000,
            )
            content = response.choices[0].message.content

            responses.append({"image": image_file, "response": content})
        except Exception as e:
            print(f"error processing image {image_path}: {e}")
responses

processing image: /dbfs/FileStore/projects/WSsustainability/data/validation/ISA/adobe_raw/gpt-are-gpt/figures/f83e9d37272532debb6009ab38ee7450.jpg
[{'image': 'f83e9d37272532debb6009ab38ee7450.jpg',
  'response': '```json\n[\n    {\n        "drink": "Purified Water",\n        "price": 3.99,\n        "currency": "$"\n    },\n    {\n        "drink": "Sparkling Water",\n        "price": 3.99,\n        "currency": "$"\n    },\n    {\n        "drink": "Soda In A Bottle",\n        "price": 4.50,\n        "currency": "$"\n    },\n    {\n        "drink": "Orange Juice",\n        "price": 6.00,\n        "currency": "$"\n    },\n    {\n        "drink": "Fresh Lemonade",\n        "price": 7.50,\n        "currency": "$"\n    }\n]\n```'}]

### Parsing the output

In [67]:
def format_output_qa(output, debug=False):
    print(f"Raw model output: {output}")
    try:
        output_text = output.replace("\n", "")
        output_text = output_text.replace("```json", "")
        output_text = output_text.replace("```", "")
        if debug is True:
            return output_text
        # Now load it into a Python dictionary
        output_dict = json.loads(output_text)
        # Flatten the dictionary
        # flat_output_dict = flatten_dict(output_dict)
        # Create a df
        df = pd.DataFrame(output_dict)
    except Exception as e:
        print(f"Error processing output: {e}")
        df = pd.DataFrame({"error": str(e)}, index=[0])
    return df


# Now process each response in the list
df_output = pd.DataFrame()
for response_dict in responses:
    response = response_dict["response"]
    df = format_output_qa(response)
    # df["company"] = response_dict["company"]
    df["image"] = response_dict["image"]
    df_output = pd.concat([df_output, df], ignore_index=True)

df_output

Unnamed: 0,drink,price,currency,image
0,Purified Water,3.99,$,f83e9d37272532debb6009ab38ee7450.jpg
1,Sparkling Water,3.99,$,f83e9d37272532debb6009ab38ee7450.jpg
2,Soda In A Bottle,4.5,$,f83e9d37272532debb6009ab38ee7450.jpg
3,Orange Juice,6.0,$,f83e9d37272532debb6009ab38ee7450.jpg
4,Fresh Lemonade,7.5,$,f83e9d37272532debb6009ab38ee7450.jpg
