In [12]:
import sys
from pathlib import Path
import os
import base64
from groq import Groq

from dotenv import load_dotenv
import json
import re
from datetime import date

load_dotenv()

project_root = Path().resolve().parent  
sys.path.append(str(project_root))

IMAGE_PATH = project_root / "data/raw/lidl/WhatsApp Image 2025-09-09 at 12.48.55.jpeg"


In [None]:
# https://console.groq.com/docs/vision

In [3]:
if os.path.isfile(IMAGE_PATH):
    print("Valid: file exists")
else:
    print("Invalid: not a file or doesn’t exist")

Valid: file exists


In [4]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [5]:
base64_image = encode_image(IMAGE_PATH)

In [34]:
# Groq and OpenAI both follow the Chat Completions API format, which is structured around a list of messages.
"""
    The structure is nested arrays and dictionaries:
    Outer array → list of messages (prompt)
    Each message → dictionary with role and content
    Content array → list of content parts (text, image, etc.)
"""

prompt = [
    {
        "role": "system",
        "content": [
            { "type": "text", "text": "You are a helpful assistant that extracts structured data from receipts." }
        ]
    },
    {
        "role": "user",
        "content": [
            { 
                "type": "text", 
                "text": (
                    "Extract all items from this receipt and return the data in a JSON table format "
                    "with the following columns: product, quantity, price_unit, price_total, discount_amount, discount_ind. "
                    "If a field is not present, use null. "
                    "If a line represents a discount, add the value to the discount_amount of the item above and mark the discount_ind = 'J' for the item above. Do not change other values of the item above. "
                    "Do not include discount lines as separate rows. "
                    "Do not include any extra text, just return the JSON."
                ) 
            },
            { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } }
        ]
    }
]

In [35]:
client = Groq()
chat_completion = client.chat.completions.create(
    messages=prompt,
    model="meta-llama/llama-4-scout-17b-16e-instruct",
)
message_content = chat_completion.choices[0].message.content

In [9]:
def parse_str(message_content: str):
    
    message_clean = message_content.strip().strip("```json").strip("```")

    try:
        message_json = json.loads(message_clean)
        return message_json
    except json.JSONDecodeError:
        return "There was some error parsing the model's output"
    except Exception as e:
        print(f"Unexpected error: {e}")
        return "There was some error parsing the model's output"

In [36]:
parsed_output = parse_str(message_content)
print(parsed_output[5])

{'product': 'Rasp kaas jong bele', 'quantity': 2, 'price_unit': 2.99, 'price_total': 5.98, 'discount_amount': None, 'discount_ind': None}


In [38]:
# Loop over all receipts lines and write to database
for receipt_line in parsed_output:
    
    # define variables to insert into database
    filename = "x"
    receipt_date = "9999-12-31"
    store_cod = "LIDL"
    load_date = date.today()
    submitted_by = "jjvanderuitenbeek"

    # transform values to database types
    quantity = (receipt_line['quantity'] or 1)
    product = receipt_line['product'].strip()
    price_unit = receipt_line['price_unit']
    price_total = quantity * price_unit
    discount_ind = (receipt_line['discount_ind'] or 'N')