In [None]:
from ollama import chat
from pydantic import BaseModel
from typing import List, Literal, Optional
import numpy as np
import os
from dotenv import load_dotenv
from PIL import Image
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

load_dotenv()

PROJECT_DIR = os.getenv("PROJECT_DIR")
DEAL_ANNOTATION_PATH = os.path.join(PROJECT_DIR, "information_extraction", "labeled_deals_all.csv")
LEAFLET_DIR = os.path.join(PROJECT_DIR, "crawled_leaflets")
DABASE_PATH = os.path.join(LEAFLET_DIR, "supermarket_leaflets.db")
DEALS_DIR = os.path.join(PROJECT_DIR, "deals")
MODELS_DIR = os.path.join(PROJECT_DIR, "models")

In [None]:
all_images = []
labeled_deals = pd.read_csv(DEAL_ANNOTATION_PATH)["img_name"].tolist()

for root, dirs, files in os.walk(DEALS_DIR):
    for file in files:
        if file.endswith(".png") and "annotated" not in file and (file in labeled_deals or file.replace(".png", "") in labeled_deals):
            all_images.append(os.path.join(root, file))

print(f"Total images: {len(all_images)}")

In [None]:
rnd_image = np.random.choice(all_images)
loaded_image = Image.open(rnd_image).convert("RGB")

In [None]:
class DealDescription(BaseModel):
    brand: str = None
    productname: str = None
    unbinding_price_recommendation: Optional[float] = None
    deal_price: float = None
    weight: str = None

In [None]:
def process_data(img_paths, model):
    data_dict = defaultdict(list)

    for img_path in tqdm(img_paths):
        data_dict["img_name"].append(os.path.basename(img_path))

        response = chat(
                model=model,
                format=DealDescription.model_json_schema(),
                messages=[
                    {
                        "role": "user",
                        "content": (
                            """
                            You are an advanced vision language model specializing in structured data extraction from an image of a deal. Your task is to extract structured information from a supermarket deal.

                            Extraction Fields:
                                - brand: The brand name, if available (e.g. "Coca Cola", "Milka", "Nestle", "Müller", "Iglo").
                                - productname: The name of the product without mentioning the brand and description.
                                - unbinding_price_recommendation: The price of the product without discount, if given (e.g. 2.99, 3.50, 1.99).
                                - deal_price: The price of the product on sale (e.g. 1.99, 2.50, 0.99). Never negative.
                                - weight: The amount of the product, if given (e.g. 500g, 1kg, 1 piece).
                            """
                        ),
                        "images": [img_path],
                    },
                ],
                options={"temperature": 0}
            )
        image_description = DealDescription.model_validate_json(response.message.content)
        data_dict["brand"].append(image_description.brand)
        data_dict["productname"].append(image_description.productname)
        data_dict["original_price"].append(image_description.unbinding_price_recommendation)
        data_dict["deal_price"].append(image_description.deal_price)
        data_dict["weight"].append(image_description.weight)

    return pd.DataFrame(data_dict).reset_index(drop=True)

In [None]:
llama3_2_vision_df = process_data(all_images, "llama3.2-vision")
llama3_2_vision_df.to_csv("llama3_2-vision_results.csv", index=False)
display(llama3_2_vision_df)

In [None]:
minicpm_v_df = process_data(all_images, "minicpm-v")
minicpm_v_df.to_csv("minicpm-v_results.csv", index=False)
display(minicpm_v_df)

In [None]:
llama3_2_vision_11b_instuct_q8_df = process_data(all_images, "llama3.2-vision:11b-instruct-q8_0")
llama3_2_vision_11b_instuct_q8_df.to_csv("llama3_2-vision_11b-instruct-q8_0_results.csv", index=False)
display(llama3_2_vision_11b_instuct_q8_df)

In [None]:
response = chat(
    model="llama3.2-vision",
    format=DealDescription.model_json_schema(),  # Pass in the schema for the response
    messages=[
        {
            "role": "user",
            "content": (
                """
                You are an advanced vision language model specializing in structured data extraction from an image of a deal. Your task is to extract structured information from a supermarket deal.

                Extraction Fields:
                    - brand: The brand name, if available (e.g. "Coca Cola", "Milka", "Nestle", "Müller", "Iglo").
                    - productname: The name of the product without mentioning the brand and description.
                    - original_price: The original price of the product, if given (often strikethrough or the UVP).
                    - deal_price: The price of the product on sale (e.g. 1.99, 2.50, 0.99). Never negative.
                    - weight: The amount of the product, if given (e.g. 500g, 1kg, 1 piece).
                """
            ),
            "images": [rnd_image],
        },
    ],
    options={"temperature": 0}, # Set the temperature to 0 to get deterministic results
)

image_description = DealDescription.model_validate_json(response.message.content)
print(image_description)
loaded_image

In [None]:
response = chat(
    model="minicpm-v",
    format=DealDescription.model_json_schema(),  # Pass in the schema for the response
    messages=[
        {
            "role": "user",
            "content": (
                """
                You are an advanced vision language model specializing in structured data extraction from an image of a deal. Your task is to extract structured information from a supermarket deal.

                Extraction Fields:
                    - brand: The brand name, if available (e.g. "Coca Cola", "Milka", "Nestle", "Müller", "Iglo").
                    - productname: The name of the product without mentioning the brand and description.
                    - original_price: The original price of the product, if given (often strikethrough or the UVP).
                    - deal_price: The price of the product on sale (e.g. 1.99, 2.50, 0.99). Never negative.
                    - weight: The amount of the product, if given (e.g. 500g, 1kg, 1 piece).
                """
            ),
            "images": [rnd_image],
        },
    ],
    options={"temperature": 0}, # Set the temperature to 0 to get deterministic results
)

image_description = DealDescription.model_validate_json(response.message.content)
print(image_description)
loaded_image

In [None]:
response = chat(
    model="llama3.2-vision:11b-instruct-q8_0",
    format=DealDescription.model_json_schema(),  # Pass in the schema for the response
    messages=[
        {
            "role": "user",
            "content": (
                """
                You are an advanced vision language model specializing in structured data extraction from an image of a deal. Your task is to extract structured information from a supermarket deal.

                Extraction Fields:
                    - brand: The brand name, if available (e.g. "Coca Cola", "Milka", "Nestle", "Müller", "Iglo").
                    - productname: The name of the product without mentioning the brand and description.
                    - original_price: The original price of the product, if given (often strikethrough or the UVP).
                    - deal_price: The price of the product on sale (e.g. 1.99, 2.50, 0.99). Never negative.
                    - weight: The amount of the product, if given (e.g. 500g, 1kg, 1 piece).
                """
            ),
            "images": [rnd_image],
        },
    ],
    options={"temperature": 0}, # Set the temperature to 0 to get deterministic results
)

image_description = DealDescription.model_validate_json(response.message.content)
print(image_description)
loaded_image