In [1]:
import easyocr
import fitz  # PyMuPDF
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt
from groq import Groq
import os
from dotenv import load_dotenv


from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders.csv_loader import CSVLoader


from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_ollama.llms import OllamaLLM
from typing import Optional


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
load_dotenv()

## load the Groq API key
groq_api_key=os.environ['GROQ_API_KEY']
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")


In [3]:
flyer_content = ""
for page_num in range(24):
    file_path = f"../outputs/extracted_products_ocr/page_1/{page_num}.txt"
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            flyer_content += file.read()
    else:
        print(f"can not find a {file_path}")

can not find a ../outputs/extracted_products_ocr/page_1/4.txt
can not find a ../outputs/extracted_products_ocr/page_1/9.txt
can not find a ../outputs/extracted_products_ocr/page_1/13.txt
can not find a ../outputs/extracted_products_ocr/page_1/14.txt
can not find a ../outputs/extracted_products_ocr/page_1/18.txt
can not find a ../outputs/extracted_products_ocr/page_1/19.txt


In [5]:
for text in flyer_content.split("\n"):
    print(text)

```
IGA

Prix imbattables

KETCHUP AUX TOMATES
HEINZ (750 ml ou 1 L)
OU TARTINADE
KRAFT
MIRACLE WHIP (500 ml)
```L’heure  
du BBQ,  
c’est sacré.  

Contient  
4 sacs  
de 680 g  ```
Encore plus
d'offres en ligne
Even more offers in our digital flyer

Scène
PRIX CHOC
```I'm unable to extract text from the image as it is not visible to me. If you can provide the text or describe the content, I'd be happy to help with that!```
KETCHUP AUX TOMATES
HEINZ (750 mL ou 1 L)
ou TARTINADE
MIRACLE WHIP (890 mL)
TOMATO KETCHUP
OR SPREAD

4 97

CRÈME GLACÉE
HAAGEN-DAZS (400 mL ou 500 mL)
ou FRANDAISES GLACÉES
HAAGEN-DAZS
DESSERT GLACÉ
SORBET OU GELATO
NOSTRE DÉLICES
ICE CREAM, FROZEN DESSERTS,
SORBET OR GELATO
SÉLECTION VARIÉE

4 94

CAFÉ MOULU
MAXWELL HOUSE
GROUND COFFEE
``````
4 sacs de 680 g

ÉCONOCAISSE
CREVETTES NORDIQUES
SURGELÉES
CARAVELLE
Grosseur 250-350 size
FROZEN NORDIC
SHRIMP ÉCONOCAISSE
4 x 680 g

40$

Limite de 1 par client
Limit of 1 per client

Sans carte Scène*
Without Scene* card

In [6]:
model_local = OllamaLLM(model="llama3",
                   temperature=0.0)

model_groq=ChatGroq(groq_api_key=groq_api_key,
             model_name="llama-3.1-70b-versatile",
             temperature=0.0)



template = """
Please analyze the following text and organize it to identify products and their promotional prices. In this text, product descriptions may span multiple lines, and each product usually ends with a price listed on a new line. Some products also have descriptive details like sizes or variants on additional lines.

Please merge any split product descriptions and list each product with its corresponding price. If a price appears isolated, assign it to the preceding product description. Organize the results in a clear list format like this:

Product Name and Description: Price
Example:

Pepsi ou Coca-Cola, 12 x 355 ml Canettes: 8.97
Eau de source naturelle: 6.27
Here is the text:

{query}

Return the products with their descriptions and prices in a clear list format as shown.
"""


template_new = """
Please analyze the following text from a bilingual flyer (containing both English and French) to 
organize and extract product information. For each product, please provide:

Product Name or Brand Logo: Clearly indicate the product name or brand.
Price: Identify the price for each product.
Promotions or Discounts: If any special offers, discounts, or promotions are mentioned, include these details.
Category: Classify each product into one of the following categories:
Chicken
Fish
Pork
Beef
Vegetable
Fruit
Others (for products that don’t fit into the other categories).
Ensure that for each product:

Both English and French details are included when available, listing each language version for clarity.
All related information for the product (such as size or variant) is grouped together in one entry.
Please return the results in a structured list format, for example:

Example Format:

Product Name and Description (English / French):
Category: [Category]
Price: [Price]
Promotions: [Promotions or Discounts, if any]

Example:

Pepsi ou Coca-Cola, 12 x 355 ml Canettes
Category: Others
Price: 8.97
Promotions: None

Here is the text to analyze:

{query}


"""

prompt = PromptTemplate(
    template=template_new,
    input_variables=["query"],
    
)

prompt_and_model = prompt | model_groq

output = prompt_and_model.invoke({"query": flyer_content})
print(output.content)

extracted_text_path = f"../outputs/flyer_new.txt"
# Save the text to a file
with open(extracted_text_path, 'w') as file:
    file.write(output.content)

Here's the extracted product information in the requested format:

1. KETCHUP AUX TOMATES HEINZ (750 mL ou 1 L) ou TARTINADE MIRACLE WHIP (890 mL)
Category: Others
Price: 4.97
Promotions: None

2. CRÈME GLACÉE HAAGEN-DAZS (400 mL ou 500 mL) ou FRANDAISES GLACÉES HAAGEN-DAZS
Category: Others
Price: 4.94
Promotions: None

3. CAFÉ MOULU MAXWELL HOUSE
Category: Others
Price: 4.94
Promotions: None

4. ÉCONOCAISSE CREVETTES NORDIQUES SURGELÉES CARAVELLE
Category: Fish
Price: 40.00 (with Scene card), 50.00 (without Scene card)
Promotions: Limit of 1 per client, minimum 30 cases per order

5. CÔTELETTES DE LONGE DE VEAU DE LAIT FRAIS
Category: Beef
Price: 22.02/kg, 9.99/lb
Promotions: None

6. PEPSI OU COCA-COLA
Category: Others
Price: 8.97
Promotions: None

7. EAU DE SOURCE NATURELLE
Category: Others
Price: 6.27
Promotions: None

8. POULET ENTIERS FRAIS
Category: Chicken
Price: 4.39/kg (less than 2 kg)
Promotions: None

9. EAU DE SOURCE NATURELLE ESKA
Category: Others
Price: 3.27
Promotions: 

FileNotFoundError: [Errno 2] No such file or directory: './outputs/flyer_new.txt'

In [7]:
extracted_text_path = f"../outputs/flyer_new.txt"
# Save the text to a file
with open(extracted_text_path, 'w') as file:
    file.write(output.content)