In [1]:
import os
import cv2
import sys
import re
import subprocess
import pytesseract
import numpy as np
import torch
import torch.nn as nn
# import torch.optim as optim
from PIL import Image
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
# import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from torchvision import transforms
import json
import csv
import traceback
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import logging
import openfoodfacts
import openfoodfacts.products as products
from typing import List

In [2]:
# Set up Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the folder containing PNG images
folder_path = "C:/Users/hseit/Desktop/SummerMCREU/images/"
folder_path_after = "C:/Users/hseit/Desktop/SummerMCREU/imagesAfter/"
extracted_data = []
global keywords
keywords = {
    "product_type": {
        "food": ["food", "nutrition", "diet"],
        "skincare": ["skincare", "beauty", "cosmetics"],
        "medication": ["medication", "drug", "pharmaceutical"]
    },
    "directions": ["directions", "usage", "how to use"],
    "supplements_or_elements": ["supplements", "drug facts", "ingredients"],
    "warnings": ["warnings", "cautions", "precautions"]
}

In [3]:
class CNN(nn.Module):
    def __init__(self, hidden_size):
        super(CNN, self).__init__()
        # Define the CNN layers
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(9472, hidden_size)

    def forward(self, x):
        x = x.float()  # Convert the input tensor to float
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        input_size = x.size(1)  # Get the output size of the last convolutional layer
        x = self.fc(x)
        return x, input_size

# Define the RNN architecture for sequence recognition
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes+1)  # Increase num_classes by 1 to account for the 'unseen' label

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Define the CRNN model integrating the CNN and RNN components
class CRNN(nn.Module):
    def __init__(self, cnn, rnn):
        super(CRNN, self).__init__()
        self.cnn = cnn
        self.rnn = rnn

    def forward(self, x):
        features = self.cnn(x)
        features = features[0].view(features[0].size(0), -1, features[0].size(1))
        output = self.rnn(features)
        return output

In [4]:
class ImageTextDataset(Dataset):
    def __init__(self, data):
        if isinstance(data, list) and all(isinstance(item, dict) for item in data):
            self.data = data
        else:
            raise ValueError("Input data should be a list of dictionaries.")

        self.transform = transforms.ToTensor()

    def __getitem__(self, index):
        image_path = self.data[index]['image_path']
        extracted_text = self.data[index]['extracted_text']
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
        image = cv2.resize(image, (100, 75))
        image = self.transform(image)

        extracted_dict = {
            "image_path": image_path,
            "extracted_text": extracted_text,
            "product_type": "",
            "directions": "",
            "supplements_or_elements": "",
            "warnings": ""
        }

        lines = extracted_text.split('\n')

        for category, category_keywords in keywords.items():
            if category == 'product_type':
                for product_type, product_type_keywords in category_keywords.items():
                    for line in lines:
                        if any(keyword.lower() in line.lower() for keyword in product_type_keywords):
                            extracted_dict[category] = product_type
                            lines.remove(line)
                            break
            else:
                for keyword in category_keywords:
                    for line in lines:
                        if keyword.lower() in line.lower():
                            extracted_dict[category] = line
                            lines.remove(line)
                            break

        extracted_dict['extracted_text'] = '\n'.join(lines)

        return image, extracted_dict


In [5]:
def collate_fn(batch):
    # Initialize lists to store images and raw labels
    images = []
    raw_labels = []

    # Iterate over each (img, label) pair in the batch
    for img, label in batch:
        # Convert the tensor to a PIL image if it's a tensor
        if isinstance(img, torch.Tensor):
            img = transforms.ToPILImage()(img)

        # Convert the image to grayscale
        img = img.convert("L")

        # Resize the image to a fixed width while maintaining the aspect ratio
        img = img.resize((32, img.height))

        # Convert the image to a numpy array
        img = np.array(img)

        # Expand dimensions if the image is grayscale
        if img.ndim == 2:
            img = np.expand_dims(img, axis=2)
        # Concatenate the image along the third axis if it has a single channel
        elif img.shape[2] == 1:
            img = np.concatenate([img] * 3, axis=2)

        # Transpose the image to match the expected input shape of the model
        img = img.transpose((2, 0, 1))

        # Convert the numpy array to a PyTorch tensor
        img = torch.from_numpy(img)

        # Add the image tensor to the list of images
        images.append(img)

        # Store the raw labels in the raw_labels list
        raw_labels.append(label)

    # Stack the image tensors along the batch dimension
    images = torch.stack(images, dim=0)

    # Preprocess the raw labels by applying the preprocess_label function and filtering out unseen labels
    processed_labels = [preprocess_label(label) for label in raw_labels if preprocess_label(label) in label_encoder.classes_]

    # Transform the processed labels to numerical encodings using the label_encoder
    labels = label_encoder.transform(processed_labels)

    # Convert the labels to a PyTorch tensor of type Long
    labels = torch.tensor(labels, dtype=torch.long)

    # Return the batch of images and labels
    return images, labels


In [6]:
def get_product_info(product_name):
    search_results = openfoodfacts.products.search(product_name)
    if search_results and 'products' in search_results:
        products = search_results['products']
        if products:
            return products[0]  # Return the first product
    return {}  # Return an empty dictionary if no results or unexpected structure

# Create a function to process the extracted text and retrieve additional product information
def process_extracted_text(extracted_text):
    # Clean the extracted text
    cleaned_text = clean_text(extracted_text, None)

    # Retrieve product information from Open Food Facts based on the extracted text
    product_info = get_product_info(cleaned_text)

    return product_info

In [7]:
# Initialize lemmatizer and define stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text, category):
    if pd.isna(text):
        return ''
    text = str(text)
    text = text.lower()

    # List of keywords to preserve for 'warnings' category
    warnings_keywords = ['broken skin', 'do not apply']

    if category == 'warnings':
        # Exclude specific keywords from the general cleaning process
        for keyword in warnings_keywords:
            text = text.replace(keyword, f'__{keyword}__')
    else:
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove special symbols
        text = re.sub(r'[^\w\s]', '', text)
        
        # Remove special symbols
        text = text.encode('ascii', 'ignore').decode('ascii')

        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        text = " ".join(tokens)

    return text

In [8]:
# Create output folders if they don't exist
output_extracted_folder = os.path.join(folder_path_after, "extracted_images")
output_not_extracted_folder = os.path.join(folder_path_after, "not_extracted_images")
os.makedirs(output_extracted_folder, exist_ok=True)
os.makedirs(output_not_extracted_folder, exist_ok=True)

In [9]:
# sharpen_kernel = np.array([[0, -1, 0],
#                            [-1, 5, -1],
#                            [0, -1, 0]], dtype=np.float64)

# better
sharpen_kernel = np.array([[0, -0.5, 0],
                           [-0.5, 3, -0.5],
                           [0, -0.5, 0]], dtype=np.float64)
# less but worse
# sharpen_kernel = np.array([[0, -0.3, 0],
#                            [-0.3, 2, -0.3],
#                            [0, -0.3, 0]], dtype=np.float64)
global text_threshold
text_threshold = 8

keywords = {
    "product_type": {
        "food": ["food", "nutrition", "diet"],
        "skincare": ["skincare", "beauty", "cosmetics"],
        "medication": ["medication", "drug", "pharmaceutical"]
    },
    "directions": ["directions", "usage", "how to use"],
    "supplements_or_elements": ["supplements", "drug facts", "ingredients"],
    "warnings": ["warnings", "cautions", "precautions"]
}

# Assign numerical code to product type
product_type_map = {
    'food': 1,
    'skincare': 2,
    'medication': 3
}

# Set up logging
logging.basicConfig(filename='error.log', level=logging.ERROR)

In [10]:
def image_save_textLength(extracted_text, image, image_path, text_threshold, folder_path_after):
    text_length = len(extracted_text)

    if text_length >= text_threshold:
        # Save the image in the 'extracted_images' folder
        folder = "extracted_images"
    else:
        # Save the image in the 'not_extracted_images' folder
        folder = "not_extracted_images"
    
    output_folder = os.path.join(folder_path_after, folder)
    output_path = os.path.join(output_folder, os.path.basename(image_path))
    cv2.imwrite(output_path, image)

# def extract_data(keywords, lines, image_path, extracted_text):
    
#     extracted_dict = {}
#     # Match keywords to determine the feature type
#     for feature, feature_keywords in keywords.items():
#         if feature == 'product_type':
#             for product_type, product_type_keywords in feature_keywords.items():
#                 for line in lines:
#                     if any(keyword.lower() in line.lower() for keyword in product_type_keywords):
#                         extracted_dict[feature] = product_type
#                         break
#         else:
#             for keyword in feature_keywords:
#                 for line in lines:
#                     if keyword.lower() in line.lower():
#                         extracted_dict[feature] = line
#                         break

#     # Store the extracted data
#     data = {
#         "image_path": image_path,
#         "extracted_text": extracted_text,
#         **extracted_dict
#     }

#     return data
def extract_data(keywords, lines, image_path, extracted_text):
    extracted_dict = {}

    # Match keywords to determine the feature type
    for feature, feature_keywords in keywords.items():
        if feature == 'product_type':
            for product_type, product_type_keywords in feature_keywords.items():
                for line in lines:
                    # Check if any keyword in the current line matches the product type keywords
                    if any(keyword.lower() in line.lower() for keyword in product_type_keywords):
                        # Assign the product type to the feature
                        extracted_dict[feature] = product_type
                        break
        else:
            # Combine all lines into a single string
            all_lines_text = ' '.join(lines)
            
            # Find the first occurrence of the keyword in the combined text
            for keyword in feature_keywords:
                if keyword.lower() in all_lines_text.lower():
                    # Extract the substring after the keyword
                    start_index = all_lines_text.lower().index(keyword.lower()) + len(keyword)
                    extracted_value = all_lines_text[start_index:].strip()
                    
                    # Assign the extracted value to the feature
                    extracted_dict[feature] = extracted_value
                    break

    # Store the extracted data
    data = {
        "image_path": image_path,
        "extracted_text": extracted_text,
        **extracted_dict
    }

    return data


In [25]:
def process_image(image_path):
    try:
        print("Processing image:", image_path)

        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        target_size = max(image.shape[0], image.shape[1])
        scaling_factor = target_size / max(image.shape[0], image.shape[1])
        resized_image = cv2.resize(image, (int(image.shape[1] * scaling_factor), int(image.shape[0] * scaling_factor)))

        grayscale_image = cv2.cvtColor(resized_image, cv2.COLOR_RGB2GRAY)
        
        # Calculate the ratio of black pixels
        black_ratio = np.count_nonzero(grayscale_image <= 1) / (grayscale_image.shape[0] * grayscale_image.shape[1])
        
        if black_ratio > 0.95:  # If the image is mostly black and white
            
            # Contrast enhancement
            contrast_img = cv2.convertScaleAbs(grayscale_image, alpha=1.5, beta=30)
            
            # Noise removal
            filtered_img = cv2.medianBlur(contrast_img, 5)
            
            # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) for even lighting
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
            equalized_image = clahe.apply(filtered_img)
            # Otsu's thresholding
            _, thresh_img = cv2.threshold(equalized_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # Dilation and Erosion
            kernel = np.ones((1, 1), np.uint8)
            thresh_img = cv2.dilate(thresh_img, kernel, iterations=1)
            thresh_img = cv2.erode(thresh_img, kernel, iterations=1)
            extracted_text = pytesseract.image_to_string(thresh_img, config='--psm 6 alphanumeric').strip()
            extracted_text = clean_text(extracted_text)
            
            if len(extracted_text) < text_threshold:
                inverted_image = cv2.bitwise_not(resized_image)
                inverted_extracted_text = pytesseract.image_to_string(inverted_image).strip()
                inverted_extracted_text = clean_text(inverted_extracted_text)
                extracted_text = inverted_extracted_text
                image = inverted_image
                
            image_save_textLength(extracted_text, image, image_path, 10, folder_path_after)

#             Here we assume that `extracted_text` contains a unique identifier for the product
            product_info = products.get(extracted_text)  # Fetch the product info from OpenFoodFacts

            # Add the fetched product info to the data dictionary
            data = {
                "image_path": image_path,
                "extracted_text": extracted_text,
                "product_info": product_info
            }
            return grayscale_image, data
        else:  # For color images
            
            # Convert the image to LAB color space
            lab_img = cv2.cvtColor(resized_image, cv2.COLOR_RGB2Lab)
            
            # Split the LAB image into L, A and B channels
            l, a, b = cv2.split(lab_img)
            
            # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) for even lighting
            # Apply CLAHE to L channel
            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
            cl = clahe.apply(l)
            
            # Merge the CLAHE enhanced L channel with the A and B channels
            limg = cv2.merge((cl,a,b))
            
            # Convert image from LAB Color model to RGB model
            enhanced_img = cv2.cvtColor(limg, cv2.COLOR_Lab2RGB)
            gray = cv2.cvtColor(enhanced_img, cv2.COLOR_RGB2GRAY)
            _, thresholded = cv2.threshold(gray, 170, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # Perform text detection
            contours, _ = cv2.findContours(thresholded.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            text_regions = []
            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if w > 10 and h > 10:  # Filter out small contours
                    text_regions.append((x, y, w, h))

            # Find the contour with maximum area
            if text_regions:
                max_contour = max(text_regions, key=lambda r: r[2] * r[3])
                x, y, w, h = max_contour
                cropped_image = thresholded[y:y+h, x:x+w]
                extracted_text = pytesseract.image_to_string(cropped_image).strip()
                text_length = len(extracted_text)

                image_save_textLength(extracted_text, image, image_path, 10, folder_path_after)
                lines = extracted_text.split('\n')
                data = extract_data(keywords, lines, image_path, extracted_text)

                return cropped_image, data

        return None
    except Exception as e:
        error_msg = f"An exception occurred in process_image: {e}"
        print(error_msg)
        logging.error(error_msg)
        traceback.print_exc()


In [26]:
def process_images_multithreading(image_paths):
    # Local Lists to store the cropped images and extracted data
    cropped_images = []
    extracted_data = []
    
    # Process each image using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_image, image_paths)

    for result in results:
        if result:
            cropped_image, data = result
            cropped_images.append(cropped_image)
            extracted_data.append(data)

    return cropped_images, extracted_data

def save_and_display_results(cropped_images, extracted_data):
    if cropped_images:
        output_folder = os.path.join(folder_path_after, "cropped_images")

        for i, cropped_image in enumerate(cropped_images):
            output_path = os.path.join(folder_path_after, f"cropped_image_{i}.jpg")
            cv2.imwrite(output_path, cropped_image)

        # Open the saved image using the default image viewer
        if sys.platform.startswith('darwin'):  # macOS
            subprocess.run(["open", output_path])
        elif sys.platform.startswith('win32'):  # Windows
            subprocess.run(["start", output_path], shell=True)
        else:  # Linux
            subprocess.run(["xdg-open", output_path])

    output_folder = os.path.join(folder_path_after)
    output_path = os.path.join(output_folder, "extracted_data.json")
    with open(output_path, "w") as file:
        json.dump(extracted_data, file, indent=4)

    output_path_csv = os.path.join(folder_path_after, 'extracted_data.csv')
    with open(output_path_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(extracted_data)

def folderOCR(directory_path):
    try:
        global extracted_data
        image_paths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)
                       if filename.endswith((".jpg", ".png"))]
        
        cropped_images, extracted_data = process_images_multithreading(image_paths)
        save_and_display_results(cropped_images, extracted_data)

    except Exception as e:
        error_msg = f"An exception occurred in folderOCR: {e}"
        print(error_msg)
        logging.error(error_msg)
        traceback.print_exc()

In [27]:
folderOCR(folder_path)

Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/cockies.jpgProcessing image: C:/Users/hseit/Desktop/SummerMCREU/images/darkchoc.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/deodorant.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/FoodLabel-Matrix.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/front_en.536.400.jpg

Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/front_en.797.400.jpgProcessing image: C:/Users/hseit/Desktop/SummerMCREU/images/groundbeef2.jpg

Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/heinz.jpg
Processing image: Processing image:Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4230.jpg
C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4228.jpg Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4231.jpg

C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4229.jpg
Processing image: C:/User

# There are two versions, 1 for the whole world products, and 1 for only the product in english
# world has 703
# english about 100
# + my database (about 60) per case

In [28]:
# Initialize the Open Food Facts products object
products = openfoodfacts.products

# Define a mapping of your categories to Open Food Facts categories
category_mapping = {
    "food": ["dairy", "beverages", "groceries", "Desserts", "Frozen foods"],
    "skincare": ["beauty", "cosmetics"],
    "medication": ["medicine"]
}

# Create a list to store the properly structured extracted data
image_extracted_data = []

# Iterate over each item in extracted_data
for item in extracted_data:
    # Check if the required keys are present in the item
    if all(key in item for key in ['image_path', 'extracted_text']):
        # Add the item to image_extracted_data
        image_extracted_data.append({
            'Product Name': item.get('image_path', ''),
            'Product Category': item.get('product_type', ''),
            'Supplements/Ingredients': item.get('supplements_or_elements', '')
        })
    else:
        print(f'Item missing keys: {item}')

# Create a DataFrame from the properly structured extracted data
df = pd.DataFrame(image_extracted_data)

# Iterate through the DataFrame and map 'Product Type' to 'Product Category'
for index, row in df.iterrows():
    for key, value in category_mapping.items():
        if row['Product Category'] in value:
            df.at[index, 'Product Category'] = key

# Extract the last two parts of the image path
df['Product Name'] = df['Product Name'].apply(lambda path: '/'.join(path.split('/')[-2:]))

# There are two versions, 1 for the whole world products, and 1 for only the product in english
# world has 703
# english about 100
def fetch_products(category, max_pages_per_category):
    page = 1
    products_list = []

    while page <= max_pages_per_category:
#         products = openfoodfacts.products.get_by_category(category, page=page, locale='world')
#         if not products:
#             break
#         products_list.extend(products)
        products = openfoodfacts.products.get_by_category(category, page=page, locale='world')
        english_products = [product for product in products if product.get('lang', '') == 'en']
        if not english_products:
            break
        products_list.extend(english_products)
        page += 1

    return products_list


def get_product_data(df: pd.DataFrame, categories: List[str], max_pages_per_category: int = 5):
    df_list = []

    with ThreadPoolExecutor() as executor:
        # Submit tasks for fetching products in each category concurrently
        futures = [executor.submit(fetch_products, category, max_pages_per_category) for category in categories]

        # Iterate over the completed futures
        for index, future in enumerate(futures):
            category = categories[index]
            products_list = future.result()

            # Create a list of dictionaries for the products in the current category
            product_dicts = [
                {
                    'Product Category': category,
                    'Product Name': product.get('product_name', ''),
                    'Supplements/Ingredients': product.get('ingredients_text', '')
                }
                for product in products_list
            ]

            # Create a DataFrame from the list of dictionaries
            df_page = pd.DataFrame(product_dicts)

            # Append the DataFrame to the list
            df_list.append(df_page)

    # Concatenate the list of DataFrames into a single DataFrame
    df_products = pd.concat(df_list, ignore_index=True)

    # Extract the last two parts of the image path
    df_products['Product Name'] = df_products['Product Name'].apply(lambda path: '/'.join(path.split('/')[-2:]))

    # Concatenate the extracted data DataFrame and the Open Food Facts DataFrame
    df_combined = pd.concat([df, df_products], ignore_index=True)

    # Reorder the columns
    df_combined = df_combined[['Product Category', 'Product Name', 'Supplements/Ingredients']]

    return df_combined

# Specify the categories of interest
categories = ['dairy', 'beverages', 'medicine', 'groceries', 'desserts', 'frozen foods']

# Get product data
product_data = get_product_data(df, categories)

# Set option to display all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame
print("\nProduct Information:")
product_data

# other info, where is the rest of info.
# template matching


Product Information:


Unnamed: 0,Product Category,Product Name,Supplements/Ingredients
0,,images/brtbeefjpg.jpg,
1,,images/cockies.jpg,
2,,images/darkchoc.jpg,
3,,images/deodorant.jpg,
4,,images/FoodLabel-Matrix.jpg,
5,,images/front_en.536.400.jpg,
6,,images/front_en.797.400.jpg,
7,food,images/groundbeef2.jpg,
8,,images/heinz.jpg,
9,,images/IMG-4228.jpg,


# Slower version without multithreading

In [29]:
# # Create a list to store the properly structured extracted data
# image_extracted_data = []

# # Iterate over each item in extracted_data
# for item in extracted_data:
#     # Check if the required keys are present in the item
#     if all(key in item for key in ['image_path', 'extracted_text']):
#         # Add the item to image_extracted_data
#         image_extracted_data.append({
#             'Product Name': item.get('image_path', ''),
#             'Product Category': item.get('product_type', ''),
#             'Supplements/Ingredients': item.get('supplements_or_elements', '')
#         })
#     else:
#         print(f'Item missing keys: {item}')

# # Create a DataFrame from the properly structured extracted data
# df = pd.DataFrame(image_extracted_data)

# # Iterate through the DataFrame and map 'Product Type' to 'Product Category'
# for index, row in df.iterrows():
#     for key, value in category_mapping.items():
#         if row['Product Category'] in value:
#             df.at[index, 'Product Category'] = key

# # Extract the last two parts of the image path
# df['Product Name'] = df['Product Name'].apply(lambda path: '/'.join(path.split('/')[-2:]))

# def get_product_data(df: pd.DataFrame, categories: List[str], max_pages_per_category: int = 5):
#     # Iterate over each category
#     for category in categories:
# #         print(f"Processing {category} category.")

#         # Initialize page counter
#         page = 1

#         # Fetch products until the maximum number of pages is reached
#         while page <= max_pages_per_category:
#             # Retrieve products in the category from Open Food Facts
#             products_list = products.get_by_category(category, page=page, locale='world')

#             # Filter English products
#             english_products = [product for product in products_list if product.get('lang', '') == 'en']

#             # Check if any products are found in the category
#             if not english_products:
#                 print("No more products found in this category.")
#                 break

#             for product in english_products:
#                 # Extract the relevant information from the product
#                 product_name = product.get('product_name', '')
#                 product_category = category
#                 supplements_ingredients = product.get('ingredients_text', '')

#                 # Add the product information to the DataFrame
#                 df = df.append({'Product Name': product_name,
#                                 'Product Category': product_category,
#                                 'Supplements/Ingredients': supplements_ingredients},
#                                ignore_index=True)

#             # Increment the page counter
#             page += 1

#     # Reorder the columns
#     df = df[['Product Category', 'Product Name', 'Supplements/Ingredients']]

#     # Extract the last two parts of the image path
#     df['Product Name'] = df['Product Name'].apply(lambda path: '/'.join(path.split('/')[-2:]))

#     return df

# # Specify the categories of interest
# categories = ['dairy', 'beverages', 'medicine', 'groceries', 'desserts', 'frozen foods']

# # Get product data
# product_data = get_product_data(df, categories)

# # Display the DataFrame
# print("\nProduct Information:")
# product_data


In [30]:
# Set option to display all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame
print("\nProduct Information:")
product_data


Product Information:


Unnamed: 0,Product Category,Product Name,Supplements/Ingredients
0,,images/brtbeefjpg.jpg,
1,,images/cockies.jpg,
2,,images/darkchoc.jpg,
3,,images/deodorant.jpg,
4,,images/FoodLabel-Matrix.jpg,
5,,images/front_en.536.400.jpg,
6,,images/front_en.797.400.jpg,
7,food,images/groundbeef2.jpg,
8,,images/heinz.jpg,
9,,images/IMG-4228.jpg,


In [31]:
import pprint

pprint.pprint(extracted_data)

[{'extracted_text': 'essential\n7 \\Naitrose® .\nbritish beef ~',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg'},
 {'extracted_text': 'Sars, Sin? (5) Sat.Fal6y 30% Fiber 0g\n'
                    'Sunane Ctaler® eat. amy 19% Sapare Op\n'
                    'Sedha',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/cockies.jpg'},
 {'extracted_text': 'NOIR PRODIGIEUX',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/darkchoc.jpg'},
 {'extracted_text': 'PURE SPORT\n\n———HIGH ENDURANCE®',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/deodorant.jpg'},
 {'extracted_text': 'of which saturates\nCarbohydrate\nof which sugars',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/FoodLabel-Matrix.jpg'},
 {'extracted_text': '',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/front_en.536.400.jpg'},
 {'extracted_text': '',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/front_en.797.400.jpg'},
 {'extracted_t

                'PHENOXYETHANOL, DISODIUM EDTA, TETRASODIUM EDTA, HYDROLYZED '
                'HYALURONIC ACID, PHYTOSPHINGOSINE, XANTHAN GUM, '
                'ETHYLHEXYLGLYCERIN (CODE F.I.L. D230397/1]  CeraVe@LLc, ned '
                'York, NY 10001  ein USA of US and/o Imported Ingredients ier '
                'WWW.Cerave.com  Questions or c omm ? 1-898. 168. 291% ents?  '
                '3612623334935  48x 504',
  'extracted_text': 'CeraVe” Renowhi\n'
                    '\n'
                    'Developed with dermatologists, its unique formula -\n'
                    'with 3 essential ceramides ~ exfoliates to remove dirt\n'
                    '& oil while softening and smoothing skin.\n'
                    '\n'
                    'Free of Salicylic Acid\n'
                    '\n'
                    'Mechanical Exfoliates and\n'
                    'Exfoliants softens to smooth\n'
                    'Gently smooths skin rough skin\n'
                    '\n'
       

                    '\n'
                    'White petrolatum USP\n'
                    '\n'
                    'Uses m temporarily protects m\n'
                    'e cuts « scrapes » burns il temporari\n'
                    'chapped or cracked skin ane\n'
                    'ect from the drying effects\n'
                    '\n'
                    '+ {Do not use on Ml deep or puncture wounds\n'
                    '\n'
                    'ly protects’sg% animal bites i serious burns\n'
                    '\n'
                    'Keep out of reach of children.\n'
                    '\n'
                    '[if swallowed, get medical help or contact a\n'
                    '\n'
                    'a Poison Control Center right away.\n'
                    '\n'
                    '00,\n'
                    '\n'
                    'lips Mi helps prot\n'
                    'wind and cold weather\n'
                    '\n'
                    'Gaticfacti teed = For\n'
 

In [19]:
# Template matching, inverting the img, 