In [1]:
import os
import cv2
import sys
import re
import subprocess
import pytesseract
import numpy as np
import torch
import torch.nn as nn
# import torch.optim as optim
from PIL import Image
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from torchvision import transforms
import json
import csv
import traceback
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import logging
import openfoodfacts
import openfoodfacts.products as products
from openfoodfacts import get_product
from typing import List
from openfoodfacts.products import get_product

In [2]:
# Set up Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the folder containing PNG images
folder_path = "C:/Users/hseit/Desktop/SummerMCREU/images/"
folder_path_after = "C:/Users/hseit/Desktop/SummerMCREU/imagesAfter/"
extracted_data = []
global keywords
keywords = {
    "product_type": {
        "food": ["food", "nutrition", "diet"],
        "skincare": ["skincare", "beauty", "cosmetics"],
        "medication": ["medication", "drug", "pharmaceutical"]
    },
    "directions": ["directions", "usage", "how to use"],
    "supplements_or_elements": ["supplements", "drug facts", "ingredients"],
    "warnings": ["warnings", "cautions", "precautions"]
}

In [3]:
class CNN(nn.Module):
    def __init__(self, hidden_size):
        super(CNN, self).__init__()
        # Define the CNN layers
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(9472, hidden_size)

    def forward(self, x):
        x = x.float()  # Convert the input tensor to float
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        input_size = x.size(1)  # Get the output size of the last convolutional layer
        x = self.fc(x)
        return x, input_size

# Define the RNN architecture for sequence recognition
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes+1)  # Increase num_classes by 1 to account for the 'unseen' label

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Define the CRNN model integrating the CNN and RNN components
class CRNN(nn.Module):
    def __init__(self, cnn, rnn):
        super(CRNN, self).__init__()
        self.cnn = cnn
        self.rnn = rnn

    def forward(self, x):
        features = self.cnn(x)
        features = features[0].view(features[0].size(0), -1, features[0].size(1))
        output = self.rnn(features)
        return output

In [4]:
class ImageTextDataset(Dataset):
    def __init__(self, data):
        if isinstance(data, list) and all(isinstance(item, dict) for item in data):
            self.data = data
        else:
            raise ValueError("Input data should be a list of dictionaries.")

        self.transform = transforms.ToTensor()

    def __getitem__(self, index):
        image_path = self.data[index]['image_path']
        extracted_text = self.data[index]['extracted_text']
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
        image = cv2.resize(image, (100, 75))
        image = self.transform(image)

        extracted_dict = {
            "image_path": image_path,
            "extracted_text": extracted_text,
            "product_type": "",
            "directions": "",
            "supplements_or_elements": "",
            "warnings": ""
        }

        lines = extracted_text.split('\n')

        for category, category_keywords in keywords.items():
            if category == 'product_type':
                for product_type, product_type_keywords in category_keywords.items():
                    for line in lines:
                        if any(keyword.lower() in line.lower() for keyword in product_type_keywords):
                            extracted_dict[category] = product_type
                            lines.remove(line)
                            break
            else:
                for keyword in category_keywords:
                    for line in lines:
                        if keyword.lower() in line.lower():
                            extracted_dict[category] = line
                            lines.remove(line)
                            break

        extracted_dict['extracted_text'] = '\n'.join(lines)

        return image, extracted_dict


In [5]:
# # Initialize lemmatizer and define stop words
# lemmatizer = nltk.WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

# def get_product_info(product_name):
#     search_results = openfoodfacts.products.search(product_name)
#     if search_results and 'products' in search_results:
#         products = search_results['products']
#         if products:
#             return products[0]  # Return the first product
#     return {}  # Return an empty dictionary if no results or unexpected structure

# # Create a function to process the extracted text and retrieve additional product information
# def process_extracted_text(extracted_text):
#     # Clean the extracted text
#     cleaned_text = clean_text(extracted_text, None)

#     # Retrieve product information from Open Food Facts based on the extracted text
#     product_info = get_product_info(cleaned_text)

#     return product_info

In [6]:
# Initialize lemmatizer and define stop words
lemmatizer = nltk.WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_product_info(product_name):
    search_results = openfoodfacts.products.search(product_name)
    if search_results and 'products' in search_results:
        products = search_results['products']
        if products:
            return products[0]  # Return the first product
    return {}  # Return an empty dictionary if no results or unexpected structure

# Create a function to process the extracted text and retrieve additional product information
def process_extracted_text(data):
    products = []
    for product in data:
        product_dict = {}
        text = product['extracted_text'].lower()

        # find ingredients
        ingredients_marker = "ingredients:"
        start = text.find(ingredients_marker)
        if start != -1:
            start += len(ingredients_marker)
            ingredients = text[start:].split('\n')[0]  # Assuming ingredients are on the same line
            product_dict['ingredients'] = ingredients.split(',')
        else:
            product_dict['ingredients'] = []

        # find warnings
        warnings_marker = "warnings:"
        start = text.find(warnings_marker)
        if start != -1:
            start += len(warnings_marker)
            warnings = text[start:].split('\n')[0]  # Assuming warnings are on the same line
            product_dict['warnings'] = warnings.split(',')
        else:
            product_dict['warnings'] = []

        products.append(product_dict)
    return products


In [7]:
# Create output folders if they don't exist
output_extracted_folder = os.path.join(folder_path_after, "extracted_images")
output_not_extracted_folder = os.path.join(folder_path_after, "not_extracted_images")
os.makedirs(output_extracted_folder, exist_ok=True)
os.makedirs(output_not_extracted_folder, exist_ok=True)

In [8]:
# better
sharpen_kernel = np.array([[0, -0.5, 0],
                           [-0.5, 3, -0.5],
                           [0, -0.5, 0]], dtype=np.float64)

global text_threshold
text_threshold = 10

keywords = {
    "product_type": {
        "food": ["food", "nutrition", "diet"],
        "skincare": ["skincare", "beauty", "cosmetics"],
        "medication": ["medication", "drug", "pharmaceutical"]
    },
    "directions": ["directions", "usage", "how to use"],
    "supplements_or_elements": ["supplements", "drug facts", "ingredients"],
    "warnings": ["warnings", "cautions", "precautions"]
}
# Set up logging
logging.basicConfig(filename='error.log', level=logging.ERROR)


In [9]:
# Initialize lemmatizer and define stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text, category):
    if pd.isna(text):
        return ''
    text = str(text)
    text = text.lower()

    # List of keywords to preserve for 'warnings' category
    warnings_keywords = ['broken skin', 'do not apply']

    if category == 'warnings':
        # Exclude specific keywords from the general cleaning process
        for keyword in warnings_keywords:
            text = text.replace(keyword, f'__{keyword}__')
    else:
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove special symbols
        text = re.sub(r'[^\w\s]', '', text)
        
        # Remove special symbols
        text = text.encode('ascii', 'ignore').decode('ascii')

        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        text = " ".join(tokens)

    return text

def image_save_textLength(extracted_text, image, image_path, text_threshold, folder_path_after):
    # Clean the extracted_text by replacing multiple whitespaces with a single one, 
    # removing newline characters and trimming leading/trailing spaces
    cleaned_text = " ".join(extracted_text.split())
    text_length = len(cleaned_text)

    if text_length >= text_threshold:
        # Save the image in the 'extracted_images' folder
        folder = "extracted_images"
    else:
        # Save the image in the 'not_extracted_images' folder
        folder = "not_extracted_images"

    try:
        output_folder = os.path.join(folder_path_after, folder)
        output_path = os.path.join(output_folder, os.path.basename(image_path))
        cv2.imwrite(output_path, image)
    except Exception as e:
        error_msg = f"An exception occurred in image_save_textLength: {e}"
        print(error_msg)
        logging.error(error_msg)
        traceback.print_exc()


def extract_data(keywords, lines, image_path, extracted_text):
    extracted_dict = {}

    # Match keywords to determine the feature type
    for feature, feature_keywords in keywords.items():
        if feature == 'product_type':
            for product_type, product_type_keywords in feature_keywords.items():
                for line in lines:
                    # Check if any keyword in the current line matches the product type keywords
                    if any(keyword.lower() in line.lower() for keyword in product_type_keywords):
                        # Assign the product type to the feature
                        extracted_dict[feature] = product_type
                        break
        else:
            # Combine all lines into a single string
            all_lines_text = ' '.join(lines)
            
            # Find the first occurrence of the keyword in the combined text
            for keyword in feature_keywords:
                if keyword.lower() in all_lines_text.lower():
                    # Extract the substring after the keyword
                    start_index = all_lines_text.lower().index(keyword.lower()) + len(keyword)
                    extracted_value = all_lines_text[start_index:].strip()
                    
                    # Assign the extracted value to the feature
                    extracted_dict[feature] = extracted_value
                    break

    # Store the extracted data
    data = {
        "image_path": image_path,
        "extracted_text": extracted_text,
        **extracted_dict
    }

    return data

def orb_feature_matching(target_image_path, scene_image):
    orb = cv2.ORB_create()

    # Read the target image and compute its keypoints and descriptors
    target_img = cv2.imread(target_image_path, 0)  # The '0' flag reads the image in grayscale
    kp1, des1 = orb.detectAndCompute(target_img, None)

    # Compute the keypoints and descriptors of the scene image
    kp2, des2 = orb.detectAndCompute(scene_image, None)

    # Create a BFMatcher object and match the descriptors
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1, des2)

    # Sort the matches based on distance (the lower the distance, the better the match)
    matches = sorted(matches, key=lambda x: x.distance)

    return matches


ORB (Oriented FAST and Rotated BRIEF) using for feature matching, first identifies keypoints in each image. Keypoints are points of interest within an image, which are typically corners, edges, or other areas with high contrast.

Once ORB has identified keypoints, it generates a descriptor for each keypoint. A descriptor is a vector that describes the keypoints' surrounding areas. ORB then tries to match the descriptors from the target image to the descriptors of the other image (scene image).

When you see "Found X matches with the target image" in the output, it means that ORB has found X pairs of descriptors (one from the target image and one from the scene image) that it believes correspond to the same feature.

when "Found 102 matches with the target image" it means that for that specific scene image, ORB found 102 descriptors that match with descriptors in the target image. This doesn't mean there are 102 images that match, but rather 102 features within one image that match features in the target image.

In [12]:
def process_images_multithreading(image_paths):
    # Local Lists to store the cropped images and extracted data
    cropped_images = []
    extracted_data = []
    
    # Process each image using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        results = executor.map(process_image, image_paths)

    for result in results:
        if result:
            cropped_image, data = result
            cropped_images.append(cropped_image)
            extracted_data.append(data)

    return cropped_images, extracted_data

def save_and_display_results(cropped_images, extracted_data):
    if cropped_images:
        output_folder = os.path.join(folder_path_after, "cropped_images")

        for i, cropped_image in enumerate(cropped_images):
            output_path = os.path.join(folder_path_after, f"cropped_image_{i}.jpg")
            cv2.imwrite(output_path, cropped_image)

        # Open the saved image using the default image viewer
        if sys.platform.startswith('darwin'):  # macOS
            subprocess.run(["open", output_path])
        elif sys.platform.startswith('win32'):  # Windows
            subprocess.run(["start", output_path], shell=True)
        else:  # Linux
            subprocess.run(["xdg-open", output_path])

    output_folder = os.path.join(folder_path_after)
    output_path = os.path.join(output_folder, "extracted_data.json")
    with open(output_path, "w") as file:
        json.dump(extracted_data, file, indent=4)

    output_path_csv = os.path.join(folder_path_after, 'extracted_data.csv')
    with open(output_path_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(extracted_data)

def folderOCR(directory_path):
    try:
        global extracted_data
        image_paths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)
                       if filename.endswith((".jpg", ".png"))]
        
        cropped_images, extracted_data = process_images_multithreading(image_paths)
        save_and_display_results(cropped_images, extracted_data)

    except Exception as e:
        error_msg = f"An exception occurred in folderOCR: {e}"
        print(error_msg)
        logging.error(error_msg)
        traceback.print_exc()

# Method to find highest (top 3) matches to the image provided
def orb_matching(image_paths, target_image):
    orb = cv2.ORB_create()
    orb_results = {}

    for image_path in image_paths:
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        matches = orb_feature_matching(target_image, image)
        orb_results[image_path] = len(matches)
        print(f'Found {len(matches)} matches with the target image for {image_path}')

    # Sort orb_results by matches
    sorted_results = sorted(orb_results.items(), key=lambda x: x[1], reverse=True)
    
    # Get top 3 matches
    top_3_matches = sorted_results[:3]
    
    # Plot results
    image_names = [match[0] for match in top_3_matches]
    match_values = [match[1] for match in top_3_matches]

    plt.bar(image_names, match_values)
    plt.xlabel('Image')
    plt.ylabel('Matches')
    plt.title('Top 3 images with most ORB matches')
    plt.show()

    return orb_results

In [13]:
def process_image(image_path):
    try:
        print("Processing image:", image_path)

        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        target_size = max(image.shape[0], image.shape[1])
        scaling_factor = target_size / max(image.shape[0], image.shape[1])
        resized_image = cv2.resize(image, (int(image.shape[1] * scaling_factor), int(image.shape[0] * scaling_factor)))
        
        # Convert the image to LAB color space
        lab_img = cv2.cvtColor(resized_image, cv2.COLOR_RGB2Lab)

        # Split the LAB image into L, A and B channels
        l, a, b = cv2.split(lab_img)

        # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) to the L channel with different parameters
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(10,10))
        cl = clahe.apply(l)

        # Merge the CLAHE enhanced L channel with the A and B channels
        limg = cv2.merge((cl,a,b))

        # Convert image from LAB Color model to RGB model
        enhanced_img = cv2.cvtColor(limg, cv2.COLOR_Lab2RGB)
        
        # Apply sharpening
        sharpened_image = cv2.filter2D(enhanced_img, -1, sharpen_kernel)

        # Convert the sharpened image to grayscale
        grayscale_image = cv2.cvtColor(sharpened_image, cv2.COLOR_RGB2GRAY)

        # Denoise the grayscale image using a bilateral filter
#         denoised_image = cv2.bilateralFilter(grayscale_image, 9, 75, 75)
        
        # Noise removal
        _, thresholded = cv2.threshold(grayscale_image, 170, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Calculate the ratio of black pixels
        black_ratio = np.count_nonzero(grayscale_image <= 1) / (grayscale_image.shape[0] * grayscale_image.shape[1])
        white_ratio = np.count_nonzero(grayscale_image >= 254) / (grayscale_image.shape[0] * grayscale_image.shape[1])

        if black_ratio > 0.90:  # If the image is mostly black and white
            thresholded = cv2.bitwise_not(thresholded)
            print(f"The image {image_path} is about 90% black")
        elif white_ratio > 0.90:  # If the image is mostly white
            thresholded = cv2.bitwise_not(thresholded)
            print(f"The image {image_path} is about 90% white")
        else:  # For color images

            # Find contours
            contours, _ = cv2.findContours(thresholded.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            text_regions = []  # List to store the results
            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                if w > 12 and h > 12:  # Filter out small contours
                    text_regions.append((x, y, w, h))

            # Find the contour with maximum area
            if text_regions:
                max_contour = max(text_regions, key=lambda r: r[2] * r[3])
                x, y, w, h = max_contour
                cropped_image = thresholded[y:y+h, x:x+w]
                extracted_text = pytesseract.image_to_string(cropped_image).strip()
#                 extracted_text = clean_text(extracted_text, None)

                image_save_textLength(extracted_text, image, image_path, text_threshold, folder_path_after)
                lines = extracted_text.split('\n')
                data = extract_data(keywords, lines, image_path, extracted_text)

                # Here we assume that `extracted_text` contains a unique identifier for the product
#                 product_info = get_product(extracted_text)  # Fetch the product info from OpenFoodFacts
                # Process the extracted text
                product_info = process_extracted_text([{'extracted_text': extracted_text}])

                # Add the fetched product info to the data dictionary
                data["product_info"] = product_info
                
                return cropped_image, data
        return None
    except Exception as e:
        error_msg = f"An exception occurred in process_image: {e}"
        print(error_msg)
        logging.error(error_msg)
        traceback.print_exc()


In [14]:
# import nltk
# nltk.data.path.append("C:\\Users\\hseit\\AppData\\Roaming\\nltk_data")
# https://github.com/wolfgarbe/SymSpell
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hseit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
folderOCR(folder_path)

Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/cockies.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/darkchoc.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/deodorant.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/FoodLabel-Matrix.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/groundbeef2.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/heinz.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4228.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4229.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4230.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4231.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4232.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4233.jpg
Processing image: C:/Users/hseit/Desktop/

In [16]:
import pprint

pprint.pprint(extracted_data)

[{'extracted_text': "british beef\ndiced'steak",
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg',
 {'extracted_text': 'Serv. Sire 202. (569), “Sat. Faby: 90%. Fiber dg\n'
                    '\n'
                    'Sarvs orate aa en\n'
                    '\n'
                    'Tue aye Sodlum 780mg 33% Protein 7g\n'
                    'alain det, Vitara A= Wianin GOs Goi Osean Pe',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/cockies.jpg',
 {'extracted_text': 'EXCELLENCE',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/darkchoc.jpg',
 {'extracted_text': '',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/deodorant.jpg',
 {'extracted_text': 'of which saturates\n'
                    'Carbohydrate\n'
                    '\n'
                    'of which sugars\n'
                    'Protein',
  'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/FoodLabel-Matrix.jpg',
 {'extracted_text': '‘Atmountisoning\n'
    

There are two versions, 1 for the whole world products, and 1 for only the product in english

world has way more then english version

In [17]:
def fetch_products(category: str, max_pages: int) -> List[dict]:
    products = []
    for page in range(1, max_pages + 1):
        result = openfoodfacts.products.get_by_facets({'category': category}, page=page)
        products.extend(result)  # changed this line
    return products

# Initialize the Open Food Facts products object
products = openfoodfacts.products

# Define a mapping of your categories to Open Food Facts categories
category_mapping = {
    "food": ["dairy", "beverages", "groceries", "Desserts", "Frozen foods"],
    "skincare": ["beauty", "cosmetics"],
    "medication": ["medicine"]
}

# Create a list to store the properly structured extracted data
image_extracted_data = []

# Iterate over each item in extracted_data
for item in extracted_data:
    # Get all other keys
    other_keys = {k:v for k, v in item.items() if k not in ['image_path', 'product_type', 'supplements_or_elements']}
    image_extracted_data.append({
        'Product Name': item.get('image_path', ''),
        'Product Category': item.get('product_type', 'Unknown'),
        'Supplements/Ingredients': item.get('supplements_or_elements', 'Unknown'),
        'Other': other_keys
    })

# Create a DataFrame from the properly structured extracted data
df = pd.DataFrame(image_extracted_data)

# Convert the dictionary in the 'Other' column into string format
df['Other'] = df['Other'].apply(lambda x: ', '.join(f'{k}: {v}' for k, v in x.items()) if isinstance(x, dict) else x)

# Iterate through the DataFrame and map 'Product Type' to 'Product Category'
for index, row in df.iterrows():
    for key, value in category_mapping.items():
        if row['Product Category'] in value:
            df.at[index, 'Product Category'] = key

# Extract the last two parts of the image path
df['Product Name'] = df['Product Name'].apply(lambda path: '/'.join(path.split('/')[-2:]))

def get_product_data(df: pd.DataFrame, categories: List[str], max_pages_per_category: int = 5):
    df_list = []

    with ThreadPoolExecutor() as executor:
        # Submit tasks for fetching products in each category concurrently
        futures = [executor.submit(fetch_products, category, max_pages_per_category) for category in categories]

        # Iterate over the completed futures
        for index, future in enumerate(futures):
            category = categories[index]
            products_list = future.result()

            # Create a list of dictionaries for the products in the current category
            product_dicts = [
                {
                    'Product Category': category,
                    'Product Name': product.get('product_name', ''),
                    'Supplements/Ingredients': product.get('ingredients_text', ''),
                    'Other': {'Brands': product.get('brands', ''), 'Countries': product.get('countries', '')}
                }
                for product in products_list
            ]

            # Create a DataFrame from the list of dictionaries
            df_page = pd.DataFrame(product_dicts)

            # Append the DataFrame to the list
            df_list.append(df_page)

    # Concatenate the list of DataFrames into a single DataFrame
    df_products = pd.concat(df_list, ignore_index=True)

    # Extract the last two parts of the image path
    df_products['Product Name'] = df_products['Product Name'].apply(lambda path: '/'.join(path.split('/')[-2:]))

    # Concatenate the extracted data DataFrame and the Open Food Facts DataFrame
    df_combined = pd.concat([df, df_products], ignore_index=True)

    # Convert the dictionary in the 'Other' column into string format
    df_combined['Other'] = df_combined['Other'].apply(lambda x: ', '.join(f'{k}: {v}' for k, v in x.items()) if isinstance(x, dict) else x)

    # Reorder the columns
    df_combined = df_combined[['Product Category', 'Product Name', 'Supplements/Ingredients', 'Other']]

    return df_combined

def update_product_data(df):
    df = df.fillna('')  # fills NaNs with an empty string
    df.replace('Unknown', '', inplace=True)  # replaces 'Unknown' with an empty string

    # removes 'extracted_text:' and 'brands:' prefixes from the 'Others' column
    df['Other'] = df['Other'].str.replace('extracted_text:', '').str.replace('Brands:', '')

    return df

# Specify the categories of interest
categories = ['dairy', 'beverages', 'medicine', 'groceries', 'desserts', 'frozen foods']

# Get product data
product_data = get_product_data(df, categories)


# Clean the DataFrame
product_data = update_product_data(product_data)

# Set option to display all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame
print("\nProduct Information:")
product_data



Product Information:


Unnamed: 0,Product Category,Product Name,Supplements/Ingredients,Other
0,,images/brtbeefjpg.jpg,,"british beef\ndiced'steak, product_info: [{'i..."
1,,images/cockies.jpg,,"Serv. Sire 202. (569), “Sat. Faby: 90%. Fiber..."
2,,images/darkchoc.jpg,,"EXCELLENCE, product_info: [{'ingredients': []..."
3,,images/deodorant.jpg,,", product_info: [{'ingredients': [], 'warning..."
4,,images/FoodLabel-Matrix.jpg,,of which saturates\nCarbohydrate\n\nof which ...
5,,images/groundbeef2.jpg,,‘Atmountisoning\n\nTotal\n20) gy | Oe\nSOE TO...
6,,images/heinz.jpg,,", product_info: [{'ingredients': [], 'warning..."
7,,images/IMG-4228.jpg,,"Lu\nLu\nox\nLL.\n*\n=\n=)\nra\n=\n5\nol\n<, p..."
8,,images/IMG-4229.jpg,": DIPROPYLENE GLYCOL, WATER, PROPYLENE GLYCOL,...",CONTAINS ODOR-FIGHTING “ATOMIC ROBOTS” THAT “...
9,medication,images/IMG-4230.jpg,Active ingredient Purpose (per tablet) Calcium...,Drug Facts\n\nActive ingredient Purpose\n(per...


In [18]:
# Set option to display all rows
pd.set_option('display.max_rows', None)

# Display the DataFrame
print("\nProduct Information:")
product_data


Product Information:


Unnamed: 0,Product Category,Product Name,Supplements/Ingredients,Other
0,,images/brtbeefjpg.jpg,,"british beef\ndiced'steak, product_info: [{'i..."
1,,images/cockies.jpg,,"Serv. Sire 202. (569), “Sat. Faby: 90%. Fiber..."
2,,images/darkchoc.jpg,,"EXCELLENCE, product_info: [{'ingredients': []..."
3,,images/deodorant.jpg,,", product_info: [{'ingredients': [], 'warning..."
4,,images/FoodLabel-Matrix.jpg,,of which saturates\nCarbohydrate\n\nof which ...
5,,images/groundbeef2.jpg,,‘Atmountisoning\n\nTotal\n20) gy | Oe\nSOE TO...
6,,images/heinz.jpg,,", product_info: [{'ingredients': [], 'warning..."
7,,images/IMG-4228.jpg,,"Lu\nLu\nox\nLL.\n*\n=\n=)\nra\n=\n5\nol\n<, p..."
8,,images/IMG-4229.jpg,": DIPROPYLENE GLYCOL, WATER, PROPYLENE GLYCOL,...",CONTAINS ODOR-FIGHTING “ATOMIC ROBOTS” THAT “...
9,medication,images/IMG-4230.jpg,Active ingredient Purpose (per tablet) Calcium...,Drug Facts\n\nActive ingredient Purpose\n(per...


In [19]:
# import pprint

# target_filename = 'C:/Users/hseit/Desktop/SummerMCREU/images/image01.jpg'

# for image_data in extracted_data:
#     if image_data['image_path'] == target_filename:
#         pprint.pprint(image_data)
#         break
