In [1]:
import os
import cv2
import sys
import re
# import subprocess
import pytesseract
# import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
# from concurrent.futures import ThreadPoolExecutor

# Set up Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [2]:
class CNN(nn.Module):
    def __init__(self, hidden_size):
        super(CNN, self).__init__()
        # Define the CNN layers
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(9472, hidden_size)

    def forward(self, x):
        x = x.float()  # Convert the input tensor to float
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        input_size = x.size(1)  # Get the output size of the last convolutional layer
        x = self.fc(x)
        return x, input_size

# Define the RNN architecture for sequence recognition
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes+1)  # Increase num_classes by 1 to account for the 'unseen' label

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Define the CRNN model integrating the CNN and RNN components
class CRNN(nn.Module):
    def __init__(self, cnn, rnn):
        super(CRNN, self).__init__()
        self.cnn = cnn
        self.rnn = rnn

    def forward(self, x):
        features = self.cnn(x)
        features = features[0].view(features[0].size(0), -1, features[0].size(1))
        output = self.rnn(features)
        return output


class ImageTextDataset(Dataset):
    def __init__(self, data):
        # Checking if data is a list of dictionaries
        if isinstance(data, list) and all(isinstance(item, dict) for item in data):
            self.data = data
        else:
            raise ValueError("Input data should be a list of dictionaries.")
        
        self.transform = transforms.ToTensor()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # print out the data at current index to verify its structure
        print(self.data[index])
        
        image_path = self.data[index]['image_path']
        label = self.data[index]['extracted_text']
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (200, 150)) # Resizing to the size expected by the CNN
        image = self.transform(image)
        return image, label

In [3]:
# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the folder containing PNG images
folder_path = "C:/Users/hseit/Desktop/SummerMCREU/images/"
extracted_data = []

# Keywords for feature detection
keywords = {
    "product_title": ["product name", "name", "product"],
    "directions": ["directions", "usage", "how to use"],
    "supplements_or_elements": ["supplements", "drug facts", "ingredients"],
    "warnings": ["warnings", "cautions", "precautions"]
}

In [4]:
def preprocess_label(label):
    # Initialize the processed_label variable
    processed_label = ""

    try:
        # Convert to lowercase
        processed_label = label.lower()
        # Remove leading/trailing whitespaces
        processed_label = processed_label.strip()
        # Remove special characters and digits
        processed_label = re.sub(r'[^\w\s]', ' ', processed_label)
        # Tokenize the label
        tokens = processed_label.split()
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]
        # Lemmatize the tokens
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        # Join the tokens back into a string
        processed_label = ' '.join(lemmatized_tokens)
    
    except Exception as e:
        # Handle any exception that may occur during processing
        print(f"An error occurred during label preprocessing: {e}")
    
    # Assign a special label for unseen instances
    unseen_label = "unseen"
    if processed_label not in label_encoder.classes_:
        processed_label = unseen_label
    
    return processed_label


def collate_fn(batch):
    images = []
    raw_labels = []

    for img, label in batch:
        if isinstance(img, torch.Tensor):
            img = transforms.ToPILImage()(img)  # Convert the tensor to a PIL image

        img = img.convert("L")  # Convert the image to grayscale
        img = img.resize((32, img.height))  # Resize to fixed width while maintaining the aspect ratio
        img = np.array(img)

        if img.ndim == 2:
            img = np.expand_dims(img, axis=2)
        elif img.shape[2] == 1:
            img = np.concatenate([img] * 3, axis=2)

        img = img.transpose((2, 0, 1))
        img = torch.from_numpy(img)
        images.append(img)

        raw_labels.append(label)  # Store the raw labels

    images = torch.stack(images, dim=0)
    processed_labels = [preprocess_label(label) for label in raw_labels if preprocess_label(label) in label_encoder.classes_]

    labels = label_encoder.transform(processed_labels)
    labels = torch.tensor(labels, dtype=torch.long)

    return images, labels

In [5]:
def process_image(image_path):
    print("Processing image:", image_path)

    # Load the label image
    image = cv2.imread(image_path)

    # Preprocess the image
    resized_image = cv2.resize(image, (600, 600))  # Resize the image
    normalized_image = cv2.normalize(resized_image, None, 0, 255, cv2.NORM_MINMAX)  # Normalize pixel values
    denoised_image = cv2.fastNlMeansDenoisingColored(normalized_image, None, 10, 10, 7, 21)  # Denoise the image
    gray = cv2.cvtColor(denoised_image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale

    # Perform text detection
    contours, _ = cv2.findContours(gray.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    text_regions = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 10 and h > 10:  # Filter out small contours
            text_regions.append((x, y, w, h))

    # Find the contour with maximum area
    if text_regions:
        max_contour = max(text_regions, key=lambda r: r[2] * r[3])
        x, y, w, h = max_contour

        # Crop the image using the contour coordinates
        cropped_image = gray[y:y+h, x:x+w]

        # Perform text recognition on the cropped image
        extracted_text = pytesseract.image_to_string(cropped_image).strip()

        # Match keywords to determine the feature type
        extracted_dict = {}
        for feature, feature_keywords in keywords.items():
            for keyword in feature_keywords:
                if keyword.lower() in extracted_text.lower():
                    extracted_dict[feature] = extracted_text
                    break

        # Store the extracted data
        data = {
            "image_path": image_path,
            "extracted_text": extracted_text,
            "product_title": extracted_dict.get("product_title", ""),
            "directions": extracted_dict.get("directions", ""),
            "supplements_or_elements": extracted_dict.get("supplements_or_elements", ""),
            "warnings": extracted_dict.get("warnings", "")
        }
        return data

    return None

def folderOCR(directory_path, batch_size=10):
    global extracted_data  # Add this line to update the global extracted_data list
    image_paths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)
                   if filename.endswith((".jpg", ".png"))]

    num_images = len(image_paths)
    num_batches = (num_images + batch_size - 1) // batch_size

    extracted_data = []
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_images)
        batch_image_paths = image_paths[start_idx:end_idx]

        batch_results = []
        for image_path in batch_image_paths:
            result = process_image(image_path)
            if result is not None:
                batch_results.append(result)

        extracted_data.extend(batch_results)

    # Display the results
    for data in extracted_data:
        print("Image Path:", data["image_path"])
        print("Extracted Text:")
        print(data["extracted_text"])
        print()

    print("Total Images Processed:", len(extracted_data))

In [6]:
folderOCR(folder_path)

Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/cropped_image.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4228.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4229.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4230.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4231.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4232.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4233.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4234.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4235.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4236.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4237.jpg
Processing image: C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4239.jpg
Processing image: C:/Users/hseit/Desktop/Sum

[{'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg',
  'extracted_text': 'essential\n\nWaitrose®\n\nbritish beef ~\nMw diced steak',
  'product_title': '',
  'directions': '',
  'supplements_or_elements': '',
 {'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/cropped_image.jpg',
  'extracted_text': 'FBA Box 1 of 1 - 1Ib\n\nSHIP FROM SHIP TO\n\nJames Bond FBA. dnest+sta012\n\n333 Boren Ave N Amazon.com Services, Inc.\n\nSeattle, WA 98109 4255 Anson Blvd\n\nUnited States Whitestown, IN 46075-4412\nUnited States\n\nFBA (10/30/19 10:55 AM) - 1\n\nae\npea\n\nFBA15JD9C5R9U000001\nMixed SKUs pnd\n| |\n\nKM-SSHL-KJON\nQty 1\n\nPLEASE LEAVE THIS LABEL UNCOVERED',
  'product_title': '',
  'directions': '',
  'supplements_or_elements': '',
 {'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4228.jpg',
  'extracted_text': 'ALUMINUM + FREE\n\n,\nOld Spice-\n\nPURE SPORT |\n\n——-HIGH ENDURANCE® ——\n\nDEODORANT\n\nNET WE\n2.402 (689',
  'product_title': '',
 

In [9]:
extracted_data

[{'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/brtbeefjpg.jpg',
  'extracted_text': 'essential\n\nWaitrose®\n\nbritish beef ~\nMw diced steak',
  'product_title': '',
  'directions': '',
  'supplements_or_elements': '',
 {'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/cropped_image.jpg',
  'extracted_text': 'FBA Box 1 of 1 - 1Ib\n\nSHIP FROM SHIP TO\n\nJames Bond FBA. dnest+sta012\n\n333 Boren Ave N Amazon.com Services, Inc.\n\nSeattle, WA 98109 4255 Anson Blvd\n\nUnited States Whitestown, IN 46075-4412\nUnited States\n\nFBA (10/30/19 10:55 AM) - 1\n\nae\npea\n\nFBA15JD9C5R9U000001\nMixed SKUs pnd\n| |\n\nKM-SSHL-KJON\nQty 1\n\nPLEASE LEAVE THIS LABEL UNCOVERED',
  'product_title': '',
  'directions': '',
  'supplements_or_elements': '',
 {'image_path': 'C:/Users/hseit/Desktop/SummerMCREU/images/IMG-4228.jpg',
  'extracted_text': 'ALUMINUM + FREE\n\n,\nOld Spice-\n\nPURE SPORT |\n\n——-HIGH ENDURANCE® ——\n\nDEODORANT\n\nNET WE\n2.402 (689',
  'product_title': '',
 

In [7]:
# Create a DataFrame from the extracted data
dataframe = pd.DataFrame(extracted_data)

# Extract the last two parts of the image_path
dataframe['image_path'] = dataframe['image_path'].apply(lambda path: '/'.join(path.split('/')[-2:]))

# Assume we have a dataframe df with columns 'image_path' and 'extracted_text'
# df = pd.read_csv('your_data.csv')

# Convert dataframe to list of dictionaries
train_data = dataframe.to_dict('records')

dataframe

Unnamed: 0,image_path,extracted_text,product_title,directions,supplements_or_elements,warnings
0,images/brtbeefjpg.jpg,essential\n\nWaitrose®\n\nbritish beef ~\nMw d...,,,,
1,images/cropped_image.jpg,FBA Box 1 of 1 - 1Ib\n\nSHIP FROM SHIP TO\n\nJ...,,,,
2,images/IMG-4228.jpg,"ALUMINUM + FREE\n\n,\nOld Spice-\n\nPURE SPORT...",,,,
3,images/IMG-4229.jpg,CONTAINS ODOR-FIGHTING “ATOMIG ROBOTS” THAT “s...,CONTAINS ODOR-FIGHTING “ATOMIG ROBOTS” THAT “s...,,CONTAINS ODOR-FIGHTING “ATOMIG ROBOTS” THAT “s...,
4,images/IMG-4230.jpg,ong racks a\nActive ingredient Purpose\n\n(per...,ong racks a\nActive ingredient Purpose\n\n(per...,ong racks a\nActive ingredient Purpose\n\n(per...,ong racks a\nActive ingredient Purpose\n\n(per...,ong racks a\nActive ingredient Purpose\n\n(per...
5,images/IMG-4231.jpg,,,,,
6,images/IMG-4232.jpg,NATROL\nMelatonin 10=:\n\nSLEEP\n\n@ Fall Asle...,,,,
7,images/IMG-4233.jpg,DIRECTIONS: Take 2 gummies Manufactured in fac...,DIRECTIONS: Take 2 gummies Manufactured in fac...,DIRECTIONS: Take 2 gummies Manufactured in fac...,DIRECTIONS: Take 2 gummies Manufactured in fac...,
8,images/IMG-4234.jpg,Cera\n\nDaily\nMoisturizing\nLotion\n\nFor Nor...,,,,
9,images/IMG-4235.jpg,a\n\nCeraVe” Daily Moisturizing Lotion\n\nDeve...,,a\n\nCeraVe” Daily Moisturizing Lotion\n\nDeve...,a\n\nCeraVe” Daily Moisturizing Lotion\n\nDeve...,
