In [None]:
import os
import cv2
import pytesseract
import pandas as pd
from tabulate import tabulate
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor
import tensorflow as tf
import numpy as np

# Set up Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Path to the folder containing PNG images
folder_path = "C:/Users/hseit/Desktop/SummerMCREU/images/"
amount = 0
extracted_data = []

# Keywords for feature detection
keywords = {
    "product_title": ["product name", "name", "product"],
    "directions": ["directions", "usage", "how to use"],
    "supplements_or_elements": ["supplements", "elements", "ingredients"],
    "warnings": ["warnings", "cautions", "precautions"]
}

# Iterate over the PNG images in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".jpg"):
        amount += 1
        print("File found: " + str(amount))
        # Load the label image
        image_path = os.path.join(folder_path, filename)
        image = cv2.imread(image_path)
        print()

        # Preprocess the image
        resized_image = cv2.resize(image, (800, 600))  # Resize the image
        normalized_image = cv2.normalize(resized_image, None, 0, 255, cv2.NORM_MINMAX)  # Normalize pixel values
        denoised_image = cv2.fastNlMeansDenoisingColored(normalized_image, None, 10, 10, 7, 21)  # Denoise the image
        gray = cv2.cvtColor(denoised_image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale

        # Perform text detection
        contours, _ = cv2.findContours(gray.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        text_regions = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            if w > 10 and h > 10:  # Filter out small contours
                text_regions.append((x, y, w, h))

        # Find the contour with maximum area
        max_contour = max(text_regions, key=lambda r: r[2] * r[3])
        x, y, w, h = max_contour

        # Crop the image using the contour coordinates
        cropped_image = gray[y:y+h, x:x+w]

        # Perform text recognition on the cropped image
        extracted_text = pytesseract.image_to_string(cropped_image).strip()

        # Match keywords to determine the feature type
        extracted_dict = {}
        for feature, feature_keywords in keywords.items():
            for keyword in feature_keywords:
                if keyword.lower() in extracted_text.lower():
                    extracted_dict[feature] = extracted_text
                    break

        # Store the extracted data
        data = {
            "image_path": image_path,
            "product_title": extracted_dict.get("product_title", ""),
            "directions": extracted_dict.get("directions", ""),
            "supplements_or_elements": extracted_dict.get("supplements_or_elements", ""),
            "warnings": extracted_dict.get("warnings", "")
        }
        extracted_data.append(data)

        # Display the results
        for region in text_regions:
            x, y, w, h = region
            cv2.rectangle(resized_image, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # Print extracted text
        print("Extracted Text:")
        print(extracted_text)

# Create a DataFrame from the extracted data
df = pd.DataFrame(extracted_data)

# Define a function to truncate long strings
def truncate_string(text, max_length=30):
    if len(text) > max_length:
        return text[:max_length] + '...'
    return text

# Apply string truncation to the DataFrame
df = df.applymap(truncate_string)

# Prepare the data for LSTM training
encoder = LabelEncoder()
df_encoded = pd.DataFrame()
for column in df.columns:
    df_encoded[column] = encoder.fit_transform(df[column])

    
sequences = df_encoded.apply(lambda x: list(x), axis=1)
max_length = max(df_encoded.apply(len))
padded_sequences = padded_sequences.reshape(padded_sequences.shape[0], padded_sequences.shape[1])


# Split the data into training and testing sets (60% train, 40% test)
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df_encoded, test_size=0.3, random_state=42)

# Convert the target labels to numpy arrays
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Get the number of classes for the output layer
num_classes = len(encoder.classes_)


# Build the LSTM model
vocab_size = len(encoder.classes_)
embedding_dim = 50
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile and train the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

# Print the accuracy
print("Accuracy:", accuracy)

# Get the predicted labels for the test set
y_pred = model.predict(X_test)
predicted_labels = encoder.inverse_transform(np.argmax(y_pred, axis=1))

# Create a DataFrame with the predicted labels and true labels
results_df = pd.DataFrame({'Predicted': predicted_labels, 'True': encoder.inverse_transform(y_test)})

# Display the DataFrame
print("\nModel Predictions:")
print(tabulate(results_df, headers='keys', tablefmt='psql'))

print()

# Display the DataFrame in table format
print("\nExtracted Data:")
print(df.to_string(index=False))


File found: 1

Extracted Text:
essential
Waitrose®
british beef ~
diced steak

500ge
File found: 2

Extracted Text:
FBA Box 1 of 1 - 1lb

SHIP FROM SHIP TO

James Bond FBA. dnest+sta012

333 Boren Ave N Amazon.com Services. Inc.
Seattle, WA 98109 4255 Anson Blvd

United States Whitestown, IN 46075-4412

United States
FBA (10/30/19 10:55 AM)

zoe
Po
_ 2
=
—

15JD9C5R9U000001

Mixed SKUs
KM-SSHL-KJ9N
Oty 1

PLEASE LEAVE THIS LABEL UNCOVERED
File found: 3

Extracted Text:
ALUMINUM + FREE

O41 Spéce-

HIGH ENDURANCE® ——
DEODORANT

NETWT.
2.402 682)
File found: 4

Extracted Text:
CONTAINS ODOR-FIGHTING “ATOMIC ROBOTS” THAT “SHooT
LASERS” AT YOUR “STENCH MONSTERS” AND REPLACES.
THEM WITH FRESH, CLEAN, MASCULINE “SCENT ELVES”

48 HOUR ODOR PROTECTION.
DIRECTIONS: Twist up product. Apply to underarms only.
Use dally for bost results.
MARHINGS: DO WOT APPLY TO BROKEN SKIN. I RASH OF IRRITATION OEVELOPS
DSCONTINUE USE. USE ONLY AS DIRECTED.
KEEP OUT OF REACH OF CHILDREN.

INGREDIENTS: DIPROPYLEN