In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from PIL import Image
import os
import pandas as pd

In [12]:
sample_test=pd.read_excel('D:/STUDY/MS/Amazon ML Hackathon 2024/student_resource 3/dataset/new_sample_train.xls.xlsx')

In [13]:
# Step 1: Image Preprocessing
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize images to 128x128
    transforms.ToTensor(),          # Convert images to PyTorch tensors
    transforms.Normalize((0.5,), (0.5,))  # Normalize to range [-1, 1]
])

# Load images into PyTorch tensor format
def load_image(image_path):
    image = Image.open(image_path)  # Open the image
    image = transform(image)        # Apply the transformations
    return image

# Load the images (assuming you have the file paths and entity names)
image_paths = sample_test['image_link'].tolist()  # Ensure this contains the paths to your downloaded images
image_dir = 'D:/STUDY/MS/Amazon ML Hackathon 2024/student_resource 3/images2'

images = []
for img_url in image_paths:
    img_name = img_url.split('/')[-1]
    local_image_path = os.path.join(image_dir, img_name)
    if os.path.exists(local_image_path):
        images.append(load_image(local_image_path))  # Preprocess each image

# Stack images into a tensor
X = torch.stack(images)

y = torch.randint(0, 2, (len(images),))  # Random binary labels for text region presence/absence


In [17]:
# Step 2: CNN Model for Text Region Detection
class SimpleTextRegionCNN(nn.Module):
    def __init__(self):
        super(SimpleTextRegionCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)  # 3 input channels (RGB), 16 output channels
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 32 * 32, 128)  # Fully connected layer
        self.fc2 = nn.Linear(128, 2)  # Output layer (2 classes: text-heavy region vs. non-text-heavy region)
    
    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 32 * 32 * 32)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Step 3: Initialize the Model, Loss Function, and Optimizer
model = SimpleTextRegionCNN()
criterion = nn.CrossEntropyLoss()  # Use cross-entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer


In [18]:
# Step 4: Training the CNN
num_epochs = 5  # Keep it small for testing
batch_size = 8

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, len(X), batch_size):
        inputs = X[i:i+batch_size]
        labels = y[i:i+batch_size]
        
        optimizer.zero_grad()  # Zero the parameter gradients

        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Calculate loss

        loss.backward()  # Backpropagate
        optimizer.step()  # Optimize

        running_loss += loss.item()
        if i % (batch_size * 2) == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / (i+1):.4f}")

print('Finished Training')

Epoch [1/5], Loss: 0.7044
Epoch [1/5], Loss: 0.1461
Epoch [1/5], Loss: 0.1195
Epoch [1/5], Loss: 0.1097
Epoch [1/5], Loss: 0.1165
Epoch [1/5], Loss: 0.1093
Epoch [1/5], Loss: 0.1095
Epoch [1/5], Loss: 0.1106
Epoch [1/5], Loss: 0.1076
Epoch [1/5], Loss: 0.1054
Epoch [1/5], Loss: 0.1041
Epoch [1/5], Loss: 0.1025
Epoch [1/5], Loss: 0.1012
Epoch [2/5], Loss: 0.6459
Epoch [2/5], Loss: 0.1156
Epoch [2/5], Loss: 0.0969
Epoch [2/5], Loss: 0.0917
Epoch [2/5], Loss: 0.0898
Epoch [2/5], Loss: 0.0870
Epoch [2/5], Loss: 0.0851
Epoch [2/5], Loss: 0.0831
Epoch [2/5], Loss: 0.0829
Epoch [2/5], Loss: 0.0816
Epoch [2/5], Loss: 0.0816
Epoch [2/5], Loss: 0.0810
Epoch [2/5], Loss: 0.0806
Epoch [3/5], Loss: 0.5384
Epoch [3/5], Loss: 0.0899
Epoch [3/5], Loss: 0.0683
Epoch [3/5], Loss: 0.0687
Epoch [3/5], Loss: 0.0690
Epoch [3/5], Loss: 0.0671
Epoch [3/5], Loss: 0.0747
Epoch [3/5], Loss: 0.0762
Epoch [3/5], Loss: 0.0741
Epoch [3/5], Loss: 0.0718
Epoch [3/5], Loss: 0.0720
Epoch [3/5], Loss: 0.0697
Epoch [3/5],

In [23]:
import cv2
import numpy as np
from PIL import Image
import pytesseract

# Function to preprocess the image for OCR
def preprocess_image_for_ocr(image):
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    
    # Apply binary thresholding (convert image to black and white)
    _, thresholded_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)
    
    # Optional: Apply Gaussian blur to reduce noise
    denoised_image = cv2.GaussianBlur(thresholded_image, (5, 5), 0)
    
    return denoised_image

# Dictionary to store the results {image_link: extracted_text}
results = {}

# Use the CNN to get predictions for text-heavy regions
model.eval()  # Set the model to evaluation mode

# Loop over all images in X and apply preprocessing + OCR to detected text regions
for i in range(len(X)):
    input_image = X[i].unsqueeze(0)  # Add a batch dimension
    output = model(input_image)  # Get the CNN output
    _, predicted = torch.max(output, 1)  # Predicted label (1: text-heavy, 0: no text)
    
    img_path = image_paths[i]  # Get the image link corresponding to the current image
    img_name = img_path.split('/')[-1]
    local_image_path = os.path.join(image_dir, img_name)

    if predicted.item() == 1:  # If text-heavy region detected
        print(f"Text region detected in image {i}.")
        
        # Open the image using PIL
        image = Image.open(local_image_path)
        
        # Preprocess the image for OCR
        preprocessed_image = preprocess_image_for_ocr(image)
        
        # Use Tesseract to extract text from the preprocessed image
        extracted_text = pytesseract.image_to_string(preprocessed_image)
        print(f"Extracted Text from Image {i}: {extracted_text}")
        
        # Store the result in the dictionary, associating the image link with the extracted text
        results[img_path] = extracted_text
    else:
        print(f"No text detected in image {i}.")
        results[img_path] = "No text detected."

# At the end, `results` will contain each `image_link` as the key and the `extracted_text` as the value


Text region detected in image 0.
Extracted Text from Image 0: PROPSS

NATURE

Acie 100% pure et naturelle, la tere de
Soameiéres px tsente des propwidnes abs “s


No text detected in image 1.
No text detected in image 2.
Text region detected in image 3.
Extracted Text from Image 3: 
Text region detected in image 4.
Extracted Text from Image 4: PSYLLIUM

HUSK

FRE


Text region detected in image 5.
Extracted Text from Image 5: * Naturally-Sourced Psyllium

 ———,,

* High strength 1400mg
per serving
NS

LANTAGO
Plant SEEDS

* Suitable for Vegans & Vegetarians

,

Horbaach

Text region detected in image 6.
Extracted Text from Image 6: Directions: For adults, take two (2) vegan capsules
daily, preferably with a meal. Do not exceed stated dose.

Nutrition Information

HIGH ST “Typleally Per Daily Dose
Psyllium Husk Powder 1400mg

Ingredients: Psyllium Husk Powder, Capsule Shell (Hy-
droxypropylmethylcellulose), Anti-Caking Agents (Mag-

nesium Salts of Fatty Acids, Silicon Dioxide).
May con

In [22]:
extracted_text

'Amount Per Serving (2 Gummies)\n\nCreatine Monohydrate\n* NVR = Nutrient Reference Value\n\ntT =NVR Not Establoshed\n\nINGREDIENTS:\n\nCreatine Monohydrate, Water, Pectin, Citric\nAcid, Soduim Citrate, Natural Flavour, Natural\nColour, Carnauba Wax, Xylitol, Maltitol.\n\nSer,.\nOns wy\n\n3G\n\nLY\n\n'

In [25]:
results_df = pd.DataFrame(list(results.items()), columns=['image_link','extracted_text'])
print(results_df)

                                            image_link  \
0    https://m.media-amazon.com/images/I/61I9XdN6OF...   
1    https://m.media-amazon.com/images/I/71gSRbyXmo...   
2    https://m.media-amazon.com/images/I/61BZ4zrjZX...   
3    https://m.media-amazon.com/images/I/612mrlqiI4...   
4    https://m.media-amazon.com/images/I/617Tl40LOX...   
..                                                 ...   
190  https://m.media-amazon.com/images/I/718M5ODio0...   
191  https://m.media-amazon.com/images/I/61lpwH0qHb...   
192  https://m.media-amazon.com/images/I/712sZRVe98...   
193  https://m.media-amazon.com/images/I/71d+dz7ogk...   
194  https://m.media-amazon.com/images/I/51WSQa1ygM...   

                                        extracted_text  
0    PROPSS\n\nNATURE\n\nAcie 100% pure et naturell...  
1                                    No text detected.  
2                                    No text detected.  
3                                                       
4                 

In [29]:
pd.set_option('display.max_colwidth', None)
results_df.head(10)

Unnamed: 0,image_link,extracted_text
0,https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg,"PROPSS\n\nNATURE\n\nAcie 100% pure et naturelle, la tere de\nSoameiéres px tsente des propwidnes abs “s\n\n"
1,https://m.media-amazon.com/images/I/71gSRbyXmoL.jpg,No text detected.
2,https://m.media-amazon.com/images/I/61BZ4zrjZXL.jpg,No text detected.
3,https://m.media-amazon.com/images/I/612mrlqiI4L.jpg,
4,https://m.media-amazon.com/images/I/617Tl40LOXL.jpg,PSYLLIUM\n\nHUSK\n\nFRE\n\n
5,https://m.media-amazon.com/images/I/61QsBSE7jgL.jpg,"* Naturally-Sourced Psyllium\n\n ———,,\n\n* High strength 1400mg\nper serving\nNS\n\nLANTAGO\nPlant SEEDS\n\n* Suitable for Vegans & Vegetarians\n\n,\n\nHorbaach\n"
6,https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg,"Directions: For adults, take two (2) vegan capsules\ndaily, preferably with a meal. Do not exceed stated dose.\n\nNutrition Information\n\nHIGH ST “Typleally Per Daily Dose\nPsyllium Husk Powder 1400mg\n\nIngredients: Psyllium Husk Powder, Capsule Shell (Hy-\ndroxypropylmethylcellulose), Anti-Caking Agents (Mag-\n\nnesium Salts of Fatty Acids, Silicon Dioxide).\nMay contain Sesame Seeds & Mustard. For allergens,\nsee the ingredients in bold.\n\nNotice: Take this product with 220ml of fluids. Taking\nthis product without adequate fluid may cause the pos-\n\nsibility of choking. Do not use this product if you have\n1 4 @) @) MG PLANTAGO OVATA difficulty swallowing. If you experience chest pain, vom-\nPLANT SEEDS iting or difficulty in swallowing or breathing after taking\n\nthis product, seek immediate medical attention. Do not\ntake before sleeping. Fibre products can affect the ab-\nsorption of many medications. Do not take this product\nwithin 1.5 hours of taking medication.\n\nFree From: Artificial Colour, Artificial Flavour, Artificial\nSweetener, Preservatives.\n\nRr\n\nx re} 6 : SUITABLE FOR VEGETARIANS & VEGANS\n\nHU16790 C\n\nHorbaach\n"
7,https://m.media-amazon.com/images/I/71DiLRHeZdL.jpg,No text detected.
8,https://m.media-amazon.com/images/I/91Cma3RzseL.jpg,Horbaach\n\n100%\n\nHIGHEST\nQUALITY\n\nPSYLLIUM\nHUSK\n\n_ PLANTAGO OVATA\n\nPLANT SEEDS\n3652\n: CAPSULES:\n\nHorbaach\n
9,https://m.media-amazon.com/images/I/71jBLhmTNlL.jpg,NEW LOOK.\nSAME TRUSTED QUALITY.\n\nHIGH STRENGTH\n\nPSYLLIUM Wiow $1\nHUSK PSYLLIUM\nHUSK\n\nys Eee\n\nFOOD\nSUPPLEMENT oP QD) ad CAPSULES\n\n~ Horbadach’\n
