### Basic library imports

In [3]:
import os
import pandas as pd

### Read Dataset

In [4]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [5]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [6]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'cup', 'milligram', 'millimetre', 'microlitre', 'ounce', 'watt', 'volt', 'fluid ounce', 'gram', 'pint', 'kilovolt', 'millivolt', 'centilitre', 'cubic foot', 'pound', 'imperial gallon', 'metre', 'kilogram', 'cubic inch', 'microgram', 'quart', 'yard', 'litre', 'inch', 'gallon', 'centimetre', 'decilitre', 'millilitre', 'kilowatt', 'ton', 'foot'}


### Download images

In [7]:
from utils import download_images
from sklearn.model_selection import StratifiedShuffleSplit

In [8]:
cd ..

/workspaces/Solutions/student_resource 3


In [9]:
%pwd

'/workspaces/Solutions/student_resource 3'

In [10]:
# Load the train data
train_data = pd.read_csv('dataset/train.csv')

In [11]:
train_data["entity_name"].value_counts()

entity_name
item_weight                      102786
depth                             45127
width                             44183
height                            43597
voltage                            9466
wattage                            7755
item_volume                        7682
maximum_weight_recommendation      3263
Name: count, dtype: int64

In [12]:
# Define the stratified splitter (Stratify by 'entity_name')
splitter = StratifiedShuffleSplit(n_splits=1, test_size=50, random_state=42)

In [13]:
# Perform stratified sampling based on the 'entity_name' column
for train_idx, sample_idx in splitter.split(train_data, train_data['entity_name']):
    stratified_sample = train_data.iloc[sample_idx]

In [14]:
stratified_sample["entity_name"].value_counts()

entity_name
item_weight                      20
depth                             9
width                             8
height                            8
voltage                           2
item_volume                       1
wattage                           1
maximum_weight_recommendation     1
Name: count, dtype: int64

In [15]:
# Save the stratified sample to a new CSV file
stratified_sample.to_csv('stratified_sample.csv', index=False)

In [16]:
# The folder where images will be downloaded
download_folder = 'downloaded_images/'

In [17]:
import cv2
from multiprocessing import Pool
from utils import download_images, parse_string, common_mistake
import constants
import pytesseract
from PIL import Image
from tqdm import tqdm
from functools import partial 

In [18]:
# Step 1: Preprocessing function for images
def preprocess_image(image_path):
    try:
        image = cv2.imread(image_path)
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # Apply thresholding
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        # Apply median blur to remove noise
        processed_image = cv2.medianBlur(thresh, 3)
        return processed_image
    except Exception as e:
        print(f"Error in preprocessing image {image_path}: {e}")
        return None

## Pytesseract

In [24]:
def extract_text_pytesseract(image_path):
    try:
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        # Convert processed image to PIL format for Pytesseract
        pil_image = Image.fromarray(preprocessed_image)
        text = pytesseract.image_to_string(pil_image)
        return text.strip()
    except Exception as e:
        print(f"OCR extraction failed for {image_path}: {e}")
        return ""

In [25]:
def process_images_pytesseract(stratified_sample_file, download_folder, output_file):
    df = pd.read_csv(stratified_sample_file)
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    print("Extracting text using Pytesseract...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_pytesseract(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up images after extraction
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [26]:
# Usage
process_images_pytesseract("stratified_sample.csv", "downloaded_images", "output_pytesseract.csv")

Downloading images...


100%|██████████| 50/50 [00:00<00:00, 151.15it/s]


Extracting text using Pytesseract...
Results saved to output_pytesseract.csv


## Pytesseract without PreProcess

In [37]:
import os
import pandas as pd
import pytesseract
from PIL import Image
from tqdm import tqdm
from utils import download_images

# Step 1: OCR extraction without any preprocessing
def extract_text_from_image(image_path):
    try:
        # Directly use Pytesseract on the raw image
        text = pytesseract.image_to_string(Image.open(image_path))
        return text.strip()  # Return the extracted text without leading/trailing spaces
    except Exception as e:
        print(f"OCR extraction failed for {image_path}: {e}")
        return ""

# Step 2: Process each row to download the image and extract text
def process_row(row, download_folder):
    image_link = row['image_link']
    # Extract the image name from the image link
    image_name = os.path.basename(image_link)
    # Set the path where the image was saved after download
    image_path = os.path.join(download_folder, image_name)
    
    # Extract raw text from the image using Pytesseract OCR
    extracted_text = extract_text_from_image(image_path)
    
    return extracted_text

# Step 3: Download images, extract text, and clean up images
def process_images_and_extract_text(stratified_sample_file, download_folder):
    # Load the stratified sample dataset from the CSV file
    df = pd.read_csv(stratified_sample_file)
    
    # Step 3.1: Download images using the URLs in the image_link column
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    # Step 3.2: Loop through each row to extract text from images
    print("Extracting raw text from images...")
    df['Extracted'] = df.apply(lambda row: process_row(row, download_folder), axis=1)
    
    # Step 3.3: Clean up images from the local folder after extraction
    for file in os.listdir(download_folder):
        file_path = os.path.join(download_folder, file)
        if os.path.exists(file_path):
            os.remove(file_path)
    
    # Save the updated DataFrame back to the original stratified_sample.csv
    df.to_csv(stratified_sample_file, index=False)
    print(f"Updated {stratified_sample_file} with extracted text.")

if __name__ == "__main__":
    stratified_sample_file = "stratified_sample.csv"  # Your file name for the dataset
    download_folder = "downloaded_images"  # Folder to store downloaded images
    
    # Process images, extract text, and update stratified_sample.csv
    process_images_and_extract_text(stratified_sample_file, download_folder)

Downloading images...


100%|██████████| 1000/1000 [00:19<00:00, 50.27it/s]


Extracting raw text from images...
Updated stratified_sample.csv with extracted text.


## Easy-OCR

In [20]:
import easyocr

In [21]:
# Initialize the EasyOCR reader once
reader_easyocr = easyocr.Reader(['en'], detector='dbnet18')

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


In [22]:
def extract_text_easyocr(image_path):
    try:
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        # Use EasyOCR for text extraction
        result = reader_easyocr.readtext(preprocessed_image, detail=0, batch_size=32)
        return ' '.join(result).strip()
    except Exception as e:
        print(f"EasyOCR extraction failed for {image_path}: {e}")
        return ""

In [23]:
def process_images_easyocr(stratified_sample_file, download_folder, output_file):
    df = pd.read_csv(stratified_sample_file)
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    print("Extracting text using EasyOCR...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_easyocr(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up images after extraction
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [25]:
# Usage
process_images_easyocr("stratified_sample.csv", "downloaded_images", "output_easyocr.csv")

Downloading images...


100%|██████████| 50/50 [00:00<00:00, 152.12it/s]


Extracting text using EasyOCR...
Results saved to output_easyocr.csv


## Keras-OCR

In [19]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 

In [23]:
%pip install -q keras-ocr

Note: you may need to restart the kernel to use updated packages.


In [17]:
import keras_ocr

2024-09-14 14:41:20.378290: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-14 14:41:20.382330: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-14 14:41:20.393947: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-14 14:41:20.405401: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-14 14:41:20.409396: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-14 14:41:20.419704: I tensorflow/core/platform/cpu_feature_gu

In [18]:
# Initialize the Keras-OCR pipeline once
pipeline_kerasocr = keras_ocr.pipeline.Pipeline()

Looking for /home/codespace/.keras-ocr/craft_mlt_25k.h5
Looking for /home/codespace/.keras-ocr/crnn_kurapan.h5
Downloading /home/codespace/.keras-ocr/crnn_kurapan.h5


In [19]:
def extract_text_kerasocr(image_path):
    try:
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        # Keras-OCR needs the image in RGB format
        image_rgb = cv2.cvtColor(preprocessed_image, cv2.COLOR_GRAY2RGB)
        prediction_groups = pipeline_kerasocr.recognize([image_rgb])
        extracted_text = ' '.join([text for text, _ in prediction_groups[0]])
        return extracted_text.strip()
    except Exception as e:
        print(f"KerasOCR extraction failed for {image_path}: {e}")
        return ""

In [20]:
def process_images_kerasocr(stratified_sample_file, download_folder, output_file):
    df = pd.read_csv(stratified_sample_file)
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    print("Extracting text using KerasOCR...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_kerasocr(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up images after extraction
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [21]:
# Usage
process_images_kerasocr("stratified_sample.csv", "downloaded_images", "output_kerasocr.csv")

Downloading images...


100%|██████████| 50/50 [00:00<00:00, 165.21it/s]


Extracting text using KerasOCR...


2024-09-14 14:43:14.089716: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 48096048 exceeds 10% of free system memory.
2024-09-14 14:43:14.134769: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 48096048 exceeds 10% of free system memory.


Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.


2024-09-14 14:43:14.562991: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1026049024 exceeds 10% of free system memory.
2024-09-14 14:43:15.161781: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1026049024 exceeds 10% of free system memory.
2024-09-14 14:43:17.249428: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 256512256 exceeds 10% of free system memory.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 845ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 851ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 17s/

## Paddle-OCR

In [40]:
!git clone https://github.com/PaddlePaddle/PaddleOCR.git

Cloning into 'PaddleOCR'...
remote: Enumerating objects: 51247, done.[K
remote: Counting objects: 100% (2471/2471), done.[K
remote: Compressing objects: 100% (1190/1190), done.[K
remote: Total 51247 (delta 1283), reused 2259 (delta 1234), pack-reused 48776 (from 1)[K
Receiving objects: 100% (51247/51247), 385.66 MiB | 19.71 MiB/s, done.
Resolving deltas: 100% (35672/35672), done.
Updating files: 100% (2390/2390), done.


In [41]:
from paddleocr import PaddleOCR

In [42]:
# Initialize the PaddleOCR model once
ocr_paddleocr = PaddleOCR(use_angle_cls=True, lang='en')

[2024/09/14 11:10:49] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/codespace/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/codespace/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_t

In [43]:
def extract_text_paddleocr(image_path):
    try:
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        # PaddleOCR requires the image in file path format
        result = ocr_paddleocr.ocr(image_path)
        extracted_text = ' '.join([line[1][0] for line in result[0]])
        return extracted_text.strip()
    except Exception as e:
        print(f"PaddleOCR extraction failed for {image_path}: {e}")
        return ""

In [44]:
def process_images_paddleocr(stratified_sample_file, download_folder, output_file):
    df = pd.read_csv(stratified_sample_file)
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    print("Extracting text using PaddleOCR...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_paddleocr(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up images after extraction
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [45]:
# Usage
process_images_paddleocr("stratified_sample.csv", "downloaded_images", "output_paddleocr.csv")

Downloading images...


100%|██████████| 50/50 [00:00<00:00, 63.53it/s]


Extracting text using PaddleOCR...
[2024/09/14 11:10:56] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.24773788452148438
[2024/09/14 11:10:56] ppocr DEBUG: cls num  : 2, elapsed : 0.028303861618041992
[2024/09/14 11:10:56] ppocr DEBUG: rec_res num  : 2, elapsed : 0.09077668190002441
[2024/09/14 11:10:56] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.16480779647827148
[2024/09/14 11:10:56] ppocr DEBUG: cls num  : 5, elapsed : 0.024401426315307617
[2024/09/14 11:10:57] ppocr DEBUG: rec_res num  : 5, elapsed : 0.16092491149902344
[2024/09/14 11:10:57] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.16276264190673828
[2024/09/14 11:10:57] ppocr DEBUG: cls num  : 7, elapsed : 0.03212404251098633
[2024/09/14 11:10:57] ppocr DEBUG: rec_res num  : 7, elapsed : 0.5664801597595215
[2024/09/14 11:10:57] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.16255569458007812
[2024/09/14 11:10:58] ppocr DEBUG: cls num  : 4, elapsed : 0.021373748779296875
[2024/09/14 11:10:58] ppocr DEBUG: rec_res num  : 4, elapsed : 0.1

## docTR-OCR

In [20]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import os
import pandas as pd
from tqdm import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
# Initialize the docTR OCR predictor with PyTorch backend
ocr_model_doctr = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

In [27]:
def extract_text_doctr(image_path):
    try:
        # Preprocess the image
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        
        # Load the image into docTR for OCR
        document = DocumentFile.from_images([image_path])
        result = ocr_model_doctr(document)
        
        # Extract text from docTR's result
        extracted_text = []
        for block in result.pages[0].blocks:
            for line in block.lines:
                for word_obj in line.words:
                    extracted_text.append(word_obj.value)  # Extract the text from Word object
        
        return ' '.join(extracted_text).strip()
    except Exception as e:
        print(f"docTR OCR extraction failed for {image_path}: {e}")
        return ""

In [28]:
def process_images_doctr(stratified_sample_file, download_folder, output_file):
    # Load the stratified sample dataset
    df = pd.read_csv(stratified_sample_file)
    
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    # Extract text using docTR
    print("Extracting text using docTR...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_doctr(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up downloaded images
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    # Save the results to the output file
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

In [29]:
# Usage
process_images_doctr("stratified_sample.csv", "downloaded_images", "output_doctr.csv")

Downloading images...


100%|██████████| 50/50 [00:00<00:00, 83088.43it/s]


Extracting text using docTR...
Results saved to output_doctr.csv


## Rapid-OCR

In [28]:
import os
import pandas as pd
import cv2
from tqdm import tqdm
from rapidocr_onnxruntime import RapidOCR

# Initialize RapidOCR
ocr_rapidocr = RapidOCR()

# Preprocessing function for images
def preprocess_image(image_path):
    try:
        # Read and preprocess the image
        image = cv2.imread(image_path)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        processed_image = cv2.medianBlur(thresh, 3)
        return processed_image
    except Exception as e:
        print(f"Error in preprocessing image {image_path}: {e}")
        return None

# Function to extract text using RapidOCR
def extract_text_rapidocr(image_path):
    try:
        # Preprocess the image
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is None:
            return ""
        
        # Perform OCR using RapidOCR
        result, _ = ocr_rapidocr(image_path)  # Use original image path, not preprocessed

        # Check if result is None or empty
        if not result or result is None:
            return ""

        # Extract text from the result
        extracted_text = ' '.join([text[1][0] for text in result if text[1]])  # Safeguard for None values
        return extracted_text.strip()
    except Exception as e:
        print(f"RapidOCR extraction failed for {image_path}: {e}")
        return ""

# Function to process images and extract text
def process_images_rapidocr(stratified_sample_file, download_folder, output_file):
    # Load the stratified sample dataset
    df = pd.read_csv(stratified_sample_file)
    
    print("Downloading images...")
    download_images(df['image_link'], download_folder)
    
    print("Extracting text using RapidOCR...")
    df['Extracted'] = df['image_link'].apply(lambda link: extract_text_rapidocr(os.path.join(download_folder, os.path.basename(link))))
    
    # Clean up downloaded images
    for file in os.listdir(download_folder):
        os.remove(os.path.join(download_folder, file))
    
    # Save the results to the output file
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Usage
process_images_rapidocr("stratified_sample.csv", "downloaded_images", "output_rapidocr.csv")


Downloading images...


100%|██████████| 50/50 [00:00<00:00, 99.78it/s]


Extracting text using RapidOCR...
Results saved to output_rapidocr.csv
