In [None]:
# pip install easyocr
from warnings import filterwarnings
filterwarnings("ignore")
import easyocr
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
result = reader.readtext('train_images/619cPZMqL7L.jpg',detail=0)
result_string = ' '.join(result)
print(result_string)

In [None]:
import os
import pytesseract
from PIL import Image
import pandas as pd
import easyocr
from tqdm import tqdm  # Import tqdm for the progress bar

# Set Tesseract command path
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Initialize EasyOCR reader with GPU support
reader = easyocr.Reader(['en'], gpu=True)

dataset_dir = "/raid/ai23resch11003/Adversarial/amazon-ml/train_images"
image_extensions = ['.png', '.jpg', '.jpeg']

output_dir = "/raid/ai23resch11003/Adversarial/amazon-ml/outputs"
output_filename = "output_new.csv"

os.makedirs(output_dir, exist_ok=True)

def extract_text_tesseract(image_path):
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error extracting text from {image_path} using Tesseract: {e}")
        return None

def extract_text_easyocr(image_path):
    try:
        # EasyOCR directly works with image paths
        result = reader.readtext(image_path, detail=0)  # detail=0 for plain text output
        text = " ".join(result)  # Join the list of text results into one string
        return text
    except Exception as e:
        print(f"Error extracting text from {image_path} using EasyOCR: {e}")
        return None

def process_dataset(dataset_dir):
    data = []  
    filenames = [f for f in os.listdir(dataset_dir) if any(f.lower().endswith(ext) for ext in image_extensions)]

    # Use tqdm to show the progress bar
    for filename in tqdm(filenames, desc="Processing images", unit="image"):
        image_path = os.path.join(dataset_dir, filename)

        tesseract_text = extract_text_tesseract(image_path)
        easyocr_text = extract_text_easyocr(image_path)

        if tesseract_text and easyocr_text:
            # Append the image name, tesseract OCR text, and easyocr text
            data.append({
                'image_name': filename,
                'tesseract_text': tesseract_text,
                'easyocr_text': easyocr_text
            })

    return data

# Save data to CSV in the output directory
def save_to_csv(data, output_dir, output_filename):
    df = pd.DataFrame(data)
    
    csv_filepath = os.path.join(output_dir, output_filename)
    
    df.to_csv(csv_filepath, index=False)
    print(f"Data saved to {csv_filepath}")

# Process the dataset and save the results
data = process_dataset(dataset_dir)
save_to_csv(data, output_dir, output_filename)
