In [39]:
import os
import pytesseract
from pytesseract import Output
from PIL import Image
import cv2
import pandas as pd
from matplotlib import pyplot as plt

In [40]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
os.environ["TESSDATA_PREFIX"] = r'C:\Program Files\Tesseract-OCR\tessdata'
train_images_dir = r'E:\AI ML DL\Amazon_ML_Challenge\data\raw\images\train'
test_images_dir = r'E:\AI ML DL\Amazon_ML_Challenge\data\raw\images\test'

In [41]:
def load_and_preprocess_image(image_path, target_size=(224, 224)):
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Image not found: {image_path}")
    resized_image = cv2.resize(image, target_size)
    return resized_image

In [42]:
def extract_text_from_image(image_path):
    try:
        # Load and preprocess the image
        image = load_and_preprocess_image(image_path)
        
        # Use Tesseract to detect language and extract text
        ocr_result = pytesseract.image_to_string(image)
        detected_languages = pytesseract.image_to_osd(image, output_type=Output.DICT).get('language', 'unknown')
        
        print(f"Detected languages: {detected_languages}")
        if ocr_result.strip():
            print(f"Text extracted from {os.path.basename(image_path)}:\n{ocr_result}\n")
        else:
            print(f"No text found in {os.path.basename(image_path)}")
        
        return detected_languages, ocr_result.strip()

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None, None

In [43]:
def process_image_with_error_handling(image_path):
    try:
        detected_languages, ocr_text = extract_text_from_image(image_path)
        if not ocr_text:
            raise ValueError("No text extracted.")
        return detected_languages, ocr_text

    except Exception as e:
        print(f"Error: {e} - Check if the necessary language files are installed.")
        return None, None

In [44]:
def process_all_images(image_dir):
    results = []
    for image_file in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_file)
        
        if image_file.endswith('.jpg') or image_file.endswith('.png'):
            print(f"Processing image: {image_file}")
            languages, text = process_image_with_error_handling(image_path)
            
            if text:
                results.append({
                    'image_file': image_file,
                    'languages': languages,
                    'extracted_text': text
                })

    return pd.DataFrame(results)

In [45]:
def save_results_to_csv(results_df, output_path):
    results_df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")

In [None]:
# Main execution
if __name__ == "__main__":
    # Process all images in the train folder
    print("Starting OCR processing for train images...")
    train_results = process_all_images(train_images_dir)

    # Save the results to a CSV file
    output_csv_path = r'E:\AI ML DL\Amazon_ML_Challenge\results\train_ocr_results.csv'
    save_results_to_csv(train_results, output_csv_path)

    print("OCR processing completed.")

In [None]:
def display_image_with_text(image_path, text):
    image = load_and_preprocess_image(image_path)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title(f"Extracted Text: {text[:50]}...")
    plt.axis('off')
    plt.show()

# Example visualization for the first processed image
if not train_results.empty:
    first_image_path = os.path.join(train_images_dir, train_results.iloc[0]['image_file'])
    display_image_with_text(first_image_path, train_results.iloc[0]['extracted_text'])