In [6]:
import cv2
import pytesseract
from pytesseract import Output
from PIL import Image
import re
import os
import zipfile
import io
import numpy as np

In [21]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Change this to your Tesseract path

In [34]:
def preprocess_image(image, method='default'):
    """Preprocess the image based on the selected method."""
    img = np.array(image)
    
    if method == 'grayscale':
        # Convert to grayscale
        return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    elif method == 'threshold':
        # Convert to grayscale and apply binary thresholding
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return thresh
    
    elif method == 'denoise':
        # Convert to grayscale, apply thresholding, and denoise
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        denoised = cv2.fastNlMeansDenoising(thresh, None, 30, 7, 21)
        return denoised
    
    else:
        # Default - no preprocessing
        return img

In [35]:
def extract_text_from_image(image, psm=6, preprocess_method='default'):
    """Extracts text from an image using Tesseract OCR with different PSM modes and preprocessing."""
    img = preprocess_image(image, method=preprocess_method)
    # Configure Tesseract options
    config = f'--psm {psm}'
    text = pytesseract.image_to_string(img, config=config)
    return text

In [37]:
def extract_details(text):
    """Extract specific details from the OCR text using improved regex patterns."""
    details = {
        'Date': re.findall(r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\b', text),
        'Name': re.findall(r'Name:\s*([\w\s]+)', text),
        'NRIC': re.findall(r'[STFG]\d{7}[A-Z]', text),
        'Shop Name': re.findall(r'Shop Name:\s*([\w\s]+)', text),
        'Address': re.findall(r'Address:\s*([\w\s,.-]+)', text),
        'Total Amount': re.findall(r'\$\s*\d+\.\d{2}', text)  # Adjust for variations
    }
    return details

In [40]:
# Specify the path to the zip file containing the images
zip_file_path = r'C:\Users\hp\dsproject\MLProject\AItest\ai_soluton_design\large-receipt-image-dataset-SRD.zip'  # Replace with your actual zip file path

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # List all JPG images in the zip file
    all_receipts = [file for file in zip_ref.namelist() if file.lower().endswith('.jpg')]
    
    print(f"Found {len(all_receipts)} receipt images in the zip file.")
    
    # Create a list to hold all extracted data
    extracted_data = []

    # Process each receipt image in the zip file
    for receipt in all_receipts:
        with zip_ref.open(receipt) as file:
            # Open the image using PIL directly from the zip file
            image = Image.open(file)
            text = extract_text_from_image(image)
            details = extract_details(text)
            extracted_data.append(details)

    # Display extracted data for each sample
    for idx, data in enumerate(extracted_data):
        print(f"Receipt {idx + 1}: {data}")

Found 200 receipt images in the zip file.
Receipt 1: {'Date': ['2/19/19'], 'Name': [], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': ['$12.95', '$12.95', '$1.23', '$14.18', '$14.18', '$2.33', '$16.51', '$2.59', '$16.77']}
Receipt 2: {'Date': ['5/4/2017'], 'Name': [], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': []}
Receipt 3: {'Date': ['28-08-2018'], 'Name': [], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': []}
Receipt 4: {'Date': [], 'Name': [], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': ['$2.25', '$6.75', '$1.50', '$4.00', '$22.25', '$2.22', '$24.47', '$4.40', '$4.89', '$6.12']}
Receipt 5: {'Date': [], 'Name': [], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': ['$21.73', '$21.13', '$0.00']}
Receipt 6: {'Date': [], 'Name': [], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': []}
Receipt 7: {'Date': ['08/03/18'], 'Name': ['HWANG '], 'NRIC': [], 'Shop Name': [], 'Address': [], 'Total Amount': ['$24.00', 

In [41]:
import pandas as pd

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(extracted_data)

# Save the DataFrame to a CSV file
df.to_csv('extracted_receipt_data.csv', index=False)

print("Extracted data has been saved to 'extracted_receipt_data.csv'.")

Extracted data has been saved to 'extracted_receipt_data.csv'.
