In [1]:
pip install pytesseract opencv-python pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [73]:
# Basic libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2
from PIL import Image
import re

# OCR libraries.
import pytesseract
from pytesseract import Output
!pip install keras_ocr
import keras_ocr
from keras_ocr.detection import Detector
from keras_ocr.recognition import Recognizer
from keras_ocr import pipeline

# XML reader.
import xml.etree.ElementTree as ET

Collecting keras_ocr
  Downloading keras_ocr-0.9.3-py3-none-any.whl.metadata (8.6 kB)
Collecting editdistance (from keras_ocr)
  Downloading editdistance-0.8.1-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Collecting efficientnet==1.0.0 (from keras_ocr)
  Downloading efficientnet-1.0.0-py3-none-any.whl.metadata (6.1 kB)
Collecting essential_generators (from keras_ocr)
  Downloading essential_generators-1.0-py3-none-any.whl.metadata (14 kB)
Collecting imgaug (from keras_ocr)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from keras_ocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Collecting shapely (from keras_ocr)
  Downloading shapely-2.0.7-cp311-cp311-win_amd64.whl.metadata (7.1 kB)
Collecting keras-applications<=1.0.8,>=1.0.7 (from efficientnet==1.0.0->keras_ocr)
  Downloading Keras_Applications-1.0.8-py3-none-any.whl.metadata (1.7 kB)
Downloading keras_ocr-0.9.3-py3-none-any.whl (42 kB)
   ------------------

In [58]:
# Path to Tesseract OCR executable (Modify this based on your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [75]:
IMAGE_FOLDER = r"C:\Users\minor\Downloads\receipts_img" 
# Store names to prioritize (add more as needed)
STORE_NAMES = ["WALMART", "TRADER JOE'S", "COSTCO", "TARGET", "KROGER", "WHOLE FOODS"]

# Function to preprocess image for better OCR
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    return thresh

# Function to extract text from an image
def extract_text(image_path):
    processed_img = preprocess_image(image_path)
    text = pytesseract.image_to_string(processed_img, config='--psm 6')
    return text

# Function to extract store name
def extract_store_name(text):
    for store in STORE_NAMES:
        if re.search(store, text, re.IGNORECASE):
            return store
    lines = text.split("\n")
    for line in lines:
        clean_line = re.sub(r'[^a-zA-Z0-9 &]', '', line).strip()
        if clean_line and len(clean_line) > 3:
            return clean_line
    return "Not Found"

# Function to extract date
def extract_date(text):
    date_patterns = [
        r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b",  # 12/31/2023 or 31-12-23
        r"\b\d{4}-\d{2}-\d{2}\b",  # 2023-12-31
        r"\b\d{2}[.]\d{2}[.]\d{2,4}\b"  # 12.31.23 or 31.12.2023
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return "Not Found"

# Function to extract total amount
def extract_total_amount(text):
    amount_pattern = r"\b\d{1,4}\.\d{2}\b"
    amounts = re.findall(amount_pattern, text)
    return max(amounts, key=float) if amounts else "Not Found"

# Process all images
receipt_data = []
for filename in os.listdir(IMAGE_FOLDER):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(IMAGE_FOLDER, filename)
        text = extract_text(image_path)

        store_name = extract_store_name(text)
        date = extract_date(text)
        total_amount = extract_total_amount(text)

        receipt_data.append([filename, store_name, date, total_amount])

# Save results to Excel
df = pd.DataFrame(receipt_data, columns=["File Name", "Store Name", "Date", "Total Amount"])
df


Unnamed: 0,File Name,Store Name,Date,Total Amount
0,0.jpg,7 ee ALWAYS LOW PRICES i,Not Found,Not Found
1,1.jpg,gp aes 3 ee a a ote,0-28-2014,40.00
2,10.jpg,ee eee Ry IgG COPE oe OO,Not Found,28.82
3,11.jpg,wPh7R WHOLE Rage,Not Found,9.99
4,12.jpg,teen el Bd be yee,10/20/07,18.75
5,13.jpg,WALMART,Not Found,10.00
6,14.jpg,SSASNSSASS S,Not Found,1.25
7,15.jpg,settee Saree,Not Found,0.00
8,16.jpg,WALMART,Not Found,23.19
9,17.jpg,WALMART,Not Found,70.64


In [76]:
# List of known store names (can be expanded)
KNOWN_STORES = ["WALMART", "TARGET", "TRADER JOE'S", "COSTCO", "WHOLE FOODS"]

# Function to extract text from an image
def extract_text(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

# Function to extract relevant details
def parse_receipt(text):
    lines = text.split("\n")
    store_name = "Not Found"
    total_amount = "Not Found"
    date = "Not Found"

    # Extract store name (Match known stores or uppercase words)
    for line in lines:
        line_clean = re.sub(r'[^a-zA-Z0-9 &]', '', line).strip()
        if any(store in line_clean.upper() for store in KNOWN_STORES):
            store_name = line_clean.upper()
            break
        elif line_clean.isupper() and len(line_clean) > 3 and not re.search(r'survey|feedback|see back', line_clean, re.IGNORECASE):
            store_name = line_clean
            break

    # Extract date (Match multiple formats and choose the first occurrence)
    date_patterns = [
        r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',  # MM/DD/YYYY or MM/DD/YY
        r'\b\d{1,2}-\d{1,2}-\d{2,4}\b',  # DD-MM-YYYY or DD-MM-YY
        r'\b\d{4}-\d{1,2}-\d{1,2}\b'   # YYYY-MM-DD
    ]
    for line in lines:
        for pattern in date_patterns:
            match = re.search(pattern, line)
            if match:
                date = match.group()
                break
        if date != "Not Found":
            break

    # Extract total amount (Look for currency-like values)
    amount_pattern = r'\b\d{1,4}\.\d{2}\b'
    for line in lines:
        if re.search(r'Total|Subtotal', line, re.IGNORECASE):
            match = re.search(amount_pattern, line)
            if match:
                total_amount = match.group()
                break

    return store_name, date, total_amount

# Process all images in the folder
receipt_data = []
for filename in os.listdir(IMAGE_FOLDER):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(IMAGE_FOLDER, filename)
        text = extract_text(image_path)
        store_name, date, total_amount = parse_receipt(text)
        receipt_data.append([filename, store_name, date, total_amount])

# Save results to Excel
df = pd.DataFrame(receipt_data, columns=["File Name", "Store Name", "Date", "Total Amount"])
df

Unnamed: 0,File Name,Store Name,Date,Total Amount
0,0.jpg,WALMART,Not Found,5.11
1,1.jpg,TRADER JOES,06-28-2014,Not Found
2,10.jpg,Not Found,Not Found,Not Found
3,11.jpg,WHOLE,Not Found,Not Found
4,12.jpg,WALMART,1/20/07,18.75
5,13.jpg,WALMART,Not Found,Not Found
6,14.jpg,ID 7LORX4K8PUX,05/04/17,26.60
7,15.jpg,WALMART 7,07/22/16,Not Found
8,16.jpg,ID 7L2T9WJM25F,11/13/17,21.74
9,17.jpg,ID 7K2GF1SL2FF,01/18/17,38.68


In [77]:
# Folder containing images
IMAGE_FOLDER = r"C:\Users\minor\Downloads\receipts_img"

# List of known store names (can be expanded)
KNOWN_STORES = ["WALMART", "TARGET", "TRADER JOE'S", "COSTCO", "WHOLE FOODS"]

# Function to extract text from an image
def extract_text(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

# Function to extract relevant details
def parse_receipt(text):
    lines = text.split("\n")
    store_name = "Not Found"
    total_amount = "Not Found"
    date = "Not Found"

    # Extract store name (Match known stores or uppercase words)
    for line in lines:
        line_clean = re.sub(r'[^a-zA-Z0-9 &]', '', line).strip()
        if any(store in line_clean.upper() for store in KNOWN_STORES):
            store_name = line_clean.upper()
            break
        elif line_clean.isupper() and len(line_clean) > 3 and not re.search(r'survey|feedback|see back', line_clean, re.IGNORECASE):
            store_name = line_clean
            break

    # Extract date (Match multiple formats and choose the first occurrence)
    date_patterns = [
        r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',  # MM/DD/YYYY or MM/DD/YY
        r'\b\d{1,2}-\d{1,2}-\d{2,4}\b',  # DD-MM-YYYY or DD-MM-YY
        r'\b\d{4}-\d{1,2}-\d{1,2}\b'   # YYYY-MM-DD
    ]
    for line in lines:
        for pattern in date_patterns:
            match = re.search(pattern, line)
            if match:
                date = match.group()
                break
        if date != "Not Found":
            break

    # Extract total amount (Look for currency-like values)
    amount_pattern = r'\b\d{1,4}\.\d{2}\b'
    for line in lines:
        if re.search(r'Total|Subtotal', line, re.IGNORECASE):
            match = re.search(amount_pattern, line)
            if match:
                total_amount = match.group()
                break

    return store_name, date, total_amount

# Process all images in the folder
receipt_data = []
for filename in os.listdir(IMAGE_FOLDER):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(IMAGE_FOLDER, filename)
        text = extract_text(image_path)
        store_name, date, total_amount = parse_receipt(text)
        receipt_data.append([filename, store_name, date, total_amount])

# Save results to Excel
df = pd.DataFrame(receipt_data, columns=["File Name", "Store Name", "Date", "Total Amount"])
df

Unnamed: 0,File Name,Store Name,Date,Total Amount
0,0.jpg,WALMART,Not Found,5.11
1,1.jpg,TRADER JOES,06-28-2014,Not Found
2,10.jpg,Not Found,Not Found,Not Found
3,11.jpg,WHOLE,Not Found,Not Found
4,12.jpg,WALMART,1/20/07,18.75
5,13.jpg,WALMART,Not Found,Not Found
6,14.jpg,ID 7LORX4K8PUX,05/04/17,26.60
7,15.jpg,WALMART 7,07/22/16,Not Found
8,16.jpg,ID 7L2T9WJM25F,11/13/17,21.74
9,17.jpg,ID 7K2GF1SL2FF,01/18/17,38.68


In [61]:
# Function to extract text from an image
def extract_text(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

In [68]:
# Function to extract relevant details
def parse_receipt(text):
    lines = text.split("\n")
    store_name = "Not Found"
    total_amount = "Not Found"
    date = "Not Found"

    # Extract store name (Prioritize uppercase words, avoid common receipt text)
    for line in lines:
        line_clean = re.sub(r'[^a-zA-Z0-9 &]', '', line).strip()
        if line_clean and len(line_clean) > 3 and not re.search(r'survey|feedback|see back', line_clean, re.IGNORECASE):
            store_name = line_clean
            break

    # Extract date (Match multiple date formats)
    date_patterns = [r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b']
    for line in lines:
        for pattern in date_patterns:
            match = re.search(pattern, line)
            if match:
                date = match.group()
                break
        if date != "Not Found":
            break

    # Extract total amount (Look for currency-like values)
    amount_pattern = r'\b\d{1,4}\.\d{2}\b'
    for line in lines:
        if "Total" in line or "Subtotal" in line:
            match = re.search(amount_pattern, line)
            if match:
                total_amount = match.group()
                break

    return store_name, date, total_amount

In [63]:
# Process all images in the folder
receipt_data = []
for filename in os.listdir(IMAGE_FOLDER):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(IMAGE_FOLDER, filename)
        text = extract_text(image_path)
        store_name, date, total_amount = parse_receipt(text)
        receipt_data.append([filename, store_name, date, total_amount])

In [71]:
# Save results to Excel
df = pd.DataFrame(receipt_data, columns=["File Name", "Store Name", "Date", "Total Amount"])

In [72]:
df.head(15)

Unnamed: 0,File Name,Store Name,Date,Total Amount
0,0.jpg,WAL*MART,Not Found,Not Found
1,1.jpg,TRADER JOE'S,Not Found,Not Found
2,10.jpg,4,Not Found,Not Found
3,11.jpg,WHOLE,Not Found,Not Found
4,12.jpg,WAL*MART,Not Found,Not Found
5,13.jpg,TD i AASWOVBKCH,Not Found,Not Found
6,14.jpg,See back of receipt for your chance,Not Found,Not Found
7,15.jpg,Ac ack of receipt For our ee ase,Not Found,Not Found
8,16.jpg,See back of receip’,Not Found,Not Found
9,17.jpg,See back of receipt for your chance,Not Found,Not Found


In [None]:
df.to_excel("Extracted_Receipt_Data.xlsx", index=False)

print("Extraction complete! Data saved in Extracted_Receipt_Data.xlsx")