In [1]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.6 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (978 kB)
[2K   

In [2]:
import easyocr
import cv2
import matplotlib.pyplot as plt
import numpy as np
import re


In [14]:
!unzip receipt_validation.zip -d data/

Archive:  receipt_validation.zip
   creating: data/receipt_validation/
   creating: data/receipt_validation/train/
   creating: data/receipt_validation/train/non_receipt/
  inflating: data/receipt_validation/train/non_receipt/1.jpg  
  inflating: data/receipt_validation/train/non_receipt/2.jpg  
  inflating: data/receipt_validation/train/non_receipt/22903.jpg  
  inflating: data/receipt_validation/train/non_receipt/22907.jpg  
  inflating: data/receipt_validation/train/non_receipt/22908.jpg  
  inflating: data/receipt_validation/train/non_receipt/22911.jpg  
  inflating: data/receipt_validation/train/non_receipt/22912.jpg  
  inflating: data/receipt_validation/train/non_receipt/22913.jpg  
  inflating: data/receipt_validation/train/non_receipt/22915.jpg  
  inflating: data/receipt_validation/train/non_receipt/22917.jpg  
  inflating: data/receipt_validation/train/non_receipt/22919.jpg  
  inflating: data/receipt_validation/train/non_receipt/22920.jpg  
  inflating: data/receipt_validat

In [15]:
IMAGE_PATH = "/content/data/receipt_validation/train/receipt/X51007846301.jpg"

In [5]:
def run_ocr(image_path: str, lang: list = ["en"]):
    """
    Runs EasyOCR on an image and returns raw OCR results.

    Args:
        image_path (str): Path to receipt image.
        lang (list): Languages for OCR (default: English).

    Returns:
        results (list): List of tuples:
            [
              (bounding_box, text, confidence),
              ...
            ]
    """
    image = cv2.imread(image_path)

    if image is None:
        raise ValueError(f"Could not read image at path: {image_path}")

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    reader = easyocr.Reader(lang, gpu=False)
    results = reader.readtext(image_rgb)

    return results


In [6]:
def filter_ocr_by_confidence(results, min_conf: float = 0.5):
    """
    Filters OCR results based on confidence score.

    Args:
        results (list): Raw OCR output from EasyOCR.
        min_conf (float): Minimum confidence threshold.

    Returns:
        filtered_results (list): OCR results with confidence >= min_conf.
    """
    filtered_results = []

    for bbox, text, conf in results:
        if conf >= min_conf:
            filtered_results.append((bbox, text, conf))

    return filtered_results


In [7]:
def extract_text_lines(results):
    """
    Extracts text lines from OCR results and sorts them
    top-to-bottom based on bounding box positions.

    Args:
        results (list): OCR results (bbox, text, confidence)

    Returns:
        lines (list of str): Ordered text lines
    """


    extracted = []

    for bbox, text, conf in results:
        #bbox = [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        #Take top-left corner (x1, y1)
        y_top = bbox[0][1]
        extracted.append((y_top, text))

    #Sort lines by y-coordinate
    extracted.sort(key=lambda x: x[0])

    #Return only text lines
    lines = [text for _, text in extracted]

    return lines


In [34]:
def normalize_line(line: str):
    line = line.lower().strip()
    line = re.sub(r"\s+", " ", line)
    line = re.sub(r'(?<=[a-z])1(?=[a-z])', 'l', line)
    line = line.replace("o.", "0.")
    line = line.replace("o,", "0,")
    line = line.replace("|", "1")

    return line


def normalize_lines(lines: list):
    return [normalize_line(line) for line in lines]


In [9]:
def extract_amount_from_line(line: str):
    """
    Extracts a numeric currency amount from a line of text.

    Handles formats like:
    - rm8.50
    - rm8,50
    - rm 8.50

    Returns:
        float or None
    """

    #Regex to capture currency amounts
    pattern = r"rm\s*([\d]+[.,][\d]{2})"

    match = re.search(pattern, line)
    if not match:
        return None

    amount_str = match.group(1)

    #Normalize comma to dot
    amount_str = amount_str.replace(",", ".")

    try:
        return float(amount_str)
    except ValueError:
        return None


def extract_final_total(lines: list):
    """
    Extracts the final payable total from OCR text lines.

    Strategy:
    1. Look for semantic keywords (nett total, grand total)
    2. Fallback to largest currency amount
    """

    keyword_priority = [
        "nett total",
        "grand total",
        "total payable",
        "amount payable"
    ]

    #Keyword-based extraction
    for line in lines:
        for keyword in keyword_priority:
            if keyword in line:
                amount = extract_amount_from_line(line)
                if amount is not None:
                    return amount

    #Fallback to largest amount
    amounts = []

    for line in lines:
        amount = extract_amount_from_line(line)
        if amount is not None:
            amounts.append(amount)

    if amounts:
        return max(amounts)

    return None


In [10]:
def extract_date(lines: list):
    """
    Extracts transaction date from OCR text lines.

    Supports formats like:
    - 10 jun 2018
    - 10/06/2018
    - 2018-06-10
    - 10 jun 2018 18.23
    """
    date_patterns = [
        r"\b\d{1,2}\s+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{4}\b",
        r"\b\d{1,2}/\d{1,2}/\d{4}\b",
        r"\b\d{4}-\d{1,2}-\d{1,2}\b"
    ]

    for line in lines:
        for pattern in date_patterns:
            match = re.search(pattern, line)
            if match:
                return match.group(0)

    return None


In [24]:
def vendor_score(line: str):
    score = 0

    # Strong negative signals
    blacklist = ["invoice", "total", "gst", "tax", "receipt"]
    if any(b in line for b in blacklist):
        return -1

    # Alphabet dominance
    alpha_ratio = sum(c.isalpha() for c in line) / max(len(line), 1)
    if alpha_ratio > 0.6:
        score += 2

    # Length matters
    if len(line) >= 10:
        score += 2
    elif len(line) >= 6:
        score += 1

    # Multi-word bonus
    if len(line.split()) >= 2:
        score += 2

    # Business suffix bonus
    business_suffixes = ["ltd", "limited", "llp", "inc", "corp", "sdn", "bhd"]
    if any(s in line for s in business_suffixes):
        score += 3

    # Penalize digit-heavy lines
    if sum(c.isdigit() for c in line) > 3:
        score -= 2

    return score



def extract_vendor(lines: list):
    """
    Extract vendor name using scoring-based heuristics.
    """

    candidates = lines[:6]  # top region only

    scored = [(vendor_score(line), line) for line in candidates]

    # Remove impossible candidates
    scored = [item for item in scored if item[0] > 0]

    if not scored:
        return None

    # Pick highest scoring line
    scored.sort(reverse=True, key=lambda x: x[0])
    return scored[0][1]


In [25]:
def extract_receipt_entities(lines: list):
    """
    Extracts key structured entities from OCR text lines.

    Entities:
        - vendor name
        - transaction date
        - final payable total

    Args:
        lines (list): Normalized OCR text lines (ordered top-to-bottom)

    Returns:
        dict: Structured receipt entities
    """
    vendor = extract_vendor(lines)
    date = extract_date(lines)
    total = extract_final_total(lines)

    return {
        "vendor": vendor,
        "date": date,
        "total": total
    }


In [26]:
results = run_ocr(IMAGE_PATH)
results[:5]



[([[np.int32(2667), np.int32(1203)],
   [np.int32(3098), np.int32(1203)],
   [np.int32(3098), np.int32(1361)],
   [np.int32(2667), np.int32(1361)]],
  'Hoboz',
  np.float64(0.5182740976238969)),
 ([[np.int32(1630), np.int32(1455)],
   [np.int32(2984), np.int32(1455)],
   [np.int32(2984), np.int32(1559)],
   [np.int32(1630), np.int32(1559)]],
  'UNIHAKKA INTERNATIONAL SDN BHD',
  np.float64(0.9856849153772583)),
 ([[np.int32(2059), np.int32(1536)],
   [np.int32(2555), np.int32(1536)],
   [np.int32(2555), np.int32(1620)],
   [np.int32(2059), np.int32(1620)]],
  '10 Jun 2018 18.23',
  np.float64(0.9902464752058203)),
 ([[np.int32(2146), np.int32(1621)],
   [np.int32(2469), np.int32(1621)],
   [np.int32(2469), np.int32(1712)],
   [np.int32(2146), np.int32(1712)]],
  '(867388-U)',
  np.float64(0.7024909255879769)),
 ([[np.int32(1729), np.int32(1702)],
   [np.int32(2891), np.int32(1702)],
   [np.int32(2891), np.int32(1802)],
   [np.int32(1729), np.int32(1802)]],
  '12, Jalan Tampoi 7/4,Kawas

In [27]:
filtered_results = filter_ocr_by_confidence(results, min_conf=0.5)
len(results), len(filtered_results)

(43, 41)

In [28]:
lines = extract_text_lines(filtered_results)
for line in lines:
    print(line)

Hoboz
UNIHAKKA INTERNATIONAL SDN BHD
10 Jun 2018 18.23
(867388-U)
12, Jalan Tampoi 7/4,Kawasan Perindustrian
Tampoi,81200 Johor Bahru,Johor
TAX INVOICE
OR18061002160368
Invoice #
Total
Qty
Item
SR 100100000171-Meat Dish
RM2.83
RM2.83
SR 100100000031
3 Vege
RM4.15
SR /00100000170- Imported Veggies
RM1.50
RM1.50
Total Amount: RM8.48
GST @O%: RMO.OO
Rounding: RMO.02
Nett Total: RM8.50
Amount
Payment Mode_
RM8,50
CASH
RMO.00
Change
TaxRM)
Amount(RM)
GST Summary
8.48
SR = GST @0%
GST REG #000656195584
BAR WANG RICE@PERMAS JAYA
Thank You & Come Againl
Like and Follow Us on Facebookl
Facebook com/BarWangRice


In [35]:
clean_lines = normalize_lines(lines)
for l in clean_lines:
    print(l)


hoboz
unihakka international sdn bhd
10 jun 2018 18.23
(867388-u)
12, jalan tampoi 7/4,kawasan perindustrian
tampoi,81200 johor bahru,johor
tax invoice
or18061002160368
invoice #
total
qty
item
sr 100100000171-meat dish
rm2.83
rm2.83
sr 100100000031
3 vege
rm4.15
sr /00100000170- imported veggies
rm1.50
rm1.50
total amount: rm8.48
gst @o%: rm0.oo
rounding: rm0.02
nett total: rm8.50
amount
payment mode_
rm8,50
cash
rm0.00
change
taxrm)
amount(rm)
gst summary
8.48
sr = gst @0%
gst reg #000656195584
bar wang rice@permas jaya
thank you & come againl
like and follow us on facebookl
facebook com/barwangrice


In [36]:
final_total = extract_final_total(clean_lines)
final_total

8.5

In [38]:
date = extract_date(clean_lines)
date

'10 jun 2018'

In [37]:
vendor = extract_vendor(clean_lines)
vendor

'unihakka international sdn bhd'

In [39]:
entities = extract_receipt_entities(clean_lines)
entities

{'vendor': 'unihakka international sdn bhd',
 'date': '10 jun 2018',
 'total': 8.5}