## Automated Expense Extraction - Receipt Parsing Using YOLO and OCR
### Evaluation of YOLO + Tesseract

In [None]:
!pip install -q ultralytics pytesseract textdistance dateparser
# !sudo apt-get install -y tesseract-ocr

In [None]:
import cv2
import json
import os
import pandas as pd
import numpy as np
import pytesseract
import dateparser
from difflib import SequenceMatcher
from ultralytics import YOLO
from pathlib import Path
from tqdm.notebook import tqdm
import re

In [None]:
# Check if running in Google Colab
if 'COLAB_GPU' in os.environ:
    # Mount Google Drive (for Colab)
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set DATA_PATH for Google Drive
    DATA_PATH = Path('/content/drive/MyDrive/data')
else:
    # Set DATA_PATH for local environment
    DATA_PATH = Path('../data')

In [None]:
# Load Model
# model_path = DATA_PATH / "models/yolo_receipts/weights/best.pt"
model_path = DATA_PATH / "models/yolo_receipts_highres_nano/weights/best.pt"
# model_path = DATA_PATH / "models/yolo_receipts_highres_small/weights/best.pt"
model = YOLO(model_path)

In [None]:
# PART 1: ADVANCED PREPROCESSING

def clean_crop_vendor(crop_img):
    """Gentle cleaning for Logos"""
    if len(crop_img.shape) == 3: gray = cv2.cvtColor(crop_img, cv2.COLOR_BGR2GRAY)
    else: gray = crop_img
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    return cv2.medianBlur(gray, 3)

def clean_crop_numeric(crop_img):
    """Aggressive cleaning for Numbers"""
    if len(crop_img.shape) == 3: gray = cv2.cvtColor(crop_img, cv2.COLOR_BGR2GRAY)
    else: gray = crop_img
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    return cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 10)

def preprocess_full_page(original_img):
    """
    SROIE OPTIMIZED: Simple Adaptive Thresholding.
    This gave the best results (approx 60% accuracy) on the Test Set.
    """
    # 1. Convert to Grayscale
    if len(original_img.shape) == 3:
        gray = cv2.cvtColor(original_img, cv2.COLOR_BGR2GRAY)
    else:
        gray = original_img

    # 2. Upscale (Essential for small Date text)
    # We force the image to be double size to make text thicker
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    # 3. Gaussian Blur (Removes scanner grain/noise)
    # We use a slight blur so the thresholding doesn't pick up dust
    gray = cv2.GaussianBlur(gray, (5, 5), 0)

    # 4. Adaptive Thresholding (The Winner)
    # Block Size 31: Looks at a large area to ignore small stains
    # C = 10: High constant to ensure background turns pure white
    thresh = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        31, 10
    )

    return thresh

In [None]:
# PART 2: STRICT REGEX FALLBACKS

def is_valid_date(text):
    """Returns True if text looks like a date"""
    # Matches DD/MM/YYYY, YYYY-MM-DD, 12 DEC 2023
    pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}|\d{1,2}\s(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[a-z]*\s\d{2,4})'
    return bool(re.search(pattern, text, re.IGNORECASE))

def fallback_find_date(text):
    pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}|\d{1,2}\s(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[a-z]*\s\d{2,4})'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return matches[0] if matches else None

# Reuse existing regex for total/vendor...
def fallback_find_total(text):
    pattern = r'(\d{1,3}(?:[.,]\d{3})*[.,]\d{2})\b'
    matches = re.findall(pattern, text)
    if not matches: return None
    try:
        clean_values = []
        for m in matches:
            val = m.replace(',', '.')
            if val.count('.') > 1: val = val.replace('.', '', val.count('.')-1)
            clean_values.append(float(val))
        return "{:.2f}".format(max(clean_values))
    except: return matches[-1]

def fallback_find_vendor(text):
    lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 3]
    blacklist = ["welcome", "receipt", "tax invoice", "gst", "tel", "fax"]
    for line in lines[:6]:
        if not any(b in line.lower() for b in blacklist):
            if any(c.isalpha() for c in line): return line
    return None

def get_full_page_text(original_img):
    processed = preprocess_full_page(original_img)
    return pytesseract.image_to_string(processed, config='--psm 3')



In [None]:
# PART 3: INFERENCE PIPELINE

def predict_single_image(image_path):
    results = model.predict(image_path, conf=0.10, verbose=False)
    result = results[0]
    original_img = cv2.imread(str(image_path))
    if original_img is None: return {}

    extracted = {}

    # A. Process YOLO
    for box in result.boxes:
        cls_id = int(box.cls[0])
        label = model.names[cls_id]
        x1, y1, x2, y2 = map(int, box.xyxy[0])

        h, w, _ = original_img.shape
        crop = original_img[max(0, y1-5):min(h, y2+5), max(0, x1-5):min(w, x2+5)]
        if crop.size == 0: continue

        # Preprocess & OCR
        if label in ['company', 'vendor']:
            final_crop = clean_crop_vendor(crop)
            config = '--psm 6'
        else:
            final_crop = clean_crop_numeric(crop)
            config = '--psm 7 -c tessedit_char_whitelist=0123456789./:-RM'

        text = pytesseract.image_to_string(final_crop, config=config).strip()
        if label == 'total': text = text.replace("RP", "RM").replace("Rm", "RM")

        # --- VALIDATION STEP (NEW) ---
        # If YOLO finds a date, but it's just " . ", IGNORE IT so fallback runs later
        if label == 'date':
            if not is_valid_date(text):
                continue # Skip adding it to 'extracted', forcing fallback

        if label not in extracted: extracted[label] = text

    if 'company' in extracted: extracted['vendor'] = extracted.pop('company')

    # B. SAFETY NET (Runs if field missing OR if Validation failed)
    required = ['vendor', 'date', 'total']
    missing = [f for f in required if f not in extracted]

    if missing:
        full_text = get_full_page_text(original_img)

        if 'date' in missing:
            val = fallback_find_date(full_text)
            if val: extracted['date'] = val
        if 'total' in missing:
            val = fallback_find_total(full_text)
            if val: extracted['total'] = val
        if 'vendor' in missing:
            val = fallback_find_vendor(full_text)
            if val: extracted['vendor'] = val

    return extracted



In [None]:
# PART 4: METRICS & LOOP

def clean_amount(s):
    if pd.isna(s): return 0.0
    s = str(s).upper().replace("RM", "").replace("RP", "").replace(",", "").replace(" ", "")
    try: return float(s)
    except: return 0.0

def is_date_match(pred, gt):
    if pd.isna(pred) or pd.isna(gt): return False
    try:
        d_pred = dateparser.parse(str(pred), settings={'STRICT_PARSING': False})
        d_gt = dateparser.parse(str(gt), settings={'STRICT_PARSING': False})
        return d_pred.date() == d_gt.date() if (d_pred and d_gt) else False
    except: return False

def get_similarity(s1, s2):
    return SequenceMatcher(None, str(s1).lower(), str(s2).lower()).ratio()


In [None]:
# PART 5: MAIN LOOP

test_img_dir = DATA_PATH / "raw/SROIE2019/test/img"
test_gt_dir = DATA_PATH / "raw/SROIE2019/test/entities"

results = []
print("ðŸš€ Starting Final Optimized Evaluation...")

for img_file in tqdm(sorted(list(test_img_dir.glob("*.jpg")))):
    file_id = img_file.stem
    preds = predict_single_image(img_file)

    gt_file = test_gt_dir / f"{file_id}.txt"
    if not gt_file.exists(): continue
    try:
        with open(gt_file, 'r') as f:
            gt_data = json.loads(f.read().replace(",\n}", "}"))
    except: continue

    gt_vendor = gt_data.get('company', "")
    gt_date = gt_data.get('date', "")
    gt_total = gt_data.get('total', "")

    pred_vendor = preds.get('vendor', "")
    pred_date = preds.get('date', "")
    pred_total = preds.get('total', "")

    results.append({
        "File": file_id,
        "Vendor_Correct": get_similarity(pred_vendor, gt_vendor) > 0.7,
        "Date_Correct": is_date_match(pred_date, gt_date),
        "Total_Correct": abs(clean_amount(pred_total) - clean_amount(gt_total)) < 0.10
    })


In [None]:
# PART 6: RESULTS TABLE

df = pd.DataFrame(results)

print("\n" + "="*40)
print("   FINAL OPTIMIZED RESULTS (STRICT REGEX)")
print("="*40)
print(f"Vendor Accuracy: {df['Vendor_Correct'].mean():.2%}")
print(f"Date Accuracy:   {df['Date_Correct'].mean():.2%}")
print(f"Total Accuracy:  {df['Total_Correct'].mean():.2%}")
print("="*40)

csv_path = DATA_PATH / "processed/final_optimized_results.csv"
df.to_csv(csv_path, index=False)