In [None]:
!pip install transformers==4.36.2
!pip install torch torchvision
!pip install Pillow

# donut model

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# Load pretrained Donut receipt model (CORD dataset)
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

# Load a receipt image
image = Image.open("/content/5098.png").convert("RGB")

# Prepare input
task_prompt = "<s_receipt>"  # special token for receipts
inputs = processor(image, text=task_prompt, return_tensors="pt")

# Run inference
outputs = model.generate(**inputs, max_length=512, return_dict_in_generate=True)
result = processor.batch_decode(outputs.sequences)[0]

print(result)

# for extracting the recepit output

In [None]:
# Install dependencies
!pip install python-doctr transformers torch torchvision --quiet

from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from transformers import pipeline
import re, json

# ===== 1. OCR Extraction =====
ocr_model = ocr_predictor(pretrained=True)
doc = DocumentFile.from_images("/content/5096.png")  # Change image path
ocr_result = ocr_model(doc)

# Flatten text lines
lines = []
for p in ocr_result.pages:
    for b in p.blocks:
        for l in b.lines:
            line_text = " ".join([w.value for w in l.words])
            lines.append(line_text)

text = " ".join(lines)

# ===== 2. NER Model (BERT) =====
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
entities = ner(text)

# ===== 3. Prepare Output JSON =====
output = {
    "date": None,
    "supplier_name": None,
    "items_purchased": [],
    "payment_type": None,
    "total_balance": None
}

# Fill from detected entities
for ent in entities:
    if "DATE" in ent['entity_group'] and not output["date"]:
        output["date"] = ent["word"]
    elif "ORG" in ent['entity_group'] and not output["supplier_name"]:
        output["supplier_name"] = ent["word"]

# ===== 4. Detect Total Balance =====
total_match = re.search(r"(total|amount|balance)\s*[:\-]?\s*\$?\d+[.,]?\d*", text, re.I)
if total_match:
    amount_match = re.search(r"\$?\d+[.,]?\d*", total_match.group(0))
    if amount_match:
        output["total_balance"] = amount_match.group(0)

# ===== 5. Detect Payment Type =====
pay_match = re.search(r"(credit\s*card|debit\s*card|cash|upi|net\s*banking)", text, re.I)
if pay_match:
    output["payment_type"] = pay_match.group(0)

# ===== 6. Extract Items Purchased (structured) =====
items = []
capture = False

for l in lines:
    # Stop capturing when reaching total/amount/balance
    if re.search(r"(total|amount|balance)", l, re.I):
        break
    # Start when we see first "item + price" line
    if re.search(r"[A-Za-z]", l) and re.search(r"\d+[.,]\d{2}", l):
        capture = True
    if capture and re.search(r"[A-Za-z]", l) and re.search(r"\d+[.,]\d{2}", l):
        # Split into item name + price
        price_match = re.search(r"\d+[.,]\d{2}", l)
        if price_match:
            price = price_match.group(0)
            item_name = l.replace(price, "").strip(" -:")  # clean name
            items.append({"item": item_name, "price": price})

output["items_purchased"] = items

# ===== 7. Save to JSON =====
with open("receipt_output.json", "w") as f:
    json.dump(output, f, indent=2)

print(json.dumps(output, indent=2))


# for reconstruct the json output

In [None]:
# Install dependencies
!pip install python-doctr transformers torch torchvision --quiet

from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import re, json, os

# ===== 1. OCR Extraction =====
ocr_model = ocr_predictor(pretrained=True)

# Function to parse any receipt image
def parse_receipt(image_path):
    doc = DocumentFile.from_images(image_path)
    ocr_result = ocr_model(doc)

    # Flatten OCR lines
    lines = []
    for p in ocr_result.pages:
        for b in p.blocks:
            for l in b.lines:
                line_text = " ".join([w.value for w in l.words])
                lines.append(line_text)

    text = " ".join(lines)

    # ===== 2. Output JSON Structure =====
    output = {
        "date": None,
        "supplier_name": None,
        "items_purchased": [],
        "payment_type": None,
        "total_balance": None
    }

    # ===== 3. Supplier (first line usually store name) =====
    if len(lines) > 0:
        output["supplier_name"] = lines[0].strip()

    # ===== 4. Date detection =====
    date_match = re.search(r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", text)
    if date_match:
        output["date"] = date_match.group(0)

    # ===== 5. Total Balance =====
    total_match = re.search(r"(total|amount|balance)\s*[:\-]?\s*\$?\d+[.,]?\d*", text, re.I)
    if total_match:
        amount_match = re.search(r"\d+[.,]?\d*", total_match.group(0))
        if amount_match:
            output["total_balance"] = amount_match.group(0)

    # ===== 6. Payment Type =====
    pay_match = re.search(r"(credit\s*card|debit\s*card|cash|upi|net\s*banking)", text, re.I)
    if pay_match:
        output["payment_type"] = pay_match.group(0)

    # ===== 7. Extract Items =====
    items = []
    capture = False
    for l in lines:
        if re.search(r"(total|amount|balance)", l, re.I):
            break
        if re.search(r"[A-Za-z]", l) and re.search(r"\d+[.,]\d{2}", l):
            capture = True
        if capture and re.search(r"[A-Za-z]", l) and re.search(r"\d+[.,]\d{2}", l):
            price_match = re.search(r"\d+[.,]\d{2}", l)
            if price_match:
                price = price_match.group(0)
                item_name = l.replace(price, "").strip(" -:")
                items.append({"item": item_name, "price": price})

    output["items_purchased"] = items

    # ===== 8. Save JSON =====
    with open("receipt_output.json", "w") as f:
        json.dump(output, f, indent=2)

    # ===== 9. Reconstruct Receipt =====
    reconstructed = "\n=== Reconstructed Receipt ===\n"
    reconstructed += f"Store: {output['supplier_name'] or 'N/A'}\n"
    reconstructed += f"Date: {output['date'] or 'N/A'}\n"
    reconstructed += "-" * 30 + "\n"
    for it in output["items_purchased"]:
        reconstructed += f"{it['item']:<20} {it['price']:>7}\n"
    reconstructed += "-" * 30 + "\n"
    reconstructed += f"Payment: {output['payment_type'] or 'N/A'}\n"
    reconstructed += f"Total: {output['total_balance'] or 'N/A'}\n"

    print("=== Parsed JSON Output ===")
    print(json.dumps(output, indent=2))
    print(reconstructed)

    return output, reconstructed


# ===== Run on any receipt =====
parse_receipt("/content/20230317_193638.jpg")  # Change image path for other receipts


# for matching single field

In [None]:
! pip install python-doctr[torch] rapidfuzz python-dateutil


In [None]:
import os
import re
from dateutil import parser as dateparser
from rapidfuzz import fuzz, process
from pathlib import Path
import torch
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

# ----------------------------
# Config, regex, and helpers
# ----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

KW = {
    "sub_total": ["subtotal", "sub total", "sub-total", "amount before tax"],
    "total_discount": ["total discount", "discount total", "discounts"],
    "net_amount": ["net", "net amount", "amount due"],
    "total_amount": ["total", "grand total", "invoice total", "total amount", "balance"],
    "vat_amount": ["vat amount", "tax amount", "vat", "igst", "cgst", "sgst"],
    "sale_amount": ["sale amount", "sales amount"],
}
DATE_HINTS = ["date", "receipt date", "payment date", "txn date", "time", "invoice date"]
CURRENCY_SIGNS = {
    "USD": ["$", "usd"], "EUR": ["€", "eur"], "GBP": ["£", "gbp"],
    "INR": ["₹", "rs", "inr"], "JPY": ["¥", "jpy"]
}
RE_PRICE = re.compile(r"(?<![A-Za-z0-9])(?:Rs\.?|₹|€|\$|£|¥)?\s?(-?\d{1,3}(?:[, ]\d{3})*(?:\.\d{1,2})|\d+(?:\.\d{1,2})?)(?![A-Za-z])")
RE_PERCENT = re.compile(r"(\d{1,2}(?:\.\d+)?)\s?%")
RE_VAT_CODE = re.compile(r"\b(VAT|GST|IGST|CGST|SGST)\s*[:\-]?\s*([A-Z0-9\-]+)\b", re.I)
RE_ITEM_CODE = re.compile(r"\b(?:SKU|Item\s*Code|Code|PLU|EAN)\s*[:\-]?\s*([A-Z0-9\-]+)\b", re.I)
RE_RECEIPT_NO = re.compile(r"\b(?:Receipt\s*No\.?|Invoice\s*No\.?|Bill\s*No\.?|Txn\s*ID)\s*[:\-]?\s*([A-Z0-9\-\/]+)\b", re.I)
RE_ITEM_LINE = re.compile(
    r"^(?P<name>.+?)\s{1,}(?P<qty>\d{1,5})\s{1,}(?P<unit>(?:₹|€|\$|£|¥)?\s?\d+(?:[,\s]\d{3})*(?:\.\d{1,2})?)\s{1,}(?P<amount>(?:₹|€|\$|£|¥)?\s?\d+(?:[,\s]\d{3})*(?:\.\d{1,2})?)$"
)

def norm_money(s):
    if s is None: return None
    t = s.replace(",", "").replace(" ", "").replace("₹","").replace("$","").replace("€","").replace("£","").replace("¥","")
    try: return float(t)
    except: return None

def find_currency(lines):
    joined = " ".join(lines).lower()
    for code, signs in CURRENCY_SIGNS.items():
        for sig in signs:
            if sig.lower() in joined:
                return code
    return None

def extract_first_date(text):
    try:
        dt = dateparser.parse(text, fuzzy=True, dayfirst=False)
        return dt.date().isoformat() if dt else None
    except: return None

def run_ocr_lines_from_images_or_pdf(path):
    model = ocr_predictor(pretrained=True).to(DEVICE)
    suffix = Path(path).suffix.lower()
    if suffix == ".pdf":
        doc = DocumentFile.from_pdf(path)
    else:
        doc = DocumentFile.from_images(path)
    result = model(doc)
    lines = []
    for page in result.export()["pages"]:
        for block in page["blocks"]:
            for line in block["lines"]:
                txt = " ".join([w["value"] for w in line["words"]]).strip()
                if txt: lines.append(txt)
    return lines

def split_header_body_totals(lines):
    n = len(lines)
    header = lines[: max(1, n // 7)]
    tail_start = n
    for i in range(n-1, -1, -1):
        if any(k in lines[i].lower() for k in ["total","subtotal","vat","tax","grand"]):
            tail_start = min(tail_start, i)
    body = lines[max(1, n // 7): tail_start]
    totals = lines[tail_start:]
    return header, body, totals

def parse_supplier(header): return header[0].strip() if header else None
def extract_with_regex(lines, pattern):
    for ln in lines:
        m = pattern.search(ln)
        if m: return m.groups() if m.groups() else (m.group(0),)
    return None

from rapidfuzz import fuzz
def parse_amount_by_label(totals_lines, label_list):
    best_line, best_score = None, -1
    for ln in totals_lines:
        for lab in label_list:
            score = fuzz.partial_ratio(lab.lower(), ln.lower())
            if score > best_score:
                best_score, best_line = score, ln
    if best_line is None or best_score < 70: return None
    prices = RE_PRICE.findall(best_line)
    return norm_money(prices[-1]) if prices else None

def parse_vat_percentage(lines):
    vals = []
    for ln in lines:
        if any(k in ln.lower() for k in ["vat","gst","tax","igst","cgst","sgst"]):
            for p in RE_PERCENT.findall(ln):
                try: vals.append(float(p))
                except: pass
    return max(vals) if vals else None

def parse_dates(lines):
    labeled = [ln for ln in lines if any(h in ln.lower() for h in [d.lower() for d in DATE_HINTS])]
    receipt_date = None
    for ln in labeled:
        d = extract_first_date(ln)
        if d: receipt_date = d; break
    if not receipt_date:
        receipt_date = extract_first_date("\n".join(lines))
    pay_lines = [ln for ln in lines if any(k in ln.lower() for k in ["payment date","paid on","settled on","txn date"])]
    payment_date = None
    for ln in pay_lines:
        d = extract_first_date(ln)
        if d: payment_date = d; break
    return receipt_date, payment_date

def parse_items(body_lines):
    items = []
    for ln in body_lines:
        m = RE_ITEM_LINE.match(ln.strip())
        if m:
            items.append({
                "item name": m.group("name").strip(),
                "item quantity": int(re.sub(r"[^\d]","", m.group("qty"))),
                "unit price": norm_money(m.group("unit")),
                "item amount": norm_money(m.group("amount")),
            })
    return items

def sum_items(items):
    return (
        sum(i["item amount"] or 0 for i in items) if items else None,
        sum(i["item quantity"] or 0 for i in items) if items else None
    )

def parse_single_file(file_path):
    lines = run_ocr_lines_from_images_or_pdf(file_path)
    header, body, totals = split_header_body_totals(lines)

    supplier = parse_supplier(header)
    currency = find_currency(lines)
    receipt_no = (extract_with_regex(lines, RE_RECEIPT_NO) or (None,))[0]
    item_code = (extract_with_regex(lines, RE_ITEM_CODE) or (None,))
    vc = extract_with_regex(lines, RE_VAT_CODE)
    vat_code = vc[1] if vc else None
    receipt_date, payment_date = parse_dates(lines)

    items = parse_items(body)
    sub_total_calc, total_item_count_calc = sum_items(items)

    sub_total = parse_amount_by_label(totals, KW["sub_total"]) or sub_total_calc
    total_discount = parse_amount_by_label(totals, KW["total_discount"])
    vat_amount = parse_amount_by_label(totals, KW["vat_amount"])
    net_amount = parse_amount_by_label(totals, KW["net_amount"])
    total_amount = parse_amount_by_label(totals, KW["total_amount"]) or net_amount
    sale_amount = parse_amount_by_label(totals, KW["sale_amount"])
    vat_percentage = parse_vat_percentage(totals)

    item_discount = None
    for ln in body:
        if "disc" in ln.lower():
            prices = RE_PRICE.findall(ln)
            if prices:
                item_discount = norm_money(prices[-1]); break

    if net_amount is None and total_amount is not None and total_discount is not None:
        net_amount = round(total_amount - total_discount, 2)
    if sub_total is None and total_amount is not None and vat_amount is not None:
        sub_total = round(total_amount - vat_amount, 2)

    record = {
        "supplier name": supplier,
        "item name": items[0]["item name"] if items else None,
        "item quantity": items["item quantity"] if items else None,
        "unit price": items["unit price"] if items else None,
        "item amount": items["item amount"] if items else None,
        "receipt no": receipt_no,
        "currency": currency,
        "item code": item_code,
        "vat code": vat_code,
        "vat percentage": vat_percentage,
        "vat amount": vat_amount,
        "total amount": total_amount,
        "receipt date": receipt_date,
        "item discount": item_discount,
        "total discount": total_discount,
        "net amount": net_amount,
        "sub total": sub_total,
        "total item count": total_item_count_calc,
        "sale amount": sale_amount,
        "payment date": payment_date,
        "_items": items,
        "_raw_lines": lines,
    }
    return record

if __name__ == "__main__":
    # Set this to your specific file (image or PDF)
    input_path = r"/content/5131.png"  # or .png/.pdf
    result = parse_single_file(input_path)

    import json
    print(json.dumps(result, indent=2, ensure_ascii=False))
