In [1]:
pip install torch torchvision transformers datasets pytesseract pdf2image opencv-python matplotlib tabulate pandas pillow


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re 
import cv2
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
from tabulate import tabulate
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch
import numpy as np

# --- Configure paths ---
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Set this path on your system
pdf_path = r"C:\Users\HP\Desktop\Bank Project\Balraj\Sales Invoices\Yes Group-Ashok -Inv-August-2014.pdf"
output_csv_path = "extracted_transactions.csv"

# --- Step 1: Convert PDF to Images ---
def pdf_to_images(pdf_path, dpi=300):
    return convert_from_path(pdf_path, dpi=dpi)

# --- Step 2: OCR using Tesseract ---
def extract_text_and_boxes(image):
    rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
    data = pytesseract.image_to_data(rgb, output_type=pytesseract.Output.DICT)
    n_boxes = len(data['level'])
    lines = []
    for i in range(n_boxes):
        if int(data['conf'][i]) > 60:  # Confidence threshold
            lines.append({
                'text': data['text'][i],
                'conf': data['conf'][i],
                'left': data['left'][i],
                'top': data['top'][i],
                'width': data['width'][i],
                'height': data['height'][i],
            })
    return lines

# # --- Step 3: Identify transaction table using regex/keyword rules ---
# def identify_transaction_lines(ocr_lines):
#     rows = []
#     buffer = []
#     keywords = ['date', 'deposit', 'withdrawal', 'balance', 'narration', 'description']

#     for line in ocr_lines:
#         text = line['text'].strip().lower()
#         if any(k in text for k in keywords):
#             if buffer:
#                 rows.append(" ".join(buffer))
#                 buffer = []
#         elif len(text) > 0:
#             buffer.append(text)
#     if buffer:
#         rows.append(" ".join(buffer))
#     return rows

# # --- Step 4: Parse text into structured transaction rows ---
# def parse_transaction_rows(rows):
#     # Define rules / patterns (you can expand)
#     pattern = r"(\d{2}[-/]\d{2}[-/]\d{4})"  # date format: dd-mm-yyyy or dd/mm/yyyy
#     parsed_data = []

# ✅ Improved: Identify likely transaction lines using keywords + date pattern
def identify_transaction_lines(lines):
    rows = []
    pattern = r"\d{2}[-/]\d{2}[-/]\d{4}"  # Match date pattern

    for line in lines:
        text = line['text'].strip()
        if re.search(pattern, text):
            rows.append(text)
    return rows


# ✅ Improved: Use regex to parse transaction-like rows
def parse_transaction_rows(rows):
    parsed_data = []
    for row in rows:
        parts = row.split()
        if len(parts) >= 3:  # At least date + narration + amount
            parsed_data.append(parts)

    if not parsed_data:
        print("⚠️ No parsable transaction rows found.")
        return pd.DataFrame()  # Return empty DataFrame safely

    # pad rows to same length
    max_len = max(len(r) for r in parsed_data)
    cleaned_data = [r + [''] * (max_len - len(r)) for r in parsed_data]
    return pd.DataFrame(cleaned_data)



    for row in rows:
        if re.search(pattern, row):
            parsed_data.append(row.split())

    df = pd.DataFrame(parsed_data)
    return df

# --- Step 5: Fallback if no structure ---
def fallback_to_text(image):
    text = pytesseract.image_to_string(image)
    return text

# --- Step 6: Main runner ---
def process_bank_statement(pdf_path):
    pages = pdf_to_images(pdf_path)
    all_rows = []

    for i, page in enumerate(pages):
        print(f"Processing page {i+1}...")
        ocr_lines = extract_text_and_boxes(page)
        rows = identify_transaction_lines(ocr_lines)
        all_rows.extend(rows)

    df = parse_transaction_rows(all_rows)

    if df.empty:
        print("Fallback to raw OCR.")
        raw_text = "\n".join(fallback_to_text(p) for p in pages)
        with open("fallback_ocr_output.txt", "w", encoding="utf-8") as f:
            f.write(raw_text)
        return None

    df.to_csv(output_csv_path, index=False)
    print("✅ Extracted transactions saved to:", output_csv_path)
    return df

# --- Step 7: Run and Display ---
df_result = process_bank_statement(pdf_path)
if df_result is not None:
    print("\n📊 Final Extracted Transactions Table:\n")
    print(tabulate(df_result.head(10), headers='keys', tablefmt='github'))
else:
    print("⚠️ No structured table extracted. Please check fallback_ocr_output.txt")


Processing page 1...
⚠️ No parsable transaction rows found.
Fallback to raw OCR.
⚠️ No structured table extracted. Please check fallback_ocr_output.txt
