## Extract text using OCR

In [1]:
import pdfplumber
import pytesseract
from PIL import Image

def ocr_pdf_to_text(pdf_path: str) -> str:
    """
    Extracts text from a PDF file using OCR.
    """
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            image = page.to_image(resolution=300)
            text = pytesseract.image_to_string(image.original)
            full_text += text + "\n"
    return full_text

## Parsing PDFs

In [2]:
import re
import pandas as pd

def parse_bank_statement(text: str, bank: str) -> pd.DataFrame:
    transactions = []
    
    if bank.lower() == 'icici':
        # Regex for ICICI based on sample data, adjusted for multi-line remarks
        pattern = re.compile(
            r'(\d+)\s+' # S.No
            r'(\d{2}/\d{2}/\d{4})\s+' # Value Date
            r'(\d{2}/\d{2}/\d{4})\s+' # Transaction Date
            r'(UPI.*?)(?:\r?\n|\s+)(\d[\d,.]*?)\s+(\d[\d,.]*?)\s+([\d,.]+)' # Description, Withdrawal, Deposit, Balance
            , re.DOTALL
        )
        for match in pattern.finditer(text):
            # Extract groups
            s_no, val_date, txn_date, desc, withdraw, deposit, balance = match.groups()
            transactions.append({
                'Date': txn_date,
                'Description': desc.strip(),
                'Debit': float(withdraw.replace(',', '')),
                'Credit': float(deposit.replace(',', '')),
                'Balance': float(balance.replace(',', '')),
            })
    
    elif bank.lower() == 'hdfc':
        # Regex for HDFC
        pattern = re.compile(
            r'(\d{2} [A-Z][a-z]{2} \d{4})\s+'  # Date
            r'(.+?)\s+'  # Narration (Description) - a bit broad, but works here
            r'(\d{10,}|[A-Z0-9]+)\s+' # Cheque/Ref. No.
            r'(\d{2} [A-Z][a-z]{2} \d{4})\s+'  # Value Date
            r'([\d,]+\.\d{2}|)?\s*'  # Withdrawal (optional)
            r'([\d,]+\.\d{2}|)?\s*'  # Deposit (optional)
            r'([\d,]+\.\d{2})'  # Closing Balance
        )
        for match in pattern.finditer(text):
            # Extract groups
            date, desc, ref, val_date, withdraw, deposit, balance = match.groups()
            transactions.append({
                'Date': date,
                'Description': desc.strip(),
                'Debit': float(withdraw.replace(',', '')) if withdraw else 0.0,
                'Credit': float(deposit.replace(',', '')) if deposit else 0.0,
                'Balance': float(balance.replace(',', '')),
            })

    # Create and return DataFrame
    return pd.DataFrame(transactions)

## Inference

In [None]:
pdf_path = "/Users/karthiksagar/Expense-Classification/Statements/hdfc.pdf"
bank = "HDFC"  # or "ICICI", "Union Bank", "SBI"

text = ocr_pdf_to_text(pdf_path)

df = parse_bank_statement(text, bank)
df.head()

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,18 Jan 2024,18 Jan,200.0,300.0,20.0


In [4]:
df

Unnamed: 0,Date,Description,Debit,Credit,Balance
0,18 Jan 2024,18 Jan,200.0,300.0,20.0
