
# Secure Bank Document Extraction — Colab Prototype (Option A)

**What this notebook does (safe to run on Colab with synthetic data):**

- Installs required libraries (Tesseract OCR + Python packages)
- Generates **synthetic** bank statement PDFs (no real data)
- Renders PDFs to images and runs Tesseract OCR
- Demonstrates simple PII redaction (masks account numbers, Aadhaar-like numbers, phone)
- Runs a rule-based extractor to produce structured JSON from OCR text
- Shows example outputs and next steps

**Warning:** Do **NOT** upload real bank or KYC documents to Colab. This notebook is for prototyping only with synthetic data.


In [None]:

# 1) Setup: install system & python dependencies (run this cell on Colab)
# On local machines you may need different install steps.
import sys
print('Python', sys.version)
# Install packages
!apt-get update -qq && apt-get install -y -qq tesseract-ocr poppler-utils
!pip install -q reportlab faker pillow pdf2image pytesseract tqdm

# verify tesseract
!tesseract --version | head -n 1


In [None]:

# 2) Generate synthetic bank-statement PDFs and labels (uses reportlab + faker)
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm
from faker import Faker
import random, os, json, datetime

OUT_DIR = 'synthetic_data'
PDF_DIR = os.path.join(OUT_DIR, 'pdfs')
LABEL_DIR = os.path.join(OUT_DIR, 'labels')
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(LABEL_DIR, exist_ok=True)

fake = Faker()

def mask_account(num):
    return 'X'*(len(num)-4)+num[-4:]

def generate_transactions(num=8):
    txns=[]
    base = datetime.date.today() - datetime.timedelta(days=180)
    for i in range(num):
        d = base + datetime.timedelta(days=random.randint(1,15)*(i+1))
        amt = round(random.uniform(100,20000),2)
        desc = random.choice(['CREDIT SALARY','ATM WITHDRAWAL','NEFT TRANSFER','DEBIT CARD PAYMENT','UPI RECEIVED','EMI PAYMENT','ONLINE PURCHASE'])
        txns.append({'date':d.isoformat(),'desc':desc,'amount':amt})
    return txns

def draw_statement(path, name, acct, masked, txns):
    c = canvas.Canvas(path, pagesize=A4)
    width, height = A4
    margin = 20*mm
    y = height - margin
    bank = fake.company() + ' Bank'
    c.setFont('Helvetica-Bold', 16)
    c.drawString(margin, y, bank)
    c.setFont('Helvetica', 10)
    c.drawString(width - 90*mm, y, f"Statement Date: {datetime.date.today().isoformat()}")
    y -= 12*mm
    c.setFont('Helvetica-Bold', 11)
    c.drawString(margin, y, name)
    c.setFont('Helvetica', 9)
    y -= 6*mm
    c.drawString(margin, y, fake.address().replace('\n', ', '))
    y -= 8*mm
    c.drawString(margin, y, f'Account No: {masked}')
    c.drawString(width - 90*mm, y, f'PAN: {fake.lexify(text="?????9999?")}')
    y -= 12*mm
    c.setFont('Helvetica-Bold', 9)
    c.drawString(margin, y, 'Date')
    c.drawString(margin+40*mm, y, 'Description')
    c.drawString(width - 60*mm, y, 'Amount')
    y -= 6*mm
    c.line(margin, y, width - margin, y)
    y -= 6*mm
    c.setFont('Helvetica', 9)
    for t in txns:
        if y < margin + 40*mm:
            c.showPage()
            y = height - margin
        c.drawString(margin, y, t['date'])
        c.drawString(margin+40*mm, y, t['desc'])
        c.drawRightString(width - margin, y, f"{t['amount']:,.2f}")
        y -= 6*mm
    c.setFont('Helvetica-Oblique', 8)
    c.drawString(margin, margin, 'Synthetic sample for model training. Not a real bank statement.')
    c.save()

# Create N synthetic PDFs
N = 20
manifest = []
for i in range(1, N+1):
    name = fake.name()
    acct = ''.join(str(random.randint(0,9)) for _ in range(12))
    masked = mask_account(acct)
    txns = generate_transactions(num=random.randint(6,16))
    fname = f'statement_{i:03d}.pdf'
    path = os.path.join(PDF_DIR, fname)
    draw_statement(path, name, acct, masked, txns)
    label = {'filename': fname, 'fields': {'customer_name': name, 'account_masked': masked, 'transactions': txns}, 'meta': {'created': datetime.datetime.utcnow().isoformat()}}
    with open(os.path.join(LABEL_DIR, fname.replace('.pdf','.json')), 'w', encoding='utf-8') as f:
        json.dump(label, f, indent=2)
    manifest.append(fname)

print('Created', len(manifest), 'synthetic PDFs at', PDF_DIR)


In [None]:

# 3) Convert PDFs to images (pdf2image) and run Tesseract OCR
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import glob, os, json

IMG_DIR = os.path.join('synthetic_data','images')
os.makedirs(IMG_DIR, exist_ok=True)

sample_pdf = os.path.join('synthetic_data','pdfs','statement_001.pdf')
pages = convert_from_path(sample_pdf, dpi=200)
print('Pages:', len(pages))
for i,p in enumerate(pages):
    out = os.path.join(IMG_DIR, f'statement_001_p{i+1}.png')
    p.save(out, 'PNG')
    print('Saved image', out)

# Run OCR on the first page
img_path = out
ocr_text = pytesseract.image_to_string(Image.open(img_path))
print('--- OCR Extract (first 500 chars) ---')
print(ocr_text[:500])


In [None]:

# 4) Simple PII redaction functions (regex-based). Replace/extend for production.
import re

def mask_account_numbers(text):
    # mask sequences of 9-20 digits
    return re.sub(r'\b(\d{6,20})\b', lambda m: 'X'*(len(m.group(1))-4)+m.group(1)[-4:], text)

def redact_aadhaar(text):
    return re.sub(r'\b\d{4}\s?\d{4}\s?\d{4}\b', '[REDACTED_AADHAAR]', text)

def mask_phone(text):
    return re.sub(r'\b\d{10}\b', lambda m: m.group(0)[:6]+'XXXX', text)

def redact_all(text):
    t = mask_account_numbers(text)
    t = redact_aadhaar(t)
    t = mask_phone(t)
    return t

print('Sample before redaction:\n', ocr_text.splitlines()[:6])
print('\nSample after redaction:\n', '\n'.join(redact_all(ocr_text).splitlines()[:6]))


In [None]:

# 5) Simple rule-based information extractor from OCR text.
# This is a demo. For production use layout-aware models (LayoutLM, Donut) or LLM prompts.
import re, json

def extract_name(text):
    # naive: the first line with letters and a space after header likely the name
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    # skip header lines that contain 'Bank' or 'Statement'
    for l in lines[:10]:
        if 'Bank' in l or 'Statement' in l or 'Statement Date' in l:
            continue
        # pick first plausible name (contains space and letters)
        if re.match(r'^[A-Za-z .]{3,60}$', l):
            return l
    return None

def extract_account_masked(text):
    m = re.search(r'Account\s*No[:\s]*([Xx\d\- ]{6,20})', text, re.IGNORECASE)
    if m:
        return m.group(1).strip()
    # fallback: find long masked digit sequence
    m2 = re.search(r'(X{4,}\d{2,4})', text)
    if m2:
        return m2.group(1)
    return None

def extract_transactions(text):
    # naive parsing: find lines with dates and amounts
    txns = []
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    date_re = re.compile(r'\b(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})\b')
    amt_re = re.compile(r'\b(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?)\b')
    for l in lines:
        if date_re.search(l) and amt_re.search(l):
            dates = date_re.findall(l)
            amts = amt_re.findall(l)
            txns.append({'line': l, 'dates': dates, 'amounts': amts})
    return txns

sample_text = ocr_text
ex_result = {
    'name': extract_name(sample_text),
    'account_masked': extract_account_masked(sample_text),
    'transactions': extract_transactions(sample_text)[:10]
}
print('Extraction result (pretty):')
print(json.dumps(ex_result, indent=2))


In [None]:

# 6) Run OCR + extraction on all synthetic PDFs and save results as JSON
from pdf2image import convert_from_path
import pytesseract, os, json
PDF_DIR = os.path.join('synthetic_data','pdfs')
OUT = os.path.join('synthetic_data','ocr_results')
os.makedirs(OUT, exist_ok=True)

files = [f for f in os.listdir(PDF_DIR) if f.endswith('.pdf')]
for f in files:
    pages = convert_from_path(os.path.join(PDF_DIR,f), dpi=180)
    page_texts = []
    for p in pages:
        page_texts.append(pytesseract.image_to_string(p))
    full_text = '\n'.join(page_texts)
    out = {
        'filename': f,
        'ocr_text_snippet': full_text[:800],
        'redacted_snippet': redact_all(full_text)[:800],
        'extracted': {
            'name': extract_name(full_text),
            'account_masked': extract_account_masked(full_text),
            'transactions': extract_transactions(full_text)[:10]
        }
    }
    with open(os.path.join(OUT, f.replace('.pdf','.json')), 'w', encoding='utf-8') as fh:
        json.dump(out, fh, indent=2)
print('Processed', len(files), 'files. Results in', OUT)



## Next steps (if you like this prototype)

- Replace rule-based extractor with a Layout-aware model (e.g., LayoutLMv3 / Donut) or LLM prompt approach.
- Improve OCR accuracy with preprocessing (deskew, denoise) or a commercial OCR API (for production, on-prem only).
- Add unit tests and synthetic edge cases (multilingual dates, multi-page statements).
- Implement stronger PII detection using NER models (spaCy, HuggingFace) in addition to regex.
- For production, **never** process real bank docs on Colab or any public cloud without explicit approval — use on-prem or VPC-secured services with audit logs.
