In [28]:
import cv2
import numpy as np
import pytesseract
from PIL import Image
import spacy
import re
import os

# --- Tesseract Configuration (Crucial for Windows) ---
# Replace this path with your EXACT Tesseract installation path if it's different.
tesseract_path = r'C:\Users\kulde\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = tesseract_path

# --- spaCy PII Model ---
# Load the small English model for PII Detection
nlp = spacy.load("en_core_web_sm") 

print("Dependencies loaded and Tesseract/spaCy configured.")

Dependencies loaded and Tesseract/spaCy configured.


In [29]:
# Function to apply image enhancements for better OCR results
def preprocess_image(image_path):
    """
    Applies image processing techniques (Grayscale, Binarization, Noise Reduction)
    to enhance the image quality for OCR.
    """
    # 1. Load the image using OpenCV
    # The image path is based on the file you uploaded
    img = cv2.imread(image_path)
    
    if img is None:
        print(f"Error: Could not load image at {image_path}")
        return None

    # 2. Convert to Grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 3. Apply Binarization (Thresholding) using OTSU's method
    # This converts the image to pure black and white based on optimal thresholding
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 4. Apply Gaussian Blur (Optional: Helps smooth edges and reduce salt-and-pepper noise)
    denoised = cv2.GaussianBlur(binary, (3, 3), 0)
    
    # We will skip complex deskewing (tilt correction) for now, relying on Tesseract's robustness.
    return denoised

In [30]:
def extract_text(processed_image):
    """
    Performs OCR on the pre-processed image using a config suitable for forms.
    We remove '-oem 1' as Tesseract 5 uses LSTM by default and the argument is causing an error.
    """
    if processed_image is None:
        return ""
        
    # We will only use PSM 4, which is better for reading down a structured form.
    # Tesseract 5 automatically uses the LSTM engine, making -oem 1 redundant.
    config_string = '--psm 4'
    
    raw_text = pytesseract.image_to_string(processed_image, config=config_string)
    
    return raw_text.strip()

In [31]:
def clean_text(raw_text):
    """
    Performs basic text cleaning: removing non-printable characters and extra spaces.
    """
    # 1. Remove non-ASCII characters and control characters (often noise from OCR)
    cleaned = re.sub(r'[^\x00-\x7F]+', ' ', raw_text)
    
    # 2. Replace multiple newlines/spaces with a single space
    # This flattens the form structure into a continuous block of text for PII detection
    cleaned = re.sub(r'\s+', ' ', cleaned)
    
    # Remove leading/trailing spaces
    return cleaned.strip()

In [32]:
# --- Pipeline Execution and Test ---
# Ensure 'sample_doc.jpg' is in the same folder as your notebook
image_file_path = 'doc_c.jpg' 

print(f"--- Running Pipeline on: {image_file_path} ---")

# 1. Pre-process
processed_img = preprocess_image(image_file_path)

# 2. Extract Text (OCR)
raw_text = extract_text(processed_img)

# 3. Clean Text
cleaned_text = clean_text(raw_text)

print("\n--- Raw Extracted Text (Before Cleaning) ---")
print(raw_text)

print("\n--- Cleaned Text (Ready for PII Detection) ---")
print(cleaned_text)

# We will use this variable in the next step
# The final variable that holds the text we care about
extracted_document_text = cleaned_text

--- Running Pipeline on: doc_c.jpg ---

--- Raw Extracted Text (Before Cleaning) ---
INSTITUTE OF MEDICAL SCIENCES & SUM HOSPITAL

(Faculty of Medical Sciences)
SIKSHA ‘0’ ANUSANDHAN
(DEEMED TO BE UNIVERSITY) |
| K-8, Kalinga Nagar. Bhubaneswar

| PROGRESS REPORT

| Panent Name: entesh Prvdten lessees sees Age: 2G . Sex: MA.

INSTITUTE OF MEDICAL SCIENCES & SUM HOSPITAL iti

(Faculty of Medical Sciences)
SIKSHA ‘O' ANUSANDHAN
(DEEMED TO BE UNIVERSITY)

K-8, Kalinga Nagar. Bhubaneswar a
PROGRESS REPORT Co |
A n Post Feadbon vee Age: 4 EY. Sex AT.

Patient Name? wire...
-)" . a - o <2 +) ; A, -
pono: ASE 254 Unione: 26258 WUC

iPO No:  UHIONG LEASE UU CITT Beano: JC!

DATE &TIME SIGNATURE

DATE &TIME | PROGRESS NOTES TREATMENT:ADVICE | SIGNATURE OF DOCTOR

~ -_———_ ‘ OF BOC" DOCTOR

Velox hs) Cuba! cal Rap by | — | Regatea
—- 6

-y Qn Obnewvediian -  wW sack!

Moret va
- Srrebl dy pate

PROGRESS NOTES TREATMENT/ADVICE

ORAL PISoRVER PUT
Anh aan SINDEN, Lore

THE. Sup STANCE

Ry.
Troy THI

In [33]:
# --- PII DETECTION FUNCTION ---

def detect_pii(text):
    """
    Uses the loaded spaCy model (nlp) to detect Named Entities that represent PII.
    We focus on common PII categories: PERSON, DATE, ORG, GPE (Geo-Political Entity/Location).
    """
    # Process the text using the spaCy model
    doc = nlp(text)
    
    pii_entities = []
    
    # Iterate over all entities found by spaCy
    for ent in doc.ents:
        # Check for common PII categories
        if ent.label_ in ["PERSON", "DATE", "ORG", "GPE", "LOC"]:
            pii_entities.append({
                "entity": ent.text, 
                "label": ent.label_,
                "start_char": ent.start_char
            })
            
    return pii_entities

# --- MULTI-FILE EXECUTION ---

# List of the files you need to process
input_files = ['doc_a.jpg', 'doc_b.jpg', 'doc_c.jpg']
results = {}

for filename in input_files:
    print(f"\n=======================================================")
    print(f"| PROCESSING FILE: {filename}")
    print(f"=======================================================")
    
    try:
        # 1. Pre-process Image
        processed_img = preprocess_image(filename)
        
        # 2. Extract Text (OCR)
        raw_text = extract_text(processed_img)
        
        # 3. Clean Text
        cleaned_text = clean_text(raw_text)
        
        # 4. PII Detection (NEW STAGE)
        pii_list = detect_pii(cleaned_text)
        
        # Store results
        results[filename] = {
            "cleaned_text": cleaned_text,
            "pii_data": pii_list,
        }
        
        # Display Output
        print("\n--- Cleaned Extracted Text (Snippet) ---")
        # Display only the first 300 characters for readability
        print(cleaned_text[:300] + ('...' if len(cleaned_text) > 300 else ''))
        
        print("\n--- Detected PII ---")
        if pii_list:
            for pii in pii_list:
                print(f"    - {pii['entity']} ({pii['label']})")
        else:
            print("    - No PII entities detected.")
            
    except Exception as e:
        print(f"*** ERROR PROCESSING {filename}: {e} ***")
        results[filename] = {"error": str(e)}



| PROCESSING FILE: doc_a.jpg

--- Cleaned Extracted Text (Snippet) ---
IMS Ae OEM OL INSTITUTE OF MEDICAL SCIENCES & SUM HOSPITAL (Faculty of Medical Sciences) SIKSHA O ANUSANDHAN (DEEMED TO BE UNIVERSITY) K-8, Kalinga Nagar, Bhubaneswar PROGRESS REPORT > CIWA-M Patient Name: reas co ee enen cue sence eee AGG! ceoeiee t... SOX! ae bene 6 s nQAN? UAK \ v ae 4 ad vecuenc...

--- Detected PII ---
    - IMS (ORG)
    - OEM OL INSTITUTE OF MEDICAL SCIENCES & SUM HOSPITAL (Faculty of Medical Sciences (ORG)
    - ANUSANDHAN (ORG)
    - DEEMED TO BE UNIVERSITY (ORG)
    - Kalinga Nagar (PERSON)
    - SOX (ORG)
    - UAK (ORG)
    - ROME (ORG)
    - tN MS VS0G5 DEPEMPER OE (ORG)
    - PAN (PERSON)
    - BD (ORG)
    - TIM Fc (PERSON)
    - Tey Tin AMINE (PERSON)
    - Le THMINI-O - F v (ORG)
    - Mtr Consunant (PERSON)

| PROCESSING FILE: doc_b.jpg

--- Cleaned Extracted Text (Snippet) ---
INS PRE INE) ee SS ees es es TH ROMROPHin Hehe etter 1 eS a A A Dose 0 et Direction D 7 q pare eee ny Fe