In [124]:
from pdf2image import convert_from_path
import os
import pytesseract
from pytesseract import Output
import pandas as pd
from dotenv import load_dotenv
import numpy as np
import cv2

load_dotenv()

POPPLER_PATH = os.getenv("POPPLER_PATH")
pytesseract.pytesseract.tesseract_cmd = os.getenv("PYTESSERACT_PATH")

In [90]:
class Token:
    def __init__(self, type, value, bbox):
        self.type = type   # e.g., "FIELD_LABEL"
        self.value = value # e.g., "Name:"
        self.bbox = bbox   # (x, y, w, h)

In [91]:
pages = convert_from_path(
    "./forms/forms.pdf", 
    300,
    poppler_path=POPPLER_PATH
)

data = pytesseract.image_to_data(pages[2], output_type=Output.DICT)

In [164]:
def process_ocr_data(data, gap_threshold=27):
    """
    Groups Tesseract words into lines based on Block+Paragraph+Line ID,
    then splits them horizontally if a large gap is detected.
    """
    raw_lines = {}
    n_boxes = len(data['text'])
    
    for i in range(n_boxes):
        # Skip empty text or low confidence garbage
        if int(data['conf'][i]) == -1 or not data['text'][i].strip():
            continue
        
        # Creating an ID (key) for each word
        line_id = f"{data['block_num'][i]}_{data['par_num'][i]}_{data['line_num'][i]}"
        
        word_info = {
            "text": (data['text'][i].replace("_", "")).strip(),
            "left": data['left'][i],
            "top": data['top'][i],
            "width": data['width'][i],
            "right": data['left'][i] + data['width'][i],
            "height": data['height'][i]
        }
        
        if line_id not in raw_lines:
            raw_lines[line_id] = []
        raw_lines[line_id].append(word_info)


    # 2. Process each line: Merge words, but split on huge gaps
    final_tokens = []

    for line_id, words in raw_lines.items():
        # Ensure words are sorted left-to-right (Tesseract usually does this, but be safe)
        words.sort(key=lambda w: w['left'])
        
        # Initialize the first phrase with the first word
        current_phrase_text = words[0]['text']
        current_phrase_bbox = [words[0]['left'], words[0]['top'], words[0]['width'], words[0]['height']]
        last_word_right = words[0]['right']

        for i in range(1, len(words)):
            word = words[i]
            
            # Calculate the gap between the end of the last word and start of this one
            gap = word['left'] - last_word_right
            
            # DECISION: Is this a new field on the same line?
            if gap > gap_threshold:
                # YES: The gap is huge (e.g., between "Name:" and "Phone:")
                # 1. Save the previous phrase as a complete token
                
                token_type = ""
                token_id=0

                if (current_phrase_text.strip().endswith(":")):
                    token_type = "FIELD_LABEL"
                    token_id = 3
                else:
                    token_type = "NOTE"
                    token_id = 5
                  
                final_tokens.append({
                    "id": token_id,
                    "type": token_type,
                    "value": current_phrase_text,
                    "x": current_phrase_bbox[0],
                    "y": current_phrase_bbox[1],
                    "w": current_phrase_bbox[2],
                    "h": current_phrase_bbox[3]
                })
                
                # 2. Start a completely new phrase for the current word
                current_phrase_text = word['text']
                # Reset bbox to this new word's geometry
                current_phrase_bbox = [word['left'], word['top'], word['width'], word['height']]
                
            else:
                # NO: The gap is small. It's part of the same sentence.
                current_phrase_text += " " + word['text']
                
                # Expand the width of the current phrase to include this word
                # New Width = (Word Right Edge) - (Phrase Left Edge)
                current_phrase_bbox[2] = word['right'] - current_phrase_bbox[0]
                
                # Update height to be the max height seen (optional, but good for bounding boxes)
                current_phrase_bbox[3] = max(current_phrase_bbox[3], word['height'])
            
            # Update tracker for the next iteration
            last_word_right = word['right']


        # Append the final phrase of the line after the loop finishes
        token_type = ""
        token_id = 0

        if (current_phrase_text.strip().endswith(":")):
            token_type = "FIELD_LABEL"
            token_id = 3
        else:
            token_type = "NOTE"
            token_id = 5
            
        final_tokens.append({
            "id": token_id,
            "type": token_type,
            "value": current_phrase_text,
            "x": current_phrase_bbox[0],
            "y": current_phrase_bbox[1],
            "w": current_phrase_bbox[2],
            "h": current_phrase_bbox[3]
        })


    # clean the final tokens
    notes = [t for t in final_tokens if t["type"] == "NOTE"]
    heights = np.array([t["h"] for t in notes])
    median_h = np.median(heights)
    max_h = np.max(heights)

    img = cv2.imread("medical_form.jpg")
    page_height, page_width = img.shape[:2]

    PAGE_WIDTH = page_width

    def is_form_title(t, median_h):
        return (
            t["h"] >= median_h * 1.3 and   # ← lower
            t["y"] < 300 and
            t["w"] > 0.5 * PAGE_WIDTH      # ← lower
        )

    
    def is_section_title(t, median_h):
        return (
            t["h"] >= median_h * 0.80 and          # similar to notes
            t["w"] >= 0.10 * PAGE_WIDTH and         # wider than labels
            len(t["value"].split()) <= 6 and
            not t["value"].strip().endswith(":")   # exclude labels
        )


    for t in final_tokens:
        if t["type"] != "NOTE":
            continue

        if is_form_title(t, median_h):
            t["type"] = "FORM_TITLE"
            t["id"] = 1
        elif is_section_title(t, median_h):
            t["type"] = "SECTION_TITLE"
            t["id"] = 2
        else:
            t["type"] = "NOTE"

    
    return final_tokens

In [165]:
text_tokens = process_ocr_data(data)

for obj in text_tokens:
    print(obj)

{'id': 1, 'type': 'FORM_TITLE', 'value': 'Medical Consent Form (ABC Medics)', 'x': 153, 'y': 168, 'w': 1344, 'h': 79}
{'id': 2, 'type': 'SECTION_TITLE', 'value': 'Patient Information', 'x': 155, 'y': 391, 'w': 575, 'h': 45}
{'id': 3, 'type': 'FIELD_LABEL', 'value': 'Full Name:', 'x': 154, 'y': 508, 'w': 226, 'h': 33}
{'id': 3, 'type': 'FIELD_LABEL', 'value': 'Date of Birth:', 'x': 1539, 'y': 508, 'w': 273, 'h': 33}
{'id': 3, 'type': 'FIELD_LABEL', 'value': 'Address:', 'x': 150, 'y': 707, 'w': 185, 'h': 33}
{'id': 2, 'type': 'SECTION_TITLE', 'value': 'Emergency Contact', 'x': 155, 'y': 1108, 'w': 587, 'h': 56}
{'id': 3, 'type': 'FIELD_LABEL', 'value': 'Emergency Contact Name:', 'x': 154, 'y': 1224, 'w': 570, 'h': 42}
{'id': 3, 'type': 'FIELD_LABEL', 'value': 'Phone Number:', 'x': 1677, 'y': 1224, 'w': 330, 'h': 33}
{'id': 2, 'type': 'SECTION_TITLE', 'value': 'Medical Details', 'x': 155, 'y': 1427, 'w': 462, 'h': 45}
{'id': 3, 'type': 'FIELD_LABEL', 'value': 'Medical Provider Name:', 'x'

## Open CV Implementation

In [None]:
import cv2
import numpy as np

img = cv2.imread("medical_form.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

In [97]:


# binary image (invert so lines are white)
_, bw = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)

# Detect horizontal lines
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
h_lines = cv2.morphologyEx(bw, cv2.MORPH_OPEN, h_kernel)

# Detect vertical lines
v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
v_lines = cv2.morphologyEx(bw, cv2.MORPH_OPEN, v_kernel)

# Combine lines
lines = cv2.add(h_lines, v_lines)

# Find contours
contours, _ = cv2.findContours(lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

visual_tokens = []
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
    if w > 70 and h > 5:
        # Store as a dictionary or object to match your Token style
        visual_tokens.append({
            'type': 'FIELD_SPACE', # or CHECKBOX depending on shape
            'value': '____',
            'x': x, 'y': y, 'w': w, 'h': h
        })

# CRITICAL STEP: SORT BY Y (Top-to-Bottom), THEN X (Left-to-Right)
# We use y // 10 to allow for slight "wobble" in alignment (Row Clustering)
visual_tokens.sort(key=lambda b: (b['y'] // 10, b['x']))

print(visual_tokens)
cv2.imwrite("detected_fields.jpg", img)

[{'type': 'FIELD_SPACE', 'value': '____', 'x': 403, 'y': 493, 'w': 1053, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 1855, 'y': 497, 'w': 613, 'h': 61}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 410, 'y': 706, 'w': 2059, 'h': 233}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 768, 'y': 1211, 'w': 798, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 2058, 'y': 1211, 'w': 430, 'h': 55}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 728, 'y': 1534, 'w': 798, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 148, 'y': 1726, 'w': 2324, 'h': 354}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 1823, 'y': 2369, 'w': 574, 'h': 72}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 649, 'y': 2370, 'w': 798, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 648, 'y': 2472, 'w': 797, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 1825, 'y': 2472, 'w': 574, 'h': 72}]


True

## Merging and Sorting

In [98]:
def merge_and_sort_tokens(text_tokens, visual_tokens, row_tolerance=20):
    # 1. Unified List
    all_tokens = text_tokens + visual_tokens

    # 2. Initial Sort by Y (Top to Bottom)
    # This gets them roughly in order so we can cluster them
    all_tokens.sort(key=lambda t: t['y'])

    rows = []
    current_row = []
    
    # We track the "average Y" of the current row to handle drift
    current_row_y = 0 

    if all_tokens:
        current_row = [all_tokens[0]]
        current_row_y = all_tokens[0]['y']

    for i in range(1, len(all_tokens)):
        token = all_tokens[i]
        
        # 3. Check Vertical Distance
        # If this token is within 'row_tolerance' pixels of the current row's Y...
        if abs(token['y'] - current_row_y) <= row_tolerance:
            current_row.append(token)
            
            # Optional: Update average Y (moving average) to follow the line's drift
            # current_row_y = (current_row_y + token['y']) / 2
        else:
            # It's a new line! 
            # a. Sort the OLD row by X (Left to Right)
            current_row.sort(key=lambda t: t['x'])
            rows.append(current_row)
            
            # b. Start the NEW row
            current_row = [token]
            current_row_y = token['y']

    # Don't forget the last row
    if current_row:
        current_row.sort(key=lambda t: t['x'])
        rows.append(current_row)

    # 4. Flatten into a single stream
    # This turns [[Row1_Item1, Row1_Item2], [Row2_Item1]] into [Item1, Item2, Item1...]
    final_stream = [token for row in rows for token in row]
    
    return final_stream

In [101]:
tokens = merge_and_sort_tokens(text_tokens, visual_tokens)

In [102]:
for token in tokens:
    print(token)

{'type': 'TEXT_FRAGMENT', 'value': 'Medical Consent Form (ABC Medics)', 'x': 153, 'y': 168, 'w': 1344, 'h': 79}
{'type': 'TEXT_FRAGMENT', 'value': 'Patient Information', 'x': 155, 'y': 391, 'w': 575, 'h': 45}
{'type': 'FIELD_LABEL', 'value': 'Full Name:', 'x': 154, 'y': 508, 'w': 226, 'h': 33}
{'type': 'FIELD_SPACE', 'value': '____', 'x': 403, 'y': 493, 'w': 1053, 'h': 59}
{'type': 'FIELD_LABEL', 'value': 'Date of Birth:', 'x': 1539, 'y': 508, 'w': 273, 'h': 33}
{'type': 'FIELD_SPACE', 'value': '____', 'x': 1855, 'y': 497, 'w': 613, 'h': 61}
{'type': 'FIELD_LABEL', 'value': 'Address:', 'x': 150, 'y': 707, 'w': 185, 'h': 33}
{'type': 'FIELD_SPACE', 'value': '____', 'x': 410, 'y': 706, 'w': 2059, 'h': 233}
{'type': 'TEXT_FRAGMENT', 'value': 'Emergency Contact', 'x': 155, 'y': 1108, 'w': 587, 'h': 56}
{'type': 'FIELD_LABEL', 'value': 'Emergency Contact Name:', 'x': 154, 'y': 1224, 'w': 570, 'h': 42}
{'type': 'FIELD_SPACE', 'value': '____', 'x': 768, 'y': 1211, 'w': 798, 'h': 59}
{'type': 

In [103]:
# 2. Iterate through your token stream
for t in tokens:
    x, y, w, h = t['x'], t['y'], t['w'], t['h']
    
    # COLOR CODING SCHEME (BGR Format)
    if t['type'] == 'FIELD_SPACE':
        color = (0, 0, 255)      # Red for Input Boxes
        thickness = 2
    elif t['type'] == 'CHECKBOX':
        color = (0, 0, 255)      # Red for Checkboxes
        thickness = 2
    elif t['type'] == 'FIELD_LABEL':
        color = (0, 255, 0)      # Green for Labels
        thickness = 2
    elif t['type'] == 'SECTION_TITLE':
        color = (255, 0, 0)      # Blue for Titles
        thickness = 3
    else:
        color = (255, 255, 0)    # Cyan for Fragments/Notes
        thickness = 1

    # 3. Draw the Rectangle
    cv2.rectangle(img, (x, y), (x + w, y + h), color, thickness)
    
    # 4. Draw the Label (Tiny text above the box)
    # This helps you see if the SYSTEM thinks it's a "Label" or a "Note"
    label_text = f"{t['type']} ({t['value'][:10]}...)"
    cv2.putText(img, label_text, (x, y - 5), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)

# 5. Save the result
cv2.imwrite("final_detected.jpg", img)


True