In [58]:
import cv2
import numpy as np

img = cv2.imread("medical_form.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

In [None]:


# binary image (invert so lines are white)
_, bw = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)

# Detect horizontal lines
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
h_lines = cv2.morphologyEx(bw, cv2.MORPH_OPEN, h_kernel)

# Detect vertical lines
v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
v_lines = cv2.morphologyEx(bw, cv2.MORPH_OPEN, v_kernel)

# Combine lines
lines = cv2.add(h_lines, v_lines)

# Find contours
contours, _ = cv2.findContours(lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

fields = []
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
    if w > 70 and h > 5:
        # Store as a dictionary or object to match your Token style
        fields.append({
            'type': 'FIELD_SPACE', # or CHECKBOX depending on shape
            'value': '____',
            'x': x, 'y': y, 'w': w, 'h': h
        })

# CRITICAL STEP: SORT BY Y (Top-to-Bottom), THEN X (Left-to-Right)
# We use y // 10 to allow for slight "wobble" in alignment (Row Clustering)
fields.sort(key=lambda b: (b['y'] // 10, b['x']))

print(fields)
cv2.imwrite("detected_fields.jpg", img)

[{'type': 'FIELD_SPACE', 'value': '____', 'x': 403, 'y': 493, 'w': 1053, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 1855, 'y': 497, 'w': 613, 'h': 61}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 410, 'y': 706, 'w': 2059, 'h': 233}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 768, 'y': 1211, 'w': 798, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 2058, 'y': 1211, 'w': 430, 'h': 55}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 728, 'y': 1534, 'w': 798, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 148, 'y': 1726, 'w': 2324, 'h': 354}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 1823, 'y': 2369, 'w': 574, 'h': 72}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 649, 'y': 2370, 'w': 798, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 648, 'y': 2472, 'w': 797, 'h': 59}, {'type': 'FIELD_SPACE', 'value': '____', 'x': 1825, 'y': 2472, 'w': 574, 'h': 72}]


True

In [None]:
def merge_and_sort_tokens(text_tokens, visual_tokens, row_tolerance=20):
    # 1. Unified List
    all_tokens = text_tokens + visual_tokens

    # 2. Initial Sort by Y (Top to Bottom)
    # This gets them roughly in order so we can cluster them
    all_tokens.sort(key=lambda t: t['y'])

    rows = []
    current_row = []
    
    # We track the "average Y" of the current row to handle drift
    current_row_y = 0 

    if all_tokens:
        current_row = [all_tokens[0]]
        current_row_y = all_tokens[0]['y']

    for i in range(1, len(all_tokens)):
        token = all_tokens[i]
        
        # 3. Check Vertical Distance
        # If this token is within 'row_tolerance' pixels of the current row's Y...
        if abs(token['y'] - current_row_y) <= row_tolerance:
            current_row.append(token)
            
            # Optional: Update average Y (moving average) to follow the line's drift
            # current_row_y = (current_row_y + token['y']) / 2
        else:
            # It's a new line! 
            # a. Sort the OLD row by X (Left to Right)
            current_row.sort(key=lambda t: t['x'])
            rows.append(current_row)
            
            # b. Start the NEW row
            current_row = [token]
            current_row_y = token['y']

    # Don't forget the last row
    if current_row:
        current_row.sort(key=lambda t: t['x'])
        rows.append(current_row)

    # 4. Flatten into a single stream
    # This turns [[Row1_Item1, Row1_Item2], [Row2_Item1]] into [Item1, Item2, Item1...]
    final_stream = [token for row in rows for token in row]
    
    return final_stream