In [13]:
import json
import math
import re
from statistics import median

# Load OCR JSON
with open('../data/ocr_jsons/sample.json', 'r', encoding='utf-8') as f:
    ocr = json.load(f)

# --- START: MODIFIED SECTION ---
# This part is fixed to handle a common JSON structure (e.g., from Google Vision API)

# The 'textAnnotations' key is common. The first element is the full text, the rest are individual words.
word_annotations = ocr.get('textAnnotations', [])

words = []
if word_annotations:
    # We skip the first annotation [0] as it's the entire text block.
    for annotation in word_annotations[1:]:
        # Adapt the data structure to what the rest of the script expects
        transformed_word = {
            'text': annotation.get('description', ''),
            'bbox': [(v.get('x', 0), v.get('y', 0)) for v in annotation.get('boundingPoly', {}).get('vertices', [])]
        }
        words.append(transformed_word)
# --- END: MODIFIED SECTION ---


# Get centroid of bbox for sorting (This part remains the same)
for w in words:
    if w['bbox']:
        xs = [p[0] for p in w['bbox']]
        ys = [p[1] for p in w['bbox']]
        w['cx'] = sum(xs) / len(xs)
        w['cy'] = sum(ys) / len(ys)
    else:
        w['cx'] = 0
        w['cy'] = 0

# Group into lines by Y coordinate
def group_words_into_lines(words, y_tol=10):
    # Sort words by y then x
    words_sorted = sorted(words, key=lambda w: (w['cy'], w['cx']))
    lines = []
    for w in words_sorted:
        placed = False
        for line in lines:
            # Check if the word belongs to an existing line
            if abs(line['median_y'] - w['cy']) <= y_tol:
                line['words'].append(w)
                # Recalculate the median Y for the line
                line['median_y'] = median([wd['cy'] for wd in line['words']])
                placed = True
                break
        if not placed:
            # Create a new line for this word
            lines.append({'median_y': w['cy'], 'words': [w]})
    
    # Convert word groups to text strings and get line bounds
    for line in lines:
        line['words'].sort(key=lambda x: x['cx']) # Sort words in the line by X
        line['text'] = ' '.join([wd['text'] for wd in line['words']])
        line['x_min'] = min([p[0] for wd in line['words'] for p in wd['bbox']])
        line['x_max'] = max([p[0] for wd in line['words'] for p in wd['bbox']])
    
    # Sort lines by their Y position before returning
    return sorted(lines, key=lambda l: l['median_y'])

lines = group_words_into_lines(words, y_tol=12)
text_reconstructed = '\n'.join([ln['text'] for ln in lines])
print("--- Reconstructed Text (First 1000 Chars) ---")
print(text_reconstructed[:1000])
print("\n" + "="*50 + "\n")


# Simple fixes
COMMON_FIXES = [
    (r'\s+', ' '),
    (r"0f", "of"),
    (r"Narne", "Name"),
]

def apply_common_fixes(text):
    for pat, rep in COMMON_FIXES:
        text = re.sub(pat, rep, text)
    return text.strip()

clean_text = apply_common_fixes(text_reconstructed)
print("--- Cleaned Text (First 800 Chars) ---")
print(clean_text[:800])

--- Reconstructed Text (First 1000 Chars) ---



--- Cleaned Text (First 800 Chars) ---

