# Word-Level Grounding Using Agentic Document Extraction (ADE) from LandingAI

Extract a single field from a PDF and visualize where the source on a word-level for a more precise grounding, using ADE's Parsing & Field Extraction Python Library.

In [None]:
# Install required packages
# !pip install landingai-ade python-dotenv pillow pymupdf matplotlib pytesseract

In [12]:
from __future__ import annotations

import os
import json
from io import BytesIO
from pathlib import Path

from dotenv import load_dotenv
from pydantic import BaseModel, Field
from PIL import Image, ImageDraw, ImageFont
import pymupdf
import matplotlib.pyplot as plt
import pytesseract

from landingai_ade import LandingAIADE
from landingai_ade.lib import pydantic_to_json_schema

load_dotenv()
print("Dependencies loaded")

Dependencies loaded


## Parse Document (Run Once)

In [None]:
client = LandingAIADE(apikey=os.environ.get("VISION_AGENT_API_KEY"))
pdf_path = Path("demo_pdf.pdf")

print(f"Parsing {pdf_path}...")
parse_response = client.parse(
    document=pdf_path,
    model=os.environ.get("ADE_MODEL", "dpt-2-latest"),
)

print(f"Parsed successfully!")
print(f"  - Markdown: {len(parse_response.markdown)} chars")
print(f"  - Chunks: {len(parse_response.chunks)}")

# Save markdown to file
markdown_path = Path("parsed_output.md")
with open(markdown_path, "w", encoding="utf-8") as f:
    f.write(parse_response.markdown)
print(f"Saved markdown to {markdown_path}")

# Save grounding data for visualization
grounding_data = {}
for chunk_id, grounding in parse_response.grounding.items():
    grounding_data[chunk_id] = {
        'type': grounding.type,
        'page': grounding.page,
        'box': {
            'left': grounding.box.left,
            'top': grounding.box.top,
            'right': grounding.box.right,
            'bottom': grounding.box.bottom
        }
    }

grounding_path = Path("grounding_data.json")
with open(grounding_path, "w", encoding="utf-8") as f:
    json.dump(grounding_data, f, indent=2)
print(f"Saved grounding data to {grounding_path}")

## Extract the Field

In [13]:
client = LandingAIADE(apikey=os.environ.get("VISION_AGENT_API_KEY"))

print("Extracting field from saved markdown file...")

# Prepare schema as dict (not json string)
schema_dict = {
    "type": "object",
    "properties": {
        "remaining_human_genome": {
            "type": "string",
            "description": "The percentage of the human genome that was left unfinished before the Telomere-to-Telomere (T2T) Consortium's work"
        }
    },
    "required": ["remaining_human_genome"]
}

# Extract from the saved markdown file
markdown_path = Path("parsed_output.md")
extract_response = client.extract(
    schema=json.dumps(schema_dict),
    markdown=markdown_path
)

print("\n" + "="*60)
print("EXTRACTED DATA")
print("="*60)
print(f"\nRemaining Human Genome: {extract_response.extraction.get('remaining_human_genome', 'N/A')}")
print("\n" + "="*60)

Extracting field from saved markdown file...

EXTRACTED DATA

Remaining Human Genome: 8%



In [14]:
# Convert PDF to images
pdf_path = Path("demo_pdf.pdf")
pdf = pymupdf.open(pdf_path)
page_images = {}

for page_num in range(len(pdf)):
    page = pdf[page_num]
    pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    page_images[page_num] = img

pdf.close()
print(f"Loaded {len(page_images)} page(s)")

Loaded 1 page(s)


In [15]:
# Load grounding data for visualization
with open("grounding_data.json", "r", encoding="utf-8") as f:
    grounding_data = json.load(f)

# Reconstruct grounding objects
class Grounding:
    def __init__(self, data):
        self.type = data['type']
        self.page = data['page']
        self.box = type('Box', (), data['box'])()

grounding_dict = {chunk_id: Grounding(data) for chunk_id, data in grounding_data.items()}
print(f"Loaded {len(grounding_dict)} grounding entries")

Loaded 16 grounding entries


## Find Chunks with Extracted Value

Check extraction_metadata for chunk references.

In [None]:
from difflib import SequenceMatcher
import pytesseract

# Get the extracted value
extracted_value = extract_response.extraction.get('remaining_human_genome', '')
print(f"Extracted value: '{extracted_value}'")
print("\n" + "="*60)

# Check extraction_metadata for chunk references
print("\nChecking extraction_metadata...")
chunk_ids = []

if hasattr(extract_response, 'extraction_metadata') and extract_response.extraction_metadata:
    metadata = extract_response.extraction_metadata
    
    # Check if our field has references
    if isinstance(metadata, dict) and 'remaining_human_genome' in metadata:
        field_meta = metadata['remaining_human_genome']
        print(f"Field metadata: {field_meta}")
        
        # Look for 'references' key (not 'chunk_references')
        if isinstance(field_meta, dict) and 'references' in field_meta:
            chunk_ids = field_meta['references']
            print(f"\nâœ“ Found {len(chunk_ids)} chunk reference(s) from API: {chunk_ids}")
        else:
            print(f"No 'references' key in field metadata")
    else:
        print(f"Field not in metadata. Keys: {metadata.keys() if isinstance(metadata, dict) else 'N/A'}")

# Fallback: search all chunks
if not chunk_ids:
    print("\nNo chunk references from API. Will search all chunks.")
    chunk_ids = list(grounding_dict.keys())

print(f"\n{'='*60}")
print(f"Will OCR {len(chunk_ids)} chunk(s)")

In [None]:
def fuzzy_match_ratio(text1, text2):
    """Return similarity ratio between two strings (0-1)"""
    return SequenceMatcher(None, text1.lower().strip(), text2.lower().strip()).ratio()

print("Running OCR on identified chunks...")
ocr_matches = []

# Process each identified chunk
for chunk_id in chunk_ids:
    if chunk_id not in grounding_dict:
        continue
    
    grounding = grounding_dict[chunk_id]
    page_num = grounding.page
    
    if page_num not in page_images:
        continue
    
    img = page_images[page_num]
    img_width, img_height = img.size
    
    # Get chunk bounding box in pixels
    box = grounding.box
    x1 = int(box.left * img_width)
    y1 = int(box.top * img_height)
    x2 = int(box.right * img_width)
    y2 = int(box.bottom * img_height)
    
    # Crop image to chunk area
    chunk_img = img.crop((x1, y1, x2, y2))
    
    # Run OCR only on this chunk
    ocr_data = pytesseract.image_to_data(chunk_img, output_type=pytesseract.Output.DICT)
    
    # Process OCR results
    for i in range(len(ocr_data['text'])):
        text = ocr_data['text'][i].strip()
        if not text:
            continue
        
        conf = float(ocr_data['conf'][i])
        if conf < 30:
            continue
        
        # Fuzzy match with extracted value
        similarity = fuzzy_match_ratio(extracted_value, text)
        
        # Boost for substring matches
        if extracted_value.lower() in text.lower() or text.lower() in extracted_value.lower():
            similarity = max(similarity, 0.85)
        
        if similarity > 0.5:
            # Convert coordinates back to full image space
            ocr_x = x1 + ocr_data['left'][i]
            ocr_y = y1 + ocr_data['top'][i]
            ocr_w = ocr_data['width'][i]
            ocr_h = ocr_data['height'][i]
            
            ocr_matches.append({
                'text': text,
                'similarity': similarity,
                'confidence': conf,
                'page': page_num,
                'chunk_id': chunk_id,
                'box_pixels': {'x': ocr_x, 'y': ocr_y, 'w': ocr_w, 'h': ocr_h}
            })

# Sort by similarity
ocr_matches.sort(key=lambda x: x['similarity'], reverse=True)

print(f"\nFound {len(ocr_matches)} OCR matches in {len(chunk_ids)} chunk(s):")
for match in ocr_matches[:10]:
    print(f"  - '{match['text']}': {match['similarity']:.1%} similarity, {match['confidence']:.0f}% confidence")
    print(f"    Chunk {match['chunk_id']}, Page {match['page']}")

print(f"\n{'='*60}")

In [None]:
import numpy as np

# Create heatmap visualization using OCR matches
page_num = 0
img = page_images[page_num].copy().convert('RGBA')
img_width, img_height = img.size

# Create a transparent overlay for highlighter effect
overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
overlay_draw = ImageDraw.Draw(overlay)

# Draw all chunks in very light gray (subtle context)
for chunk_id, grounding_obj in grounding_dict.items():
    if grounding_obj.page != page_num:
        continue
    box = grounding_obj.box
    x1 = int(box.left * img_width)
    y1 = int(box.top * img_height)
    x2 = int(box.right * img_width)
    y2 = int(box.bottom * img_height)
    overlay_draw.rectangle([x1, y1, x2, y2], outline=(220, 220, 220, 60), width=1)

# Highlighter effect for OCR matches
for match in ocr_matches:
    if match['page'] != page_num:
        continue
    
    box_px = match['box_pixels']
    x1, y1 = box_px['x'], box_px['y']
    x2, y2 = x1 + box_px['w'], y1 + box_px['h']
    
    # Add padding for highlighter effect (like a marker pen)
    padding = 2
    x1 -= padding
    y1 -= padding
    x2 += padding
    y2 += padding
    
    similarity = match['similarity']
    
    # Transparent yellow highlighter (like a marker)
    # Higher similarity = more opaque
    if similarity > 0.9:
        # Bright yellow for exact matches
        color = (255, 255, 0, 120)  # Yellow highlighter
    elif similarity > 0.7:
        # Orange highlighter for good matches
        color = (255, 200, 0, 100)
    else:
        # Light yellow for partial matches
        color = (255, 255, 100, 80)
    
    # Draw filled rectangle (highlighter effect - no outline)
    overlay_draw.rectangle([x1, y1, x2, y2], fill=color)
    
    # Optional: add small similarity badge
    if similarity > 0.8:
        label = f"{similarity:.0%}"
        label_x = x2 + 2
        label_y = y1
        # Small red badge
        overlay_draw.ellipse([label_x, label_y, label_x+20, label_y+20], fill=(255, 0, 0, 200))
        overlay_draw.text((label_x+3, label_y+3), label, fill=(255, 255, 255, 255))

# Composite overlay with base image
img = Image.alpha_composite(img, overlay)

# Display
plt.figure(figsize=(14, 18))
plt.imshow(img)
plt.title(f"Highlighted: '{extracted_value}' ({len(ocr_matches)} matches found)", 
          fontsize=14, fontweight='bold')
plt.axis('off')
plt.tight_layout()
plt.show()

print(f"\nHighlighted {len(ocr_matches)} word-level matches")
if ocr_matches:
    best = ocr_matches[0]
    print(f"Best match: '{best['text']}' ({best['similarity']:.0%} similarity, {best['confidence']:.0f}% confidence)")