# AWS Textract Integration for PDF Form Analysis

This notebook demonstrates how to use AWS Textract to:
1. Automatically detect form fields in PDFs
2. Extract key-value pairs
3. Get precise field locations
4. Map extracted data to populate forms

AWS Textract is especially useful for:
- Complex forms with unknown structure
- Scanned documents
- Documents with mixed languages (Korean + English)
- Automatic field detection without manual mapping

## Prerequisites

Before running this notebook:

1. Install boto3:
```bash
pip install boto3
```

2. Configure AWS credentials:
```bash
aws configure
```

3. Ensure you have AWS Textract permissions in your IAM role

In [None]:
import json
import os
from pathlib import Path

try:
    import boto3
    from botocore.exceptions import ClientError, NoCredentialsError
    print("✓ boto3 imported successfully")
except ImportError:
    print("✗ boto3 not installed. Run: pip install boto3")
    print("  This notebook requires AWS SDK for Python.")

import fitz  # PyMuPDF for PDF manipulation
print("✓ PyMuPDF imported successfully")

In [None]:
# File paths
PDF_INPUT = "pdf/A0124_pages_1_to_4.pdf"
JSON_INPUT = "inputs/test.json"

# AWS Configuration
AWS_REGION = "us-east-1"  # Change to your preferred region

print(f"PDF: {PDF_INPUT}")
print(f"JSON: {JSON_INPUT}")
print(f"AWS Region: {AWS_REGION}")

## Step 1: Analyze PDF with AWS Textract

In [None]:
def analyze_document_with_textract(pdf_path, region='us-east-1'):
    """
    Use AWS Textract to analyze a PDF document
    Extracts forms, tables, and key-value pairs
    """
    # Initialize Textract client
    textract = boto3.client('textract', region_name=region)
    
    # Read PDF file
    with open(pdf_path, 'rb') as document:
        pdf_bytes = document.read()
    
    # Call Textract - analyze document for forms
    print("Sending document to AWS Textract...")
    response = textract.analyze_document(
        Document={'Bytes': pdf_bytes},
        FeatureTypes=['FORMS']  # Can also use 'TABLES' for table extraction
    )
    
    print(f"✓ Textract analysis complete!")
    print(f"  Blocks found: {len(response['Blocks'])}")
    
    return response

try:
    print("="*60)
    print("AWS Textract Analysis")
    print("="*60 + "\n")
    
    textract_response = analyze_document_with_textract(PDF_INPUT, AWS_REGION)
    
    # Save raw response for analysis
    with open('results/textract_response.json', 'w', encoding='utf-8') as f:
        json.dump(textract_response, f, indent=2, ensure_ascii=False)
    print("\n✓ Raw Textract response saved to: results/textract_response.json")
    
except NoCredentialsError:
    print("\n✗ AWS credentials not found!")
    print("  Please configure AWS credentials using 'aws configure'")
except ClientError as e:
    print(f"\n✗ AWS Error: {e}")
    print("  Check your AWS permissions and region settings.")
except Exception as e:
    print(f"\n✗ Error: {e}")

## Step 2: Extract Key-Value Pairs from Textract Response

In [None]:
def extract_key_value_pairs(textract_response):
    """
    Extract key-value pairs from Textract response
    Returns a dictionary of field names and their values
    """
    # Create a map of block IDs to blocks
    blocks = {block['Id']: block for block in textract_response['Blocks']}
    
    key_value_pairs = {}
    
    # Find KEY_VALUE_SET blocks
    for block in textract_response['Blocks']:
        if block['BlockType'] == 'KEY_VALUE_SET':
            if 'KEY' in block.get('EntityTypes', []):
                # This is a key block
                key_text = ''
                value_text = ''
                
                # Get key text
                if 'Relationships' in block:
                    for relationship in block['Relationships']:
                        if relationship['Type'] == 'CHILD':
                            # Get text from child blocks
                            for child_id in relationship['Ids']:
                                child = blocks.get(child_id)
                                if child and child['BlockType'] == 'WORD':
                                    key_text += child.get('Text', '') + ' '
                        
                        elif relationship['Type'] == 'VALUE':
                            # Get the associated value
                            for value_id in relationship['Ids']:
                                value_block = blocks.get(value_id)
                                if value_block and 'Relationships' in value_block:
                                    for val_rel in value_block['Relationships']:
                                        if val_rel['Type'] == 'CHILD':
                                            for val_child_id in val_rel['Ids']:
                                                val_child = blocks.get(val_child_id)
                                                if val_child and val_child['BlockType'] == 'WORD':
                                                    value_text += val_child.get('Text', '') + ' '
                
                key_text = key_text.strip()
                value_text = value_text.strip()
                
                if key_text:
                    key_value_pairs[key_text] = value_text
    
    return key_value_pairs

if 'textract_response' in locals():
    print("\n" + "="*60)
    print("Extracted Key-Value Pairs")
    print("="*60 + "\n")
    
    kv_pairs = extract_key_value_pairs(textract_response)
    
    if kv_pairs:
        print(f"Found {len(kv_pairs)} key-value pairs:\n")
        for key, value in kv_pairs.items():
            print(f"  {key}: {value}")
        
        # Save to JSON
        with open('results/textract_key_values.json', 'w', encoding='utf-8') as f:
            json.dump(kv_pairs, f, indent=2, ensure_ascii=False)
        print("\n✓ Saved to: results/textract_key_values.json")
    else:
        print("No key-value pairs found.")
        print("The document might not have structured form fields.")
else:
    print("Textract response not available. Run the previous cell first.")

## Step 3: Extract All Text with Positions

In [None]:
def extract_text_with_geometry(textract_response):
    """
    Extract all text with position information
    Useful for understanding document layout
    """
    text_elements = []
    
    for block in textract_response['Blocks']:
        if block['BlockType'] in ['LINE', 'WORD']:
            geometry = block.get('Geometry', {})
            bbox = geometry.get('BoundingBox', {})
            
            text_elements.append({
                'text': block.get('Text', ''),
                'type': block['BlockType'],
                'confidence': block.get('Confidence', 0),
                'page': block.get('Page', 1),
                'geometry': {
                    'left': bbox.get('Left', 0),
                    'top': bbox.get('Top', 0),
                    'width': bbox.get('Width', 0),
                    'height': bbox.get('Height', 0)
                }
            })
    
    return text_elements

if 'textract_response' in locals():
    print("\n" + "="*60)
    print("Text Elements with Geometry")
    print("="*60 + "\n")
    
    text_elements = extract_text_with_geometry(textract_response)
    
    print(f"Found {len(text_elements)} text elements")
    print("\nFirst 10 elements:")
    for elem in text_elements[:10]:
        print(f"  '{elem['text']}' ({elem['type']}) - Page {elem['page']}")
        print(f"    Position: ({elem['geometry']['left']:.3f}, {elem['geometry']['top']:.3f})")
        print(f"    Confidence: {elem['confidence']:.1f}%")
    
    # Save to JSON
    with open('results/textract_text_elements.json', 'w', encoding='utf-8') as f:
        json.dump(text_elements, f, indent=2, ensure_ascii=False)
    print("\n✓ Saved to: results/textract_text_elements.json")

## Step 4: Smart Mapping - Match JSON Data to Textract Fields

In [None]:
def smart_match_fields(json_data, textract_kv_pairs):
    """
    Intelligently match JSON fields with Textract-detected form fields
    """
    # Load JSON data
    with open(json_data, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    parsed_data = data.get('parsedJson', {})
    
    # Matching keywords
    field_keywords = {
        'name': ['name', '성명', 'full name', 'fullname', 'nome', 'nombre'],
        'id_number': ['id', 'resident', '주민', 'registration', 'number', '번호'],
        'address': ['address', '주소', 'addr', 'location'],
        'phone': ['phone', 'tel', '전화', 'telephone', 'mobile', 'contact']
    }
    
    mappings = []
    
    # Try to match each Textract field with JSON data
    for textract_key, textract_value in textract_kv_pairs.items():
        textract_key_lower = textract_key.lower()
        
        for json_field, keywords in field_keywords.items():
            if any(kw in textract_key_lower for kw in keywords):
                json_value = parsed_data.get(json_field)
                if json_value:
                    mappings.append({
                        'pdf_field': textract_key,
                        'json_field': json_field,
                        'current_value': textract_value,
                        'new_value': json_value,
                        'match_confidence': 'high'
                    })
                break
    
    return mappings

if 'kv_pairs' in locals() and kv_pairs:
    print("\n" + "="*60)
    print("Smart Field Mapping")
    print("="*60 + "\n")
    
    mappings = smart_match_fields(JSON_INPUT, kv_pairs)
    
    if mappings:
        print(f"Found {len(mappings)} field mappings:\n")
        for mapping in mappings:
            print(f"PDF Field: '{mapping['pdf_field']}'")
            print(f"  → Maps to JSON: {mapping['json_field']}")
            print(f"  → Current: '{mapping['current_value']}'")
            print(f"  → New: '{mapping['new_value']}'")
            print()
        
        # Save mappings
        with open('results/field_mappings.json', 'w', encoding='utf-8') as f:
            json.dump(mappings, f, indent=2, ensure_ascii=False)
        print("✓ Mappings saved to: results/field_mappings.json")
    else:
        print("No automatic mappings found.")
        print("You may need to create manual mappings.")
else:
    print("No key-value pairs available for mapping.")

## Step 5: Populate PDF Using Textract-Detected Positions

In [None]:
def populate_using_textract_positions(input_pdf, output_pdf, mappings, text_elements):
    """
    Populate PDF using positions detected by Textract
    """
    doc = fitz.open(input_pdf)
    
    # Get page dimensions
    page = doc[0]
    page_width = page.rect.width
    page_height = page.rect.height
    
    for mapping in mappings:
        # Find the position of the field in text_elements
        field_text = mapping['pdf_field']
        new_value = mapping['new_value']
        
        # Find the field in text elements
        for elem in text_elements:
            if field_text.lower() in elem['text'].lower():
                # Convert normalized coordinates to PDF coordinates
                geom = elem['geometry']
                x = (geom['left'] + geom['width'] + 0.02) * page_width  # Add offset
                y = (geom['top'] + geom['height'] / 2) * page_height
                
                # Insert text
                page.insert_text(
                    fitz.Point(x, y),
                    str(new_value),
                    fontsize=10,
                    color=(0, 0, 1)  # Blue color to distinguish from original
                )
                
                print(f"  ✓ Added '{new_value}' near '{field_text}'")
                break
    
    doc.save(output_pdf)
    doc.close()

if 'mappings' in locals() and mappings and 'text_elements' in locals():
    print("\n" + "="*60)
    print("Populating PDF with Textract Positions")
    print("="*60 + "\n")
    
    output_pdf = f"results/populated_textract.pdf"
    
    try:
        populate_using_textract_positions(PDF_INPUT, output_pdf, mappings, text_elements)
        print(f"\n✓ Output saved to: {output_pdf}")
        print("\nNote: New text is in BLUE to distinguish from original content.")
    except Exception as e:
        print(f"✗ Error: {e}")
else:
    print("Required data not available. Run previous cells first.")

## Bonus: Analyze Multiple Pages

In [None]:
def get_page_summary(textract_response):
    """
    Summarize content by page
    """
    pages = {}
    
    for block in textract_response['Blocks']:
        page_num = block.get('Page', 1)
        
        if page_num not in pages:
            pages[page_num] = {
                'lines': [],
                'words': [],
                'key_values': 0
            }
        
        if block['BlockType'] == 'LINE':
            pages[page_num]['lines'].append(block.get('Text', ''))
        elif block['BlockType'] == 'WORD':
            pages[page_num]['words'].append(block.get('Text', ''))
        elif block['BlockType'] == 'KEY_VALUE_SET':
            pages[page_num]['key_values'] += 1
    
    return pages

if 'textract_response' in locals():
    print("\n" + "="*60)
    print("Page-by-Page Summary")
    print("="*60 + "\n")
    
    page_summary = get_page_summary(textract_response)
    
    for page_num, content in sorted(page_summary.items()):
        print(f"Page {page_num}:")
        print(f"  Lines: {len(content['lines'])}")
        print(f"  Words: {len(content['words'])}")
        print(f"  Key-Value pairs: {content['key_values']}")
        if content['lines']:
            print(f"  First line: {content['lines'][0][:60]}...")
        print()

## Summary

### AWS Textract Advantages:

1. **Automatic field detection** - No need to know field names in advance
2. **Multi-language support** - Works with Korean, English, and mixed content
3. **Precise positioning** - Gets exact coordinates of text
4. **High accuracy** - Especially for printed documents
5. **Handles scanned documents** - Works with images embedded in PDFs

### Cost Considerations:

- **Textract pricing**: ~$1.50 per 1,000 pages (Forms feature)
- **Analyze Document**: Pay per page analyzed
- Consider caching results for development

### Next Steps:

1. Review the extracted key-value pairs in `results/textract_key_values.json`
2. Check field mappings in `results/field_mappings.json`
3. Adjust mapping logic if needed for your specific form
4. For production: Implement error handling and retry logic
5. Consider using S3 + Lambda for automated processing