# Universal Text Extractor - Jupyter Notebook Example

This notebook shows how to use the Universal Text Extractor in Jupyter notebooks.

## 1. Import the Text Extractor

In [None]:
# Import the text extractor API
from text_extractor_api import TextExtractor, extract_text, extract_from_image

# Create an extractor instance
extractor = TextExtractor()

# Check what formats are supported
print("Supported formats:")
formats = extractor.get_supported_formats()
print(f"Documents: {formats['documents']}")
print(f"Images: {formats['images']}")
print(f"OCR Available: {formats['ocr_available']}")
print(f"OCR Backends: {formats['ocr_backends']}")

## 2. Extract Text from Documents

In [None]:
# Example: Extract text from a PDF
pdf_path = "your_document.pdf"  # Replace with your file path

try:
    result = extractor.extract_from_file(pdf_path)
    
    if result["success"]:
        print(f"File Type: {result['file_type']}")
        print(f"OCR Used: {result['ocr_used']}")
        print(f"Extracted Text ({len(result['text'])} characters):")
        print("=" * 50)
        print(result['text'][:500] + "..." if len(result['text']) > 500 else result['text'])
    else:
        print(f"Error: {result['error']}")
        
except Exception as e:
    print(f"Exception: {e}")

## 3. Extract Text from Images (OCR)

In [None]:
# Example: Extract text from an image
image_path = "your_image.jpg"  # Replace with your image path

try:
    result = extractor.extract_from_image(image_path, language="eng")
    
    if result["success"]:
        print(f"OCR Used: {result['ocr_used']}")
        print(f"Extracted Text:")
        print("=" * 50)
        print(result['text'])
    else:
        print(f"Error: {result['error']}")
        
except Exception as e:
    print(f"Exception: {e}")

## 4. Quick Functions (Simplified Usage)

In [None]:
# Quick text extraction (returns just the text)
try:
    text = extract_text("your_document.pdf")
    print(f"Extracted {len(text)} characters")
    print(text[:200] + "..." if len(text) > 200 else text)
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Quick image OCR
try:
    text = extract_from_image("your_image.jpg")
    print("OCR Result:")
    print(text)
except Exception as e:
    print(f"Error: {e}")

## 5. Batch Processing

In [None]:
# Process multiple files at once
file_paths = [
    "document1.pdf",
    "document2.docx",
    "image1.jpg"
]

results = extractor.batch_extract(file_paths)

for filename, result in results.items():
    print(f"\n{filename}:")
    if result["success"]:
        print(f"  Type: {result['file_type']}")
        print(f"  OCR: {result['ocr_used']}")
        print(f"  Text length: {len(result['text'])} characters")
        print(f"  Preview: {result['text'][:100]}...")
    else:
        print(f"  Error: {result['error']}")

## 6. Advanced OCR Options

In [None]:
# OCR with different languages and handwriting mode
image_path = "handwritten_note.jpg"

# Try different OCR settings
settings = [
    {"language": "eng", "handwriting_mode": False, "name": "Standard English"},
    {"language": "eng", "handwriting_mode": True, "name": "Handwriting Mode"},
    {"language": "fra", "handwriting_mode": False, "name": "French"},
]

for setting in settings:
    try:
        result = extractor.extract_from_image(
            image_path, 
            language=setting["language"],
            handwriting_mode=setting["handwriting_mode"]
        )
        
        print(f"\n{setting['name']}:")
        if result["success"]:
            print(result['text'][:200] + "..." if len(result['text']) > 200 else result['text'])
        else:
            print(f"Error: {result['error']}")
            
    except Exception as e:
        print(f"Exception with {setting['name']}: {e}")