# API Testing Notebook - Entity Extraction from Real Files

This notebook tests the Resume NER API with actual test files and visualizes extracted entities.

**Note:** For comprehensive error handling and edge case testing, see `tests/integration/api/test_api_local_server.py`.

## Prerequisites

Before running this notebook, start the API server. See [`docs/api_testing_prerequisites.md`](../docs/api_testing_prerequisites.md) for detailed setup instructions.

**Quick start (Terminal - Recommended):**
```bash
# Activate environment
source /opt/conda/etc/profile.d/conda.sh
conda activate resume-ner-training

# Find latest ONNX model
ONNX_MODEL=$(find outputs/conversion -name "model.onnx" -type f | head -1)

# Extract spec hash and find matching checkpoint
SPEC_HASH=$(echo "$ONNX_MODEL" | sed -n 's|.*\(spec-[a-f0-9]\{8\}_exec-[a-f0-9]\{8\}\).*|\1|p')
CHECKPOINT_DIR=$(find outputs/final_training -path "*${SPEC_HASH}*/checkpoint" -type d | head -1)

# Set PYTHONPATH and start server (PYTHONPATH required for infrastructure imports)
export PYTHONPATH="$(pwd)/src:$(pwd)"
python -m src.deployment.api.cli.run_api \
  --onnx-model "$ONNX_MODEL" \
  --checkpoint "$CHECKPOINT_DIR"
```



## 1. Setup and Configuration


In [None]:
# Install OCR dependencies for image processing
# Note: EasyOCR is the default OCR extractor. If you prefer pytesseract, install that instead.
%pip install easyocr pillow


In [None]:
import sys
from pathlib import Path
from typing import Any, Optional
from IPython.display import display, Markdown, JSON
import pandas as pd

# Setup Python paths (required for infrastructure and src imports)
# Must be done before importing from src
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
else:
    project_root = current_dir

src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import test fixtures
from tests.test_data.fixtures import (
    get_text_fixture,
    get_file_fixture,
    get_batch_text_fixture,
    get_batch_file_fixture,
    TEXT_FIXTURES,
    FILE_FIXTURES
)

# Import API utilities
from src.deployment.api.tools.model_finder import (
    find_latest_onnx_model,
    find_matching_checkpoint,
    find_model_pair,
    list_available_models,
)
from src.deployment.api.tools.server_launcher import (
    check_server_health,
    get_server_info,
    start_api_server,
    wait_for_server,
)
from src.deployment.api.tools.notebook_helpers import (
    display_entities,
    make_request,
)
from src.deployment.api.tools.notebook_config import (
    NotebookConfig,
    get_default_config,
)

# Helper functions for model discovery and server management
def find_and_display_models(outputs_dir: Optional[Path] = None, verbose: bool = False) -> tuple[Optional[Path], Optional[Path]]:
    """
    Find and display available models.
    
    Args:
        outputs_dir: Root outputs directory (default: project_root / "outputs")
        verbose: If True, show detailed listing of all models
    
    Returns:
        Tuple of (onnx_path, checkpoint_path), either may be None if not found
    """
    if outputs_dir is None:
        outputs_dir = project_root / "outputs"
    
    # List all available models
    all_models = list_available_models(outputs_dir)
    
    if verbose:
        print(f"Found {len(all_models['onnx_models'])} ONNX model(s), {len(all_models['checkpoints'])} checkpoint(s)")
    
    # Try to find matching pair
    onnx_path, checkpoint_path = find_model_pair(outputs_dir)
    
    if onnx_path and checkpoint_path:
        if verbose:
            print(f"✓ ONNX: {onnx_path}\n  Checkpoint: {checkpoint_path}")
        return onnx_path, checkpoint_path
    else:
        if verbose:
            if not onnx_path:
                print("✗ No ONNX models found")
            if not checkpoint_path:
                print("✗ No matching checkpoint found")
        return None, None


def start_api_server_interactive(
    onnx_path: Optional[Path] = None,
    checkpoint_path: Optional[Path] = None,
    outputs_dir: Optional[Path] = None,
    host: str = "0.0.0.0",
    port: int = 8000,
    background: bool = True,
    verbose: bool = True,
) -> Optional[Any]:
    """
    Interactive function to start API server.
    
    Args:
        onnx_path: Path to ONNX model (if None, will try to find)
        checkpoint_path: Path to checkpoint (if None, will try to find)
        outputs_dir: Root outputs directory (default: project_root / "outputs")
        host: Server host (default: "0.0.0.0")
        port: Server port (default: 8000)
        background: Run in background (default: True)
        verbose: If True, print status messages
    
    Returns:
        subprocess.Popen object if background=True, None otherwise
    """
    if outputs_dir is None:
        outputs_dir = project_root / "outputs"
    
    # Find paths if not provided
    if not onnx_path or not checkpoint_path:
        if verbose:
            print("Finding model pair...")
        found_onnx, found_checkpoint = find_model_pair(outputs_dir)
        if not found_onnx or not found_checkpoint:
            if verbose:
                print("✗ Could not find model pair")
            return None
        onnx_path = found_onnx
        checkpoint_path = found_checkpoint
    
    if verbose:
        print(f"Starting server on {host}:{port}...")
    
    try:
        process = start_api_server(
            onnx_path=onnx_path,
            checkpoint_dir=checkpoint_path,
            host=host,
            port=port,
            background=background,
        )
        
        if background and process:
            if verbose:
                print(f"✓ Server started (PID: {process.pid})")
            if wait_for_server(timeout=30):
                if verbose:
                    print("✓ Server ready")
            elif verbose:
                print("⚠ Server may still be starting")
        elif verbose:
            print("✓ Server started in foreground")
        
        return process
    except Exception as e:
        if verbose:
            print(f"✗ Failed to start server: {e}")
        return None


def verify_server_running(base_url: str = "http://localhost:8000", verbose: bool = True) -> bool:
    """
    Verify server is running and display status.
    
    Args:
        base_url: Base URL of the server (default: "http://localhost:8000")
        verbose: If True, print status messages
    
    Returns:
        True if server is healthy, False otherwise
    """
    is_healthy = check_server_health(base_url)
    
    if is_healthy:
        if verbose:
            print("✓ Server is healthy")
        return True
    else:
        if verbose:
            print(f"✗ Server not running on {base_url}")
        return False


In [None]:
# API Configuration
config: NotebookConfig = get_default_config()
API_BASE_URL = config["api_base_url"]
API_TIMEOUT = config["api_timeout"]


In [None]:
# Create a wrapper that uses config values (for backward compatibility with existing notebook cells)
from functools import partial
make_request = partial(make_request, base_url=API_BASE_URL, timeout=API_TIMEOUT)

# Check server health using helper function
if not verify_server_running(API_BASE_URL):
    print("⚠️  Server not running. Use start_api_server_interactive() to start it.")


In [None]:
# display_entities is now imported from src.deployment.api.tools.notebook_helpers
# No need to redefine it here


## 2. Single Text Prediction

Test entity extraction from individual text inputs.


### 2.1 Test with Sample Text


In [None]:
# Test with text_1
text_1 = get_text_fixture("text_1")
result = make_request("POST", "/predict", json={"text": text_1})
if result.get("status_code") == 200 and result.get("data"):
    entities = result["data"].get("entities", [])
    display_entities(entities, source_text=text_1)


In [None]:
# Test with text_2 (contains email, phone, location)
text_2 = get_text_fixture("text_2")
result = make_request("POST", "/predict", json={"text": text_2})
if result.get("status_code") == 200 and result.get("data"):
    entities = result["data"].get("entities", [])
    display_entities(entities, source_text=text_2)


In [None]:
# Test with text_special (contains email, phone, URL)
text_special = get_text_fixture("text_special")
result = make_request("POST", "/predict", json={"text": text_special})
if result.get("status_code") == 200 and result.get("data"):
    entities = result["data"].get("entities", [])
    display_entities(entities, source_text=text_special)


## 3. Single PDF File Prediction

Test entity extraction from PDF files.


In [None]:
# Test with PDF file
file_path = get_file_fixture("file_1", "pdf")
try:
    with open(file_path, "rb") as f:
        file_content = f.read()
    files = {"file": (file_path.name, file_content, "application/pdf")}
    result = make_request("POST", "/predict/file", files=files)
    
    if result.get("status_code") == 200 and result.get("data"):
        extracted_text = result["data"].get("extracted_text", "")
        entities = result["data"].get("entities", [])
        display_entities(entities, source_text=extracted_text)
except Exception as e:
    print(f"Error loading file: {e}")


In [None]:
# Test with larger PDF file
file_path = get_file_fixture("file_resume_1", "pdf")
try:
    with open(file_path, "rb") as f:
        file_content = f.read()
    files = {"file": (file_path.name, file_content, "application/pdf")}
    result = make_request("POST", "/predict/file", files=files)
    
    if result.get("status_code") == 200 and result.get("data"):
        extracted_text = result["data"].get("extracted_text", "")
        entities = result["data"].get("entities", [])
        display_entities(entities, source_text=extracted_text)
except Exception as e:
    print(f"Error loading file: {e}")


## 4. Single Image File Prediction

Test entity extraction from image files (PNG) using OCR.


In [None]:
# Test with PNG image file
file_path = get_file_fixture("file_1", "png")
try:
    with open(file_path, "rb") as f:
        file_content = f.read()
    files = {"file": (file_path.name, file_content, "image/png")}
    result = make_request("POST", "/predict/file", files=files)
    
    if result.get("status_code") == 200 and result.get("data"):
        extracted_text = result["data"].get("extracted_text", "")
        entities = result["data"].get("entities", [])
        if extracted_text:
            display_entities(entities, source_text=extracted_text)
    elif result.get("status_code") == 400:
        error_detail = result.get("data", {}).get("detail", "")
        if "EasyOCR" in error_detail or "pytesseract" in error_detail or "Pillow" in error_detail:
            print(f"⚠️  OCR dependencies not installed")
except Exception as e:
    print(f"Error loading file: {e}")


## 5. Batch Text Prediction

Test entity extraction from multiple text inputs in a single batch.


### 5.1 Batch with Multiple Texts


In [None]:
# Test batch with multiple texts
texts = get_batch_text_fixture("batch_text_small")
result = make_request("POST", "/predict/batch", json={"texts": texts})

if result.get("status_code") == 200 and result.get("data"):
    predictions = result["data"].get("predictions", [])
    for i, (text, prediction) in enumerate(zip(texts, predictions), 1):
        entities = prediction.get("entities", [])
        display_entities(entities, source_text=text)


## 6. Batch File Prediction

Test entity extraction from multiple files in a single batch.


### 6.1 Batch with PDF Files Only


In [None]:
# Test batch with PDF files
file_paths = get_batch_file_fixture("batch_file_small", "pdf")
try:
    files_list = []
    for file_path in file_paths:
        with open(file_path, "rb") as f:
            file_content = f.read()
        files_list.append(("files", (file_path.name, file_content, "application/pdf")))
    
    result = make_request("POST", "/predict/file/batch", files=files_list)
    
    if result.get("status_code") == 200 and result.get("data"):
        predictions = result["data"].get("predictions", [])
        for i, (file_path, prediction) in enumerate(zip(file_paths, predictions), 1):
            extracted_text = prediction.get("extracted_text", "")
            entities = prediction.get("entities", [])
            if extracted_text:
                display_entities(entities, source_text=extracted_text)
except Exception as e:
    print(f"Error: {e}")


## 7. Mixed Batch Prediction

Test entity extraction from a batch containing a mixture of texts, PDF files, and images.


In [None]:
# Test mixed content (texts + PDFs + images)
# Note: API endpoints are separate, so we process them separately and combine results

texts = [get_text_fixture("text_1"), get_text_fixture("text_2")]
pdf_files = [get_file_fixture("file_1", "pdf")]
png_files = [get_file_fixture("file_1", "png")]

all_results = []

# Process texts
text_result = make_request("POST", "/predict/batch", json={"texts": texts})
if text_result.get("status_code") == 200:
    all_results.extend([
        {"type": "text", "content": text, "result": pred}
        for text, pred in zip(texts, text_result["data"].get("predictions", []))
    ])

# Process PDF files
try:
    files_list = []
    for file_path in pdf_files:
        with open(file_path, "rb") as f:
            file_content = f.read()
        files_list.append(("files", (file_path.name, file_content, "application/pdf")))
    
    pdf_result = make_request("POST", "/predict/file/batch", files=files_list)
    if pdf_result.get("status_code") == 200:
        all_results.extend([
            {"type": "pdf", "file": str(fp), "result": pred}
            for fp, pred in zip(pdf_files, pdf_result["data"].get("predictions", []))
        ])
except Exception as e:
    print(f"Error processing PDFs: {e}")

# Process image files
try:
    files_list = []
    for file_path in png_files:
        with open(file_path, "rb") as f:
            file_content = f.read()
        files_list.append(("files", (file_path.name, file_content, "image/png")))
    
    png_result = make_request("POST", "/predict/file/batch", files=files_list)
    if png_result.get("status_code") == 200:
        all_results.extend([
            {"type": "image", "file": str(fp), "result": pred}
            for fp, pred in zip(png_files, png_result["data"].get("predictions", []))
        ])
    elif png_result.get("status_code") == 400:
        error_detail = png_result.get("data", {}).get("detail", "")
        if "EasyOCR" in error_detail or "pytesseract" in error_detail:
            print(f"⚠️  OCR dependencies not installed")
except Exception as e:
    print(f"Error processing images: {e}")

# Display combined results
for item in all_results:
    result = item["result"]
    entities = result.get("entities", [])
    
    if item["type"] == "text":
        display_entities(entities, source_text=item["content"])
    else:
        extracted_text = result.get("extracted_text", "")
        if extracted_text:
            display_entities(entities, source_text=extracted_text)


## 8. Cross-Format Consistency Test

Test the same content across different formats (text, PDF, PNG) to verify entity extraction consistency and compare performance.


In [None]:
# Test the same content in different formats
sample_text = "John Doe is a software engineer at Google. Email: john.doe@example.com. Phone: +1-555-123-4567. Location: Seattle, WA."

pdf_file = get_file_fixture("file_resume_1", "pdf")
png_file = get_file_fixture("file_resume_1", "png")

results = []

# Test 1: Text format
text_result = make_request("POST", "/predict", json={"text": sample_text})
if text_result.get("status_code") == 200:
    text_data = text_result["data"]
    results.append({
        "format": "Text",
        "input": sample_text,
        "extracted_text": sample_text,
        "entities": text_data.get("entities", []),
        "processing_time_ms": text_data.get("processing_time_ms", 0),
        "num_entities": len(text_data.get("entities", []))
    })
    display_entities(text_data.get("entities", []), source_text=sample_text)

# Test 2: PDF format
try:
    with open(pdf_file, "rb") as f:
        pdf_content = f.read()
    pdf_files = {"file": (pdf_file.name, pdf_content, "application/pdf")}
    pdf_result = make_request("POST", "/predict/file", files=pdf_files)
    
    if pdf_result.get("status_code") == 200:
        pdf_data = pdf_result["data"]
        extracted_text = pdf_data.get("extracted_text", "")
        results.append({
            "format": "PDF",
            "input": str(pdf_file),
            "extracted_text": extracted_text,
            "entities": pdf_data.get("entities", []),
            "processing_time_ms": pdf_data.get("processing_time_ms", 0),
            "num_entities": len(pdf_data.get("entities", []))
        })
        display_entities(pdf_data.get("entities", []), source_text=extracted_text)
except Exception as e:
    print(f"Error loading PDF: {e}")

# Test 3: PNG format
try:
    with open(png_file, "rb") as f:
        png_content = f.read()
    png_files = {"file": (png_file.name, png_content, "image/png")}
    png_result = make_request("POST", "/predict/file", files=png_files)
    
    if png_result.get("status_code") == 200:
        png_data = png_result["data"]
        extracted_text = png_data.get("extracted_text", "")
        results.append({
            "format": "PNG",
            "input": str(png_file),
            "extracted_text": extracted_text,
            "entities": png_data.get("entities", []),
            "processing_time_ms": png_data.get("processing_time_ms", 0),
            "num_entities": len(png_data.get("entities", []))
        })
        display_entities(png_data.get("entities", []), source_text=extracted_text)
    elif png_result.get("status_code") == 400:
        error_detail = png_result.get("data", {}).get("detail", "")
        if "EasyOCR" in error_detail or "pytesseract" in error_detail or "Pillow" in error_detail:
            print(f"⚠️  OCR dependencies not installed")
except Exception as e:
    print(f"Error loading PNG: {e}")

# Comparison Summary
if len(results) >= 2:
    comparison_data = []
    for r in results:
        comparison_data.append({
            "Format": r["format"],
            "Processing Time (ms)": f"{r['processing_time_ms']:.1f}",
            "Entities Extracted": r["num_entities"],
            "Text Length": len(r["extracted_text"])
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    display(comparison_df)
    
    # Entity consistency analysis
    if len(results) == 3:
        text_entities = set((e.get("text", ""), e.get("label", "")) for e in results[0]["entities"])
        pdf_entities = set((e.get("text", ""), e.get("label", "")) for e in results[1]["entities"])
        png_entities = set((e.get("text", ""), e.get("label", "")) for e in results[2]["entities"])
        
        common_all = text_entities & pdf_entities & png_entities
        text_pdf_only = (text_entities & pdf_entities) - png_entities
        text_png_only = (text_entities & png_entities) - pdf_entities
        pdf_png_only = (pdf_entities & png_entities) - text_entities
        
        if common_all:
            print(f"\nEntities in all formats ({len(common_all)}):")
            for entity in sorted(common_all):
                print(f"  - '{entity[0]}' ({entity[1]})")
        
        if text_pdf_only or text_png_only or pdf_png_only:
            print(f"\nFormat-specific entities:")
            if text_pdf_only:
                print(f"  Text & PDF only ({len(text_pdf_only)}): {', '.join([e[0] for e in sorted(text_pdf_only)])}")
            if text_png_only:
                print(f"  Text & PNG only ({len(text_png_only)}): {', '.join([e[0] for e in sorted(text_png_only)])}")
            if pdf_png_only:
                print(f"  PDF & PNG only ({len(pdf_png_only)}): {', '.join([e[0] for e in sorted(pdf_png_only)])}")
        
        # Performance comparison
        times = [r["processing_time_ms"] for r in results]
        formats = [r["format"] for r in results]
        min_time = min(times)
        max_time = max(times)
        
        print(f"\nPerformance: {formats[times.index(min_time)]} fastest ({min_time:.1f}ms), {formats[times.index(max_time)]} slowest ({max_time:.1f}ms)")
        
        text_time = results[0]["processing_time_ms"]
        png_time = results[2]["processing_time_ms"]
        if png_time > text_time:
            ocr_overhead = png_time - text_time
            print(f"OCR overhead: {ocr_overhead:.1f}ms ({ocr_overhead / text_time * 100:.1f}%)")
